Port-amd64 archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[patch] assembly clean-up: xor



Browsing through the amd64 assembly code, I found that it could use a
little bit of clean-up. E.g., many functions use 64 bit instructions,
even in cases where it is not necessary.

I prepared a patch set, which
 * uses shorter instruction encodings, if it does not effect speed
 * prefers instructions with lower delay
 * unifies some constructs between functions
Altogether, it's not too intrusive, i.e. it's more an adjustment, not a
rewrite.

I attached the first patch, which is the simplest one (it's also the one
which makes the most noise - with it applied, I think it's easier to
review the following ones):

All XOR operations for clearing a register are converted to 32 bit. This
is sufficient because the upper 32 bits are set to 0 automatically. And
it saves one byte (the REX.W prefix) in case of the first eight GPRs.

You can see that compilers also prefer the 32 bit variant. As an
example, none of the 64 bit ones are left in the kernel, when this patch
is applied:

~$ objdump -d /netbsd.old | \
grep -Ec '[[:blank:]]xor[[:blank:]]+%r([a-s][ipx]|[89]|1[0-5]),%r\1'
41
~$ objdump -d /netbsd | \
grep -Ec '[[:blank:]]xor[[:blank:]]+%r([a-s][ipx]|[89]|1[0-5]),%r\1'
0

(and the .text section of the kernel shrinks by 64 bytes)

Best regards,
Jo.
Index: common/lib/libc/arch/x86_64/string/memchr.S
===================================================================
RCS file: /cvsroot/src/common/lib/libc/arch/x86_64/string/memchr.S,v
retrieving revision 1.6
diff -u -r1.6 memchr.S
--- common/lib/libc/arch/x86_64/string/memchr.S	22 Mar 2014 19:16:34 -0000	1.6
+++ common/lib/libc/arch/x86_64/string/memchr.S	4 May 2015 15:38:55 -0000
@@ -104,6 +104,6 @@
 	jmp	2b
 
 /* Not found */
-30:	xorq	%rax,%rax
+30:	xorl	%eax,%eax
 	ret
 END(memchr)
Index: common/lib/libc/arch/x86_64/string/strchr.S
===================================================================
RCS file: /cvsroot/src/common/lib/libc/arch/x86_64/string/strchr.S,v
retrieving revision 1.7
diff -u -r1.7 strchr.S
--- common/lib/libc/arch/x86_64/string/strchr.S	22 Mar 2014 19:16:34 -0000	1.7
+++ common/lib/libc/arch/x86_64/string/strchr.S	4 May 2015 15:38:55 -0000
@@ -111,7 +111,7 @@
 /* Source misaligned: read aligned word and make low bytes invalid */
 /* I (dsl) think a _ALIGN_TEXT here will slow things down! */
 20:
-	xor	%rcx,%rcx
+	xor	%ecx,%ecx
 	sub	%dil,%cl	/* Convert low address values 1..7 ... */
 	sbb	%rsi,%rsi	/* carry was set, so %rsi now ~0u! */
 	and	$7,%cl		/* ... to 7..1 */
Index: common/lib/libc/arch/x86_64/string/strrchr.S
===================================================================
RCS file: /cvsroot/src/common/lib/libc/arch/x86_64/string/strrchr.S,v
retrieving revision 1.3
diff -u -r1.3 strrchr.S
--- common/lib/libc/arch/x86_64/string/strrchr.S	22 Mar 2014 19:16:34 -0000	1.3
+++ common/lib/libc/arch/x86_64/string/strrchr.S	4 May 2015 15:38:55 -0000
@@ -13,7 +13,7 @@
 	movzbq	%sil,%rcx
 
 	/* zero return value */
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 
 	/*
 	 * Align to word boundary.
Index: lib/libc/arch/x86_64/gen/__setjmp14.S
===================================================================
RCS file: /cvsroot/src/lib/libc/arch/x86_64/gen/__setjmp14.S,v
retrieving revision 1.3
diff -u -r1.3 __setjmp14.S
--- lib/libc/arch/x86_64/gen/__setjmp14.S	22 May 2014 15:01:56 -0000	1.3
+++ lib/libc/arch/x86_64/gen/__setjmp14.S	4 May 2015 15:39:37 -0000
@@ -66,7 +66,7 @@
 
 	leaq	(_JB_SIGMASK * 8)(%rdi),%rdx
 	xorl	%edi,%edi
-	xorq	%rsi,%rsi
+	xorl	%esi,%esi
 
 #ifdef __PIC__
 	call	PIC_PLT(_C_LABEL(__sigprocmask14))
@@ -83,7 +83,7 @@
 
 	leaq	(_JB_SIGMASK * 8)(%rdi),%rsi
 	movl	$3,%edi		/* SIG_SETMASK */
-	xorq	%rdx,%rdx
+	xorl	%edx,%edx
 
 	pushq	%r8
 #ifdef __PIC__
Index: lib/libc/arch/x86_64/gen/__sigsetjmp14.S
===================================================================
RCS file: /cvsroot/src/lib/libc/arch/x86_64/gen/__sigsetjmp14.S,v
retrieving revision 1.3
diff -u -r1.3 __sigsetjmp14.S
--- lib/libc/arch/x86_64/gen/__sigsetjmp14.S	22 May 2014 15:01:56 -0000	1.3
+++ lib/libc/arch/x86_64/gen/__sigsetjmp14.S	4 May 2015 15:39:37 -0000
@@ -70,7 +70,7 @@
 
 	leaq	(_JB_SIGMASK * 8)(%rdi),%rdx
 	xorl	%edi,%edi
-	xorq	%rsi,%rsi
+	xorl	%esi,%esi
 
 #ifdef __PIC__
 	call	PIC_PLT(_C_LABEL(__sigprocmask14))
@@ -89,7 +89,7 @@
 	jz	2f
 	leaq	(_JB_SIGMASK * 8)(%rdi),%rsi
 	movl	$3,%edi		/* SIG_SETMASK */
-	xorq	%rdx,%rdx
+	xorl	%edx,%edx
 
 #ifdef __PIC__
 	call	PIC_PLT(_C_LABEL(__sigprocmask14))
Index: sys/arch/amd64/acpi/acpi_wakeup_low.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/acpi/acpi_wakeup_low.S,v
retrieving revision 1.4
diff -u -r1.4 acpi_wakeup_low.S
--- sys/arch/amd64/acpi/acpi_wakeup_low.S	11 May 2008 15:32:20 -0000	1.4
+++ sys/arch/amd64/acpi/acpi_wakeup_low.S	4 May 2015 15:39:39 -0000
@@ -99,7 +99,7 @@
 	movq	ACPI_SUSPEND_REG+(5*8)(%r8),%r14
 	movq	ACPI_SUSPEND_REG+(6*8)(%r8),%r15
 
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 
 	pushq	ACPI_SUSPEND_REG+(7*8)(%r8)
 	popfq
Index: sys/arch/amd64/amd64/amd64_trap.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/amd64_trap.S,v
retrieving revision 1.2
diff -u -r1.2 amd64_trap.S
--- sys/arch/amd64/amd64/amd64_trap.S	12 Feb 2014 19:53:49 -0000	1.2
+++ sys/arch/amd64/amd64/amd64_trap.S	4 May 2015 15:39:39 -0000
@@ -415,7 +415,7 @@
 	movabsq	$4f,%rdi
 	movl	CPUVAR(ILEVEL),%esi
 	movl	%ebx,%edx
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	call	_C_LABEL(printf)
 	movl	%ebx,%edi
 	call	_C_LABEL(spllower)
Index: sys/arch/amd64/amd64/copy.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/copy.S,v
retrieving revision 1.18
diff -u -r1.18 copy.S
--- sys/arch/amd64/amd64/copy.S	7 Jul 2010 01:13:29 -0000	1.18
+++ sys/arch/amd64/amd64/copy.S	4 May 2015 15:39:39 -0000
@@ -93,7 +93,7 @@
 	jnz	2f
 	cmpl	$0, L_DOPREEMPT(%rbx)
 	jz	2f
-	xorq	%rdi, %rdi
+	xorl	%edi, %edi
 	call	_C_LABEL(kpreempt)
 2:
 	cmpl	$0, CPUVAR(WANT_PMAPLOAD)
@@ -143,7 +143,7 @@
 	rep
 	movsb
 
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	ret
 
 # Using 'rep movs' to copy backwards is not as fast as for forwards copies
@@ -167,7 +167,7 @@
 	movsq
 	cld
 .Lkcopy_end:
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	ret
 
 ENTRY(copyout)
@@ -266,7 +266,7 @@
 .Lcopyoutstr_end:
 	/* Success -- 0 byte reached. */
 	decq	%rdx
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	jmp	copystr_return
 
 2:	/* rdx is zero -- return EFAULT or ENAMETOOLONG. */
@@ -306,7 +306,7 @@
 
 	/* Success -- 0 byte reached. */
 	decq	%rdx
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	jmp	copystr_return
 
 2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
@@ -422,7 +422,7 @@
 	movq	%r11,PCB_ONFAULT(%rcx)
 
 	movq	%rsi,(%rdi)
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	ret
 	DEFERRED_SWITCH_CALL
@@ -438,7 +438,7 @@
 	movq	%r11,PCB_ONFAULT(%rcx)
 
 	movw	%si,(%rdi)
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	ret
 	DEFERRED_SWITCH_CALL
@@ -453,7 +453,7 @@
 	leaq	_C_LABEL(fusuintrfailure)(%rip),%r11
 	movq	%r11,PCB_ONFAULT(%rcx)
 	movw	%si,(%rdi)
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	ret
 
@@ -468,7 +468,7 @@
 	movq	%r11,PCB_ONFAULT(%rcx)
 
 	movb	%sil,(%rdi)
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	ret
 	DEFERRED_SWITCH_CALL
@@ -513,7 +513,7 @@
 	 * Set the return values.
 	 */
 	movq	%rax, (%rcx)
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	ret
 	DEFERRED_SWITCH_CALL
 
@@ -537,7 +537,7 @@
 	 * Set the return values.
 	 */
 	movl	%eax, (%rcx)
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	ret
 	DEFERRED_SWITCH_CALL
 
Index: sys/arch/amd64/amd64/cpu_in_cksum.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/cpu_in_cksum.S,v
retrieving revision 1.2
diff -u -r1.2 cpu_in_cksum.S
--- sys/arch/amd64/amd64/cpu_in_cksum.S	22 Jun 2013 05:56:32 -0000	1.2
+++ sys/arch/amd64/amd64/cpu_in_cksum.S	4 May 2015 15:39:39 -0000
@@ -100,8 +100,8 @@
 	cmovb	%esi, %ebp
 	subl	%ebp, %esi
 
-	xorq	%r9, %r9
-	xorq	%r10, %r10
+	xorl	%r9d, %r9d
+	xorl	%r10d, %r10d
 
 .Mmbuf_align_word:
 	/* Already aligned on a word boundary? */
@@ -179,7 +179,7 @@
 	addq	%r9, %r10
 	movq	%r10, %rax
 	shrq	$62, %rax
-	xorq	%r9, %r9
+	xorl	%r9d, %r9d
 	testb	%al, %al
 	jz	.Mmbuf_inner_loop
 
@@ -189,7 +189,7 @@
 1:
 	addq	%r10, %r8
 	adcq	$0, %r8
-	xorq	%r10, %r10
+	xorl	%r10d, %r10d
 
 	jmp	.Mmbuf_inner_loop
 
Index: sys/arch/amd64/amd64/cpufunc.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/cpufunc.S,v
retrieving revision 1.25
diff -u -r1.25 cpufunc.S
--- sys/arch/amd64/amd64/cpufunc.S	12 Feb 2014 23:24:09 -0000	1.25
+++ sys/arch/amd64/amd64/cpufunc.S	4 May 2015 15:39:39 -0000
@@ -191,7 +191,7 @@
 
 ENTRY(rdmsr)
 	movq	%rdi, %rcx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	rdmsr
 	shlq	$32, %rdx
 	orq	%rdx, %rax
@@ -207,7 +207,7 @@
 
 ENTRY(rdmsr_locked)
 	movq	%rdi, %rcx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	movl	$OPTERON_MSR_PASSCODE, %edi
 	rdmsr
 	shlq	$32, %rdx
@@ -239,7 +239,7 @@
 	movl	%eax, %eax	/* zero-extend %eax -> %rax */
 	orq	%rdx, %rax
 	movq	%rax, (%rsi)  /* *data */
-	xorq	%rax, %rax    /* "no error" */
+	xorl	%eax, %eax    /* "no error" */
 
 	movq	%rax, PCB_ONFAULT(%r8)
 	ret
@@ -276,7 +276,7 @@
 #endif
 
 ENTRY(cpu_counter)
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	rdtsc
 	shlq	$32, %rdx
 	orq	%rdx, %rax
@@ -290,7 +290,7 @@
 
 ENTRY(rdpmc)
 	movq	%rdi, %rcx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	rdpmc
 	shlq	$32, %rdx
 	orq	%rdx, %rax
@@ -518,7 +518,7 @@
 
 ENTRY(inb)
 	movq	%rdi, %rdx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	inb	%dx, %al
 	ret
 
@@ -532,7 +532,7 @@
 
 ENTRY(inw)
 	movq	%rdi, %rdx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	inw	%dx, %ax
 	ret
 
@@ -546,7 +546,7 @@
 
 ENTRY(inl)
 	movq	%rdi, %rdx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	inl	%dx, %eax
 	ret
 
Index: sys/arch/amd64/amd64/lock_stubs.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/lock_stubs.S,v
retrieving revision 1.25
diff -u -r1.25 lock_stubs.S
--- sys/arch/amd64/amd64/lock_stubs.S	22 Jun 2013 06:23:28 -0000	1.25
+++ sys/arch/amd64/amd64/lock_stubs.S	4 May 2015 15:39:39 -0000
@@ -61,7 +61,7 @@
 
 ENTRY(mutex_enter)
 	movq	CPUVAR(CURLWP), %rcx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	LOCK(1)
 	cmpxchgq %rcx, (%rdi)
 	jnz	1f
@@ -81,7 +81,7 @@
  */
 ENTRY(mutex_exit)
 	movq	CPUVAR(CURLWP), %rax
-	xorq	%rdx, %rdx
+	xorl	%edx, %edx
 	cmpxchgq %rdx, (%rdi)
 	jnz	1f
 	ret
@@ -200,7 +200,7 @@
 	 * Writer: if the compare-and-set fails, don't bother retrying.
 	 */
 2:	movq	CPUVAR(CURLWP), %rcx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	orq	$RW_WRITE_LOCKED, %rcx
 	LOCK(3)
 	cmpxchgq %rcx, (%rdi)
@@ -278,7 +278,7 @@
 	 * Writer: if the compare-and-set fails, don't bother retrying.
 	 */
 2:	movq	CPUVAR(CURLWP), %rcx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	orq	$RW_WRITE_LOCKED, %rcx
 	LOCK(9)
 	cmpxchgq %rcx, (%rdi)
Index: sys/arch/amd64/amd64/locore.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/locore.S,v
retrieving revision 1.77
diff -u -r1.77 locore.S
--- sys/arch/amd64/amd64/locore.S	17 Aug 2014 21:17:43 -0000	1.77
+++ sys/arch/amd64/amd64/locore.S	4 May 2015 15:39:39 -0000
@@ -762,7 +762,7 @@
 	movq	%rax,(_C_LABEL(lwp0)+L_PCB)(%rip) /* XXX L_PCB != uarea */
 	leaq	(USPACE-FRAMESIZE)(%rax),%rsp
 	movq	%rsi,PCB_CR3(%rax)	# pcb->pcb_cr3
-	xorq	%rbp,%rbp		# mark end of frames
+	xorl	%ebp,%ebp		# mark end of frames
 
 	xorw	%ax,%ax
 	movw	%ax,%gs
@@ -787,7 +787,7 @@
 	movq	%rsi, %rbx
 
 	/* Clear BSS. */
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	movq	$_C_LABEL(__bss_start),%rdi
 	movq	$_C_LABEL(_end),%rcx
 	subq	%rdi,%rcx
@@ -849,7 +849,7 @@
 	 */
 
 	leaq	(USPACE-FRAMESIZE)(%rsi),%rsp
-	xorq	%rbp,%rbp
+	xorl	%ebp,%ebp
 
 	xorw	%ax,%ax
 	movw	%ax,%gs
@@ -1043,7 +1043,7 @@
 	jne	32f
 
 	/* Zero out %fs/%gs registers and GDT descriptors. */
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	movw	%ax, %fs
 	CLI(cx)
 	SWAPGS
@@ -1245,7 +1245,7 @@
 	movl	TF_RDI(%rsp),%edx
 	movl	%ebx,%ecx
 	movl	CPUVAR(ILEVEL),%r8d
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	call	_C_LABEL(printf)
 	movl	$IPL_NONE,%edi
 	call	_C_LABEL(spllower)
@@ -1279,7 +1279,7 @@
 	movq	%rbp,%rsi
 	movq	%rbp,%r14	/* for .Lsyscall_checkast */
 	movq	%rax,%rdi
-	xorq	%rbp,%rbp
+	xorl	%ebp,%ebp
 	call	_C_LABEL(lwp_startup)
 	movq	%r13,%rdi
 	call	*%r12
@@ -1337,7 +1337,7 @@
 	pushq	%rbp
 	movq	%rsp,%rbp
 	movl	$(PAGE_SIZE/64), %ecx
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 	.align	16
 1:
 	testl	$RESCHED_KPREEMPT, CPUVAR(RESCHED)
@@ -1372,7 +1372,7 @@
 ENTRY(pagezero)
 	movq	$-PAGE_SIZE,%rdx
 	subq	%rdx,%rdi
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 1:
 	movnti	%rax,(%rdi,%rdx)
 	movnti	%rax,8(%rdi,%rdx)
Index: sys/arch/amd64/amd64/spl.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/spl.S,v
retrieving revision 1.29
diff -u -r1.29 spl.S
--- sys/arch/amd64/amd64/spl.S	27 Nov 2014 04:48:39 -0000	1.29
+++ sys/arch/amd64/amd64/spl.S	4 May 2015 15:39:39 -0000
@@ -152,7 +152,7 @@
 IDTVEC(preemptrecurse)
 	movl	$IPL_PREEMPT, CPUVAR(ILEVEL)
 	sti
-	xorq	%rdi, %rdi
+	xorl	%edi, %edi
 	call	_C_LABEL(kpreempt)
 	cli
 	jmp	*%r13			/* back to Xspllower */
Index: sys/arch/amd64/amd64/vector.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/vector.S,v
retrieving revision 1.44
diff -u -r1.44 vector.S
--- sys/arch/amd64/amd64/vector.S	25 Jun 2013 00:27:22 -0000	1.44
+++ sys/arch/amd64/amd64/vector.S	4 May 2015 15:39:39 -0000
@@ -193,7 +193,7 @@
 	sti
 	pushq	%rbx
 	movq	%rsp,%rsi
-	xorq	%rdi,%rdi
+	xorl	%edi,%edi
 	call	_C_LABEL(lapic_clockintr)
 	jmp	_C_LABEL(Xdoreti)
 2:
Index: sys/lib/libkern/arch/x86_64/scanc.S
===================================================================
RCS file: /cvsroot/src/sys/lib/libkern/arch/x86_64/scanc.S,v
retrieving revision 1.2
diff -u -r1.2 scanc.S
--- sys/lib/libkern/arch/x86_64/scanc.S	28 Apr 2008 20:24:06 -0000	1.2
+++ sys/lib/libkern/arch/x86_64/scanc.S	4 May 2015 15:39:43 -0000
@@ -42,7 +42,7 @@
 	testl	%ecx,%ecx
 	jz	2f
 	movq	%r11,%rdi
-	xorq	%rax,%rax
+	xorl	%eax,%eax
 	cld
 1:
 	lodsb


Home | Main Index | Thread Index | Old Index