Subject: Re: Performance of various memcpy()'s
To: None <tech-perform@netbsd.org>
From: Bang Jun-Young <junyoung@mogua.com>
List: tech-perform
Date: 10/28/2002 14:33:03
--Dxnq1zWXvFF0Q93v
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Wed, Oct 23, 2002 at 11:54:42PM +0900, Bang Jun-Young wrote:
> In this test, non-temporal movntq instruction was obviously a big win.
> Since it doesn't pollute cache lines, you can get 2x performance for
> copying data not in cache. 

This time I implemented i686_copyin() and i686_copyout() using
non-temporal MOVNTQ instruction. Benchmark results are as follows:

memcpy 128B -- 8192 loops
  aligned blocks
      libc memcpy                                        2.871323 s
      i686_copyin (MOVQ, FNSAVE/FRSTOR)                  3.784806 s
      i686_copyin3 (MOVNTQ, FNSAVE/FRSTOR)               3.375719 s
      MMX memcpy using MOVQ                              2.746474 s

memcpy 256B -- 4096 loops
  aligned blocks
      libc memcpy                                        2.857081 s
      i686_copyin (MOVQ, FNSAVE/FRSTOR)                  2.859079 s
      i686_copyin3 (MOVNTQ, FNSAVE/FRSTOR)               2.503540 s
      MMX memcpy using MOVQ                              2.692716 s

memcpy 512B -- 2048 loops
  aligned blocks
      libc memcpy                                        2.858101 s
      i686_copyin (MOVQ, FNSAVE/FRSTOR)                  2.741855 s
      i686_copyin3 (MOVQ, FNSAVE/FRSTOR)                 1.982627 s
      MMX memcpy using MOVQ                              2.653495 s

memcpy 1024B -- 1024 loops
  aligned blocks
      libc memcpy                                        2.859076 s
      i686_copyin (MOVQ, FNSAVE/FRSTOR)                  2.679616 s
      i686_copyin3 (MOVNTQ, FNSAVE/FRSTOR)               1.857517 s
      MMX memcpy using MOVQ                              2.643854 s

I'd appreciate it if someone would apply the patch and perform 
a "real world" benchmark with it.

Jun-Young

-- 
Bang Jun-Young <junyoung@mogua.com>

--Dxnq1zWXvFF0Q93v
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="locore.s.i686.diff"

Index: locore.s
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/i386/i386/locore.s,v
retrieving revision 1.267
diff -u -r1.267 locore.s
--- locore.s	2002/10/23 03:28:34	1.267
+++ locore.s	2002/10/28 05:11:13
@@ -951,8 +951,8 @@
 #define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
 #define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
 #elif defined(I686_CPU)
-#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
-#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
+#define	DEFAULT_COPYOUT		_C_LABEL(i686_copyout)	/* XXX */
+#define	DEFAULT_COPYIN		_C_LABEL(i686_copyin)	/* XXX */
 #endif
 
 	.data
@@ -1108,6 +1108,101 @@
 	ret
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i686_copyout)
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebx
+
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.
+	 */
+	movl	%edi,%edx
+	addl	%eax,%edx
+	jc	_C_LABEL(i686_copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(i686_copy_efault)
+
+	GET_CURPCB(%edx)
+	movl	$_C_LABEL(i686_copy_fault),PCB_ONFAULT(%edx)
+
+	cmpl	$512,%eax
+	jb	2f
+
+	xorl	%ebx,%ebx
+	movl	%eax,%edx
+	shrl	$6,%edx
+
+	/*
+	 * Save FPU state in stack.
+	 */
+	smsw	%cx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+
+1:	movq 	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	movntq	%mm0,(%edi)
+	movntq	%mm1,8(%edi)
+	movntq	%mm2,16(%edi)
+	movntq	%mm3,24(%edi)
+	movntq	%mm4,32(%edi)
+	movntq	%mm5,40(%edi)
+	movntq	%mm6,48(%edi)
+	movntq	%mm7,56(%edi)
+
+	addl	$64,%esi
+	addl	$64,%edi
+	incl	%ebx
+	cmpl	%edx,%ebx
+	jb	1b
+
+	/*
+	 * Restore FPU state.
+	 */
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%cx
+	sfence
+	emms
+
+	andl	$63,%eax
+	jz	3f
+
+2:	/* plain old bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	andl	$3,%eax
+	jz	3f
+	movl	%eax,%ecx
+	rep
+	movsb
+
+3:	GET_CURPCB(%edx)
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	movl	%eax,PCB_ONFAULT(%edx)
+	ret
+#endif /* I686_CPU */
+
 /*
  * int copyin(const void *from, void *to, size_t len);
  * Copy len bytes from the user's address space.
@@ -1160,6 +1255,114 @@
 	xorl	%eax,%eax
 	ret
 #endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i686_copyin)
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebx
+	GET_CURPCB(%eax)
+	movl	$_C_LABEL(i686_copy_fault),PCB_ONFAULT(%eax)
+	
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.  If it's not, then we only need to
+	 * check that each page is readable, and the CPU will do that for us.
+	 */
+	movl	%esi,%edx
+	addl	%eax,%edx
+	jc	_C_LABEL(i686_copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(i686_copy_efault)
+
+	cmpl	$512,%eax
+	jb	2f
+
+	xorl	%ebx,%ebx
+	movl	%eax,%edx
+	shrl	$6,%edx
+
+	/*
+	 * Save FPU state in stack.
+	 */
+	smsw	%cx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+
+1:	movq 	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	movntq	%mm0,(%edi)
+	movntq	%mm1,8(%edi)
+	movntq	%mm2,16(%edi)
+	movntq	%mm3,24(%edi)
+	movntq	%mm4,32(%edi)
+	movntq	%mm5,40(%edi)
+	movntq	%mm6,48(%edi)
+	movntq	%mm7,56(%edi)
+
+	addl	$64,%esi
+	addl	$64,%edi
+	incl	%ebx
+	cmpl	%edx,%ebx
+	jb	1b
+
+	/*
+	 * Restore FPU state.
+	 */
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%cx
+	sfence
+	emms
+
+	andl	$63,%eax
+	jz	3f
+
+2:	/* plain old bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	andl	$3,%eax
+	jz	3f
+	movl	%eax,%ecx
+	rep
+	movsb
+
+3:	GET_CURPCB(%edx)
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	movl	%eax,PCB_ONFAULT(%edx)
+	ret
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_efault)
+	movl	$EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_fault)
+	GET_CURPCB(%edx)
+	movl	%eax,PCB_ONFAULT(%edx)
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+#endif /* I686_CPU */
 
 /* LINTSTUB: Ignore */
 NENTRY(copy_efault)

--Dxnq1zWXvFF0Q93v--