Subject: Re: Performance of various memcpy()'s
To: Frank van der Linden <fvdl@wasabisystems.com>
From: Bang Jun-Young <junyoung@mogua.com>
List: tech-perform
Date: 10/23/2002 02:08:24
On Wed, Oct 16, 2002 at 12:04:58AM +0200, Frank van der Linden wrote:
> On Wed, Oct 16, 2002 at 04:18:30AM +0900, Bang Jun-Young wrote:
> > Another attached patch is i686 version of copyin(9) that makes use
> > of MMX insns. It works well with intops-only programs, but doesn't
> > with ones like XFree86 that uses FP ops. In this case, it would be
> > helpful if NPX handling code was imported from FreeBSD (they have
> > i586 optimized version of copyin/out(9)). Can anybody give me some
> > comments wrt this?
> 
> Yup, there's a lot to be had by using SSE(2) instructions, copying
> in 128bit quantities is quite a useful thing to do. It's been
> on my todo list for a while.
> 
> I've been playing with a few SSE memcpy functions myself, but
> did not get around to adding the extra checks to the FP
> save/restore code yet. There are some checks that need to
> be done. It comes down to:
> 
> 	* Don't mess up the current process' FP state, so save it if necessary. 
> 	* Don't bother if there's not enough bytes to copy, since you're
> 	  paying the price of an entire FP save if someone was using the FPU.
> 	* If you're going all the way, and are using memcpy with SSE in
> 	  the kernel too, be careful about interrupts. If you come in
> 	  during the FP save path, it will mess up things. And maybe
> 	  you don't want to use FP in an interrupt at all, it'll
> 	  cause a ton of fp save/restore actions.
> 
> It's not overly complicated to do, but it's important to take all
> scenarios into account. copyin/out is the simplest case, since
> you should be in a process context when doing those.
> 
> I'll probably have some time to spend on this soon (next month).
> If you're going to work on it before than, please let me review
> the changes.

Here is a new version of i686_copyin(). By saving FPU state in stack,
I could make it work with programs that use FP operations, including
XFree86, xmms, mozilla, etc.

In this version, I set the minimum length to use MMX bcopy to 512.
Since I don't know of a kernel profiling tool or a method to measure
copyin performance at kernel level, the number may be too small, or
too large.

Possible todo:
 - i686_copyout(), i686_kcopy(), i686_memcpy(), ...
 - use prefetch and movntq instructions for PIII/4 or Athlon.
 - use npxproc to eliminate overhead in saving FPU state as 
   FreeBSD does.

Index: locore.s
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/i386/i386/locore.s,v
retrieving revision 1.265
diff -u -r1.265 locore.s
--- locore.s	2002/10/05 21:20:00	1.265
+++ locore.s	2002/10/22 16:42:17
@@ -951,7 +951,7 @@
 #define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
 #elif defined(I686_CPU)
 #define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
-#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
+#define	DEFAULT_COPYIN		_C_LABEL(i686_copyin)	/* XXX */
 #endif
 
 	.data
@@ -1159,6 +1159,114 @@
 	xorl	%eax,%eax
 	ret
 #endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i686_copyin)
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebx
+	GET_CURPCB(%eax)
+	movl	$_C_LABEL(i686_copy_fault),PCB_ONFAULT(%eax)
+	
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.  If it's not, then we only need to
+	 * check that each page is readable, and the CPU will do that for us.
+	 */
+	movl	%esi,%edx
+	addl	%eax,%edx
+	jc	_C_LABEL(i686_copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(i686_copy_efault)
+
+	cmpl	$512,%eax
+	jb	2f
+
+	xorl	%ebx,%ebx
+	movl	%eax,%edx
+	shrl	$6,%edx
+
+	/*
+	 * Save FPU state in stack.
+	 */
+	smsw	%cx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+
+1:
+	movq 	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	movq	%mm0,(%edi)
+	movq	%mm1,8(%edi)
+	movq	%mm2,16(%edi)
+	movq	%mm3,24(%edi)
+	movq	%mm4,32(%edi)
+	movq	%mm5,40(%edi)
+	movq	%mm6,48(%edi)
+	movq	%mm7,56(%edi)
+
+	addl	$64,%esi
+	addl	$64,%edi
+	incl	%ebx
+	cmpl	%edx,%ebx
+	jb	1b
+
+	/*
+	 * Restore FPU state.
+	 */
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%cx
+
+	andl	$63,%eax
+	je	3f
+
+2:
+	/* bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl
+	rep
+	movsb
+
+3:
+	GET_CURPCB(%edx)
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	movl	%eax,PCB_ONFAULT(%edx)
+	ret
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_efault)
+	movl	$EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_fault)
+	GET_CURPCB(%edx)
+	movl	%eax,PCB_ONFAULT(%edx)
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+#endif /* I686_CPU */
 
 /* LINTSTUB: Ignore */
 NENTRY(copy_efault)

Jun-Young

-- 
Bang Jun-Young <junyoung@mogua.com>