tech-perform: Re: Performance of various memcpy()'s

Subject: Re: Performance of various memcpy()'s
To: <>
From: David Laight <david@l8s.co.uk>
List: tech-perform
Date: 10/16/2002 12:58:52
On Wed, Oct 16, 2002 at 04:18:30AM +0900, Bang Jun-Young wrote:
> Hi,
> 
> About 14 monthes ago, I had some discussion on memcpy performance on
> i386 platform here. Monthes later, I took a look into it again, and
> now am coming with (not-so-)new benchmark results (attached). The
> tests were performed on Athlon XP 1800 and DDR 256MB. 
> 
> >>From the results, it's obvious that memcpy() using MMX insns is the
> best for in-cache sized data, typically 50-100% faster than plain old
> memcpy for data <= 32 KB.

I've done some experiments on my slot-A athlon 700.

The libc memcpy is slow (on modern cpus) because of the setup cost of
executing 'rep movs' instructions.  In particular the one used to
copy the remaining (0-3) bytes is particularly expensive.
'rep movsl' only starts to win for copies over (about) 200 bytes.
(when the mmx copy is still 50% faster).

For small blocks (probably the commonest?) I get:

addr1=0x804c000 addr2=0x804c080
memcpy 64B -- 16777216 loops
  aligned blocks
      libc memcpy                                        1.721654 s
      rep movsw                                          1.310823 s
      asm loop                                           1.000972 s
      MMX memcpy using MOVQ                              0.762467 s
      arjanv's MOVQ (with prefetch)                      0.905702 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        1.559139 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  1.556865 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        1.715516 s
      rep movsw                                          1.310894 s
      asm loop                                           1.000683 s
      MMX memcpy using MOVQ                              0.881484 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        1.996214 s
      rep movsw                                          1.619813 s
      asm loop                                           1.190194 s
      MMX memcpy using MOVQ                              1.024688 s

where the 'rep movsl' and 'asm loop' are:


#include <machine/asm.h>

ENTRY(memcpy_rep_movsl)
	pushl	%esi
	pushl	%edi
	movl	20(%esp),%ecx
	movl	12(%esp),%edi
	movl	16(%esp),%esi
	movl	%edi,%eax	/* return value */
	movl	%ecx,%edx
	cld			/* copy forwards. */
	shrl	$2,%ecx		/* copy by words */
	rep
	movsl
	testl	$3,%edx
	jne	1f
2:	popl	%edi
	popl	%esi
	ret
1:
	movl	%edx,%ecx
	rep
	movsb
	jmp	2b

ENTRY(memcpy_words)
	pushl	%esi
	pushl	%edi
	movl	12(%esp),%edi
	movl	16(%esp),%esi
	movl	20(%esp),%ecx
	pushl	%ebp
	pushl	%ebx
	shrl	$4,%ecx
1:
	movl	0(%esi),%eax
	movl	4(%esi),%edx
	movl	8(%esi),%ebx
	movl	12(%esi),%ebp
	addl	$16,%esi
	subl	$1,%ecx
	movl	%eax,0(%edi)
	movl	%edx,4(%edi)
	movl	%ebx,8(%edi)
	movl	%ebp,12(%edi)
	leal	16(%edi),%edi
	jne	1b
/* We ought to do the remainder here... */
	popl	%ebx
	popl	%ebp
	movl	12(%esp),%eax
	popl	%edi
	popl	%esi
	ret

	David

-- 
David Laight: david@l8s.co.uk