port-i386: Re: speeding up bzero

Subject: Re: speeding up bzero
To: <>
From: David Laight <david@l8s.co.uk>
List: port-i386
Date: 04/12/2003 14:56:10
On Fri, Apr 11, 2003 at 03:56:07PM -0700, Jason Thorpe wrote:
> 
> On Friday, April 11, 2003, at 01:43  PM, David Laight wrote:
> 
> > My Athlon 700 gains about 1.5% on 8k aligned calls.
> > For 20 byte aligned transfers the gain is 38%
> > For 20 byte misaligned transfers the gain is 29%
> 
> Can you make the same tweaks to memset()?  That is what we prefer to 
> use these days, after all :-)

I have a nice merged memset/bcopy...

I've also redone memcpy/memmove/bcopy giving a 30% improvement for
small copies.

Note this is still a 'rep movsl' based routine.
Some experiments done last year showed that significant improvements
were possible by using SSE2 instructions and/or avoiding movsl.

As a slight aside, what is the current ruling on whether the kernel
bcopy() has to behave like memcpy() or memmove()?
(wrt overlapped copies).

Also is there a mechanism to automatically update sys/lib/libkern/arch/*
from lib/libc/*/arch/*?

Or could the kernel use a 'reachover' makefile?

	David

/*	$NetBSD: memcpy.S,v 1.2 1998/01/09 03:45:07 perry Exp $	*/

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 * ...
 */

#include <machine/asm.h>

#if defined(LIBC_SCCS)
	RCSID("$NetBSD: bcopy.S,v 1.11 2002/10/29 07:01:44 junyoung Exp $")
#endif

	/*
	 * (ov)bcopy (src,dst,cnt)
	 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
	 */

#ifdef BCOPY
ENTRY(bcopy)
#else
#ifdef MEMMOVE
ENTRY(memmove)
#else
#define MEMCPY
#define NO_OVERLAP
ENTRY(memcpy)
#endif
#endif
	push	%esi
	mov	%edi,%edx
#if defined(MEMCPY) || defined(MEMMOVE)
	movl	8(%esp),%edi
	movl	12(%esp),%esi
#else
	movl	8(%esp),%esi
	movl	12(%esp),%edi
#endif
	movl	16(%esp),%ecx
#if defined(NO_OVERLAP)
	movl	%ecx,%eax
#else
	movl	%edi,%eax
	subl	%esi,%eax
	cmpl	%ecx,%eax	/* overlapping? */
	movl	%ecx,%eax
	jb	backwards
#endif
	cld			/* nope, copy forwards. */
	shrl	$2,%ecx		/* copy by words */
	rep
	movsl
	and	$3,%eax		/* any bytes left? */
	jnz	trailing
done:
#if defined(MEMCPY) || defined(MEMMOVE)
	movl	8(%esp),%eax
#endif
	mov	%edx,%edi
	pop	%esi
	ret

trailing:
	cmp	$2,%eax
	jb	1f
	movw	(%esi),%ax
	movw	%ax,(%edi)
	je	done
	movb	2(%esi),%al
	movb	%al,2(%edi)
	jmp	done
1:	movb	(%esi),%al
	movb	%al,(%edi)
	jmp	done

#if !defined(NO_OVERLAP)
backwards:
	addl	%ecx,%edi	/* copy backwards. */
	addl	%ecx,%esi
	and	$3,%eax		/* any fractional bytes? */
	jnz	back_align
back_aligned:
	shrl	$2,%ecx
	subl	$4,%esi
	subl	$4,%edi
	std
	rep
	movsl
	cld
	jmp	done

back_align:
	sub	%eax,%esi
	sub	%eax,%edi
	cmp	$2,%eax
	jb	1f
	je	2f
	movb	2(%esi),%al
	movb	%al,2(%edi)
2:	movw	(%esi),%ax
	movw	%ax,(%edi)
	jmp	back_aligned
1:	movb	(%esi),%al
	movb	%al,(%edi)
	jmp	back_aligned
#endif

-- 
David Laight: david@l8s.co.uk