Subject: port-amd64/25411: much faster version of amd64 str* functions
To: None <gnats-bugs@gnats.netbsd.org>
From: J.T. Conklin <jtc@orac.acorntoolworks.com>
List: netbsd-bugs
Date: 04/30/2004 18:41:35
>Number:         25411
>Category:       port-amd64
>Synopsis:       much faster version of amd64 str* functions
>Confidential:   yes
>Severity:       non-critical
>Priority:       low
>Responsible:    port-amd64-maintainer
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Sat May 01 01:42:00 UTC 2004
>Closed-Date:
>Last-Modified:
>Originator:     J.T. Conklin
>Release:        NetBSD 2.0_BETA
>Organization:
J.T. Conklin
>Environment:
	
	
System: NetBSD orac 1.6T NetBSD 1.6T (GENERIC) #0: Fri Jun 6 22:31:11 PDT 2003 jtc@orac:/home/jtc/netbsd/NetBSD-current/obj/sys/arch/i386/compile/GENERIC i386
Architecture: i386
Machine: i386
>Description:
I got a k8 machine earlier this week, and decided to adapt the much
faster versions of the x86 str* functions I wrote and submitted in
port-i386/25263.  See the earlier PR for rationale, background, and
implementation details.

As I had guessed, the k8's ability to processes 8 bytes at a time
yielded even better performance ratios over the curent implementation
than were achieved on the x86.  Functions like strlen() and memchr()
are about 4 times faster, and strchr() and strrchr() are better than
10 times better (comparisons are between the slopes of the ratios of
number of bytes processed over time --- in most cases, the overhead 
for processing 0 bytes is the same or slightly better and can be 
ignored).  

There is probably a tiny bit more performance that can be squeezed
out, as I've only played with the machine for a couple of days and
I haven't had time to figure out all the scheduling details.  I think
it is a very good baseline nonetheless.

Like pr-25263, the changes are provided in a shar file.  Diffs were
larger and more confusing.

>How-To-Repeat:
	
>Fix:
# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	index.S
#	memchr.S
#	rindex.S
#	strcat.S
#	strchr.S
#	strcmp.S
#	strcpy.S
#	strlen.S
#	strrchr.S
#
echo x - index.S
sed 's/^X//' >index.S << 'END-of-index.S'
X/*	$NetBSD: $	*/
X
X#define INDEX
X#include "strchr.S"
END-of-index.S
echo x - memchr.S
sed 's/^X//' >memchr.S << 'END-of-memchr.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X	
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
XENTRY(memchr)
X	movzbq	%sil,%rcx
X
X	/*
X	 * Align to word boundry
X	 * Consider unrolling loop?
X	 */
X	testq	%rdx,%rdx			/* nbytes == 0? */
X	je	.Lzero
X.Lalign:		
X	testb	$7,%dil
X	je	.Lword_aligned
X	cmpb	(%rdi),%cl
X	je	.Ldone
X	incq	%rdi
X	decq	%rdx
X	jnz	.Lalign
X	jmp	.Lzero
X	
X.Lword_aligned:
X	/* copy char to all bytes in word */
X	movb	%cl,%ch
X	movq	%rcx,%rsi
X	salq	$16,%rcx
X	orq	%rsi,%rcx
X	movq	%rcx,%rsi
X	salq	$32,%rcx
X	orq	%rsi,%rcx
X
X	movabsq	$0x0101010101010101,%r8
X	movabsq	$0x8080808080808080,%r9
X	
X	_ALIGN_TEXT	
X.Lloop:
X	cmpq	$7,%rdx				/* nbytes > 8 */
X	jbe	.Lbyte
X	movq	(%rdi),%rsi
X	addq	$8,%rdi
X	xorq	%rcx,%rsi
X	subq	$8,%rdx
X	subq	%r8,%rsi
X	testq	%r9,%rsi
X	je	.Lloop
X
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word are
X	 * equal to ch.
X	 */
X
X	leaq	-8(%rdi),%rax
X	cmpb	-8(%rdi),%cl			/* 1st byte == ch? */
X	je	.Ldone
X
X	leaq	-7(%rdi),%rax
X	cmpb	-7(%rdi),%cl			/* 2nd byte == ch? */
X	je	.Ldone
X
X	leaq	-6(%rdi),%rax
X	cmpb	-6(%rdi),%cl			/* 3rd byte == ch? */
X	je	.Ldone
X
X	leaq	-5(%rdi),%rax
X	cmpb	-5(%rdi),%cl			/* 4th byte == ch? */
X	je	.Ldone
X
X	leaq	-4(%rdi),%rax
X	cmpb	-4(%rdi),%cl			/* 5th byte == ch? */
X	je	.Ldone
X
X	leaq	-3(%rdi),%rax
X	cmpb	-3(%rdi),%cl			/* 6th byte == ch? */
X	je	.Ldone
X
X	leaq	-2(%rdi),%rax
X	cmpb	-2(%rdi),%cl			/* 7th byte == ch? */
X	je	.Ldone
X
X	leaq	-1(%rdi),%rax
X	cmpb	-1(%rdi),%cl			/* 7th byte == ch? */
X	jne	.Lloop
X	ret
X
X.Lbyte:		
X	testq	%rdx,%rdx
X	je	.Lzero
X.Lbyte_loop:
X	cmpb	(%rdi),%cl
X	je	.Ldone
X	incq	%rdi
X	decq	%rdx
X	jnz	.Lbyte_loop
X	
X.Lzero:
X	xorq	%rax,%rax
X	
X.Ldone:
X	ret
END-of-memchr.S
echo x - rindex.S
sed 's/^X//' >rindex.S << 'END-of-rindex.S'
X/*	$NetBSD: $	*/
X
X#define RINDEX
X#include "strrchr.S"
END-of-rindex.S
echo x - strcat.S
sed 's/^X//' >strcat.S << 'END-of-strcat.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
XENTRY(strcat)
X	movq	%rdi,%rax
X	movabsq	$0x0101010101010101,%r8
X	movabsq	$0x8080808080808080,%r9
X
X	/*
X	 * Align destination to word boundary.
X	 * Consider unrolling loop?
X	 */
X.Lscan:	
X.Lscan_align:
X	testb	$7,%dil
X	je	.Lscan_aligned
X	cmpb	$0,(%rdi)
X	je	.Lcopy
X	incq	%rdi
X	jmp	.Lscan_align
X
X	_ALIGN_TEXT
X.Lscan_aligned:
X.Lscan_loop:
X	movq	(%rdi),%rdx
X	addq	$8,%rdi	
X	subq	%r8,%rdx
X	testq	%r9,%rdx
X	je	.Lscan_loop
X
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word equal 0.
X	 */
X
X	cmpb	$0,-8(%rdi)			/* 1st byte == 0? */
X	jne	1f
X	subq	$8,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-7(%rdi)			/* 2nd byte == 0? */
X	jne	1f
X	subq	$7,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-6(%rdi)			/* 3rd byte == 0? */
X	jne	1f
X	subq	$6,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-5(%rdi)			/* 4th byte == 0? */
X	jne	1f
X	subq	$5,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-4(%rdi)			/* 5th byte == 0? */
X	jne	1f
X	subq	$4,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-3(%rdi)			/* 6th byte == 0? */
X	jne	1f
X	subq	$3,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-2(%rdi)			/* 7th byte == 0? */
X	jne	1f
X	subq	$2,%rdi
X	jmp	.Lcopy
X
X1:	cmpb	$0,-1(%rdi)			/* 8th byte == 0? */
X	jne	.Lscan_loop
X	subq	$1,%rdi
X
X	/*
X	 * Align source to a word boundary.
X	 * Consider unrolling loop?
X	 */
X.Lcopy:
X.Lcopy_align:	
X	testb	$3,%sil
X	je	.Lcopy_aligned
X	movb	(%rsi),%dl
X	incq	%rsi
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl
X	jne	.Lcopy_align
X	ret
X	
X	_ALIGN_TEXT
X.Lcopy_loop:
X	movq	%rdx,(%rdi)
X	addq	$8,%rdi
X.Lcopy_aligned:
X	movq	(%rsi),%rdx
X	movq	%rdx,%rcx
X	addq	$8,%rsi
X	subq	%r8,%rcx
X	testq	%r9,%rcx
X	je	.Lcopy_loop
X	
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word equal 0.
X	 */
X
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 1st byte == 0? */
X	je	.Ldone	
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 2nd byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 3rd byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 4th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 5th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 6th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx	
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 7th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 8th byte == 0? */
X	jne	.Lcopy_aligned
X
X.Ldone:
X	ret
END-of-strcat.S
echo x - strchr.S
sed 's/^X//' >strchr.S << 'END-of-strchr.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X	
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
X#ifdef INDEX
XENTRY(index)
X#else
XENTRY(strchr)
X#endif
X	movzbq	%sil,%rcx
X
X	/*
X	 * Align to word boundary.
X	 * Consider unrolling loop?
X	 */
X.Lalign:
X	testb	$7,%dil
X	je	.Lword_aligned
X	movb	(%rdi),%dl
X	cmpb	%cl,%dl
X	je	.Ldone
X	incq	%rdi
X	testb	%dl,%dl
X	jne	.Lalign
X	jmp	.Lzero
X	
X.Lword_aligned:
X	/* copy char to all bytes in word */
X	movb	%cl,%ch
X	movq	%rcx,%rdx
X	salq	$16,%rcx
X	orq	%rdx,%rcx
X	movq	%rcx,%rdx
X	salq	$32,%rcx
X	orq	%rdx,%rcx
X
X	movabsq $0x0101010101010101,%r8
X	movabsq $0x8080808080808080,%r9
X	
X	/* Check whether any byte in the word is equal to ch or 0. */
X	_ALIGN_TEXT
X.Lloop:
X	movq	(%rdi),%rdx
X	addq	$8,%rdi
X	movq	%rdx,%rsi
X	subq	%r8,%rdx
X	xorq	%rcx,%rsi
X	subq	%r8,%rsi
X	orq	%rsi,%rdx
X	testq	%r9,%rdx
X	je	.Lloop
X
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word match
X	 * ch or are equal to 0.
X	 */
X
X	movb	-8(%rdi),%dl	
X	cmpb	%cl,%dl				/* 1st byte == ch? */
X	jne	1f
X	subq	$8,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 1st byte == 0? */
X	je	.Lzero
X
X	movb	-7(%rdi),%dl	
X	cmpb	%cl,%dl				/* 2nd byte == ch? */
X	jne	1f
X	subq	$7,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 2nd byte == 0? */
X	je	.Lzero
X
X	movb	-6(%rdi),%dl	
X	cmpb	%cl,%dl				/* 3rd byte == ch? */
X	jne	1f
X	subq	$6,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 3rd byte == 0? */
X	je	.Lzero
X
X	movb	-5(%rdi),%dl	
X	cmpb	%cl,%dl				/* 4th byte == ch? */
X	jne	1f
X	subq	$5,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 4th byte == 0? */
X	je	.Lzero
X
X	movb	-4(%rdi),%dl	
X	cmpb	%cl,%dl				/* 5th byte == ch? */
X	jne	1f
X	subq	$4,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 5th byte == 0? */
X	je	.Lzero
X
X	movb	-3(%rdi),%dl	
X	cmpb	%cl,%dl				/* 6th byte == ch? */
X	jne	1f
X	subq	$3,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 6th byte == 0? */
X	je	.Lzero
X
X	movb	-2(%rdi),%dl	
X	cmpb	%cl,%dl				/* 7th byte == ch? */
X	jne	1f
X	subq	$2,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 7th byte == 0? */
X	je	.Lzero
X
X	movb	-1(%rdi),%dl	
X	cmpb	%cl,%dl				/* 8th byte == ch? */
X	jne	1f
X	subq	$1,%rdi
X	jmp	.Ldone
X1:	testb	%dl,%dl				/* 8th byte == 0? */
X	jne	.Lloop
X
X.Lzero:
X	/* If a ch wasn't found, return 0. */
X	xorq	%rdi,%rdi
X
X.Ldone:
X	movq	%rdi,%rax
X	ret
END-of-strchr.S
echo x - strcmp.S
sed 's/^X//' >strcmp.S << 'END-of-strcmp.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
XENTRY(strcmp)
X	/*
X	 * Align s1 to word boundary.
X	 * Consider unrolling loop?
X	 */
X.Ls1align:
X	testb	$7,%dil
X	je	.Ls1aligned
X	movb	(%rdi),%al
X	incq	%rdi
X	movb	(%rsi),%dl
X	incq	%rsi
X	testb	%al,%al
X	je	.Ldone
X	cmpb	%al,%dl
X	je	.Ls1align
X	jmp	.Ldone
X
X	/*
X	 * Check whether s2 is aligned to a word boundry.  If it is, we 
X	 * can compare by words.  Otherwise we have to compare by bytes.
X	 */
X.Ls1aligned:
X	testb	$7,%sil
X	jne	.Lbyte_loop
X
X	movabsq	$0x0101010101010101,%r8
X	subq	$8,%rdi
X	movabsq	$0x8080808080808080,%r9
X	subq	$8,%rsi
X
X	_ALIGN_TEXT
X.Lword_loop:
X	movq	8(%rdi),%rax
X	addq	$8,%rdi
X	movq	8(%rsi),%rdx
X	addq	$8,%rsi
X	cmpq	%rax,%rdx
X	jne	.Lbyte_loop
X	subq	%r8,%rdx
X	notq	%rax
X	andq	%rax,%rdx
X	testq	%r9,%rdx
X	je	.Lword_loop
X
X	_ALIGN_TEXT
X.Lbyte_loop:
X	movb	(%rdi),%al
X	incq	%rdi
X	movb	(%rsi),%dl
X	incq	%rsi
X	testb	%al,%al
X	je	.Ldone
X	cmpb	%al,%dl
X	je	.Lbyte_loop
X
X.Ldone:
X	movzbq	%al,%rax
X	movzbq	%dl,%rdx
X	subq	%rdx,%rax
X	ret
END-of-strcmp.S
echo x - strcpy.S
sed 's/^X//' >strcpy.S << 'END-of-strcpy.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
X/*
X * This strcpy implementation copies a byte at a time until the
X * source pointer is aligned to a word boundary, it then copies by
X * words until it finds a word containing a zero byte, and finally
X * copies by bytes until the end of the string is reached.
X *	
X * While this may result in unaligned stores if the source and
X * destination pointers are unaligned with respect to each other,
X * it is still faster than either byte copies or the overhead of
X * an implementation suitable for machines with strict alignment
X * requirements.
X */
X
XENTRY(strcpy)
X	movq	%rdi,%rax
X	movabsq	$0x0101010101010101,%r8
X	movabsq	$0x8080808080808080,%r9
X
X	/*
X	 * Align source to a word boundary.
X	 * Consider unrolling loop?
X	 */
X	_ALIGN_TEXT
X.Lalign:
X	testb	$7,%sil
X	je	.Lword_aligned
X	movb	(%rsi),%dl
X	incq	%rsi
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl
X	jne	.Lalign
X	ret
X
X	_ALIGN_TEXT
X.Lloop:
X	movq	%rdx,(%rdi)
X	addq	$8,%rdi
X.Lword_aligned:
X	movq	(%rsi),%rdx
X	movq	%rdx,%rcx
X	addq	$8,%rsi
X	subq	%r8,%rcx
X	testq	%r9,%rcx
X	je	.Lloop
X	
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word equal 0.
X	 */
X
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 1st byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 2nd byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 3rd byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx	
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 4th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 5th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 6th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 7th byte == 0? */
X	je	.Ldone
X
X	shrq	$8,%rdx	
X	movb	%dl,(%rdi)
X	incq	%rdi
X	testb	%dl,%dl				/* 8th byte == 0? */
X	jne	.Lword_aligned
X
X.Ldone:
X	ret
END-of-strcpy.S
echo x - strlen.S
sed 's/^X//' >strlen.S << 'END-of-strlen.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
XENTRY(strlen)
X	movq	%rdi,%rax
X	negq	%rdi
X
X.Lalign:
X	/* Consider unrolling loop? */
X	testb	$7,%al
X	je	.Lword_aligned
X	cmpb	$0,(%rax)
X	jne	1f
X	leaq	(%rdi,%rax),%rax
X	ret
X1:	incq	%rax
X	jmp	.Lalign
X
X	/*
X	 * There are many well known branch-free sequences which are used
X	 * for determining whether a zero-byte is contained within a word.
X	 * These sequences are generally much more efficent than loading
X	 * and comparing each byte individually.
X	 *
X	 * The expression [1,2]:
X	 *
X	 * (1)	~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
X	 *
X	 * evaluates to a non-zero value if any of the bytes in the
X	 * original word is zero.
X	 *
X	 * It also has the useful property that bytes in the result word
X	 * that coorespond to non-zero bytes in the original word have
X	 * the value 0x00, while bytes cooresponding to zero bytes have
X	 * the value 0x80. This allows calculation of the first (and
X	 * last) occurance of a zero byte within the word (useful for C's
X	 * str* primitives) by counting the number of leading (or
X	 * trailing) zeros and dividing the result by 8.  On machines
X	 * without (or with slow) clz() / ctz() instructions, testing
X	 * each byte in the result word for zero is necessary.
X	 *
X	 * This typically takes 4 instructions (5 on machines without
X	 * "not-or") not including those needed to load the constant.
X	 *
X	 *
X	 * The expression:
X	 *
X	 * (2)	((x - 0x01....01) & ~x & 0x80....80)
X	 *
X	 * evaluates to a non-zero value if any of the bytes in the
X	 * original word is zero.
X	 *
X	 * On little endian machines, the first byte in the result word
X	 * that cooresponds to a zero byte in the original byte is 0x80,
X	 * so clz() can be used as above.  On big endian machines, and
X	 * little endian machines without (or with a slow) clz() insn,
X	 * testing each byte in the original for zero is necessary
X	 * 
X	 * This typically takes 3 instructions (4 on machines without
X	 * "and with complement") not including those needed to load
X	 * constants.
X	 *
X	 *
X	 * The expression:
X	 *
X	 * (3)	((x - 0x01....01) & 0x80....80)
X	 *
X	 * always evaluates to a non-zero value if any of the bytes in
X	 * the original word is zero.  However, in rare cases, it also
X	 * evaluates to a non-zero value when none of the bytes in the
X	 * original word is zero.	
X	 * 
X	 * To account for possible false positives, each byte of the
X	 * original word must be checked when the expression evaluates to
X	 * a non-zero value.  However, because it is simpler than those
X	 * presented above, code that uses it will be faster as long as
X	 * the rate of false positives is low.
X	 * 
X	 * This is likely, because the the false positive can only occur
X	 * if the most siginificant bit of a byte within the word is set.
X	 * The expression will never fail for typical 7-bit ASCII strings.
X	 *
X	 * This typically takes 2 instructions not including those needed
X	 * to load constants.
X	 *
X	 *
X	 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
X	 *
X	 * [2] International Business Machines, "The PowerPC Compiler Writer's
X	 *     Guide", Warthman Associates, 1996
X	 */
X
X	_ALIGN_TEXT
X.Lword_aligned:
X	movabsq	$0x0101010101010101,%r8
X	movabsq $0x8080808080808080,%r9
X.Lloop:
X	movq	(%rax),%rdx
X	addq	$8,%rax	
X	subq	%r8,%rdx
X	testq	%r9,%rdx
X	je	.Lloop
X
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word equal 0.
X	 */
X	cmpb	$0,-8(%rax)				/* 1st byte == 0? */
X	je	.Lsub8
X	cmpb	$0,-7(%rax)				/* 2nd byte == 0? */
X	je	.Lsub7
X	cmpb	$0,-6(%rax)				/* 3rd byte == 0? */
X	je	.Lsub6
X	cmpb	$0,-5(%rax)				/* 4th byte == 0? */
X	je	.Lsub5
X	cmpb	$0,-4(%rax)				/* 5th byte == 0? */
X	je	.Lsub4
X	cmpb	$0,-3(%rax)				/* 6th byte == 0? */
X	je	.Lsub3
X	cmpb	$0,-2(%rax)				/* 7th byte == 0? */
X	je	.Lsub2
X	cmpb	$0,-1(%rax)				/* 8th byte == 0? */
X	jne	.Lloop
X
X.Lsub1:
X	leaq	-1(%rdi,%rax),%rax
X	ret
X.Lsub2:
X	leaq	-2(%rdi,%rax),%rax
X	ret
X.Lsub3:
X	leaq	-3(%rdi,%rax),%rax
X	ret
X.Lsub4:
X	leaq	-4(%rdi,%rax),%rax
X	ret
X.Lsub5:
X	leaq	-5(%rdi,%rax),%rax
X	ret
X.Lsub6:
X	leaq	-6(%rdi,%rax),%rax
X	ret
X.Lsub7:
X	leaq	-7(%rdi,%rax),%rax
X	ret
X.Lsub8:
X	leaq	-8(%rdi,%rax),%rax
X	ret
END-of-strlen.S
echo x - strrchr.S
sed 's/^X//' >strrchr.S << 'END-of-strrchr.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X	RCSID("$NetBSD: $")
X#endif
X
X#ifdef RINDEX
XENTRY(rindex)
X#else
XENTRY(strrchr)
X#endif
X	movzbq	%sil,%rcx
X
X	/* zero return value */
X	xorq	%rax,%rax
X
X	/*
X	 * Align to word boundary.
X	 * Consider unrolling loop?
X	 */
X.Lalign:
X	testb	$7,%dil
X	je	.Lword_aligned
X	movb	(%rdi),%dl
X	cmpb	%cl,%dl
X	cmoveq	%rdi,%rax
X	incq	%rdi
X	testb	%dl,%dl
X	jne	.Lalign
X	jmp	.Ldone
X	
X.Lword_aligned:
X	/* copy char to all bytes in word */
X	movb	%cl,%ch
X	movq	%rcx,%rdx
X	salq	$16,%rcx
X	orq	%rdx,%rcx
X	movq	%rcx,%rdx
X	salq	$32,%rcx
X	orq	%rdx,%rcx
X
X	movabsq $0x0101010101010101,%r8
X	movabsq $0x8080808080808080,%r9
X
X	/* Check whether any byte in the word is equal to ch or 0.  */
X	_ALIGN_TEXT
X.Lloop:
X	movq	(%rdi),%rdx
X	addq	$8,%rdi
X	movq	%rdx,%rsi
X	subq	%r8,%rdx
X	xorq	%rcx,%rsi
X	subq	%r8,%rsi
X	orq	%rsi,%rdx
X	testq	%r9,%rdx
X	je	.Lloop
X
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word match
X	 * ch or are equal to 0.
X	 */
X
X	movb	-8(%rdi),%dl
X	cmpb	%cl,%dl				/* 1st byte == ch? */
X	jne	1f
X	leaq	-8(%rdi),%rax
X1:	testb	%dl,%dl				/* 1st byte == 0? */
X	je	.Ldone
X
X	movb	-7(%rdi),%dl
X	cmpb	%cl,%dl				/* 2nd byte == ch? */
X	jne	1f
X	leaq	-7(%rdi),%rax
X1:	testb	%dl,%dl				/* 2nd byte == 0? */
X	je	.Ldone
X
X	movb	-6(%rdi),%dl
X	cmpb	%cl,%dl				/* 3rd byte == ch? */
X	jne	1f
X	leaq	-6(%rdi),%rax
X1:	testb	%dl,%dl				/* 3rd byte == 0? */
X	je	.Ldone
X
X	movb	-5(%rdi),%dl
X	cmpb	%cl,%dl				/* 4th byte == ch? */
X	jne	1f
X	leaq	-5(%rdi),%rax
X1:	testb	%dl,%dl				/* 4th byte == 0? */
X	je	.Ldone
X
X	movb	-4(%rdi),%dl
X	cmpb	%cl,%dl				/* 5th byte == ch? */
X	jne	1f
X	leaq	-4(%rdi),%rax
X1:	testb	%dl,%dl				/* 5th byte == 0? */
X	je	.Ldone
X
X	movb	-3(%rdi),%dl
X	cmpb	%cl,%dl				/* 6th byte == ch? */
X	jne	1f
X	leaq	-3(%rdi),%rax
X1:	testb	%dl,%dl				/* 6th byte == 0? */
X	je	.Ldone
X
X	movb	-2(%rdi),%dl
X	cmpb	%cl,%dl				/* 7th byte == ch? */
X	jne	1f
X	leaq	-2(%rdi),%rax
X1:	testb	%dl,%dl				/* 7th byte == 0? */
X	je	.Ldone
X
X	movb	-1(%rdi),%dl
X	cmpb	%cl,%dl				/* 8th byte == ch? */
X	jne	1f
X	leaq	-1(%rdi),%rax
X1:	testb	%dl,%dl				/* 8th byte == 0? */
X	jne	.Lloop
X						
X.Ldone:
X	ret
END-of-strrchr.S
exit


>Release-Note:
>Audit-Trail:
>Unformatted: