Subject: port-amd64/25411: much faster version of amd64 str* functions
To: None <gnats-bugs@gnats.netbsd.org>
From: J.T. Conklin <jtc@orac.acorntoolworks.com>
List: netbsd-bugs
Date: 04/30/2004 18:41:35
>Number: 25411
>Category: port-amd64
>Synopsis: much faster version of amd64 str* functions
>Confidential: yes
>Severity: non-critical
>Priority: low
>Responsible: port-amd64-maintainer
>State: open
>Class: change-request
>Submitter-Id: net
>Arrival-Date: Sat May 01 01:42:00 UTC 2004
>Closed-Date:
>Last-Modified:
>Originator: J.T. Conklin
>Release: NetBSD 2.0_BETA
>Organization:
J.T. Conklin
>Environment:
System: NetBSD orac 1.6T NetBSD 1.6T (GENERIC) #0: Fri Jun 6 22:31:11 PDT 2003 jtc@orac:/home/jtc/netbsd/NetBSD-current/obj/sys/arch/i386/compile/GENERIC i386
Architecture: i386
Machine: i386
>Description:
I got a k8 machine earlier this week, and decided to adapt the much
faster versions of the x86 str* functions I wrote and submitted in
port-i386/25263. See the earlier PR for rationale, background, and
implementation details.
As I had guessed, the k8's ability to processes 8 bytes at a time
yielded even better performance ratios over the curent implementation
than were achieved on the x86. Functions like strlen() and memchr()
are about 4 times faster, and strchr() and strrchr() are better than
10 times better (comparisons are between the slopes of the ratios of
number of bytes processed over time --- in most cases, the overhead
for processing 0 bytes is the same or slightly better and can be
ignored).
There is probably a tiny bit more performance that can be squeezed
out, as I've only played with the machine for a couple of days and
I haven't had time to figure out all the scheduling details. I think
it is a very good baseline nonetheless.
Like pr-25263, the changes are provided in a shar file. Diffs were
larger and more confusing.
>How-To-Repeat:
>Fix:
# This is a shell archive. Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file". Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
# index.S
# memchr.S
# rindex.S
# strcat.S
# strchr.S
# strcmp.S
# strcpy.S
# strlen.S
# strrchr.S
#
echo x - index.S
sed 's/^X//' >index.S << 'END-of-index.S'
X/* $NetBSD: $ */
X
X#define INDEX
X#include "strchr.S"
END-of-index.S
echo x - memchr.S
sed 's/^X//' >memchr.S << 'END-of-memchr.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
XENTRY(memchr)
X movzbq %sil,%rcx
X
X /*
X * Align to word boundry
X * Consider unrolling loop?
X */
X testq %rdx,%rdx /* nbytes == 0? */
X je .Lzero
X.Lalign:
X testb $7,%dil
X je .Lword_aligned
X cmpb (%rdi),%cl
X je .Ldone
X incq %rdi
X decq %rdx
X jnz .Lalign
X jmp .Lzero
X
X.Lword_aligned:
X /* copy char to all bytes in word */
X movb %cl,%ch
X movq %rcx,%rsi
X salq $16,%rcx
X orq %rsi,%rcx
X movq %rcx,%rsi
X salq $32,%rcx
X orq %rsi,%rcx
X
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X
X _ALIGN_TEXT
X.Lloop:
X cmpq $7,%rdx /* nbytes > 8 */
X jbe .Lbyte
X movq (%rdi),%rsi
X addq $8,%rdi
X xorq %rcx,%rsi
X subq $8,%rdx
X subq %r8,%rsi
X testq %r9,%rsi
X je .Lloop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word are
X * equal to ch.
X */
X
X leaq -8(%rdi),%rax
X cmpb -8(%rdi),%cl /* 1st byte == ch? */
X je .Ldone
X
X leaq -7(%rdi),%rax
X cmpb -7(%rdi),%cl /* 2nd byte == ch? */
X je .Ldone
X
X leaq -6(%rdi),%rax
X cmpb -6(%rdi),%cl /* 3rd byte == ch? */
X je .Ldone
X
X leaq -5(%rdi),%rax
X cmpb -5(%rdi),%cl /* 4th byte == ch? */
X je .Ldone
X
X leaq -4(%rdi),%rax
X cmpb -4(%rdi),%cl /* 5th byte == ch? */
X je .Ldone
X
X leaq -3(%rdi),%rax
X cmpb -3(%rdi),%cl /* 6th byte == ch? */
X je .Ldone
X
X leaq -2(%rdi),%rax
X cmpb -2(%rdi),%cl /* 7th byte == ch? */
X je .Ldone
X
X leaq -1(%rdi),%rax
X cmpb -1(%rdi),%cl /* 7th byte == ch? */
X jne .Lloop
X ret
X
X.Lbyte:
X testq %rdx,%rdx
X je .Lzero
X.Lbyte_loop:
X cmpb (%rdi),%cl
X je .Ldone
X incq %rdi
X decq %rdx
X jnz .Lbyte_loop
X
X.Lzero:
X xorq %rax,%rax
X
X.Ldone:
X ret
END-of-memchr.S
echo x - rindex.S
sed 's/^X//' >rindex.S << 'END-of-rindex.S'
X/* $NetBSD: $ */
X
X#define RINDEX
X#include "strrchr.S"
END-of-rindex.S
echo x - strcat.S
sed 's/^X//' >strcat.S << 'END-of-strcat.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
XENTRY(strcat)
X movq %rdi,%rax
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X
X /*
X * Align destination to word boundary.
X * Consider unrolling loop?
X */
X.Lscan:
X.Lscan_align:
X testb $7,%dil
X je .Lscan_aligned
X cmpb $0,(%rdi)
X je .Lcopy
X incq %rdi
X jmp .Lscan_align
X
X _ALIGN_TEXT
X.Lscan_aligned:
X.Lscan_loop:
X movq (%rdi),%rdx
X addq $8,%rdi
X subq %r8,%rdx
X testq %r9,%rdx
X je .Lscan_loop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word equal 0.
X */
X
X cmpb $0,-8(%rdi) /* 1st byte == 0? */
X jne 1f
X subq $8,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-7(%rdi) /* 2nd byte == 0? */
X jne 1f
X subq $7,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-6(%rdi) /* 3rd byte == 0? */
X jne 1f
X subq $6,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-5(%rdi) /* 4th byte == 0? */
X jne 1f
X subq $5,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-4(%rdi) /* 5th byte == 0? */
X jne 1f
X subq $4,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-3(%rdi) /* 6th byte == 0? */
X jne 1f
X subq $3,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-2(%rdi) /* 7th byte == 0? */
X jne 1f
X subq $2,%rdi
X jmp .Lcopy
X
X1: cmpb $0,-1(%rdi) /* 8th byte == 0? */
X jne .Lscan_loop
X subq $1,%rdi
X
X /*
X * Align source to a word boundary.
X * Consider unrolling loop?
X */
X.Lcopy:
X.Lcopy_align:
X testb $3,%sil
X je .Lcopy_aligned
X movb (%rsi),%dl
X incq %rsi
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl
X jne .Lcopy_align
X ret
X
X _ALIGN_TEXT
X.Lcopy_loop:
X movq %rdx,(%rdi)
X addq $8,%rdi
X.Lcopy_aligned:
X movq (%rsi),%rdx
X movq %rdx,%rcx
X addq $8,%rsi
X subq %r8,%rcx
X testq %r9,%rcx
X je .Lcopy_loop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word equal 0.
X */
X
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 1st byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 2nd byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 3rd byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 4th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 5th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 6th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 7th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 8th byte == 0? */
X jne .Lcopy_aligned
X
X.Ldone:
X ret
END-of-strcat.S
echo x - strchr.S
sed 's/^X//' >strchr.S << 'END-of-strchr.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
X#ifdef INDEX
XENTRY(index)
X#else
XENTRY(strchr)
X#endif
X movzbq %sil,%rcx
X
X /*
X * Align to word boundary.
X * Consider unrolling loop?
X */
X.Lalign:
X testb $7,%dil
X je .Lword_aligned
X movb (%rdi),%dl
X cmpb %cl,%dl
X je .Ldone
X incq %rdi
X testb %dl,%dl
X jne .Lalign
X jmp .Lzero
X
X.Lword_aligned:
X /* copy char to all bytes in word */
X movb %cl,%ch
X movq %rcx,%rdx
X salq $16,%rcx
X orq %rdx,%rcx
X movq %rcx,%rdx
X salq $32,%rcx
X orq %rdx,%rcx
X
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X
X /* Check whether any byte in the word is equal to ch or 0. */
X _ALIGN_TEXT
X.Lloop:
X movq (%rdi),%rdx
X addq $8,%rdi
X movq %rdx,%rsi
X subq %r8,%rdx
X xorq %rcx,%rsi
X subq %r8,%rsi
X orq %rsi,%rdx
X testq %r9,%rdx
X je .Lloop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word match
X * ch or are equal to 0.
X */
X
X movb -8(%rdi),%dl
X cmpb %cl,%dl /* 1st byte == ch? */
X jne 1f
X subq $8,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 1st byte == 0? */
X je .Lzero
X
X movb -7(%rdi),%dl
X cmpb %cl,%dl /* 2nd byte == ch? */
X jne 1f
X subq $7,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 2nd byte == 0? */
X je .Lzero
X
X movb -6(%rdi),%dl
X cmpb %cl,%dl /* 3rd byte == ch? */
X jne 1f
X subq $6,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 3rd byte == 0? */
X je .Lzero
X
X movb -5(%rdi),%dl
X cmpb %cl,%dl /* 4th byte == ch? */
X jne 1f
X subq $5,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 4th byte == 0? */
X je .Lzero
X
X movb -4(%rdi),%dl
X cmpb %cl,%dl /* 5th byte == ch? */
X jne 1f
X subq $4,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 5th byte == 0? */
X je .Lzero
X
X movb -3(%rdi),%dl
X cmpb %cl,%dl /* 6th byte == ch? */
X jne 1f
X subq $3,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 6th byte == 0? */
X je .Lzero
X
X movb -2(%rdi),%dl
X cmpb %cl,%dl /* 7th byte == ch? */
X jne 1f
X subq $2,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 7th byte == 0? */
X je .Lzero
X
X movb -1(%rdi),%dl
X cmpb %cl,%dl /* 8th byte == ch? */
X jne 1f
X subq $1,%rdi
X jmp .Ldone
X1: testb %dl,%dl /* 8th byte == 0? */
X jne .Lloop
X
X.Lzero:
X /* If a ch wasn't found, return 0. */
X xorq %rdi,%rdi
X
X.Ldone:
X movq %rdi,%rax
X ret
END-of-strchr.S
echo x - strcmp.S
sed 's/^X//' >strcmp.S << 'END-of-strcmp.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
XENTRY(strcmp)
X /*
X * Align s1 to word boundary.
X * Consider unrolling loop?
X */
X.Ls1align:
X testb $7,%dil
X je .Ls1aligned
X movb (%rdi),%al
X incq %rdi
X movb (%rsi),%dl
X incq %rsi
X testb %al,%al
X je .Ldone
X cmpb %al,%dl
X je .Ls1align
X jmp .Ldone
X
X /*
X * Check whether s2 is aligned to a word boundry. If it is, we
X * can compare by words. Otherwise we have to compare by bytes.
X */
X.Ls1aligned:
X testb $7,%sil
X jne .Lbyte_loop
X
X movabsq $0x0101010101010101,%r8
X subq $8,%rdi
X movabsq $0x8080808080808080,%r9
X subq $8,%rsi
X
X _ALIGN_TEXT
X.Lword_loop:
X movq 8(%rdi),%rax
X addq $8,%rdi
X movq 8(%rsi),%rdx
X addq $8,%rsi
X cmpq %rax,%rdx
X jne .Lbyte_loop
X subq %r8,%rdx
X notq %rax
X andq %rax,%rdx
X testq %r9,%rdx
X je .Lword_loop
X
X _ALIGN_TEXT
X.Lbyte_loop:
X movb (%rdi),%al
X incq %rdi
X movb (%rsi),%dl
X incq %rsi
X testb %al,%al
X je .Ldone
X cmpb %al,%dl
X je .Lbyte_loop
X
X.Ldone:
X movzbq %al,%rax
X movzbq %dl,%rdx
X subq %rdx,%rax
X ret
END-of-strcmp.S
echo x - strcpy.S
sed 's/^X//' >strcpy.S << 'END-of-strcpy.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
X/*
X * This strcpy implementation copies a byte at a time until the
X * source pointer is aligned to a word boundary, it then copies by
X * words until it finds a word containing a zero byte, and finally
X * copies by bytes until the end of the string is reached.
X *
X * While this may result in unaligned stores if the source and
X * destination pointers are unaligned with respect to each other,
X * it is still faster than either byte copies or the overhead of
X * an implementation suitable for machines with strict alignment
X * requirements.
X */
X
XENTRY(strcpy)
X movq %rdi,%rax
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X
X /*
X * Align source to a word boundary.
X * Consider unrolling loop?
X */
X _ALIGN_TEXT
X.Lalign:
X testb $7,%sil
X je .Lword_aligned
X movb (%rsi),%dl
X incq %rsi
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl
X jne .Lalign
X ret
X
X _ALIGN_TEXT
X.Lloop:
X movq %rdx,(%rdi)
X addq $8,%rdi
X.Lword_aligned:
X movq (%rsi),%rdx
X movq %rdx,%rcx
X addq $8,%rsi
X subq %r8,%rcx
X testq %r9,%rcx
X je .Lloop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word equal 0.
X */
X
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 1st byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 2nd byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 3rd byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 4th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 5th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 6th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 7th byte == 0? */
X je .Ldone
X
X shrq $8,%rdx
X movb %dl,(%rdi)
X incq %rdi
X testb %dl,%dl /* 8th byte == 0? */
X jne .Lword_aligned
X
X.Ldone:
X ret
END-of-strcpy.S
echo x - strlen.S
sed 's/^X//' >strlen.S << 'END-of-strlen.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
XENTRY(strlen)
X movq %rdi,%rax
X negq %rdi
X
X.Lalign:
X /* Consider unrolling loop? */
X testb $7,%al
X je .Lword_aligned
X cmpb $0,(%rax)
X jne 1f
X leaq (%rdi,%rax),%rax
X ret
X1: incq %rax
X jmp .Lalign
X
X /*
X * There are many well known branch-free sequences which are used
X * for determining whether a zero-byte is contained within a word.
X * These sequences are generally much more efficent than loading
X * and comparing each byte individually.
X *
X * The expression [1,2]:
X *
X * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
X *
X * evaluates to a non-zero value if any of the bytes in the
X * original word is zero.
X *
X * It also has the useful property that bytes in the result word
X * that coorespond to non-zero bytes in the original word have
X * the value 0x00, while bytes cooresponding to zero bytes have
X * the value 0x80. This allows calculation of the first (and
X * last) occurance of a zero byte within the word (useful for C's
X * str* primitives) by counting the number of leading (or
X * trailing) zeros and dividing the result by 8. On machines
X * without (or with slow) clz() / ctz() instructions, testing
X * each byte in the result word for zero is necessary.
X *
X * This typically takes 4 instructions (5 on machines without
X * "not-or") not including those needed to load the constant.
X *
X *
X * The expression:
X *
X * (2) ((x - 0x01....01) & ~x & 0x80....80)
X *
X * evaluates to a non-zero value if any of the bytes in the
X * original word is zero.
X *
X * On little endian machines, the first byte in the result word
X * that cooresponds to a zero byte in the original byte is 0x80,
X * so clz() can be used as above. On big endian machines, and
X * little endian machines without (or with a slow) clz() insn,
X * testing each byte in the original for zero is necessary
X *
X * This typically takes 3 instructions (4 on machines without
X * "and with complement") not including those needed to load
X * constants.
X *
X *
X * The expression:
X *
X * (3) ((x - 0x01....01) & 0x80....80)
X *
X * always evaluates to a non-zero value if any of the bytes in
X * the original word is zero. However, in rare cases, it also
X * evaluates to a non-zero value when none of the bytes in the
X * original word is zero.
X *
X * To account for possible false positives, each byte of the
X * original word must be checked when the expression evaluates to
X * a non-zero value. However, because it is simpler than those
X * presented above, code that uses it will be faster as long as
X * the rate of false positives is low.
X *
X * This is likely, because the the false positive can only occur
X * if the most siginificant bit of a byte within the word is set.
X * The expression will never fail for typical 7-bit ASCII strings.
X *
X * This typically takes 2 instructions not including those needed
X * to load constants.
X *
X *
X * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
X *
X * [2] International Business Machines, "The PowerPC Compiler Writer's
X * Guide", Warthman Associates, 1996
X */
X
X _ALIGN_TEXT
X.Lword_aligned:
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X.Lloop:
X movq (%rax),%rdx
X addq $8,%rax
X subq %r8,%rdx
X testq %r9,%rdx
X je .Lloop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word equal 0.
X */
X cmpb $0,-8(%rax) /* 1st byte == 0? */
X je .Lsub8
X cmpb $0,-7(%rax) /* 2nd byte == 0? */
X je .Lsub7
X cmpb $0,-6(%rax) /* 3rd byte == 0? */
X je .Lsub6
X cmpb $0,-5(%rax) /* 4th byte == 0? */
X je .Lsub5
X cmpb $0,-4(%rax) /* 5th byte == 0? */
X je .Lsub4
X cmpb $0,-3(%rax) /* 6th byte == 0? */
X je .Lsub3
X cmpb $0,-2(%rax) /* 7th byte == 0? */
X je .Lsub2
X cmpb $0,-1(%rax) /* 8th byte == 0? */
X jne .Lloop
X
X.Lsub1:
X leaq -1(%rdi,%rax),%rax
X ret
X.Lsub2:
X leaq -2(%rdi,%rax),%rax
X ret
X.Lsub3:
X leaq -3(%rdi,%rax),%rax
X ret
X.Lsub4:
X leaq -4(%rdi,%rax),%rax
X ret
X.Lsub5:
X leaq -5(%rdi,%rax),%rax
X ret
X.Lsub6:
X leaq -6(%rdi,%rax),%rax
X ret
X.Lsub7:
X leaq -7(%rdi,%rax),%rax
X ret
X.Lsub8:
X leaq -8(%rdi,%rax),%rax
X ret
END-of-strlen.S
echo x - strrchr.S
sed 's/^X//' >strrchr.S << 'END-of-strrchr.S'
X/*
X * Written by J.T. Conklin <jtc@acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X
X#if defined(LIBC_SCCS)
X RCSID("$NetBSD: $")
X#endif
X
X#ifdef RINDEX
XENTRY(rindex)
X#else
XENTRY(strrchr)
X#endif
X movzbq %sil,%rcx
X
X /* zero return value */
X xorq %rax,%rax
X
X /*
X * Align to word boundary.
X * Consider unrolling loop?
X */
X.Lalign:
X testb $7,%dil
X je .Lword_aligned
X movb (%rdi),%dl
X cmpb %cl,%dl
X cmoveq %rdi,%rax
X incq %rdi
X testb %dl,%dl
X jne .Lalign
X jmp .Ldone
X
X.Lword_aligned:
X /* copy char to all bytes in word */
X movb %cl,%ch
X movq %rcx,%rdx
X salq $16,%rcx
X orq %rdx,%rcx
X movq %rcx,%rdx
X salq $32,%rcx
X orq %rdx,%rcx
X
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X
X /* Check whether any byte in the word is equal to ch or 0. */
X _ALIGN_TEXT
X.Lloop:
X movq (%rdi),%rdx
X addq $8,%rdi
X movq %rdx,%rsi
X subq %r8,%rdx
X xorq %rcx,%rsi
X subq %r8,%rsi
X orq %rsi,%rdx
X testq %r9,%rdx
X je .Lloop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word match
X * ch or are equal to 0.
X */
X
X movb -8(%rdi),%dl
X cmpb %cl,%dl /* 1st byte == ch? */
X jne 1f
X leaq -8(%rdi),%rax
X1: testb %dl,%dl /* 1st byte == 0? */
X je .Ldone
X
X movb -7(%rdi),%dl
X cmpb %cl,%dl /* 2nd byte == ch? */
X jne 1f
X leaq -7(%rdi),%rax
X1: testb %dl,%dl /* 2nd byte == 0? */
X je .Ldone
X
X movb -6(%rdi),%dl
X cmpb %cl,%dl /* 3rd byte == ch? */
X jne 1f
X leaq -6(%rdi),%rax
X1: testb %dl,%dl /* 3rd byte == 0? */
X je .Ldone
X
X movb -5(%rdi),%dl
X cmpb %cl,%dl /* 4th byte == ch? */
X jne 1f
X leaq -5(%rdi),%rax
X1: testb %dl,%dl /* 4th byte == 0? */
X je .Ldone
X
X movb -4(%rdi),%dl
X cmpb %cl,%dl /* 5th byte == ch? */
X jne 1f
X leaq -4(%rdi),%rax
X1: testb %dl,%dl /* 5th byte == 0? */
X je .Ldone
X
X movb -3(%rdi),%dl
X cmpb %cl,%dl /* 6th byte == ch? */
X jne 1f
X leaq -3(%rdi),%rax
X1: testb %dl,%dl /* 6th byte == 0? */
X je .Ldone
X
X movb -2(%rdi),%dl
X cmpb %cl,%dl /* 7th byte == ch? */
X jne 1f
X leaq -2(%rdi),%rax
X1: testb %dl,%dl /* 7th byte == 0? */
X je .Ldone
X
X movb -1(%rdi),%dl
X cmpb %cl,%dl /* 8th byte == ch? */
X jne 1f
X leaq -1(%rdi),%rax
X1: testb %dl,%dl /* 8th byte == 0? */
X jne .Lloop
X
X.Ldone:
X ret
END-of-strrchr.S
exit
>Release-Note:
>Audit-Trail:
>Unformatted: