Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/lib/libc/arch/i386/string switch to J.T.Conklin's optimized ...



details:   https://anonhg.NetBSD.org/src/rev/00a0298a8ed3
branches:  trunk
changeset: 573670:00a0298a8ed3
user:      drochner <drochner%NetBSD.org@localhost>
date:      Fri Feb 04 18:12:52 2005 +0000

description:
switch to J.T.Conklin's optimized str* functions
(submitted per PR i386/25263)

diffstat:

 lib/libc/arch/i386/string/index.S   |   31 +------
 lib/libc/arch/i386/string/memchr.S  |  115 ++++++++++++++++++++---
 lib/libc/arch/i386/string/rindex.S  |   34 +------
 lib/libc/arch/i386/string/strcat.S  |  168 ++++++++++++++++++++++++-----------
 lib/libc/arch/i386/string/strchr.S  |  107 ++++++++++++++++++++++-
 lib/libc/arch/i386/string/strcmp.S  |  135 +++++++++++++---------------
 lib/libc/arch/i386/string/strlen.S  |  142 +++++++++++++++++++++++++++--
 lib/libc/arch/i386/string/strrchr.S |  100 ++++++++++++++++++++-
 8 files changed, 611 insertions(+), 221 deletions(-)

diffs (truncated from 917 to 300 lines):

diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/index.S
--- a/lib/libc/arch/i386/string/index.S Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/index.S Fri Feb 04 18:12:52 2005 +0000
@@ -1,29 +1,4 @@
-/*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
- * Public domain.
- */
-
-#include <machine/asm.h>
-
-#if defined(LIBC_SCCS)
-       RCSID("$NetBSD: index.S,v 1.11 2003/07/26 19:24:34 salo Exp $")
-#endif
+/* $NetBSD: index.S,v 1.12 2005/02/04 18:12:52 drochner Exp $ */
 
-#ifdef STRCHR
-ENTRY(strchr)
-#else
-ENTRY(index)
-#endif
-       movl    4(%esp),%eax
-       movb    8(%esp),%cl
-       _ALIGN_TEXT,0x90
-L1:
-       movb    (%eax),%dl
-       cmpb    %dl,%cl                 /* found char??? */
-       je      L2
-       incl    %eax
-       testb   %dl,%dl                 /* null terminator??? */
-       jnz     L1
-       xorl    %eax,%eax
-L2:
-       ret
+#define INDEX
+#include "strchr.S"
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/memchr.S
--- a/lib/libc/arch/i386/string/memchr.S        Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/memchr.S        Fri Feb 04 18:12:52 2005 +0000
@@ -1,29 +1,108 @@
 /*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
+ * Written by J.T. Conklin <jtc%acorntoolworks.com@localhost>
  * Public domain.
  */
 
 #include <machine/asm.h>
 
 #if defined(LIBC_SCCS)
-       RCSID("$NetBSD: memchr.S,v 1.10 2003/07/26 19:24:34 salo Exp $")
+       RCSID("$NetBSD: memchr.S,v 1.11 2005/02/04 18:12:52 drochner Exp $")
 #endif
 
 ENTRY(memchr)
-       pushl   %edi
-       movl    8(%esp),%edi            /* string address */
-       movl    12(%esp),%eax           /* set character to search for */
-       movl    16(%esp),%ecx           /* set length of search */
-       testl   %ecx,%ecx               /* test for len == 0 */
-       jz      L1
-       cld                             /* set search forward */
-       repne                           /* search! */
-       scasb
-       jne     L1                      /* scan failed, return null */
-       leal    -1(%edi),%eax           /* adjust result of scan */
-       popl    %edi
+       pushl   %esi
+       movl    8(%esp),%eax
+       movl    12(%esp),%ecx
+       movl    16(%esp),%esi
+
+       /*
+        * Align to word boundry
+        * Consider unrolling loop?
+        */
+       testl   %esi,%esi       /* nbytes == 0? */
+       je      .Lzero
+.Lalign:
+       testb   $3,%al
+       je      .Lword_aligned
+       cmpb    (%eax),%cl
+       je      .Ldone
+       incl    %eax
+       decl    %esi
+       jnz     .Lalign
+
+.Lword_aligned:
+       /* copy char to all bytes in word */
+       movb    %cl,%ch
+       movl    %ecx,%edx
+       sall    $16,%ecx
+       orl     %edx,%ecx
+
+       _ALIGN_TEXT
+.Lloop:
+       cmpl    $3,%esi         /* nbytes > 4 */
+       jbe     .Lbyte
+       movl    (%eax),%edx
+       addl    $4,%eax
+       xorl    %ecx,%edx
+       subl    $4,%esi
+       subl    $0x01010101,%edx
+       testl   $0x80808080,%edx
+       je      .Lloop
+
+       /*
+        * In rare cases, the above loop may exit prematurely. We must
+        * return to the loop if none of the bytes in the word are
+        * equal to ch.
+        */
+
+       /*
+        * High load-use latency on the Athlon leads to significant
+        * stalls, so we preload the next char as soon as possible
+        * instead of using cmp mem8, reg8.
+        *
+        * Alignment here avoids a stall on the Athlon, even though
+        * it's not a branch target.
+        */
+       _ALIGN_TEXT
+       cmpb    -4(%eax),%cl    /* 1st byte == ch? */
+       movb    -3(%eax),%dl
+       jne     1f
+       subl    $4,%eax
+       jmp     .Ldone
+
+       _ALIGN_TEXT
+1:     cmpb    %dl,%cl         /* 2nd byte == ch? */
+       movb    -2(%eax),%dl
+       jne     1f
+       subl    $3,%eax
+       jmp     .Ldone
+
+       _ALIGN_TEXT
+1:     cmpb    %dl,%cl         /* 3rd byte == ch? */
+       movb    -1(%eax),%dl
+       jne     1f
+       subl    $2,%eax
+       jmp     .Ldone
+
+       _ALIGN_TEXT
+1:     cmpb    %dl,%cl         /* 4th byte == ch? */
+       jne     .Lloop
+       decl    %eax
+       jmp     .Ldone
+
+.Lbyte:
+       testl   %esi,%esi
+       je      .Lzero
+.Lbyte_loop:
+       cmpb    (%eax),%cl
+       je      .Ldone
+       incl    %eax
+       decl    %esi
+       jnz     .Lbyte_loop
+
+.Lzero:
+       xorl    %eax,%eax
+
+.Ldone:
+       popl    %esi
        ret
-       _ALIGN_TEXT,0x90
-L1:    xorl    %eax,%eax
-       popl    %edi
-       ret
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/rindex.S
--- a/lib/libc/arch/i386/string/rindex.S        Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/rindex.S        Fri Feb 04 18:12:52 2005 +0000
@@ -1,32 +1,4 @@
-/*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
- * Public domain.
- */
-
-#include <machine/asm.h>
-
-#if defined(LIBC_SCCS)
-       RCSID("$NetBSD: rindex.S,v 1.11 2003/07/26 19:24:34 salo Exp $")
-#endif
+/* $NetBSD: rindex.S,v 1.12 2005/02/04 18:12:52 drochner Exp $ */
 
-#ifdef STRRCHR
-ENTRY(strrchr)
-#else
-ENTRY(rindex)
-#endif
-       pushl   %ebx
-       movl    8(%esp),%edx
-       movb    12(%esp),%cl
-       xorl    %eax,%eax               /* init pointer to null */
-       _ALIGN_TEXT,0x90
-L1:
-       movb    (%edx),%bl
-       cmpb    %bl,%cl
-       jne     L2
-       movl    %edx,%eax
-L2:
-       incl    %edx
-       testb   %bl,%bl                 /* null terminator??? */
-       jnz     L1
-       popl    %ebx
-       ret
+#define RINDEX
+#include "strrchr.S"
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/strcat.S
--- a/lib/libc/arch/i386/string/strcat.S        Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/strcat.S        Fri Feb 04 18:12:52 2005 +0000
@@ -1,69 +1,127 @@
 /*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
+ * Written by J.T. Conklin <jtc%acorntoolworks.com@localhost>
  * Public domain.
  */
 
 #include <machine/asm.h>
 
 #if defined(LIBC_SCCS)
-       RCSID("$NetBSD: strcat.S,v 1.10 2003/07/26 19:24:35 salo Exp $")
+       RCSID("$NetBSD: strcat.S,v 1.11 2005/02/04 18:12:52 drochner Exp $")
 #endif
 
-/*
- * NOTE: I've unrolled the loop eight times: large enough to make a
- * significant difference, and small enough not to totally trash the
- * cache.
- */
-
 ENTRY(strcat)
-       pushl   %edi                    /* save edi */
-       movl    8(%esp),%edi            /* dst address */
-       movl    12(%esp),%edx           /* src address */
-       pushl   %edi                    /* push destination address */
+       pushl   %ebx
+       movl    8(%esp),%ecx
+       movl    12(%esp),%eax
+
+       /*
+        * Align destination to word boundary.
+        * Consider unrolling loop?
+        */
+.Lscan:
+.Lscan_align:
+       testb   $3,%cl
+       je      .Lscan_aligned
+       cmpb    $0,(%ecx)
+       je      .Lcopy
+       incl    %ecx
+       jmp     .Lscan_align
+
+       _ALIGN_TEXT
+.Lscan_aligned:
+.Lscan_loop:
+       movl    (%ecx),%ebx
+       addl    $4,%ecx
+       leal    -0x01010101(%ebx),%edx
+       testl   $0x80808080,%edx
+       je      .Lscan_loop
 
-       cld                             /* set search forward */
-       xorl    %eax,%eax               /* set search for null terminator */
-       movl    $-1,%ecx                /* set search for lots of characters */
-       repne                           /* search! */
-       scasb
+       /*
+        * In rare cases, the above loop may exit prematurely. We must
+        * return to the loop if none of the bytes in the word equal 0.
+        */
+
+       /*
+        * The optimal code for determining whether each byte is zero
+        * differs by processor.  This space-optimized code should be
+        * acceptable on all, especially since we don't expect it to
+        * be run frequently,
+        */
 
-       leal    -1(%edi),%ecx           /* correct dst address */
+       testb   %bl,%bl         /* 1st byte == 0? */
+       jne     1f
+       subl    $4,%ecx
+       jmp     .Lcopy
+
+1:     testb   %bh,%bh         /* 2nd byte == 0? */
+       jne     1f
+       subl    $3,%ecx
+       jmp     .Lcopy
+
+1:     shrl    $16,%ebx
+       testb   %bl,%bl         /* 3rd byte == 0? */
+       jne     1f
+       subl    $2,%ecx
+       jmp     .Lcopy
+
+1:     testb   %bh,%bh         /* 4th byte == 0? */
+       jne     .Lscan_loop
+       subl    $1,%ecx
 
-       _ALIGN_TEXT,0x90
-L1:    movb    (%edx),%al              /* unroll loop, but not too much */
-       movb    %al,(%ecx)



Home | Main Index | Thread Index | Old Index