Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/lib/libc/arch/i386/string switch to J.T.Conklin's optimized ...
details: https://anonhg.NetBSD.org/src/rev/00a0298a8ed3
branches: trunk
changeset: 573670:00a0298a8ed3
user: drochner <drochner%NetBSD.org@localhost>
date: Fri Feb 04 18:12:52 2005 +0000
description:
switch to J.T.Conklin's optimized str* functions
(submitted per PR i386/25263)
diffstat:
lib/libc/arch/i386/string/index.S | 31 +------
lib/libc/arch/i386/string/memchr.S | 115 ++++++++++++++++++++---
lib/libc/arch/i386/string/rindex.S | 34 +------
lib/libc/arch/i386/string/strcat.S | 168 ++++++++++++++++++++++++-----------
lib/libc/arch/i386/string/strchr.S | 107 ++++++++++++++++++++++-
lib/libc/arch/i386/string/strcmp.S | 135 +++++++++++++---------------
lib/libc/arch/i386/string/strlen.S | 142 +++++++++++++++++++++++++++--
lib/libc/arch/i386/string/strrchr.S | 100 ++++++++++++++++++++-
8 files changed, 611 insertions(+), 221 deletions(-)
diffs (truncated from 917 to 300 lines):
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/index.S
--- a/lib/libc/arch/i386/string/index.S Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/index.S Fri Feb 04 18:12:52 2005 +0000
@@ -1,29 +1,4 @@
-/*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
- * Public domain.
- */
-
-#include <machine/asm.h>
-
-#if defined(LIBC_SCCS)
- RCSID("$NetBSD: index.S,v 1.11 2003/07/26 19:24:34 salo Exp $")
-#endif
+/* $NetBSD: index.S,v 1.12 2005/02/04 18:12:52 drochner Exp $ */
-#ifdef STRCHR
-ENTRY(strchr)
-#else
-ENTRY(index)
-#endif
- movl 4(%esp),%eax
- movb 8(%esp),%cl
- _ALIGN_TEXT,0x90
-L1:
- movb (%eax),%dl
- cmpb %dl,%cl /* found char??? */
- je L2
- incl %eax
- testb %dl,%dl /* null terminator??? */
- jnz L1
- xorl %eax,%eax
-L2:
- ret
+#define INDEX
+#include "strchr.S"
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/memchr.S
--- a/lib/libc/arch/i386/string/memchr.S Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/memchr.S Fri Feb 04 18:12:52 2005 +0000
@@ -1,29 +1,108 @@
/*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
+ * Written by J.T. Conklin <jtc%acorntoolworks.com@localhost>
* Public domain.
*/
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: memchr.S,v 1.10 2003/07/26 19:24:34 salo Exp $")
+ RCSID("$NetBSD: memchr.S,v 1.11 2005/02/04 18:12:52 drochner Exp $")
#endif
ENTRY(memchr)
- pushl %edi
- movl 8(%esp),%edi /* string address */
- movl 12(%esp),%eax /* set character to search for */
- movl 16(%esp),%ecx /* set length of search */
- testl %ecx,%ecx /* test for len == 0 */
- jz L1
- cld /* set search forward */
- repne /* search! */
- scasb
- jne L1 /* scan failed, return null */
- leal -1(%edi),%eax /* adjust result of scan */
- popl %edi
+ pushl %esi
+ movl 8(%esp),%eax
+ movl 12(%esp),%ecx
+ movl 16(%esp),%esi
+
+ /*
+ * Align to word boundry
+ * Consider unrolling loop?
+ */
+ testl %esi,%esi /* nbytes == 0? */
+ je .Lzero
+.Lalign:
+ testb $3,%al
+ je .Lword_aligned
+ cmpb (%eax),%cl
+ je .Ldone
+ incl %eax
+ decl %esi
+ jnz .Lalign
+
+.Lword_aligned:
+ /* copy char to all bytes in word */
+ movb %cl,%ch
+ movl %ecx,%edx
+ sall $16,%ecx
+ orl %edx,%ecx
+
+ _ALIGN_TEXT
+.Lloop:
+ cmpl $3,%esi /* nbytes > 4 */
+ jbe .Lbyte
+ movl (%eax),%edx
+ addl $4,%eax
+ xorl %ecx,%edx
+ subl $4,%esi
+ subl $0x01010101,%edx
+ testl $0x80808080,%edx
+ je .Lloop
+
+ /*
+ * In rare cases, the above loop may exit prematurely. We must
+ * return to the loop if none of the bytes in the word are
+ * equal to ch.
+ */
+
+ /*
+ * High load-use latency on the Athlon leads to significant
+ * stalls, so we preload the next char as soon as possible
+ * instead of using cmp mem8, reg8.
+ *
+ * Alignment here avoids a stall on the Athlon, even though
+ * it's not a branch target.
+ */
+ _ALIGN_TEXT
+ cmpb -4(%eax),%cl /* 1st byte == ch? */
+ movb -3(%eax),%dl
+ jne 1f
+ subl $4,%eax
+ jmp .Ldone
+
+ _ALIGN_TEXT
+1: cmpb %dl,%cl /* 2nd byte == ch? */
+ movb -2(%eax),%dl
+ jne 1f
+ subl $3,%eax
+ jmp .Ldone
+
+ _ALIGN_TEXT
+1: cmpb %dl,%cl /* 3rd byte == ch? */
+ movb -1(%eax),%dl
+ jne 1f
+ subl $2,%eax
+ jmp .Ldone
+
+ _ALIGN_TEXT
+1: cmpb %dl,%cl /* 4th byte == ch? */
+ jne .Lloop
+ decl %eax
+ jmp .Ldone
+
+.Lbyte:
+ testl %esi,%esi
+ je .Lzero
+.Lbyte_loop:
+ cmpb (%eax),%cl
+ je .Ldone
+ incl %eax
+ decl %esi
+ jnz .Lbyte_loop
+
+.Lzero:
+ xorl %eax,%eax
+
+.Ldone:
+ popl %esi
ret
- _ALIGN_TEXT,0x90
-L1: xorl %eax,%eax
- popl %edi
- ret
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/rindex.S
--- a/lib/libc/arch/i386/string/rindex.S Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/rindex.S Fri Feb 04 18:12:52 2005 +0000
@@ -1,32 +1,4 @@
-/*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
- * Public domain.
- */
-
-#include <machine/asm.h>
-
-#if defined(LIBC_SCCS)
- RCSID("$NetBSD: rindex.S,v 1.11 2003/07/26 19:24:34 salo Exp $")
-#endif
+/* $NetBSD: rindex.S,v 1.12 2005/02/04 18:12:52 drochner Exp $ */
-#ifdef STRRCHR
-ENTRY(strrchr)
-#else
-ENTRY(rindex)
-#endif
- pushl %ebx
- movl 8(%esp),%edx
- movb 12(%esp),%cl
- xorl %eax,%eax /* init pointer to null */
- _ALIGN_TEXT,0x90
-L1:
- movb (%edx),%bl
- cmpb %bl,%cl
- jne L2
- movl %edx,%eax
-L2:
- incl %edx
- testb %bl,%bl /* null terminator??? */
- jnz L1
- popl %ebx
- ret
+#define RINDEX
+#include "strrchr.S"
diff -r 9328f2ee8480 -r 00a0298a8ed3 lib/libc/arch/i386/string/strcat.S
--- a/lib/libc/arch/i386/string/strcat.S Fri Feb 04 17:10:40 2005 +0000
+++ b/lib/libc/arch/i386/string/strcat.S Fri Feb 04 18:12:52 2005 +0000
@@ -1,69 +1,127 @@
/*
- * Written by J.T. Conklin <jtc%NetBSD.org@localhost>.
+ * Written by J.T. Conklin <jtc%acorntoolworks.com@localhost>
* Public domain.
*/
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: strcat.S,v 1.10 2003/07/26 19:24:35 salo Exp $")
+ RCSID("$NetBSD: strcat.S,v 1.11 2005/02/04 18:12:52 drochner Exp $")
#endif
-/*
- * NOTE: I've unrolled the loop eight times: large enough to make a
- * significant difference, and small enough not to totally trash the
- * cache.
- */
-
ENTRY(strcat)
- pushl %edi /* save edi */
- movl 8(%esp),%edi /* dst address */
- movl 12(%esp),%edx /* src address */
- pushl %edi /* push destination address */
+ pushl %ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%eax
+
+ /*
+ * Align destination to word boundary.
+ * Consider unrolling loop?
+ */
+.Lscan:
+.Lscan_align:
+ testb $3,%cl
+ je .Lscan_aligned
+ cmpb $0,(%ecx)
+ je .Lcopy
+ incl %ecx
+ jmp .Lscan_align
+
+ _ALIGN_TEXT
+.Lscan_aligned:
+.Lscan_loop:
+ movl (%ecx),%ebx
+ addl $4,%ecx
+ leal -0x01010101(%ebx),%edx
+ testl $0x80808080,%edx
+ je .Lscan_loop
- cld /* set search forward */
- xorl %eax,%eax /* set search for null terminator */
- movl $-1,%ecx /* set search for lots of characters */
- repne /* search! */
- scasb
+ /*
+ * In rare cases, the above loop may exit prematurely. We must
+ * return to the loop if none of the bytes in the word equal 0.
+ */
+
+ /*
+ * The optimal code for determining whether each byte is zero
+ * differs by processor. This space-optimized code should be
+ * acceptable on all, especially since we don't expect it to
+ * be run frequently,
+ */
- leal -1(%edi),%ecx /* correct dst address */
+ testb %bl,%bl /* 1st byte == 0? */
+ jne 1f
+ subl $4,%ecx
+ jmp .Lcopy
+
+1: testb %bh,%bh /* 2nd byte == 0? */
+ jne 1f
+ subl $3,%ecx
+ jmp .Lcopy
+
+1: shrl $16,%ebx
+ testb %bl,%bl /* 3rd byte == 0? */
+ jne 1f
+ subl $2,%ecx
+ jmp .Lcopy
+
+1: testb %bh,%bh /* 4th byte == 0? */
+ jne .Lscan_loop
+ subl $1,%ecx
- _ALIGN_TEXT,0x90
-L1: movb (%edx),%al /* unroll loop, but not too much */
- movb %al,(%ecx)
Home |
Main Index |
Thread Index |
Old Index