[src/trunk]: src/common/lib/libc/arch/arm/string Make this work on all ARMs b...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/common/lib/libc/arch/arm/string Make this work on all ARMs b...
From: matt <matt%NetBSD.org@localhost>
Date: Tue, 07 Apr 2020 00:02:44 +0000

details:   https://anonhg.NetBSD.org/src/rev/a243311703e7
branches:  trunk
changeset: 783596:a243311703e7
user:      matt <matt%NetBSD.org@localhost>
date:      Mon Dec 31 07:58:44 2012 +0000

description:
Make this work on all ARMs but keep the armv6 optimizations.  It as fast as
the existing strlen for small string and once strings are 8 bytes or more in
length it starts getting significantly faster.  For really long strings,
compared to the existing strlen, this uses about 1/2 of the cycles for the
non-armv6 version and about 1/3 of the cycles for the armv6 version.

diffstat:

 common/lib/libc/arch/arm/string/strlen_armv6.S |  84 +++++++++++++++++--------
 1 files changed, 57 insertions(+), 27 deletions(-)

diffs (123 lines):

diff -r fa461905efa1 -r a243311703e7 common/lib/libc/arch/arm/string/strlen_armv6.S
--- a/common/lib/libc/arch/arm/string/strlen_armv6.S    Mon Dec 31 03:23:53 2012 +0000
+++ b/common/lib/libc/arch/arm/string/strlen_armv6.S    Mon Dec 31 07:58:44 2012 +0000
@@ -29,37 +29,44 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: strlen_armv6.S,v 1.2 2012/12/29 05:36:57 matt Exp $")
+RCSID("$NetBSD: strlen_armv6.S,v 1.3 2012/12/31 07:58:44 matt Exp $")
+
+#ifdef __ARMEL__
+#define        BYTE0   0x000000ff
+#define        BYTE1   0x0000ff00
+#define        BYTE2   0x00ff0000
+#define        BYTE3   0xff000000
+#else
+#define        BYTE0   0xff000000
+#define        BYTE1   0x00ff0000
+#define        BYTE2   0x0000ff00
+#define        BYTE3   0x000000ff
+#endif
 
        .text
 ENTRY(strlen)
        add     ip, r0, #4              /* for the final post-inc */
-       ands    r1, r0, #3              /* get misalignment */
-       bic     r0, r0, #3              /* align to word boundary */
-       ldr     r3, [r0], #4            /* load first word */
-       beq     .Lpre_main_loop         /*   misaligned?  no, go to loop */
-       /*
-        * For misaligned string, we need to make sure that the bytes before
-        * the start of the string will not cause a false match to a NUL.
-        */
-       mvn     r2, #0                  /* create a mask */
-       mov     r1, r1, lsl #3          /* bytes -> bits */
-#ifdef __ARMEL__
-       mov     r2, r2, lsl r1          /* clear relavent bytes */
-#else
-       mov     r2, r2, lsr r1          /* clear relavent bytes */
-#endif
-       mvn     r2, r2                  /* invert mask */
-       orr     r3, r3, r2              /* orr in mask for leading bytes */
+1:     tst     r0, #3                  /* test for word alignment */
+       beq     .Lpre_main_loop         /*   finally word aligned */
+       ldrb    r3, [r0], #1            /* load a byte */
+       teq     r3, #0                  /* is it 0? */
+       bne     1b                      /*   no, try next byte */
+       sub     ip, ip, #3              /* subtract (4 - the NUL) */
+       sub     r0, r0, ip              /* subtract start */
+       RET                             /* return */
 .Lpre_main_loop:
-#ifdef _ARM_ARCH_7
+#if defined(_ARM_ARCH_6)
+#if defined(_ARM_ARCH_7)
        movw    r1, #0xfefe             /* magic constant; 254 in each byte */
 #else
        mov     r1, #0xfe               /* put 254 in low byte */
        orr     r1, r1, r1, lsl #8      /* move to next byte */
 #endif
        orr     r1, r1, r1, lsl #16     /* move to next halfword */
+#endif /* _ARM_ARCH_6 */
 .Lmain_loop:
+       ldr     r3, [r0], #4            /* load next word */
+#if defined(_ARM_ARCH_6)
        /*
         * Add 254 to each byte using the UQADD8 (unsigned saturating add 8)
         * instruction.  For every non-NUL byte, the result for that byte will
@@ -67,24 +74,47 @@
         * result, if the result is non-0 then we must have encountered a NUL.
         */
        uqadd8  r3, r3, r1              /* magic happens here */
-       mvns    r3, r3                  /* is the complemented result 0? */
-       bne     .Lreturn                /*    no, then we encountered a NUL */
-       ldr     r3, [r0], #4            /* load next word */
-       b       .Lmain_loop             /* and go */
+       mvns    r3, r3                  /* is the complemented result non-0? */
+       beq     .Lmain_loop             /*    no, then we encountered no NULs */
+#else
+       /*
+        * No fancy shortcuts so just test each byte lane for a NUL.
+        * (other tests for NULs in a word take more instructions/cycles).
+        */
+       tst     r3, #BYTE0              /* is this byte 0? */
+       tstne   r3, #BYTE1              /*   no, is this byte 0? */
+       tstne   r3, #BYTE2              /*   no, is this byte 0? */
+       tstne   r3, #BYTE3              /*   no, is this byte 0? */
+       bne     .Lmain_loop             /*   no, then get next word */
+#endif
+#if defined(_ARM_ARCH_6)
        /*
         * We encountered a NUL.  Find out where by doing a CLZ and then
         * shifting right by 3.  That will be the number of non-NUL bytes.
         */
-.Lreturn:
 #ifdef __ARMEL__
        rev     r3, r3                  /* we want this in BE for the CLZ */
 #endif
        clz     r3, r3                  /* count how many leading zeros */
        add     r0, r0, r3, lsr #3      /* divide that by 8 and add to count */
+#else
        /*
-        * r0 now points to 4 past the NUL due to the post-inc.  Subtract
-        * the start of the string (which also has 4 added to it to compensate
-        * for the post-inc.
+        * We encountered a NUL.
+        */
+       tst     r3, #BYTE0              /* 1st byte was NUL? */
+       beq     1f                      /*   yes, done adding */
+       add     r0, r0, #1              /* we have one more non-NUL byte */
+       tst     r3, #BYTE1              /* 2nd byte was NUL? */
+       beq     1f                      /*   yes, done adding */
+       add     r0, r0, #1              /* we have one more non-NUL byte */
+       tst     r3, #BYTE2              /* 3rd byte was NUL? */
+       addne   r0, r0, #1              /* no, we have one more non-NUL byte */
+1:
+#endif /* _ARM_ARCH_6 */
+       /*
+        * r0 now points to 4 past the NUL due to the post-inc.  Subtract the
+        * start of the string (which also has 4 added to it to compensate for
+        * the post-inc.
         */
        sub     r0, r0, ip              /* subtract start to get length */
        RET

Prev by Date: [src/trunk]: src/sys/kern Remove a debugging printf
Next by Date: [src/trunk]: src/sys/arch/arm/vfp Always re-enable the VFP when loading for a...
Previous by Thread: [src/trunk]: src/sys/kern Remove a debugging printf
Next by Thread: [src/trunk]: src/sys/arch/arm/vfp Always re-enable the VFP when loading for a...
Indexes:

Home | Main Index | Thread Index | Old Index