Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/common/lib/libc/arch/arm/string Slighly improved (can deal w...



details:   https://anonhg.NetBSD.org/src/rev/bfd0fcbe1e3f
branches:  trunk
changeset: 783341:bfd0fcbe1e3f
user:      matt <matt%NetBSD.org@localhost>
date:      Sat Dec 15 22:23:31 2012 +0000

description:
Slighly improved (can deal with all 16 bytes being non-NUL and quickly
proceed to next qword).

diffstat:

 common/lib/libc/arch/arm/string/strlen_neon.S |  36 +++++++++++++++------------
 1 files changed, 20 insertions(+), 16 deletions(-)

diffs (61 lines):

diff -r a1aa781491fa -r bfd0fcbe1e3f common/lib/libc/arch/arm/string/strlen_neon.S
--- a/common/lib/libc/arch/arm/string/strlen_neon.S     Sat Dec 15 21:50:43 2012 +0000
+++ b/common/lib/libc/arch/arm/string/strlen_neon.S     Sat Dec 15 22:23:31 2012 +0000
@@ -29,7 +29,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: strlen_neon.S,v 1.1 2012/12/15 19:26:34 matt Exp $")
+RCSID("$NetBSD: strlen_neon.S,v 1.2 2012/12/15 22:23:31 matt Exp $")
        .text
 
 ENTRY(strlen)
@@ -39,6 +39,9 @@
        veor    q2, q2, q2      /* clear mask */
        mov     r3, #7          /* NBBY - 1 */
        vdup.32 q3, r3          /* dup throughout q3 */
+       mov     r3, #0x04       /* magic since there are 4 bytes per U32 */
+       orr     r3, r3, lsl #8  /* copy to next 8 bits */
+       orr     r3, r3, lsl #16 /* copy to upper 16 bits */
        beq     .Lmain_loop
        veor    q0, q0, q0      /* clear q0 */
        vmvn    q2, q2          /* set all 16 bytes of mask to all 1s */
@@ -64,22 +67,23 @@
        vorr    q0, q0, q2      /* or "in" leading byte mask */
        veor    q2, q2, q2      /* clear byte mask */
        vceq.i8 q1, q0, #0      /* test each byte for 0 */
+       /* Why couldn't there be a 64-bit CLZ? */
        vclz.i32 q1, q1         /* count leading zeroes to find the 0 byte */
        vadd.i32 q1, q1, q3     /* round up to byte bounary */
        vshr.u32 q1, q1, #3     /* convert to bytes */
-       vmov    r2, r3, d3      /* get lo & hi counts */
-       add     r0, r0, r3      /* add bytes to count */
-       cmp     r3, #4          /* less than 4 means a NUL encountered */
-       bxlt    lr              /* return */
-       add     r0, r0, r2      /* add bytes to count */
-       cmp     r2, #4          /* less than 4 means a NUL encountered */
-       bxlt    lr              /* return */
-       vmov    r2, r3, d2      /* get lo & hi counts */
-       add     r0, r0, r3      /* add bytes to count */
-       cmp     r3, #4          /* less than 4 means a NUL encountered */
-       bxlt    lr              /* return */
-       add     r0, r0, r2      /* add bytes to count */
-       cmp     r2, #4          /* less than 4 means a NUL encountered */
-       bxlt    lr              /* return */
-       b       .Lmain_loop
+       vmovn.i32 d0, q1        /* 4 I32 -> 4 I16 */
+       vmovn.i16 d0, q0        /* 4 I16 -> 4  I8 */
+       vmov    r2, s0          /* get counts */
+       cmp     r2, r3          /* count eq 4 in each byte? */
+       addeq   r0, #16         /*  no NULs */
+       beq     .Lmain_loop     /* get next qword */
+                               /* r2[31:24] already has 1st word byte count */
+       tst     r2, #(4 << 24)  /* first word has 4 non-NUL? */
+       addne   r2, r2, r2, lsl #8 /* add second word byte-count */
+       tstne   r2, #(4 << 16)  /* second word has 4 non-NUL? */
+       addne   r2, r2, r2, lsl #16 /* add thirs word byte-count */
+       tstne   r2, #(4 << 8)   /* third has 4 non-NULL? */
+       addne   r2, r2, r2, lsl #24 /* add fourth word byte-count */
+       add     r0, r0, r2, lsr #24 /* add accumulated byte-count to length */
+       RET                     /* and return. */
 END(strlen)



Home | Main Index | Thread Index | Old Index