Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/common/lib/libc/arch/arm/string Make this work on all ARMs b...
details: https://anonhg.NetBSD.org/src/rev/a243311703e7
branches: trunk
changeset: 783596:a243311703e7
user: matt <matt%NetBSD.org@localhost>
date: Mon Dec 31 07:58:44 2012 +0000
description:
Make this work on all ARMs but keep the armv6 optimizations. It as fast as
the existing strlen for small string and once strings are 8 bytes or more in
length it starts getting significantly faster. For really long strings,
compared to the existing strlen, this uses about 1/2 of the cycles for the
non-armv6 version and about 1/3 of the cycles for the armv6 version.
diffstat:
common/lib/libc/arch/arm/string/strlen_armv6.S | 84 +++++++++++++++++--------
1 files changed, 57 insertions(+), 27 deletions(-)
diffs (123 lines):
diff -r fa461905efa1 -r a243311703e7 common/lib/libc/arch/arm/string/strlen_armv6.S
--- a/common/lib/libc/arch/arm/string/strlen_armv6.S Mon Dec 31 03:23:53 2012 +0000
+++ b/common/lib/libc/arch/arm/string/strlen_armv6.S Mon Dec 31 07:58:44 2012 +0000
@@ -29,37 +29,44 @@
#include <machine/asm.h>
-RCSID("$NetBSD: strlen_armv6.S,v 1.2 2012/12/29 05:36:57 matt Exp $")
+RCSID("$NetBSD: strlen_armv6.S,v 1.3 2012/12/31 07:58:44 matt Exp $")
+
+#ifdef __ARMEL__
+#define BYTE0 0x000000ff
+#define BYTE1 0x0000ff00
+#define BYTE2 0x00ff0000
+#define BYTE3 0xff000000
+#else
+#define BYTE0 0xff000000
+#define BYTE1 0x00ff0000
+#define BYTE2 0x0000ff00
+#define BYTE3 0x000000ff
+#endif
.text
ENTRY(strlen)
add ip, r0, #4 /* for the final post-inc */
- ands r1, r0, #3 /* get misalignment */
- bic r0, r0, #3 /* align to word boundary */
- ldr r3, [r0], #4 /* load first word */
- beq .Lpre_main_loop /* misaligned? no, go to loop */
- /*
- * For misaligned string, we need to make sure that the bytes before
- * the start of the string will not cause a false match to a NUL.
- */
- mvn r2, #0 /* create a mask */
- mov r1, r1, lsl #3 /* bytes -> bits */
-#ifdef __ARMEL__
- mov r2, r2, lsl r1 /* clear relavent bytes */
-#else
- mov r2, r2, lsr r1 /* clear relavent bytes */
-#endif
- mvn r2, r2 /* invert mask */
- orr r3, r3, r2 /* orr in mask for leading bytes */
+1: tst r0, #3 /* test for word alignment */
+ beq .Lpre_main_loop /* finally word aligned */
+ ldrb r3, [r0], #1 /* load a byte */
+ teq r3, #0 /* is it 0? */
+ bne 1b /* no, try next byte */
+ sub ip, ip, #3 /* subtract (4 - the NUL) */
+ sub r0, r0, ip /* subtract start */
+ RET /* return */
.Lpre_main_loop:
-#ifdef _ARM_ARCH_7
+#if defined(_ARM_ARCH_6)
+#if defined(_ARM_ARCH_7)
movw r1, #0xfefe /* magic constant; 254 in each byte */
#else
mov r1, #0xfe /* put 254 in low byte */
orr r1, r1, r1, lsl #8 /* move to next byte */
#endif
orr r1, r1, r1, lsl #16 /* move to next halfword */
+#endif /* _ARM_ARCH_6 */
.Lmain_loop:
+ ldr r3, [r0], #4 /* load next word */
+#if defined(_ARM_ARCH_6)
/*
* Add 254 to each byte using the UQADD8 (unsigned saturating add 8)
* instruction. For every non-NUL byte, the result for that byte will
@@ -67,24 +74,47 @@
* result, if the result is non-0 then we must have encountered a NUL.
*/
uqadd8 r3, r3, r1 /* magic happens here */
- mvns r3, r3 /* is the complemented result 0? */
- bne .Lreturn /* no, then we encountered a NUL */
- ldr r3, [r0], #4 /* load next word */
- b .Lmain_loop /* and go */
+ mvns r3, r3 /* is the complemented result non-0? */
+ beq .Lmain_loop /* no, then we encountered no NULs */
+#else
+ /*
+ * No fancy shortcuts so just test each byte lane for a NUL.
+ * (other tests for NULs in a word take more instructions/cycles).
+ */
+ tst r3, #BYTE0 /* is this byte 0? */
+ tstne r3, #BYTE1 /* no, is this byte 0? */
+ tstne r3, #BYTE2 /* no, is this byte 0? */
+ tstne r3, #BYTE3 /* no, is this byte 0? */
+ bne .Lmain_loop /* no, then get next word */
+#endif
+#if defined(_ARM_ARCH_6)
/*
* We encountered a NUL. Find out where by doing a CLZ and then
* shifting right by 3. That will be the number of non-NUL bytes.
*/
-.Lreturn:
#ifdef __ARMEL__
rev r3, r3 /* we want this in BE for the CLZ */
#endif
clz r3, r3 /* count how many leading zeros */
add r0, r0, r3, lsr #3 /* divide that by 8 and add to count */
+#else
/*
- * r0 now points to 4 past the NUL due to the post-inc. Subtract
- * the start of the string (which also has 4 added to it to compensate
- * for the post-inc.
+ * We encountered a NUL.
+ */
+ tst r3, #BYTE0 /* 1st byte was NUL? */
+ beq 1f /* yes, done adding */
+ add r0, r0, #1 /* we have one more non-NUL byte */
+ tst r3, #BYTE1 /* 2nd byte was NUL? */
+ beq 1f /* yes, done adding */
+ add r0, r0, #1 /* we have one more non-NUL byte */
+ tst r3, #BYTE2 /* 3rd byte was NUL? */
+ addne r0, r0, #1 /* no, we have one more non-NUL byte */
+1:
+#endif /* _ARM_ARCH_6 */
+ /*
+ * r0 now points to 4 past the NUL due to the post-inc. Subtract the
+ * start of the string (which also has 4 added to it to compensate for
+ * the post-inc.
*/
sub r0, r0, ip /* subtract start to get length */
RET
Home |
Main Index |
Thread Index |
Old Index