Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/arch/arm/arm More optimizations (have separate 64-byte l...
details: https://anonhg.NetBSD.org/src/rev/6da9b30ea8dc
branches: trunk
changeset: 783440:6da9b30ea8dc
user: matt <matt%NetBSD.org@localhost>
date: Fri Dec 21 06:35:34 2012 +0000
description:
More optimizations (have separate 64-byte loop which alternates loads
and add of different registers). Be more consistent on endian issues.
Use pld.
diffstat:
sys/arch/arm/arm/cpu_in_cksum_buffer.S | 238 +++++++++++++++++++-------------
1 files changed, 144 insertions(+), 94 deletions(-)
diffs (truncated from 334 to 300 lines):
diff -r 0c21ef65fb66 -r 6da9b30ea8dc sys/arch/arm/arm/cpu_in_cksum_buffer.S
--- a/sys/arch/arm/arm/cpu_in_cksum_buffer.S Thu Dec 20 22:56:38 2012 +0000
+++ b/sys/arch/arm/arm/cpu_in_cksum_buffer.S Fri Dec 21 06:35:34 2012 +0000
@@ -29,7 +29,7 @@
#include <machine/asm.h>
-RCSID("$NetBSD: cpu_in_cksum_buffer.S,v 1.3 2012/12/20 08:03:21 matt Exp $")
+RCSID("$NetBSD: cpu_in_cksum_buffer.S,v 1.4 2012/12/21 06:35:34 matt Exp $")
/*
* Special note:
@@ -38,8 +38,25 @@
#ifdef _ARM_ARCH_DWORD_OK
#define LOAD_DWORD_INTO_R4(r) ldrd r4, [r], #8
+#define LOAD_DWORD_INTO_R6(r) ldrd r6, [r], #8
#else
-#define LOAD_DWORD_INTO_R4(r) ldr r4, [r], #4; ldr r5, [r], #4
+#define LOAD_DWORD_INTO_R4(r) ldmia r!, {r4-r5}
+#define LOAD_DWORD_INTO_R4(r) ldmia r!, {r6-r7}
+#endif
+
+#if defined(__ARMEL__) || !defined(_ARM_ARCH_DWORD_OK)
+#define RLO r4
+#define RHI r5
+#else
+#define RLO r5
+#define RHI r4
+#endif
+#if defined(__ARMEL__)
+#define BYTE0 0x000000ff
+#define BYTE3 0xff000000
+#else
+#define BYTE0 0xff000000
+#define BYTE3 0x000000ff
#endif
/*
@@ -47,95 +64,124 @@
*/
ENTRY(cpu_in_cksum_buffer)
+#ifdef _ARM_ARCH_DWORD_OK
+ pld [r0] /* prefetch the first data */
+#endif
mov ip, r2 /* initialize accumulator */
adds ip, ip, #0 /* clear carry */
- push {r4-r5} /* save temporaries */
teq r1, #0 /* did we get passed a zero length? */
- beq .Lfold /* fold the checksum */
+ beq .Lfold_nopop /* fold the checksum */
ands r2, r0, #7 /* test for dword alignment */
bne .Ldword_misaligned /* no, fixup non dword aligned */
+ push {r4-r5} /* save temporaries */
add r2, r1, r0 /* point r2 just past end */
-#ifndef __OPTIMIZE_SIZE__
+ LOAD_DWORD_INTO_R4(r0) /* load first dword */
+ sub r1, r2, r0 /* we've read one dword */
+.Ldword_aligned_noload:
+#if !defined(__OPTIMIZE_SIZE__)
bics r3, r1, #63 /* at least 64 bytes to do? */
- bne 4f /* yes, then do them */
-#endif /* __OPTIMIZE_SIZE__ */
- bics r3, r1, #7 /* at least 8 bytes to do? */
- beq .Lfinal_dword /* no, handle the final dword */
-3:
-#ifndef __OPTIMIZE_SIZE__
- rsb r3, r3, #64 /* subtract from 64 */
-#ifdef _ARM_ARCH_DWORD_OK
- add r3, r3, r3, lsr #1 /* multiply by 1.5 */
- add pc, pc, r3 /* and jump! */
-#else
- add pc, pc, r3, lsl #1 /* multiply by 2 and jump! */
-#endif
- nop
-4: LOAD_DWORD_INTO_R4(r0) /* 8 dwords left */
+ beq 2f /* no, then do final collection */
+ push {r6-r7}
+1:
+ LOAD_DWORD_INTO_R6(r0) /* 8 dwords left */
adcs ip, ip, r4
adcs ip, ip, r5
LOAD_DWORD_INTO_R4(r0) /* 7 dwords left */
- adcs ip, ip, r4
- adcs ip, ip, r5
- LOAD_DWORD_INTO_R4(r0) /* 6 dwords left */
+ adcs ip, ip, r6
+ adcs ip, ip, r7
+ LOAD_DWORD_INTO_R6(r0) /* 6 dwords left */
adcs ip, ip, r4
adcs ip, ip, r5
LOAD_DWORD_INTO_R4(r0) /* 5 dwords left */
- adcs ip, ip, r4
- adcs ip, ip, r5
- LOAD_DWORD_INTO_R4(r0) /* 4 dwords left */
+ adcs ip, ip, r6
+ adcs ip, ip, r7
+ LOAD_DWORD_INTO_R6(r0) /* 4 dwords left */
adcs ip, ip, r4
adcs ip, ip, r5
LOAD_DWORD_INTO_R4(r0) /* 3 dwords left */
- adcs ip, ip, r4
- adcs ip, ip, r5
- LOAD_DWORD_INTO_R4(r0) /* 2 dwords left */
+ adcs ip, ip, r6
+ adcs ip, ip, r7
+ LOAD_DWORD_INTO_R6(r0) /* 2 dwords left */
adcs ip, ip, r4
adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0) /* 1 dword left */
+ adcs ip, ip, r6
+ adcs ip, ip, r7
+
+ sub r1, r2, r0 /* find how much is left */
+ bics r3, r1, #63 /* at least 64 bytes to do? */
+ bne 1b /* yes, run the loop again */
+
+ pop {r6-r7} /* done with these so restore them */
#endif /* __OPTIMIZE_SIZE__ */
- LOAD_DWORD_INTO_R4(r0) /* 1 dword left */
-.Ladd_one_dword:
- adcs ip, ip, r4
-.Ladd_one_word:
- adcs ip, ip, r5
- teq r2, r0 /* nothing left? */
- beq .Lfold /* yep, proceed to hold */
-
- sub r1, r2, r0 /* find out much left to do? */
-#ifndef __OPTIMIZE_SIZE__
- bics r3, r1, #63 /* at least 64 bytes left? */
- bne 4b /* yep, do 64 at time */
-#endif
- bics r3, r1, #7 /* at least 8 bytes left? */
- bne 3b /* yep, do them */
-.Lfinal_dword:
- ldr r5, [r0], #4 /* load next word */
- tst r1, #3 /* final amount one word exactly? */
- beq .Lfinal_add_one_word /* yes, and go add it */
- sub r3, r1, #1 /* 0-3 = 1 word, 4-7 = 2 words */
- tst r3, #4 /* one more word left? */
- moveq r4, #0 /* no, use 0 for 1st word */
- movne r4, r5 /* yes, move from 2nd word to 1st */
- ldrne r5, [r0] /* yes, load last word */
+2: teq r1, #0 /* at the end? */
+ beq .Lfinal_add_one_dword /* yes, do the final add */
+ bmi .Lfinal_dword_noload /* past it, handle the final dword */
+3:
+#ifdef _ARM_ARCH_DWORD_OK
+ pld [r0, #32] /* grab next cache line */
+#endif
+#ifndef __OPTIMIZE_SIZE__
+ bic r3, r1, #7 /* find out how many dwords to do */
+ rsb r3, r3, #56 /* subtract from 56 */
+ add r3, r3, r3, lsr #1 /* multiply by 1.5 */
+ add pc, pc, r3 /* and jump! */
+ nop
+ adcs ip, ip, r4 /* 7 dwords left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+ adcs ip, ip, r4 /* 6 dwords left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+ adcs ip, ip, r4 /* 5 dwords left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+ adcs ip, ip, r4 /* 4 dwords left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+ adcs ip, ip, r4 /* 3 dwords left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+ adcs ip, ip, r4 /* 2 dwords left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+#endif /* __OPTIMIZE_SIZE__ */
+ adcs ip, ip, r4 /* 1 dword left */
+ adcs ip, ip, r5
+ LOAD_DWORD_INTO_R4(r0)
+ sub r1, r2, r0 /* find out much left to do? */
+ teq r1, #0 /* at the end? */
+ beq .Lfinal_add_one_dword /* yep, proceed to fold */
+#ifdef __OPTIMIZE_SIZE__
+ bics r3, r1, #7 /* exhaust all dwords? */
+ bne 3b /* not yet, do more */
+#endif
+ adcs ip, ip, r4
+ adcs ip, ip, r5
+ ldr RHI, [r0], #4 /* we have at least one word to read */
+ sub r3, r1, #4 /* subtract 4 from length */
+ teq r3, #0 /* is the result positive? */
+ beq .Lfinal_add_one_word /* = 0? just add that word. */
+ movpl RLO, RHI /* > 0? move from hi to lo word */
+ ldrpl RHI, [r0] /* > 0? load new hi word */
+ movmi RLO, #0 /* < 0? clear lo word */
+
.Lfinal_dword_noload:
- rsb r1, r1, #4 /* find out many bytes to discard */
-#ifdef __ARMEL__
+ rsb r1, r1, #8 /* find out many bytes to discard */
tst r1, #2 /* discard at least 2? */
- movne r5, r5, lsl #16 /* yes, discard upper halfword */
- tst r1, #1 /* discard odd? */
- bicne r5, r5, #0xff000000 /* yes, discard odd byte */
+#ifdef __ARMEL__
+ movne RHI, RHI, lsl #16 /* yes, discard upper halfword */
#else
- tst r1, #2 /* discard at least 2? */
- movne r5, r5, lsr #16 /* yes, discard lower halfword */
+ movne RHI, RHI, lsr #16 /* yes, discard lower halfword */
+#endif
tst r1, #1 /* discard odd? */
- bicne r5, r5, #0x000000ff /* yes, discard odd byte */
-#endif
+ bicne RHI, RHI, #BYTE3 /* yes, discard odd byte */
.Lfinal_add_one_dword:
- adcs ip, ip, r4 /* add 1st to accumulator */
+ adcs ip, ip, RLO /* add 1st to accumulator */
.Lfinal_add_one_word:
- adcs ip, ip, r5 /* add 2nd to accumulator */
+ adcs ip, ip, RHI /* add 2nd to accumulator */
/*
* Fall into fold.
@@ -143,6 +189,7 @@
.Lfold:
pop {r4-r5} /* we don't need these anymore */
+.Lfold_nopop:
/*
* We now have the 33-bit result in <carry>, ip. Pull in the
* standard folding code.
@@ -150,57 +197,60 @@
#include "cpu_in_cksum_fold.S"
.Ldword_misaligned:
+#ifdef _ARM_ARCH_DWORD_OK
+ pld [r0, #32] /* preload next cacheline */
+#endif
tst r0, #3 /* are at least word aligned? */
bne .Lword_misaligned /* no, do it the hard way */
- ldr r5, [r0], #4 /* load word here in case of partial */
+ push {r4-r5} /* save temporaries */
+ ldr RHI, [r0], #4 /* load word here in case of partial */
sub r1, r1, #4 /* subtract length of one word */
teq r1, #0 /* what is length? */
beq .Lfinal_add_one_word /* = 0? just do the final add */
- addgt r2, r1, r0 /* > 0? point r2 just past end */
- bgt .Ladd_one_word /* > 0? accumulate it and loop */
- mov r4, #0 /* < 0? zero this */
- b .Lfinal_dword_noload /* < 0? handle final partial dword */
+ mov RLO, #0 /* <= 0? zero this */
+ bmi .Lfinal_dword_noload /* < 0? handle final partial dword */
+ add r2, r1, r0 /* > 0? point r2 just past end */
+ b .Ldword_aligned_noload /* > 0? accumulate it and loop */
.Lword_misaligned:
+ /*
+ * If we start on an odd boundary, set up our stack frame so we
+ * can fixup the return value to be byteswapped.
+ */
+ tst r0, #1 /* start address odd? */
+ strne lr, [sp, #-8]! /* yes, save our return address */
+ adrne lr, .Lmisaligned_fixup /* yes, return to fixup code. */
+ push {r4-r5} /* save temporaries */
tst r0, #4 /* do we load 1 or 2 words? */
bic r0, r0, #3 /* force word alignment */
add r1, r1, r2 /* add initial offset to length */
sub r1, r1, #8 /* subtract length of one dword */
+#ifdef _ARM_ARCH_DWORD_OK
+ ldreqd r4, [r0], #8 /* load first dword */
+#else
ldmeqia r0!, {r4-r5} /* load first dword */
- ldrne r4, [r0], #4 /* load first word */
- movne r5, #0 /* no second word */
+#endif
+ ldrne RLO, [r0], #4 /* load first word */
+ movne RHI, #0 /* no second word */
/*
* We are now dword aligned.
*/
+ tst r2, #2 /* discard at least 2? */
#ifdef __ARMEL__
- tst r2, #2 /* discard at least 2? */
- movne r4, r4, lsr #16 /* yes, discard lower halfword */
- tst r2, #1 /* start odd? */
- bicne r4, r4, #0x000000ff /* yes, discard even byte */
+ movne RLO, RLO, lsr #16 /* yes, discard lower halfword */
#else
- tst r2, #2 /* discard at least 2? */
- movne r4, r4, lsl #16 /* yes, discard upper halfword */
- tst r2, #1 /* start odd? */
- bicne r4, r4, #0xff000000 /* yes, discard even byte */
+ movne RLO, RLO, lsl #16 /* yes, discard upper halfword */
#endif
- /*
- * Since we started on an odd boundary, set up our stack frame so we
- * fixup the return value to be byteswapped.
Home |
Main Index |
Thread Index |
Old Index