Subject: tweaks to blockio.S
To: None <port-arm32@netbsd.org>
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>
List: port-arm32
Date: 03/18/2001 17:15:49
This is a multipart MIME message.
--==_Exmh_-622954600
Content-Type: text/plain; charset=us-ascii
Here's another of my smaller performance tweaks for the ARM32 kernels (I
think this is mainly for podule-bus type interfaces, but it may also be
used elsewhere). There are minor tweaks to make use of load-delay slots on
later ARM processors (when such alterations won't impact other CPUS), but
the main change is to use an XOR sequence to more efficiently manipulate
the words that are being transfered: this makes the routines a little more
compact.
If I don't hear any objections, I'll probably install this sometime
tomorrow.
R.
--==_Exmh_-622954600
Content-Type: application/x-patch ; name="blockio.patch"
Content-Description: blockio.patch
Content-Disposition: attachment; filename="blockio.patch"
Index: blockio.S
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/arm32/arm32/blockio.S,v
retrieving revision 1.9
diff -p -p -r1.9 blockio.S
*** blockio.S 1999/10/26 06:53:41 1.9
--- blockio.S 2001/03/18 17:05:24
***************
*** 41,46 ****
--- 41,48 ----
* optimised block read/write from/to IO routines.
*
* Created : 08/10/94
+ * Modified : 22/01/99 -- R.Earnshaw
+ * Faster, and small tweaks for StrongARM
*/
#include <machine/asm.h>
*************** ENTRY(insw)
*** 68,77 ****
inswloop:
ldr r3, [r0]
strb r3, [r1], #0x0001
mov r3, r3, lsr #8
strb r3, [r1], #0x0001
- subs r2, r2, #0x00000001
bgt inswloop
mov pc, lr
--- 70,79 ----
inswloop:
ldr r3, [r0]
+ subs r2, r2, #0x00000001 /* Loop test in load delay slot */
strb r3, [r1], #0x0001
mov r3, r3, lsr #8
strb r3, [r1], #0x0001
bgt inswloop
mov pc, lr
*************** inswloop:
*** 79,98 ****
/* Word aligned insw */
fastinsw:
- stmfd sp!, {r4}
fastinswloop:
ldr r3, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr r4, [r0]
mov r3, r3, lsr #16 /* Put the two shorts together */
! orr r3, r3, r4, lsl #16
str r3, [r1], #0x0004 /* Store */
subs r2, r2, #0x00000002 /* Next */
bgt fastinswloop
- ldmfd sp!, {r4}
-
mov pc, lr
--- 81,97 ----
/* Word aligned insw */
fastinsw:
fastinswloop:
ldr r3, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr ip, [r0]
mov r3, r3, lsr #16 /* Put the two shorts together */
! orr r3, r3, ip, lsl #16
str r3, [r1], #0x0004 /* Store */
subs r2, r2, #0x00000002 /* Next */
bgt fastinswloop
mov pc, lr
*************** ENTRY(outsw)
*** 117,158 ****
/* Non aligned outsw */
- stmfd sp!, {r4}
-
outswloop:
ldrb r3, [r1], #0x0001
! ldrb r4, [r1], #0x0001
! orr r3, r3, r4, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0]
- subs r2, r2, #0x00000001
bgt outswloop
- ldmfd sp!, {r4}
-
mov pc, lr
/* Word aligned outsw */
fastoutsw:
- stmfd sp!, {r4}
fastoutswloop:
! ldr r3, [r1], #0x0004
! mov r4, r3, lsl #16
! orr r4, r4, r4, lsr #16
! str r4, [r0]
! mov r4, r3, lsr #16
! orr r4, r4, r4, lsl #16
! str r4, [r0]
- subs r2, r2, #0x00000002
bgt fastoutswloop
- ldmfd sp!, {r4}
-
mov pc, lr
/*
--- 116,158 ----
/* Non aligned outsw */
outswloop:
ldrb r3, [r1], #0x0001
! ldrb ip, [r1], #0x0001
! subs r2, r2, #0x00000001 /* Loop test in load delay slot */
! orr r3, r3, ip, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0]
bgt outswloop
mov pc, lr
/* Word aligned outsw */
fastoutsw:
fastoutswloop:
! ldr r3, [r1], #0x0004 /* r3 = (H)(L) */
! subs r2, r2, #0x00000002 /* Loop test in load delay slot */
! eor ip, r3, r3, lsr #16 /* ip = (H)(H^L) */
! eor r3, r3, ip, lsl #16 /* r3 = (H^H^L)(L) = (L)(L) */
! eor ip, ip, r3, lsr #16 /* ip = (H)(H^L^L) = (H)(H) */
! str r3, [r0]
! str ip, [r0]
!
! /* mov ip, r3, lsl #16
! * orr ip, ip, ip, lsr #16
! * str ip, [r0]
! *
! * mov ip, r3, lsr #16
! * orr ip, ip, ip, lsl #16
! * str ip, [r0]
! */
bgt fastoutswloop
mov pc, lr
/*
*************** ENTRY(insw16)
*** 170,176 ****
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably aligned, do it fast */
tst r2, #0x00000007
tsteq r1, #0x00000003
--- 170,177 ----
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably
! aligned, do it fast */
tst r2, #0x00000007
tsteq r1, #0x00000003
*************** ENTRY(insw16)
*** 179,219 ****
/* Word aligned insw */
! stmfd sp!, {r4-r7}
insw16loop:
ldr r3, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr r7, [r0]
mov r3, r3, lsr #16 /* Put the two shorts together */
! orr r3, r3, r7, lsl #16
ldr r4, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr r7, [r0]
mov r4, r4, lsr #16 /* Put the two shorts together */
! orr r4, r4, r7, lsl #16
ldr r5, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr r7, [r0]
mov r5, r5, lsr #16 /* Put the two shorts together */
! orr r5, r5, r7, lsl #16
! ldr r6, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr r7, [r0]
! mov r6, r6, lsr #16 /* Put the two shorts together */
! orr r6, r6, r7, lsl #16
! stmia r1!, {r3-r6}
subs r2, r2, #0x00000008 /* Next */
bgt insw16loop
! ldmfd sp!, {r4-r7}
- mov pc, lr
-
/*
* Writes short ints (16 bits) from a block of memory to an I/O address
--- 180,218 ----
/* Word aligned insw */
! stmfd sp!, {r4,r5,lr}
insw16loop:
ldr r3, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr lr, [r0]
mov r3, r3, lsr #16 /* Put the two shorts together */
! orr r3, r3, lr, lsl #16
ldr r4, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr lr, [r0]
mov r4, r4, lsr #16 /* Put the two shorts together */
! orr r4, r4, lr, lsl #16
ldr r5, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr lr, [r0]
mov r5, r5, lsr #16 /* Put the two shorts together */
! orr r5, r5, lr, lsl #16
! ldr ip, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
! ldr lr, [r0]
! mov ip, ip, lsr #16 /* Put the two shorts together */
! orr ip, ip, lr, lsl #16
! stmia r1!, {r3-r5,ip}
subs r2, r2, #0x00000008 /* Next */
bgt insw16loop
! ldmfd sp!, {r4,r5,pc} /* Restore regs and go home */
/*
* Writes short ints (16 bits) from a block of memory to an I/O address
*************** ENTRY(outsw16)
*** 228,234 ****
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably aligned, do it fast */
tst r2, #0x00000007
tsteq r1, #0x00000003
--- 227,234 ----
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably
! aligned, do it fast */
tst r2, #0x00000007
tsteq r1, #0x00000003
*************** ENTRY(outsw16)
*** 237,285 ****
/* Word aligned outsw */
! stmfd sp!, {r4-r7}
outsw16loop:
! ldmia r1!, {r4-r7}
! mov r3, r4, lsl #16
! orr r3, r3, r3, lsr #16
str r3, [r0]
!
! mov r3, r4, lsr #16
! orr r3, r3, r3, lsl #16
! str r3, [r0]
!
! mov r3, r5, lsl #16
! orr r3, r3, r3, lsr #16
! str r3, [r0]
!
! mov r3, r5, lsr #16
! orr r3, r3, r3, lsl #16
! str r3, [r0]
!
! mov r3, r6, lsl #16
! orr r3, r3, r3, lsr #16
! str r3, [r0]
! mov r3, r6, lsr #16
! orr r3, r3, r3, lsl #16
str r3, [r0]
! mov r3, r7, lsl #16
! orr r3, r3, r3, lsr #16
str r3, [r0]
! mov r3, r7, lsr #16
! orr r3, r3, r3, lsl #16
str r3, [r0]
subs r2, r2, #0x00000008
bgt outsw16loop
! ldmfd sp!, {r4-r7}
!
! mov pc, lr
/*
* reads short ints (16 bits) from an I/O address into a block of memory
--- 237,284 ----
/* Word aligned outsw */
! stmfd sp!, {r4,r5,lr}
outsw16loop:
! ldmia r1!, {r4,r5,ip,lr}
! eor r3, r4, r4, lsl #16 /* r3 = (A^B)(B) */
! eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
! eor r3, r3, r4, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
! str r4, [r0]
!
! /* mov r3, r4, lsl #16
! * orr r3, r3, r3, lsr #16
! * str r3, [r0]
! *
! * mov r3, r4, lsr #16
! * orr r3, r3, r3, lsl #16
! * str r3, [r0]
! */
! eor r3, r5, r5, lsl #16 /* r3 = (A^B)(B) */
! eor r5, r5, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
! eor r3, r3, r5, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
+ str r5, [r0]
! eor r3, ip, ip, lsl #16 /* r3 = (A^B)(B) */
! eor ip, ip, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
! eor r3, r3, ip, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
+ str ip, [r0]
! eor r3, lr, lr, lsl #16 /* r3 = (A^B)(B) */
! eor lr, lr, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
! eor r3, r3, lr, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
+ str lr, [r0]
subs r2, r2, #0x00000008
bgt outsw16loop
! ldmfd sp!, {r4,r5,pc} /* and go home */
/*
* reads short ints (16 bits) from an I/O address into a block of memory
*************** ENTRY(inswm8)
*** 297,303 ****
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably aligned, do it fast */
tst r1, #0x00000003
--- 296,303 ----
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably
! aligned, do it fast */
tst r1, #0x00000003
*************** ENTRY(inswm8)
*** 305,329 ****
/* Word aligned insw */
! stmfd sp!, {r4-r11}
! mov r11, #0xff000000
! orr r11, r11, #0x00ff0000
inswm8_loop8:
cmp r2, #8
bcc inswm8_l8
! ldmia r0, {r3-r10}
! bic r3, r3, r11
orr r3, r3, r4, lsl #16
! bic r5, r5, r11
orr r4, r5, r6, lsl #16
! bic r7, r7, r11
orr r5, r7, r8, lsl #16
! bic r9, r9, r11
! orr r6, r9, r10, lsl #16
stmia r1!, {r3-r6}
--- 305,329 ----
/* Word aligned insw */
! stmfd sp!, {r4-r9,lr}
! mov lr, #0xff000000
! orr lr, lr, #0x00ff0000
inswm8_loop8:
cmp r2, #8
bcc inswm8_l8
! ldmia r0, {r3-r9,ip}
! bic r3, r3, lr
orr r3, r3, r4, lsl #16
! bic r5, r5, lr
orr r4, r5, r6, lsl #16
! bic r7, r7, lr
orr r5, r7, r8, lsl #16
! bic r9, r9, lr
! orr r6, r9, ip, lsl #16
stmia r1!, {r3-r6}
*************** inswm8_l8:
*** 337,345 ****
ldmia r0, {r3-r6}
! bic r3, r3, r11
orr r3, r3, r4, lsl #16
! bic r5, r5, r11
orr r4, r5, r6, lsl #16
stmia r1!, {r3-r4}
--- 337,345 ----
ldmia r0, {r3-r6}
! bic r3, r3, lr
orr r3, r3, r4, lsl #16
! bic r5, r5, lr
orr r4, r5, r6, lsl #16
stmia r1!, {r3-r4}
*************** inswm8_l4:
*** 353,359 ****
ldmia r0, {r3-r4}
! bic r3, r3, r11
orr r3, r3, r4, lsl #16
str r3, [r1], #0x0004
--- 353,359 ----
ldmia r0, {r3-r4}
! bic r3, r3, lr
orr r3, r3, r4, lsl #16
str r3, [r1], #0x0004
*************** inswm8_l2:
*** 365,382 ****
bcc inswm8_l1
ldr r3, [r0]
strb r3, [r1], #0x0001
mov r3, r3, lsr #8
strb r3, [r1], #0x0001
- subs r2, r2, #0x00000001
inswm8_l1:
! ldmfd sp!, {r4-r11}
- mov pc, lr
-
/*
* write short ints (16 bits) to an I/O address from a block of memory
* The I/O address is assumed to be mapped multiple times in a block of
--- 365,381 ----
bcc inswm8_l1
ldr r3, [r0]
+ subs r2, r2, #0x00000001 /* Test in load delay slot */
+ /* XXX, why don't we use result? */
strb r3, [r1], #0x0001
mov r3, r3, lsr #8
strb r3, [r1], #0x0001
inswm8_l1:
! ldmfd sp!, {r4-r9,pc} /* And go home */
/*
* write short ints (16 bits) to an I/O address from a block of memory
* The I/O address is assumed to be mapped multiple times in a block of
*************** ENTRY(outswm8)
*** 393,399 ****
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably aligned, do it fast */
tst r1, #0x00000003
--- 392,399 ----
cmp r2, #0x00000000
movle pc, lr
! /* If the destination address is word aligned and the size suitably
! aligned, do it fast */
tst r1, #0x00000003
*************** ENTRY(outswm8)
*** 401,432 ****
/* Word aligned outsw */
! stmfd sp!, {r4-r10}
outswm8_loop8:
cmp r2, #8
bcc outswm8_l8
! ldmia r1!, {r3,r5,r7,r9}
! mov r4, r3, lsr #16
! orr r4, r4, r4, lsl #16
! mov r3, r3, lsl #16
! orr r3, r3, r3, lsr #16
! mov r6, r5, lsr #16
! orr r6, r6, r6, lsl #16
! mov r5, r5, lsl #16
! orr r5, r5, r5, lsr #16
! mov r8, r7, lsr #16
! orr r8, r8, r8, lsl #16
! mov r7, r7, lsl #16
! orr r7, r7, r7, lsr #16
! mov r10, r9, lsr #16
! orr r10, r10, r10, lsl #16
! mov r9, r9, lsl #16
! orr r9, r9, r9, lsr #16
! stmia r0, {r3-r10}
subs r2, r2, #0x00000008 /* Next */
bne outswm8_loop8
--- 401,431 ----
/* Word aligned outsw */
! stmfd sp!, {r4-r8,lr}
outswm8_loop8:
cmp r2, #8
bcc outswm8_l8
+
+ ldmia r1!, {r3,r5,r7,ip}
+
+ eor r4, r3, r3, lsr #16 /* r4 = (A)(A^B) */
+ eor r3, r3, r4, lsl #16 /* r3 = (A^A^B)(B) = (B)(B) */
+ eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
+
+ eor r6, r5, r5, lsr #16 /* r6 = (A)(A^B) */
+ eor r5, r5, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */
+ eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */
! eor r8, r7, r7, lsr #16 /* r8 = (A)(A^B) */
! eor r7, r7, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */
! eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */
! eor lr, ip, ip, lsr #16 /* lr = (A)(A^B) */
! eor ip, ip, lr, lsl #16 /* ip = (A^A^B)(B) = (B)(B) */
! eor lr, lr, ip, lsr #16 /* lr = (A)(B^A^B) = (A)(A) */
! stmia r0, {r3-r8,ip,lr}
subs r2, r2, #0x00000008 /* Next */
bne outswm8_loop8
*************** outswm8_l8:
*** 438,451 ****
ldmia r1!, {r3-r4}
! mov r5, r3, lsl #16
! orr r5, r5, r5, lsr #16
! mov r6, r3, lsr #16
! orr r6, r6, r6, lsl #16
! mov r7, r4, lsl #16
! orr r7, r7, r7, lsr #16
! mov r8, r4, lsr #16
! orr r8, r8, r8, lsl #16
stmia r0, {r5-r8}
--- 437,449 ----
ldmia r1!, {r3-r4}
! eor r6, r3, r3, lsr #16 /* r6 = (A)(A^B) */
! eor r5, r3, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */
! eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */
!
! eor r8, r4, r4, lsr #16 /* r8 = (A)(A^B) */
! eor r7, r4, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */
! eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */
stmia r0, {r5-r8}
*************** outswm8_l4:
*** 456,471 ****
cmp r2, #2
bcc outswm8_l2
! ldr r3, [r1], #0x0004
! mov r4, r3, lsl #16
! orr r4, r4, r4, lsr #16
! mov r5, r3, lsr #16
! orr r5, r5, r5, lsl #16
stmia r0, {r4, r5}
- subs r2, r2, #0x00000002
beq outswm8_l1
outswm8_l2:
--- 454,468 ----
cmp r2, #2
bcc outswm8_l2
! ldr r3, [r1], #0x0004 /* r3 = (A)(B) */
! subs r2, r2, #0x00000002 /* Done test in Load delay slot */
! eor r5, r3, r3, lsr #16 /* r5 = (A)(A^B)*/
! eor r4, r3, r5, lsl #16 /* r4 = (A^A^B)(B) = (B)(B) */
! eor r5, r5, r4, lsr #16 /* r5 = (A)(B^A^B) = (A)(A) */
stmia r0, {r4, r5}
beq outswm8_l1
outswm8_l2:
*************** outswm8_l2:
*** 474,487 ****
ldrb r3, [r1], #0x0001
ldrb r4, [r1], #0x0001
orr r3, r3, r4, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0]
- subs r2, r2, #0x00000001
-
outswm8_l1:
! ldmfd sp!, {r4-r10}
!
! mov pc, lr
!
--- 471,481 ----
ldrb r3, [r1], #0x0001
ldrb r4, [r1], #0x0001
+ subs r2, r2, #0x00000001 /* Done test in load delay slot */
+ /* XXX This test isn't used? */
orr r3, r3, r4, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0]
outswm8_l1:
! ldmfd sp!, {r4-r8,pc} /* And go home */
--==_Exmh_-622954600--