Subject: Minor performance tweak to bcopy_page.S
To: None <port-arm32@netbsd.org>
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>
List: port-arm32
Date: 03/07/2001 21:37:19
This is a multipart MIME message.
--==_Exmh_-11931549860
Content-Type: text/plain; charset=us-ascii
Unless folks don't want them here, I'll post some of my performance tweaks
that I've made over the next few days. Folks are free to criticise or add
them to the master sources as they see fit. Things are changing so
rapidly at the moment that it is costing me serious amounts of time trying
to keep these in sync with the master sources.
This patch is a minor tweak to arm/arm32/bcopy_page.S. It makes slightly
better use of the call-clobbered registers and reduces the unwinding of
the loops slightly: on cached machines we gain very little, and may even
loose slightly (since the loop takes up more space in the I$), by
unwinding the loop more than necessary. The stalls waiting for the write
buffers to drain will more than consume the overhead of the additional
branch instructions.
--==_Exmh_-11931549860
Content-Type: application/x-patch ; name="bcopy_page.patch"
Content-Description: bcopy_page.patch
Content-Disposition: attachment; filename="bcopy_page.patch"
Index: arm32/bcopy_page.S
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/arm/arm32/bcopy_page.S,v
retrieving revision 1.1
diff -p -p -r1.1 bcopy_page.S
*** bcopy_page.S 2001/03/04 08:25:39 1.1
--- bcopy_page.S 2001/03/07 21:07:19
***************
*** 41,46 ****
--- 41,48 ----
#include <machine/param.h>
#include <machine/asm.h>
+ /* #define BIG_LOOPS */
+
/*
* bcopy_page(src, dest)
*
***************
*** 51,104 ****
* r1 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512
*/
ENTRY(bcopy_page)
! stmfd sp!, {r4-r10, lr}
mov r2, #(NBPG >> 9)
Lloopcopy:
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
subs r2, r2, #1
bne Lloopcopy
! ldmfd sp!, {r4-r10, pc}
/*
* bzero_page(dest)
--- 53,115 ----
* r1 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
! * otherwise.
*/
ENTRY(bcopy_page)
! stmfd sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
mov r2, #(NBPG >> 9)
+ #else
+ mov r2, #(NBPG >> 7)
+ #endif
Lloopcopy:
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
!
! #ifdef BIG_LOOPS
! /* There is little point making the loop any larger; unless we are
! running with the cache off, the load/store overheads will
! completely dominate this loop. */
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
!
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
!
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! #endif
subs r2, r2, #1
bne Lloopcopy
! ldmfd sp!, {r4-r8, pc}
/*
* bzero_page(dest)
*************** Lloopcopy:
*** 109,152 ****
* r0 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512
*/
ENTRY(bzero_page)
! stmfd sp!, {r4-r10, lr}
mov r2, #(NBPG >> 9)
!
mov r3, #0
mov r4, #0
mov r5, #0
mov r6, #0
mov r7, #0
mov r8, #0
! mov r9, #0
! mov r10, #0
Lloopzero:
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
!
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
!
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
!
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
subs r2, r2, #1
bne Lloopzero
! ldmfd sp!, {r4-r10, pc}
--- 120,173 ----
* r0 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
! * otherwise
*/
ENTRY(bzero_page)
! stmfd sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
mov r2, #(NBPG >> 9)
! #else
! mov r2, #(NBPG >> 7)
! #endif
mov r3, #0
mov r4, #0
mov r5, #0
mov r6, #0
mov r7, #0
mov r8, #0
! mov ip, #0
! mov lr, #0
Lloopzero:
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! #ifdef BIG_LOOPS
! /* There is little point making the loop any larger; unless we are
! running with the cache off, the load/store overheads will
! completely dominate this loop. */
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! #endif
subs r2, r2, #1
bne Lloopzero
! ldmfd sp!, {r4-r8, pc}
--==_Exmh_-11931549860--