Subject: port-arm32/12363: Minor performance tweak to bcopy_page.S
To: None <gnats-bugs@netbsd.org>
From: Ben Harris <bjh21@netbsd.org>
List: netbsd-bugs
Date: 03/09/2001 13:13:23
>Number: 12363
>Category: port-arm32
>Synopsis: Minor performance tweak to bcopy_page.S
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: port-arm32-maintainer
>State: open
>Class: change-request
>Submitter-Id: net
>Arrival-Date: Fri Mar 09 05:14:00 PST 2001
>Closed-Date:
>Last-Modified:
>Originator: Ben Harris
>Release: 2001-03-04
>Organization:
>Environment:
>Description:
To: port-arm32@netbsd.org
Subject: Minor performance tweak to bcopy_page.S
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>
This patch is a minor tweak to arm/arm32/bcopy_page.S. It makes slightly
better use of the call-clobbered registers and reduces the unwinding of
the loops slightly: on cached machines we gain very little, and may even
loose slightly (since the loop takes up more space in the I$), by
unwinding the loop more than necessary. The stalls waiting for the write
buffers to drain will more than consume the overhead of the additional
branch instructions.
>How-To-Repeat:
>Fix:
--==_Exmh_-11931549860
Content-Type: application/x-patch ; name="bcopy_page.patch"
Content-Description: bcopy_page.patch
Content-Disposition: attachment; filename="bcopy_page.patch"
Index: arm32/bcopy_page.S
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/arm/arm32/bcopy_page.S,v
retrieving revision 1.1
diff -p -p -r1.1 bcopy_page.S
*** bcopy_page.S 2001/03/04 08:25:39 1.1
--- bcopy_page.S 2001/03/07 21:07:19
***************
*** 41,46 ****
--- 41,48 ----
#include <machine/param.h>
#include <machine/asm.h>
+ /* #define BIG_LOOPS */
+
/*
* bcopy_page(src, dest)
*
***************
*** 51,104 ****
* r1 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512
*/
ENTRY(bcopy_page)
! stmfd sp!, {r4-r10, lr}
mov r2, #(NBPG >> 9)
Lloopcopy:
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
! ldmia r0!, {r3-r10}
! stmia r1!, {r3-r10}
!
subs r2, r2, #1
bne Lloopcopy
! ldmfd sp!, {r4-r10, pc}
/*
* bzero_page(dest)
--- 53,115 ----
* r1 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
! * otherwise.
*/
ENTRY(bcopy_page)
! stmfd sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
mov r2, #(NBPG >> 9)
+ #else
+ mov r2, #(NBPG >> 7)
+ #endif
Lloopcopy:
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
!
! #ifdef BIG_LOOPS
! /* There is little point making the loop any larger; unless we are
! running with the cache off, the load/store overheads will
! completely dominate this loop. */
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
!
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
!
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! ldmia r0!, {r3-r8,ip,lr}
! stmia r1!, {r3-r8,ip,lr}
! #endif
subs r2, r2, #1
bne Lloopcopy
! ldmfd sp!, {r4-r8, pc}
/*
* bzero_page(dest)
*************** Lloopcopy:
*** 109,152 ****
* r0 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512
*/
ENTRY(bzero_page)
! stmfd sp!, {r4-r10, lr}
mov r2, #(NBPG >> 9)
!
mov r3, #0
mov r4, #0
mov r5, #0
mov r6, #0
mov r7, #0
mov r8, #0
! mov r9, #0
! mov r10, #0
Lloopzero:
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
!
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
!
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
!
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
! stmia r0!, {r3-r10}
subs r2, r2, #1
bne Lloopzero
! ldmfd sp!, {r4-r10, pc}
--- 120,173 ----
* r0 - dest address
*
* Requires:
! * number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
! * otherwise
*/
ENTRY(bzero_page)
! stmfd sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
mov r2, #(NBPG >> 9)
! #else
! mov r2, #(NBPG >> 7)
! #endif
mov r3, #0
mov r4, #0
mov r5, #0
mov r6, #0
mov r7, #0
mov r8, #0
! mov ip, #0
! mov lr, #0
Lloopzero:
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! #ifdef BIG_LOOPS
! /* There is little point making the loop any larger; unless we are
! running with the cache off, the load/store overheads will
! completely dominate this loop. */
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
! stmia r0!, {r3-r8,ip,lr}
!
! #endif
subs r2, r2, #1
bne Lloopzero
! ldmfd sp!, {r4-r8, pc}
--==_Exmh_-11931549860--
>Release-Note:
>Audit-Trail:
>Unformatted: