netbsd-bugs: port-arm32/12363: Minor performance tweak to bcopy

Subject: port-arm32/12363: Minor performance tweak to bcopy_page.S
To: None <gnats-bugs@netbsd.org>
From: Ben Harris <bjh21@netbsd.org>
List: netbsd-bugs
Date: 03/09/2001 13:13:23
>Number:         12363
>Category:       port-arm32
>Synopsis:       Minor performance tweak to bcopy_page.S
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    port-arm32-maintainer
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Fri Mar 09 05:14:00 PST 2001
>Closed-Date:
>Last-Modified:
>Originator:     Ben Harris
>Release:        2001-03-04
>Organization:
>Environment:
>Description:

To: port-arm32@netbsd.org
Subject: Minor performance tweak to bcopy_page.S
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>

This patch is a minor tweak to arm/arm32/bcopy_page.S.  It makes slightly 
better use of the call-clobbered registers and reduces the unwinding of 
the loops slightly: on cached machines we gain very little, and may even 
loose slightly (since the loop takes up more space in the I$), by 
unwinding the loop more than necessary.  The stalls waiting for the write 
buffers to drain will more than consume the overhead of the additional 
branch instructions.

>How-To-Repeat:
>Fix:

--==_Exmh_-11931549860
Content-Type: application/x-patch ; name="bcopy_page.patch"
Content-Description: bcopy_page.patch
Content-Disposition: attachment; filename="bcopy_page.patch"

Index: arm32/bcopy_page.S
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/arm/arm32/bcopy_page.S,v
retrieving revision 1.1
diff -p -p -r1.1 bcopy_page.S
*** bcopy_page.S	2001/03/04 08:25:39	1.1
--- bcopy_page.S	2001/03/07 21:07:19
***************
*** 41,46 ****
--- 41,48 ----
  #include <machine/param.h>
  #include <machine/asm.h>
  
+ /* #define BIG_LOOPS */
+ 
  /*
   * bcopy_page(src, dest)
   *
***************
*** 51,104 ****
   *   r1 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512
   */
  
  ENTRY(bcopy_page)
! 	stmfd	sp!, {r4-r10, lr}
  	mov	r2, #(NBPG >> 9)
  
  Lloopcopy:
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
  	subs	r2, r2, #1
  	bne	Lloopcopy
  
! 	ldmfd	sp!, {r4-r10, pc}
  
  /*
   * bzero_page(dest)
--- 53,115 ----
   *   r1 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
!  *   otherwise.
   */
  
  ENTRY(bcopy_page)
! 	stmfd	sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
  	mov	r2, #(NBPG >> 9)
+ #else
+ 	mov	r2, #(NBPG >> 7)
+ #endif
  
  Lloopcopy:
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 
! #ifdef BIG_LOOPS
! 	/* There is little point making the loop any larger; unless we are
! 	   running with the cache off, the load/store overheads will
! 	   completely dominate this loop.  */
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! #endif
  	subs	r2, r2, #1
  	bne	Lloopcopy
  
! 	ldmfd	sp!, {r4-r8, pc}
  
  /*
   * bzero_page(dest)
*************** Lloopcopy:
*** 109,152 ****
   *   r0 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512
   */
  
  ENTRY(bzero_page)
! 	stmfd	sp!, {r4-r10, lr}
  	mov	r2, #(NBPG >> 9)
! 
  	mov	r3, #0
  	mov	r4, #0
  	mov	r5, #0
  	mov	r6, #0
  	mov	r7, #0
  	mov	r8, #0
! 	mov	r9, #0
! 	mov	r10, #0
  
  Lloopzero:
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
  
  	subs	r2, r2, #1
  	bne	Lloopzero
  
! 	ldmfd	sp!, {r4-r10, pc}
--- 120,173 ----
   *   r0 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
!  *   otherwise
   */
  
  ENTRY(bzero_page)
! 	stmfd	sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
  	mov	r2, #(NBPG >> 9)
! #else
! 	mov	r2, #(NBPG >> 7)
! #endif
  	mov	r3, #0
  	mov	r4, #0
  	mov	r5, #0
  	mov	r6, #0
  	mov	r7, #0
  	mov	r8, #0
! 	mov	ip, #0
! 	mov	lr, #0
  
  Lloopzero:
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! #ifdef BIG_LOOPS
! 	/* There is little point making the loop any larger; unless we are
! 	   running with the cache off, the load/store overheads will
! 	   completely dominate this loop.  */
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! #endif
  
  	subs	r2, r2, #1
  	bne	Lloopzero
  
! 	ldmfd	sp!, {r4-r8, pc}

--==_Exmh_-11931549860--




>Release-Note:
>Audit-Trail:
>Unformatted: