port-arm32: Minor performance tweak to bcopy

Subject: Minor performance tweak to bcopy_page.S
To: None <port-arm32@netbsd.org>
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>
List: port-arm32
Date: 03/07/2001 21:37:19
This is a multipart MIME message.

--==_Exmh_-11931549860
Content-Type: text/plain; charset=us-ascii

Unless folks don't want them here, I'll post some of my performance tweaks 
that I've made over the next few days.  Folks are free to criticise or add 
them to the master sources as they see fit.  Things are changing so 
rapidly at the moment that it is costing me serious amounts of time trying 
to keep these in sync with the master sources.

This patch is a minor tweak to arm/arm32/bcopy_page.S.  It makes slightly 
better use of the call-clobbered registers and reduces the unwinding of 
the loops slightly: on cached machines we gain very little, and may even 
loose slightly (since the loop takes up more space in the I$), by 
unwinding the loop more than necessary.  The stalls waiting for the write 
buffers to drain will more than consume the overhead of the additional 
branch instructions.



--==_Exmh_-11931549860
Content-Type: application/x-patch ; name="bcopy_page.patch"
Content-Description: bcopy_page.patch
Content-Disposition: attachment; filename="bcopy_page.patch"

Index: arm32/bcopy_page.S
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/arm/arm32/bcopy_page.S,v
retrieving revision 1.1
diff -p -p -r1.1 bcopy_page.S
*** bcopy_page.S	2001/03/04 08:25:39	1.1
--- bcopy_page.S	2001/03/07 21:07:19
***************
*** 41,46 ****
--- 41,48 ----
  #include <machine/param.h>
  #include <machine/asm.h>
  
+ /* #define BIG_LOOPS */
+ 
  /*
   * bcopy_page(src, dest)
   *
***************
*** 51,104 ****
   *   r1 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512
   */
  
  ENTRY(bcopy_page)
! 	stmfd	sp!, {r4-r10, lr}
  	mov	r2, #(NBPG >> 9)
  
  Lloopcopy:
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 	ldmia	r0!, {r3-r10}
! 	stmia	r1!, {r3-r10}
! 
  	subs	r2, r2, #1
  	bne	Lloopcopy
  
! 	ldmfd	sp!, {r4-r10, pc}
  
  /*
   * bzero_page(dest)
--- 53,115 ----
   *   r1 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
!  *   otherwise.
   */
  
  ENTRY(bcopy_page)
! 	stmfd	sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
  	mov	r2, #(NBPG >> 9)
+ #else
+ 	mov	r2, #(NBPG >> 7)
+ #endif
  
  Lloopcopy:
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 
! #ifdef BIG_LOOPS
! 	/* There is little point making the loop any larger; unless we are
! 	   running with the cache off, the load/store overheads will
! 	   completely dominate this loop.  */
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! 	ldmia	r0!, {r3-r8,ip,lr}
! 	stmia	r1!, {r3-r8,ip,lr}
! #endif
  	subs	r2, r2, #1
  	bne	Lloopcopy
  
! 	ldmfd	sp!, {r4-r8, pc}
  
  /*
   * bzero_page(dest)
*************** Lloopcopy:
*** 109,152 ****
   *   r0 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512
   */
  
  ENTRY(bzero_page)
! 	stmfd	sp!, {r4-r10, lr}
  	mov	r2, #(NBPG >> 9)
! 
  	mov	r3, #0
  	mov	r4, #0
  	mov	r5, #0
  	mov	r6, #0
  	mov	r7, #0
  	mov	r8, #0
! 	mov	r9, #0
! 	mov	r10, #0
  
  Lloopzero:
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
! 	stmia	r0!, {r3-r10}
  
  	subs	r2, r2, #1
  	bne	Lloopzero
  
! 	ldmfd	sp!, {r4-r10, pc}
--- 120,173 ----
   *   r0 - dest address
   *
   * Requires:
!  *   number of bytes per page (NBPG) is a multiple of 512 (BIG_LOOPS), 128
!  *   otherwise
   */
  
  ENTRY(bzero_page)
! 	stmfd	sp!, {r4-r8, lr}
! #ifdef BIG_LOOPS
  	mov	r2, #(NBPG >> 9)
! #else
! 	mov	r2, #(NBPG >> 7)
! #endif
  	mov	r3, #0
  	mov	r4, #0
  	mov	r5, #0
  	mov	r6, #0
  	mov	r7, #0
  	mov	r8, #0
! 	mov	ip, #0
! 	mov	lr, #0
  
  Lloopzero:
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! #ifdef BIG_LOOPS
! 	/* There is little point making the loop any larger; unless we are
! 	   running with the cache off, the load/store overheads will
! 	   completely dominate this loop.  */
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 	stmia	r0!, {r3-r8,ip,lr}
! 
! #endif
  
  	subs	r2, r2, #1
  	bne	Lloopzero
  
! 	ldmfd	sp!, {r4-r8, pc}

--==_Exmh_-11931549860--