Subject: port-arm32/12364: StrongARM performance tweaks cpufunc_asm.S
To: None <gnats-bugs@gnats.netbsd.org>
From: Ben Harris <bjh21@netbsd.org>
List: netbsd-bugs
Date: 03/09/2001 14:27:46
>Number:         12364
>Category:       port-arm32
>Synopsis:       StrongARM performance tweaks cpufunc_asm.S
>Confidential:   no
>Severity:       non-critical
>Priority:       medium
>Responsible:    port-arm32-maintainer
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Fri Mar 09 06:28:00 PST 2001
>Closed-Date:
>Last-Modified:
>Originator:     Ben Harris
>Release:        1999-10-26
>Organization:
>Environment:
>Description:

To: port-arm32@netbsd.org
Subject: StrongARM performance tweaks cpufunc_asm.S
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>

2 tweaks and (I think) some corrections, this time for StrongARM chips only.

First tweak:  The existing code for the strongarm maintains two blocks of
ram for use when flushing the cache; on each cache flush the blocks are
switched.  This is unnecessary because we never use this ram for any other
purpose, so all we achieve by switching banks is to make second and
subsequent cache flushes less efficient (since all they do is evict lines
of data from the other flush block).

Second tweak: increase the size of regions for which we use the single line
flush commands -- profiling shows this to be a major win over flushing the
entire cache (since the likelihood is that such lines won't be in the cache
anyway).

Finally the correctness fix is to add some calls to drain the write buffers
-- these are particularly important when we are trying to synchronize I$
and D$.

R.

>How-To-Repeat:
>Fix:

--==_Exmh_-11899661060
Content-Type: text/plain ; name="arm32cpu.diffs"; charset=us-ascii
Content-Description: arm32cpu.diffs
Content-Disposition: attachment; filename="arm32cpu.diffs"

Index: cpufunc_asm.S
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/arm32/arm32/cpufunc_asm.S,v
retrieving revision 1.10
diff -p -p -r1.10 cpufunc_asm.S
*** cpufunc_asm.S	1999/10/26 06:53:41	1.10
--- cpufunc_asm.S	2001/03/07 21:58:06
*************** ENTRY(sa110_cache_cleanD)
*** 513,525 ****
--- 513,528 ----
  #endif
  	ldr	r2, Lsa110_cache_clean_addr
  	ldmia	r2, {r0, r1}
+ #ifdef DOUBLE_CACHE_CLEAN_BANK
  	eor	r0, r0, r1
  	str	r0, [r2]
+ #endif
  
  Lsa110_cache_cleanD_loop:
  	ldr	r2, [r0], #32
  	subs	r1, r1, #32
  	bne	Lsa110_cache_cleanD_loop
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  
  #ifdef CACHE_CLEAN_BLOCK_INTR
  	msr	cpsr_all , r3
*************** ENTRY(sa110_cache_purgeID)
*** 542,555 ****
--- 545,561 ----
  #endif
  	ldr	r2, Lsa110_cache_clean_addr
  	ldmia	r2, {r0, r1}
+ #ifdef DOUBLE_CACHE_CLEAN_BANK
  	eor	r0, r0, r1
  	str	r0, [r2]
+ #endif
  
  Lsa110_cache_purgeID_loop:
  	ldr	r2, [r0], #32
  	subs	r1, r1, #32
  	bne	Lsa110_cache_purgeID_loop
  
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mcr	15, 0, r0, c7, c5, 0	/* flush I cache (D flushed above) */
  #ifdef CACHE_CLEAN_BLOCK_INTR
  	msr	cpsr_all , r3
*************** ENTRY(sa110_cache_purgeD)
*** 571,584 ****
--- 577,593 ----
  #endif
  	ldr	r2, Lsa110_cache_clean_addr
  	ldmia	r2, {r0, r1}
+ #ifdef DOUBLE_CACHE_CLEAN_BANK
  	eor	r0, r0, r1
  	str	r0, [r2]
+ #endif
  
  Lsa110_cache_purgeD_loop:
  	ldr	r2, [r0], #32
  	subs	r1, r1, #32
  	bne	Lsa110_cache_purgeD_loop
  
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  #ifdef CACHE_CLEAN_BLOCK_INTR
  	msr	cpsr_all , r3
  #else
*************** Lsa110_cache_purgeD_loop:
*** 588,599 ****
--- 597,610 ----
  
  ENTRY(sa110_cache_purgeID_E)
  	mcr	15, 0, r0, c7, c10, 1		/* clean dcache entry */
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mcr	15, 0, r0, c7, c5, 0		/* flush I cache */
  	mcr	15, 0, r0, c7, c6, 1		/* flush D cache single entry */
  	mov	pc, lr
  
  ENTRY(sa110_cache_purgeD_E)
  	mcr	15, 0, r0, c7, c10, 1		/* clean dcache entry */
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mcr	15, 0, r0, c7, c6, 1		/* flush D cache single entry */
  	mov	pc, lr
  #endif	/* CPU_SA110 */
*************** ENTRY(sa110_cache_syncI)
*** 626,639 ****
--- 637,653 ----
  #endif
  	ldr	r2, Lsa110_cache_clean_addr
  	ldmia	r2, {r0, r1}
+ #ifdef DOUBLE_CACHE_CLEAN_BANK
  	eor	r0, r0, r1
  	str	r0, [r2]
+ #endif
  
  Lsa110_cache_syncI_loop:
  	ldr	r2, [r0], #32
  	subs	r1, r1, #32
  	bne	Lsa110_cache_syncI_loop
  
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mcr	15, 0, r0, c7, c5, 0		/* flush I cache */
  #ifdef CACHE_CLEAN_BLOCK_INTR
  	msr	cpsr_all , r3
*************** sa110_cache_cleanD_rng_loop:
*** 656,666 ****
  	add	r0, r0, #32
  	subs	r1, r1, #32
  	bpl	sa110_cache_cleanD_rng_loop
  
  	mov	pc, lr
  
  ENTRY(sa110_cache_purgeID_rng)
! 	cmp	r1, #0x2000
  	bcs	_C_LABEL(sa110_cache_purgeID)
  
  	and	r2, r0, #0x1f
--- 670,681 ----
  	add	r0, r0, #32
  	subs	r1, r1, #32
  	bpl	sa110_cache_cleanD_rng_loop
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  
  	mov	pc, lr
  
  ENTRY(sa110_cache_purgeID_rng)
! 	cmp	r1, #0x4000
  	bcs	_C_LABEL(sa110_cache_purgeID)
  
  	and	r2, r0, #0x1f
*************** sa110_cache_purgeID_rng_loop:
*** 674,685 ****
  	subs	r1, r1, #32
  	bpl	sa110_cache_purgeID_rng_loop
  
  	mcr	15, 0, r0, c7, c5, 0		/* flush I cache */
  
  	mov	pc, lr
  
  ENTRY(sa110_cache_purgeD_rng)
! 	cmp	r1, #0x2000
  	bcs	_C_LABEL(sa110_cache_purgeD)
  
  	and	r2, r0, #0x1f
--- 689,701 ----
  	subs	r1, r1, #32
  	bpl	sa110_cache_purgeID_rng_loop
  
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mcr	15, 0, r0, c7, c5, 0		/* flush I cache */
  
  	mov	pc, lr
  
  ENTRY(sa110_cache_purgeD_rng)
! 	cmp	r1, #0x4000
  	bcs	_C_LABEL(sa110_cache_purgeD)
  
  	and	r2, r0, #0x1f
*************** sa110_cache_purgeD_rng_loop:
*** 693,698 ****
--- 709,715 ----
  	subs	r1, r1, #32
  	bpl	sa110_cache_purgeD_rng_loop
  
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mov	pc, lr
  
  ENTRY(sa110_cache_syncI_rng)
*************** sa110_cache_syncI_rng_loop:
*** 709,714 ****
--- 726,732 ----
  	subs	r1, r1, #32
  	bpl	sa110_cache_syncI_rng_loop
  
+ 	mcr	15, 0, r0, c7, c10, 4		/* drain write buffer */
  	mcr	15, 0, r0, c7, c5, 0		/* flush I cache */
  
  	mov	pc, lr

--==_Exmh_-11899661060--




>Release-Note:
>Audit-Trail:
>Unformatted: