Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/lib/libc/arch/sparc64/string New bzero() using block store i...
details: https://anonhg.NetBSD.org/src/rev/1956b56653ce
branches: trunk
changeset: 480007:1956b56653ce
user: eeh <eeh%NetBSD.org@localhost>
date: Thu Dec 30 15:31:39 1999 +0000
description:
New bzero() using block store insns.
diffstat:
lib/libc/arch/sparc64/string/bzero.S | 294 +++++++++++++++++++++++++---------
1 files changed, 212 insertions(+), 82 deletions(-)
diffs (truncated from 335 to 300 lines):
diff -r 8a07920ad508 -r 1956b56653ce lib/libc/arch/sparc64/string/bzero.S
--- a/lib/libc/arch/sparc64/string/bzero.S Thu Dec 30 15:30:26 1999 +0000
+++ b/lib/libc/arch/sparc64/string/bzero.S Thu Dec 30 15:31:39 1999 +0000
@@ -1,7 +1,7 @@
-/* $NetBSD: bzero.S,v 1.1 1998/09/11 04:56:32 eeh Exp $ */
+/* $NetBSD: bzero.S,v 1.2 1999/12/30 15:31:39 eeh Exp $ */
/*
- * Copyright (c) 1992, 1993
+ * Copyright (c) 1992, 1993, 1999
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
@@ -40,111 +40,241 @@
*/
#include <machine/asm.h>
+#ifndef _LOCORE
+#define _LOCORE
+#endif
+#include <machine/ctlreg.h>
+#include <machine/frame.h>
+#include <machine/psl.h>
+
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
.asciz "@(#)bzero.s 8.1 (Berkeley) 6/4/93"
#else
- RCSID("$NetBSD: bzero.S,v 1.1 1998/09/11 04:56:32 eeh Exp $")
+ RCSID("$NetBSD: bzero.S,v 1.2 1999/12/30 15:31:39 eeh Exp $")
#endif
#endif /* LIBC_SCCS and not lint */
+#ifdef MEMSET
+/*
+ * memset(addr, c, len)
+ *
+ * Duplicate the pattern so it fills 64-bits, then swap around the
+ * arguments and call bzero.
+ */
+ENTRY(memset)
+ and %o1, 0x0ff, %o3
+ mov %o2, %o1
+ sllx %o3, 8, %o2
+ or %o2, %o3, %o2
+ mov %o0, %o4 ! Save original pointer
+ sllx %o2, 16, %o3
+ or %o2, %o3, %o2
+ sllx %o2, 32, %o3
+ or %o2, %o3, %o2
+#else
/*
* bzero(addr, len)
*
- * We should unroll the loop, but at the moment this would
- * gain nothing since the `std' instructions are what limits us.
+ * We want to use VIS instructions if we're clearing out more than
+ * 256 bytes, but to do that we need to properly save and restore the
+ * FP registers. Unfortunately the code to do that in the kernel needs
+ * to keep track of the current owner of the FPU, hence the different
+ * code.
+ *
*/
ENTRY(bzero)
! %o0 = addr, %o1 = len
-
- ! Optimize a common case: addr and len are both multiples of 8.
- or %o0, %o1, %o2
- btst 7, %o2 ! ((addr | len) & 7) != 0?
- bnz,pt %icc, 1f ! if so, cannot optimize
- cmp %o1, 15 ! len >= 15? -- 1st instr of 1: below
-
- /* `Good' operands, can just store doubles. */
+ clr %o2 ! Initialize our pattern
+#endif
+Lbzero_internal:
+ brz,pn %o1, Lbzero_done ! No bytes to copy??
+! cmp %o1, 8 ! Less than 8 bytes to go?
+! ble,a,pn %icc, Lbzero_small ! Do it byte at a time.
+! deccc 8, %o1 ! pre-decrement
+
+ btst 7, %o0 ! 64-bit aligned? Optimization
+ bz,pt %xcc, 2f
+ btst 3, %o0 ! 32-bit aligned?
+ bz,pt %xcc, 1f
+ btst 1, %o0 ! 16-bit aligned?
+ bz,pt %xcc, 0f
+ btst 3, %o0
+
+ !! unaligned -- store 1 byte
+ stb %o2, [%o0]
+ dec 1, %o1 ! Record storing 1 byte
+ inc %o0
+ cmp %o1, 2
+ bl,a,pn %icc, 7f ! 1 or 0 left
+ dec 8, %o1 ! Fixup count -8
0:
- deccc 8, %o1 ! while ((len -= 8) >= 0)
- bge,a 0b
- stx %g0, [%o0 + %o1] ! *(quad *)(addr + len) = 0;
- retl
- nop
-
- /*
- * Either the address is unaligned, or the count is not a
- * multiple of 8, or both. We will have to align the address
- * in order to use anything `better' than stb.
- */
-1:
-! cmp %o1, 15 ! len >= 15?
- bge,a,pn %xcc, Lstx ! yes, use stx
- btst 1, %o0 ! (but first check alignment)
-
- ! not enough to bother: do byte-at-a-time loop.
-2:
- deccc %o1 ! while (--len >= 0)
- brnz,a,pt %o1, 2b
- stb %g0, [%o0 + %o1] ! addr[len] = 0;
- retl
- nop
+ btst 3, %o0
+ bz,pt %xcc, 1f
+ btst 7, %o0 ! 64-bit aligned?
-Lstx:
- /*
- * There are at least 15 bytes to zero.
- * We may have to zero some initial stuff to align
- * the address.
- */
- bz,a %icc, 1f ! if (addr & 1) {
- btst 2, %o0
- stb %g0, [%o0] ! *addr = 0;
- inc %o0 ! addr++;
- dec %o1 ! len--;
- btst 2, %o0 ! }
+ !! 16-bit aligned -- store half word
+ sth %o2, [%o0]
+ dec 2, %o1 ! Prepare to store 2 bytes
+ inc 2, %o0
+ cmp %o1, 4
+ bl,a,pn %icc, 5f ! Less than 4 left
+ dec 8, %o1 ! Fixup count -8
1:
- bz,a 1f ! if (addr & 2) {
- btst 4, %o0
- sth %g0, [%o0] ! *(short *)addr = 0;
- inc 2, %o0 ! addr += 2;
- dec 2, %o1 ! len -= 2;
- btst 4, %o0 ! }
-1:
- bz 1f ! if (addr & 4) {
- dec 8, %o1
- st %g0, [%o0] ! *(int *)addr = 0;
- inc 4, %o0 ! addr += 4;
- dec 4, %o1 ! len -= 4;
- ! }
- /*
- * Address is double word aligned; len is 8 less than
- * the number of bytes remaining (i.e., len is 0 if
- * the remaining count is 8, 1 if it is 9, etc.).
- */
-1:
- stx %g0, [%o0] ! do {
-2: ! *(quad *)addr = 0;
- inc 8, %o0 ! addr += 8;
- deccc 8, %o1 ! } while ((len -= 8) >= 0);
- bge,a 2b
- stx %g0, [%o0]
+ btst 7, %o0 ! 64-bit aligned?
+ bz,pt %xcc, 2f
+ nop
+ !! 32-bit aligned -- store word
+ stw %o2, [%o0]
+ dec 4, %o1
+ inc 4, %o0
+ cmp %o1, 8
+ bl,a,pn %icc, Lbzero_cleanup ! Less than 8 left
+ dec 8, %o1 ! Fixup count -8
+2:
+ !! Now we're 64-bit aligned
+ cmp %o1, 256 ! Use block clear if len > 256
+ bge,pt %xcc, Lbzero_block ! use block store insns
+ deccc 8, %o1
+Lbzero_longs:
+ bl,pn %xcc, Lbzero_cleanup ! Less than 8 bytes left
+ nop
+3:
+ stx %o2, [%o0] ! Do 1 longword at a time
+ deccc 8, %o1
+ bge,pt %xcc, 3b
+ inc 8, %o0
/*
* Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
* -6 => two bytes, etc. Mop up this remainder, if any.
*/
+Lbzero_cleanup:
btst 4, %o1
- bz 1f ! if (len & 4) {
+ bz,pt %xcc, 6f ! if (len & 4) {
btst 2, %o1
- stw %g0, [%o0] ! *(int *)addr = 0;
+ stw %o2, [%o0] ! *(int *)addr = 0;
inc 4, %o0 ! addr += 4;
-1:
- bz 1f ! if (len & 2) {
+5:
+ btst 2, %o1
+6:
+ bz,pt %xcc, 8f ! if (len & 2) {
btst 1, %o1
- sth %g0, [%o0] ! *(short *)addr = 0;
+ sth %o2, [%o0] ! *(short *)addr = 0;
inc 2, %o0 ! addr += 2;
-1:
- bnz,a 1f ! if (len & 1)
- stb %g0, [%o0] ! *addr = 0;
-1:
+7:
+ btst 1, %o1
+8:
+ bnz,a %icc, Lbzero_done ! if (len & 1)
+ stb %o2, [%o0] ! *addr = 0;
+Lbzero_done:
retl
+ mov %o4, %o0 ! Restore ponter for memset (ugh)
+
+ /*
+ * Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
+ * -6 => two bytes, etc. but we're potentially unaligned.
+ * Do byte stores since it's easiest.
+ */
+Lbzero_small:
+ inccc 8, %o1
+ bz,pn %icc, Lbzero_done
+1:
+ deccc %o1
+ stb %o2, [%o0]
+ bge,pt %icc, 1b
+ inc %o0
+ ba,a,pt %icc, Lbzero_done
+ nop ! XXX spitfire bug?
+
+Lbzero_block:
+/*
+ * Userland:
+ *
+ * We allocate enough space on the stack to save our registers and save
+ * our floating point state. We really don't need to do this if the
+ * registers were not in use before, but we can't really tell if they
+ * were in use or not.
+ *
+ * See locore.s for the kernel version.
+ *
+ */
+ save %sp, -(CC64FSZ+32*8+BLOCK_SIZE), %sp ! Allocate an fpstate
+ add %sp, (CC64FSZ+BLOCK_SIZE-1), %l0 ! Calculate pointer to fpstate
+ andn %l0, BLOCK_ALIGN, %l0 ! And make it block aligned
+ btst 1, %sp
+ add %l0, BIAS, %l1 ! Fixup 64-bit stack pointers
+ movnz %xcc, %l1, %l0
+
+! wr %g0, FPRS_FEF, %fprs ! Enable FPU
+ stda %f0, [%l0] ASI_BLK_P
+ add %l0, BLOCK_SIZE, %l1
+ stda %f16, [%l1] ASI_BLK_COMMIT_P ! We only need two banks
+
+ !! We are now 8-byte aligned. We need to become 64-byte aligned.
+ btst 63, %i0
+ bz,pt %xcc, 2f
nop
+1:
+ stx %i2, [%i0]
+ inc 8, %i0
+ btst 63, %i0
+ bnz,pt %xcc, 1b
+ dec 8, %i1
+
+2:
+ brz,pt %i2, 4f ! Do we have a pattern to load?
+ fzero %f0 ! Set up FPU
+
+ btst 1, %fp
+ bnz,pt %icc, 3f ! 64-bit stack?
+ nop
+ stw %i2, [%fp + 0x28] ! Flush this puppy to RAM
+ membar #StoreLoad
+ ld [%fp + 0x28], %f0
+ ba,pt %icc, 4f
+ fmovsa %icc, %f0, %f1
+3:
+ stx %i2, [%fp + BIAS + 0x50] ! Flush this puppy to RAM
+ membar #StoreLoad
+ ldd [%fp + BIAS + 0x50], %f0
+4:
+ fmovda %icc, %f0, %f2 ! Duplicate the pattern
+ fmovda %icc, %f0, %f4
+ fmovda %icc, %f0, %f6
+ fmovda %icc, %f0, %f8
Home |
Main Index |
Thread Index |
Old Index