Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/lib/libc/arch/sparc64/string I seem to have stumbled on an e...
details: https://anonhg.NetBSD.org/src/rev/9c2491f711bb
branches: trunk
changeset: 511990:9c2491f711bb
user: eeh <eeh%NetBSD.org@localhost>
date: Sun Jul 01 22:19:51 2001 +0000
description:
I seem to have stumbled on an even faster bcopy implementation....
diffstat:
lib/libc/arch/sparc64/string/bcopy.S | 370 +++++++++++++++++++++-------------
1 files changed, 231 insertions(+), 139 deletions(-)
diffs (truncated from 425 to 300 lines):
diff -r 0c3f04fb0032 -r 9c2491f711bb lib/libc/arch/sparc64/string/bcopy.S
--- a/lib/libc/arch/sparc64/string/bcopy.S Sun Jul 01 21:41:58 2001 +0000
+++ b/lib/libc/arch/sparc64/string/bcopy.S Sun Jul 01 22:19:51 2001 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: bcopy.S,v 1.1 2001/06/30 00:10:48 eeh Exp $ */
+/* $NetBSD: bcopy.S,v 1.2 2001/07/01 22:19:51 eeh Exp $ */
/*
* Copyright (c) 2001 Eduardo E. Horvath
@@ -46,7 +46,7 @@
#include <machine/psl.h>
#if defined(LIBC_SCCS) && !defined(lint)
- RCSID("$NetBSD: bcopy.S,v 1.1 2001/06/30 00:10:48 eeh Exp $")
+ RCSID("$NetBSD: bcopy.S,v 1.2 2001/07/01 22:19:51 eeh Exp $")
#endif /* LIBC_SCCS and not lint */
#define EMPTY nop
@@ -101,7 +101,7 @@
cmp %o2, BCOPY_SMALL
Lbcopy_start:
bge Lbcopy_fancy ! if >= this many, go be fancy.
- btst 7, %o0 ! (part of being fancy)
+ cmp %o2, 256
/*
* Not much to copy, just do it a byte at a time.
@@ -124,147 +124,247 @@
/*
* Plenty of data to copy, so try to do it optimally.
*/
+1:
+#if 1
+ ! If it is big enough, use VIS instructions
+ bge Lbcopy_block
+ nop
+#endif
Lbcopy_fancy:
- ! check for common case first: everything lines up.
-! btst 7, %o0 ! done already
- bne 1f
- EMPTY
- btst 7, %o1
- be,a Lbcopy_doubles
- dec 8, %o2 ! if all lined up, len -= 8, goto bcopy_doubes
-1:
- ! If it is big enough, use VIS instructions
- cmp %o2, 256
- bge Lbcopy_block
+
+ !!
+ !! First align the output to a 8-byte entity
+ !!
+
+ save %sp, -CC64FSZ, %sp
+ mov %i0, %o0
+ mov %i1, %o1
+ mov %i2, %o2
+
+ btst 1, %o1
+ bz,pt %icc, 4f
+ btst 2, %o1
- ! If the low bits match, we can make these line up.
-1:
- xor %o0, %o1, %o3 ! t = src ^ dst;
- btst 1, %o3 ! if (t & 1) {
- be 1f
- btst 1, %o0 ! [delay slot: if (src & 1)]
+ ldub [%o0], %o4 ! Load 1st byte
+ dec 1, %o2
+ brlez,pn %o2, Lbcopy_finish ! XXXX
+ inc 1, %o0
+ stb %o4, [%o1] ! Store 1st byte
+ inc 1, %o1 ! Update address
+ btst 2, %o1
+4:
+ bz,pt %icc, 4f
+ btst 1, %o0
+
+ bz,a 1f
+ lduh [%o0], %o4 ! Load short
- ! low bits do not match, must copy by bytes.
-0:
- ldsb [%o0], %o4 ! do {
- inc %o0 ! (++dst)[-1] = *src++;
- inc %o1
- deccc %o2
- bnz 0b ! } while (--len != 0);
- stb %o4, [%o1 - 1]
- retl
- nop
- NOTREACHED
+ ldub [%o0], %o4 ! Load bytes
+ ldub [%o0+1], %o3
+ sllx %o4, 8, %o4
+ or %o3, %o4, %o4
+1:
+ dec 2, %o2
+ brlez,pn %o2, Lbcopy_finish ! XXXX
+ inc 2, %o0
+ sth %o4, [%o1] ! Store 1st short
+ inc 2, %o1
+4:
+ btst 4, %o1
+ bz 4f
+ btst 3, %o0
- ! lowest bit matches, so we can copy by words, if nothing else
-1:
- be 1f ! if (src & 1) {
- btst 2, %o3 ! [delay slot: if (t & 2)]
+ bz,a 1f
+ lduw [%o0], %o4 ! Load word -1
- ! although low bits match, both are 1: must copy 1 byte to align
- ldsb [%o0], %o4 ! *dst++ = *src++;
- stb %o4, [%o1]
- inc %o0
- inc %o1
- dec %o2 ! len--;
- btst 2, %o3 ! } [if (t & 2)]
-1:
- be 1f ! if (t & 2) {
- btst 2, %o0 ! [delay slot: if (src & 2)]
- dec 2, %o2 ! len -= 2;
-0:
- ldsh [%o0], %o4 ! do {
- sth %o4, [%o1] ! *(short *)dst = *(short *)src;
- inc 2, %o0 ! dst += 2, src += 2;
- deccc 2, %o2 ! } while ((len -= 2) >= 0);
- bge 0b
- inc 2, %o1
- b Lbcopy_mopb ! goto mop_up_byte;
- btst 1, %o2 ! } [delay slot: if (len & 1)]
- NOTREACHED
+ btst 1, %o0
+ bz,a 2f
+ lduh [%o0], %o4
+
+ ldub [%o0], %o4
+ lduh [%o0+1], %o3
+ sllx %o4, 16, %o4
+ or %o4, %o3, %o4
+ ldub [%o0+3], %o3
+ sllx %o4, 8, %o4
+ ba 1f
+ or %o4, %o3, %o4
+2:
+ lduh [%o0+2], %o3
+ sllx %o4, 16, %o4
+ or %o4, %o3, %o4
+1:
+ dec 4, %o2
+ brlez,pn %o2, Lbcopy_finish ! XXXX
+ inc 4, %o0
+ st %o4, [%o1] ! Store word
+ inc 4, %o1
+4:
+ !!
+ !! We are now 32-bit aligned in the dest.
+ !!
+Lbcopy__common:
- ! low two bits match, so we can copy by longwords
-1:
- be 1f ! if (src & 2) {
- btst 4, %o3 ! [delay slot: if (t & 4)]
+ and %o0, 7, %o4 ! Shift amount
+ andn %o0, 7, %o3 ! Source addr
+ sllx %o4, 3, %o4 ! In bits
+
+ brz %o4, Lbcopy_noshift8
+ nop
+
+ ldx [%o3], %l0 ! Load word -1
+ add %o3, 8, %o0 ! now use %o0 for src
+ ldx [%o0], %l1 ! Load word 0
+
+ add %o3, 8, %o0 ! now use %o0 for src
+ sllx %l0, %o4, %l0 ! Shift high word
+
+ mov 8<<3, %o3
+ sub %o3, %o4, %o3 ! Reverse shift
+ and %o3, 0x38, %o3
+ !!
+ !! Continue until our dest is block aligned
+ !!
- ! although low 2 bits match, they are 10: must copy one short to align
- ldsh [%o0], %o4 ! (*short *)dst = *(short *)src;
- sth %o4, [%o1]
- inc 2, %o0 ! dst += 2;
- inc 2, %o1 ! src += 2;
- dec 2, %o2 ! len -= 2;
- btst 4, %o3 ! } [if (t & 4)]
+ !! Unrolled 8 times
+Lbcopy_aligned8:
+ brz %o2, Lbcopy_finish
+ srlx %l1, %o3, %o5 ! Shift low word
+
+ inc 8, %o0
+ ldx [%o0], %l2 ! Load next part
1:
- be 1f ! if (t & 4) {
- btst 4, %o0 ! [delay slot: if (src & 4)]
- dec 4, %o2 ! len -= 4;
-0:
- ld [%o0], %o4 ! do {
- st %o4, [%o1] ! *(int *)dst = *(int *)src;
- inc 4, %o0 ! dst += 4, src += 4;
- deccc 4, %o2 ! } while ((len -= 4) >= 0);
- bge 0b
- inc 4, %o1
- b Lbcopy_mopw ! goto mop_up_word_and_byte;
- btst 2, %o2 ! } [delay slot: if (len & 2)]
- NOTREACHED
+
+ dec 8, %o2
+ srlx %l1, %o3, %o5 ! Shift low word
+ brlez,pn %o2, Lbcopy_finish ! Should never happen
+ or %o5, %l0, %o5 ! Combine
+
+ inc 8, %o0
+ sllx %l1, %o4, %l0
+
+ ldx [%o0], %l3 ! Load next part
+ stx %o5, [%o1] ! Store result
+ inc 8, %o1
+
+ dec 8, %o2
+ srlx %l2, %o3, %o5 ! Shift low word
+ brlez,pn %o2, Lbcopy_finish ! Should never happen
+ or %o5, %l0, %o5 ! Combine
+
+ inc 8, %o0
+ sllx %l2, %o4, %l0
+
+ ldx [%o0], %l4 ! Load next part
+ stx %o5, [%o1] ! Store result
+ inc 8, %o1
+
+ dec 8, %o2
+ srlx %l3, %o3, %o5 ! Shift low word
+ brlez,pn %o2, Lbcopy_finish ! Should never happen
+ or %o5, %l0, %o5 ! Combine
+
+ inc 8, %o0
+ sllx %l3, %o4, %l0
+
+ ldx [%o0], %l5 ! Load next part
+ stx %o5, [%o1] ! Store result
+ inc 8, %o1
+
+ dec 8, %o2
+ srlx %l4, %o3, %o5 ! Shift low word
+ brlez,pn %o2, Lbcopy_finish ! Should never happen
+ or %o5, %l0, %o5 ! Combine
- ! low three bits match, so we can copy by doublewords
-1:
- be 1f ! if (src & 4) {
- dec 8, %o2 ! [delay slot: len -= 8]
- ld [%o0], %o4 ! *(int *)dst = *(int *)src;
- st %o4, [%o1]
- inc 4, %o0 ! dst += 4, src += 4, len -= 4;
- inc 4, %o1
- dec 4, %o2 ! }
-1:
-Lbcopy_doubles:
- ldx [%o0], %g5 ! do {
- stx %g5, [%o1] ! *(double *)dst = *(double *)src;
- inc 8, %o0 ! dst += 8, src += 8;
- deccc 8, %o2 ! } while ((len -= 8) >= 0);
- bge Lbcopy_doubles
+ inc 8, %o0
+ sllx %l4, %o4, %l0
+
+ ldx [%o0], %l6 ! Load next part
+ stx %o5, [%o1] ! Store result
+ inc 8, %o1
+
+ dec 8, %o2
+ srlx %l5, %o3, %o5 ! Shift low word
+ brlez,pn %o2, Lbcopy_finish ! Should never happen
+ or %o5, %l0, %o5 ! Combine
+
+ inc 8, %o0
+ sllx %l5, %o4, %l0
+
+ ldx [%o0], %l7 ! Load next part
+ stx %o5, [%o1] ! Store result
+ inc 8, %o1
+
+ dec 8, %o2
+ srlx %l6, %o3, %o5 ! Shift low word
+ brlez,pn %o2, Lbcopy_finish ! Should never happen
+ or %o5, %l0, %o5 ! Combine
+
Home |
Main Index |
Thread Index |
Old Index