Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/lib/libkern/arch/sh5 Replace the SuperH memcpy() with ho...
details: https://anonhg.NetBSD.org/src/rev/f23534e851ea
branches: trunk
changeset: 538542:f23534e851ea
user: scw <scw%NetBSD.org@localhost>
date: Tue Oct 22 12:25:18 2002 +0000
description:
Replace the SuperH memcpy() with homebrewed code. The former seems to have
a subtle failure mode which can result in corruption of memory outside the
bounds of the destination buffer.
diffstat:
sys/lib/libkern/arch/sh5/memcpy.S | 337 ++++++++++++++++++-------------------
1 files changed, 168 insertions(+), 169 deletions(-)
diffs (truncated from 366 to 300 lines):
diff -r 9a54e4a179e9 -r f23534e851ea sys/lib/libkern/arch/sh5/memcpy.S
--- a/sys/lib/libkern/arch/sh5/memcpy.S Tue Oct 22 12:25:17 2002 +0000
+++ b/sys/lib/libkern/arch/sh5/memcpy.S Tue Oct 22 12:25:18 2002 +0000
@@ -1,194 +1,193 @@
-/* $NetBSD: memcpy.S,v 1.1 2002/10/17 11:53:33 scw Exp $ */
+/* $NetBSD: memcpy.S,v 1.2 2002/10/22 12:25:18 scw Exp $ */
/*
- * Fast SH5 memcpy, by J"orn Rennecke (joern.rennecke%superh.com@localhost)
- *
- * Copyright 2002 SuperH, Inc. All rights reserved
+ * Copyright 2002 Wasabi Systems, Inc.
+ * All rights reserved.
*
- * This software is the property of SuperH, Inc (SuperH) which specifically
- * grants the user the right to modify, use and distribute this software
- * provided this notice is not removed or altered. All other rights are
- * reserved by SuperH.
- *
- * SUPERH MAKES NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, WITH REGARD TO
- * THIS SOFTWARE. IN NO EVENT SHALL SUPERH BE LIABLE FOR INDIRECT, SPECIAL,
- * INCIDENTAL OR CONSEQUENTIAL DAMAGES IN CONNECTION WITH OR ARISING FROM
- * THE FURNISHING, PERFORMANCE, OR USE OF THIS SOFTWARE.
+ * Written by Steve C. Woodford for Wasabi Systems, Inc.
*
- * So that all may benefit from your experience, please report any problems
- * or suggestions about this software to the SuperH Support Center via
- * e-mail at softwaresupport%superh.com@localhost .
- *
- * SuperH, Inc.
- * 405 River Oaks Parkway
- * San Jose
- * CA 95134
- * USA
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed for the NetBSD Project by
+ * Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ * or promote products derived from this software without specific prior
+ * written permission.
*
- * The code assumes that any quadword can be read in its
- * enirety if at least one byte is included in the copy.
- */
-
-/*
- * Slightly modified for use in NetBSD
- * by Steve Woodford (scw%wasabisystems.com@localhost):
- * - LP64 support,
- * - tweak register usage, mostly to avoid using r24.
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
*/
#include <machine/asm.h>
-#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
-#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
-#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
-#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
+/*
+ * void *memcpy(void *dest, void *src, size_t bytes)
+ *
+ * This is reasonably fast memcpy() routine.
+ *
+ * If the src/dest parameters are suitably aligned, it will try to align
+ * things such that "alloco" can be used to pre-allocate a cache-line for
+ * "dest".
+ *
+ * If the alignment of src and dest are different, the routine falls back
+ * to a byte-wise copy. This ain't great, but it serves the caller right.
+ *
+ * This algorithm could be improved upon, but I'm wary of trying to be
+ * too smart, given the lossage experienced with SuperH's memcpy() from
+ * newlib.
+ */
ENTRY(memcpy)
#ifndef _LP64
- add.l r2, r63, r2
+ add.l r2, r63, r7
add.l r3, r63, r3
addz.l r4, r63, r4
+#else
+ add r2, r63, r7
#endif
- ld.b r3, 0, r63
- pta/l Large, tr0
- movi 25, r0
- bgeu/u r4, r0, tr0
- nsb r4, r0
- shlli r0, 5, r0
- movi (L1 - L0 + 63*32 + 1) & 0xffff, r1
- sub r1, r0, r0
-L0: ptrel r0, tr0
- add r2, r4, r5
- ptabs r18, tr1
- add r3, r4, r6
- blink tr0, r63
+ ptabs/u r18, tr0
+ beq/u r4, r63, tr0 /* Bail now if bytes == 0 */
+
+ /*
+ * First, try to align operands. This can only be done if the low 3
+ * bits match.
+ */
+ pta/l Laligned, tr1
+ or r7, r3, r1
+ andi r1, 7, r1
+ beq/l r1, r63, tr1 /* Operands are already aligned */
+
+ pta/u Lbyte_copy, tr1
+ xor r7, r3, r0
+ andi r0, 7, r0 /* Operands misaligned differently? */
+ bne/u r0, r63, tr1 /* Yup. Fallback to copying byte-wise */
+
+ add r4, r1, r0
+ movi 8, r8
+ bgtu/l r8, r0, tr1
+
+ ldlo.q r3, 0, r0
+ stlo.q r7, 0, r0
+ sub r8, r1, r0
+ sub r4, r0, r4
+ add r7, r0, r7
+ add r3, r0, r3
- .balign 8
-L1:
- /* 0 byte memcpy */
+ /*
+ * The buffers are quad aligned. Now align src to a 32-byte boundary
+ * if possible.
+ */
+Laligned:
+ movi 0x1f, r6
+ pta/u Ltrailer, tr2
+ bgeu/u r6, r4, tr2 /* Jump if less than 32 bytes left */
+ add r7, r63, r5
+ add r7, r6, r7
+ andc r7, r6, r7 /* Round dst up to 32-byte boundary */
+ sub r7, r5, r1
+ add r3, r1, r3 /* Adjust src to match */
+ sub r4, r1, r4
+ xor r1, r6, r1
+ addi r1, 2, r1
+ ptrel/l r1, tr1
blink tr1, r63
+ ld.q r3, -24, r0
+ st.q r7, -24, r0
+ ld.q r3, -16, r0
+ st.q r7, -16, r0
+ ld.q r3, -8, r0
+ st.q r7, -8, r0
-L4_7: /* 4..7 byte memcpy cntd. */
- stlo.l r2, 0, r0
- or r6, r7, r6
- sthi.l r5, -1, r6
- stlo.l r5, -4, r6
- blink tr1, r63
-
-L2_3: /* 2 or 3 byte memcpy cntd. */
- st.b r5, -1, r6
+ /*
+ * "src" is now aligned to a multiple of 32 bytes
+ */
+ bgeu/u r6, r4, tr2 /* Jump if less than 32 bytes left */
+ pta/l Lcache_enter, tr1
+ pta/u Lcache_loop, tr2
+ ld.q r3, 0, r63 /* Prefetch one cache-line in advance */
+ alloco r7, 0 /* Allocate one cache-line in advance */
+ add r7, r4, r5
+ and r4, r6, r4
+ andc r5, r6, r5
blink tr1, r63
- /* 1 byte memcpy */
- ld.b r3, 0, r0
- st.b r2, 0, r0
- blink tr1, r63
+Lcache_loop:
+ ld.q r3, 0, r63 /* Prefetch in advance */
+ alloco r7, 0 /* Allocate one cache-line in advance */
+ ld.q r3, -32, r19
+ ld.q r3, -24, r20
+ ld.q r3, -16, r21
+ ld.q r3, -8, r22
+ st.q r7, -32, r19 /* Copy the previous cache-line */
+ st.q r7, -24, r20
+ st.q r7, -16, r21
+ st.q r7, -8, r22
+Lcache_enter:
+ addi r7, 32, r7 /* Next cache-line */
+ addi r3, 32, r3
+ bne/l r5, r7, tr2
-L8_15: /* 8..15 byte memcpy cntd. */
- stlo.q r2, 0, r0
- or r6, r7, r6
- sthi.q r5, -1, r6
- stlo.q r5, -8, r6
+ ld.q r3, -32, r19
+ ld.q r3, -24, r20
+ ld.q r3, -16, r21
+ ld.q r3, -8, r22
+ st.q r7, -32, r19
+ st.q r7, -24, r20
+ st.q r7, -16, r21
+ st.q r7, -8, r22
+
+ /*
+ * We have, at most, 31 bytes left to deal with.
+ */
+Ltrailer:
+ beq/u r4, r63, tr0 /* Return to caller if done. */
+ add r4, r7, r8
+ add r4, r3, r9
+ andi r4, 0x18, r4
+ add r7, r4, r7
+ add r3, r4, r3
+ xori r4, 0x1f, r4
+ addi r4, 2, r4
+ ptrel/l r4, tr1
blink tr1, r63
-
- /* 2 or 3 byte memcpy */
- ld.b r3, 0, r0
- ld.b r2, 0, r63
- ld.b r3, 1, r1
- st.b r2, 0, r0
- pta/l L2_3, tr0
- ld.b r6, -1, r6
- st.b r2, 1, r1
- blink tr0, r63
-
- /* 4 .. 7 byte memcpy */
- LDUAL (r3, 0, r0, r1)
- pta L4_7, tr0
- ldlo.l r6, -4, r7
- or r0, r1, r0
- sthi.l r2, 3, r0
- ldhi.l r6, -1, r6
- blink tr0, r63
-
- /* 8 .. 15 byte memcpy */
- LDUAQ (r3, 0, r0, r1)
- pta L8_15, tr0
- ldlo.q r6, -8, r7
- or r0, r1, r0
- sthi.q r2, 7, r0
- ldhi.q r6, -1, r6
+ ld.q r3, -24, r0
+ st.q r7, -24, r0
+ ld.q r3, -16, r0
+ st.q r7, -16, r0
+ ld.q r3, -8, r0
+ st.q r7, -8, r0
+ ldhi.q r9, -1, r0
+ sthi.q r8, -1, r0
blink tr0, r63
- /* 16 .. 24 byte memcpy */
- LDUAQ (r3, 0, r0, r1)
- LDUAQ (r3, 8, r8, r9)
- or r0, r1, r0
- sthi.q r2, 7, r0
- or r8, r9, r8
- sthi.q r2, 15, r8
- ldlo.q r6, -8, r7
- ldhi.q r6, -1, r6
- stlo.q r2, 8, r8
- stlo.q r2, 0, r0
- or r6, r7, r6
- sthi.q r5, -1, r6
- stlo.q r5, -8, r6
- blink tr1, r63
-Large:
- ld.b r2, 0, r63
- pta/l Loop_ua, tr1
- ori r3, -8, r7
Home |
Main Index |
Thread Index |
Old Index