Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/common/lib/libc/arch/x86_64/string Align to the destination ...
details: https://anonhg.NetBSD.org/src/rev/9174d514d4c2
branches: trunk
changeset: 749232:9174d514d4c2
user: dsl <dsl%NetBSD.org@localhost>
date: Sun Nov 22 17:25:47 2009 +0000
description:
Align to the destination buffer.
This probably costs 1 clock (on modern cpus) in the normal case.
But gives a big benefit when the destination is misaligned.
In particular when the source has the same misalignment - although
that may not be a gain on Nehalem!
Fixes PR/35535
diffstat:
common/lib/libc/arch/x86_64/string/bcopy.S | 39 +++++++++++++++++++++++++----
1 files changed, 33 insertions(+), 6 deletions(-)
diffs (92 lines):
diff -r 6f54beb27b63 -r 9174d514d4c2 common/lib/libc/arch/x86_64/string/bcopy.S
--- a/common/lib/libc/arch/x86_64/string/bcopy.S Sun Nov 22 17:09:58 2009 +0000
+++ b/common/lib/libc/arch/x86_64/string/bcopy.S Sun Nov 22 17:25:47 2009 +0000
@@ -32,14 +32,14 @@
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: bcopy.S,v 1.3 2009/11/21 19:52:54 dsl Exp $")
+ RCSID("$NetBSD: bcopy.S,v 1.4 2009/11/22 17:25:47 dsl Exp $")
#endif
/*
* (ov)bcopy (src,dst,cnt)
* ws%tools.de@localhost (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*
- * Hacked about by dsl%netnsd.org@localhost
+ * Hacked about by dsl%netbsd.org@localhost
*/
#ifdef MEMCOPY
@@ -55,7 +55,9 @@
movq %rdx,%rcx
#if defined(MEMCOPY) || defined(MEMMOVE)
movq %rdi,%rax /* must return destination address */
+ mov %rdi,%r11 /* for misaligned check */
#else
+ mov %rsi,%r11 /* for misaligned check */
xchgq %rdi,%rsi /* bcopy() has arg order reversed */
#endif
@@ -68,7 +70,7 @@
jz 8f /* j if less than 8 bytes */
lea -8(%rdi,%rdx),%r9 /* target address of last 8 */
- mov -8(%rsi,%rdx),%r10 /* get last bytes */
+ mov -8(%rsi,%rdx),%r10 /* get last word */
#if !defined(NO_OVERLAP)
cmpq %rdx,%r8 /* overlapping? */
jb 10f
@@ -80,16 +82,41 @@
* if %ecx is more than 76.
* AMD might do something similar some day.
*/
+ and $7,%r11 /* destination misaligned ? */
+ jnz 2f
rep
movsq
- mov %r10,(%r9) /* write last bytes */
+ mov %r10,(%r9) /* write last word */
+ ret
+
+/*
+ * Destination misaligned
+ * AMD say it is better to align the destination (not the source).
+ * This will also re-align copies if the source and dest are both
+ * misaligned by the same amount)
+ * (I think Nehalem will use its accelerated copy if the source
+ * and destination have the same alignment.)
+ */
+2:
+ lea -9(%r11,%rdx),%rcx /* post re-alignment count */
+ neg %r11 /* now -1 .. -7 */
+ mov (%rsi),%rdx /* get first word */
+ mov %rdi,%r8 /* target for first word */
+ lea 8(%rsi,%r11),%rsi
+ lea 8(%rdi,%r11),%rdi
+ shr $3,%rcx
+ rep
+ movsq
+ mov %rdx,(%r8) /* write first word */
+ mov %r10,(%r9) /* write last word */
ret
#if !defined(NO_OVERLAP)
/* Must copy backwards.
* Reverse copy is probably easy to code faster than 'rep movds'
- * since that requires (IIRC) an extra clock per iteration.
+ * since that requires (IIRC) an extra clock every 3 iterations (AMD).
* However I don't suppose anything cares that much!
+ * The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4.
* The copy is aligned with the buffer start (more likely to
* be a multiple of 8 than the end).
*/
@@ -106,7 +133,7 @@
/* Less than 8 bytes to copy, copy by bytes */
/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
- * For long transfers it is 50+ !
+ * For longer transfers it is 50+ !
*/
8: mov %rdx,%rcx
Home |
Main Index |
Thread Index |
Old Index