Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/common/lib/libc/arch/x86_64/string Avoid doing two 'rep movs...
details: https://anonhg.NetBSD.org/src/rev/a544ac70e1a6
branches: trunk
changeset: 749217:a544ac70e1a6
user: dsl <dsl%NetBSD.org@localhost>
date: Sat Nov 21 19:52:54 2009 +0000
description:
Avoid doing two 'rep movs' operations.
diffstat:
common/lib/libc/arch/x86_64/string/bcopy.S | 100 ++++++++++++++++++++--------
1 files changed, 70 insertions(+), 30 deletions(-)
diffs (136 lines):
diff -r f979b750312e -r a544ac70e1a6 common/lib/libc/arch/x86_64/string/bcopy.S
--- a/common/lib/libc/arch/x86_64/string/bcopy.S Sat Nov 21 18:53:08 2009 +0000
+++ b/common/lib/libc/arch/x86_64/string/bcopy.S Sat Nov 21 19:52:54 2009 +0000
@@ -32,16 +32,19 @@
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: bcopy.S,v 1.2 2007/11/12 18:41:59 ad Exp $")
+ RCSID("$NetBSD: bcopy.S,v 1.3 2009/11/21 19:52:54 dsl Exp $")
#endif
/*
* (ov)bcopy (src,dst,cnt)
* ws%tools.de@localhost (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
+ *
+ * Hacked about by dsl%netnsd.org@localhost
*/
#ifdef MEMCOPY
ENTRY(memcpy)
+#define NO_OVERLAP
#else
#ifdef MEMMOVE
ENTRY(memmove)
@@ -49,45 +52,82 @@
ENTRY(bcopy)
#endif
#endif
+ movq %rdx,%rcx
#if defined(MEMCOPY) || defined(MEMMOVE)
- movq %rdi,%r11 /* save dest */
+ movq %rdi,%rax /* must return destination address */
#else
- xchgq %rdi,%rsi
+ xchgq %rdi,%rsi /* bcopy() has arg order reversed */
+#endif
+
+#if !defined(NO_OVERLAP)
+ movq %rdi,%r8
+ subq %rsi,%r8
#endif
- movq %rdx,%rcx
- movq %rdi,%rax
- subq %rsi,%rax
- cmpq %rcx,%rax /* overlapping? */
- jb 1f
- /* nope, copy forwards. */
- shrq $3,%rcx /* copy by words */
+
+ shrq $3,%rcx /* count for copy by words */
+ jz 8f /* j if less than 8 bytes */
+
+ lea -8(%rdi,%rdx),%r9 /* target address of last 8 */
+ mov -8(%rsi,%rdx),%r10 /* get last bytes */
+#if !defined(NO_OVERLAP)
+ cmpq %rdx,%r8 /* overlapping? */
+ jb 10f
+#endif
+
+/*
+ * Non-overlaping, copy forwards.
+ * Newer Intel cpus (Nehalem) will do 16byte read/write transfers
+ * if %ecx is more than 76.
+ * AMD might do something similar some day.
+ */
rep
movsq
- movq %rdx,%rcx
- andq $7,%rcx /* any bytes left? */
+ mov %r10,(%r9) /* write last bytes */
+ ret
+
+#if !defined(NO_OVERLAP)
+/* Must copy backwards.
+ * Reverse copy is probably easy to code faster than 'rep movds'
+ * since that requires (IIRC) an extra clock per iteration.
+ * However I don't suppose anything cares that much!
+ * The copy is aligned with the buffer start (more likely to
+ * be a multiple of 8 than the end).
+ */
+10:
+ lea -8(%rsi,%rcx,8),%rsi
+ lea -8(%rdi,%rcx,8),%rdi
+ std
+ rep
+ movsq
+ cld
+ mov %r10,(%r9) /* write last bytes */
+ ret
+#endif
+
+/* Less than 8 bytes to copy, copy by bytes */
+/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
+ * For long transfers it is 50+ !
+ */
+8: mov %rdx,%rcx
+
+#if !defined(NO_OVERLAP)
+ cmpq %rdx,%r8 /* overlapping? */
+ jb 81f
+#endif
+
+ /* nope, copy forwards. */
rep
movsb
-#if defined(MEMCOPY) || defined(MEMMOVE)
- movq %r11,%rax
-#endif
ret
-1:
- addq %rcx,%rdi /* copy backwards. */
- addq %rcx,%rsi
+
+#if !defined(NO_OVERLAP)
+/* Must copy backwards */
+81:
+ lea -1(%rsi,%rcx),%rsi
+ lea -1(%rdi,%rcx),%rdi
std
- andq $7,%rcx /* any fractional bytes? */
- decq %rdi
- decq %rsi
rep
movsb
- movq %rdx,%rcx /* copy remainder by words */
- shrq $3,%rcx
- subq $7,%rsi
- subq $7,%rdi
- rep
- movsq
-#if defined(MEMCOPY) || defined(MEMMOVE)
- movq %r11,%rax
-#endif
cld
ret
+#endif
Home |
Main Index |
Thread Index |
Old Index