Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/common/lib/libc/arch/x86_64/string Align to the destination ...



details:   https://anonhg.NetBSD.org/src/rev/9174d514d4c2
branches:  trunk
changeset: 749232:9174d514d4c2
user:      dsl <dsl%NetBSD.org@localhost>
date:      Sun Nov 22 17:25:47 2009 +0000

description:
Align to the destination buffer.
This probably costs 1 clock (on modern cpus) in the normal case.
But gives a big benefit when the destination is misaligned.
In particular when the source has the same misalignment - although
that may not be a gain on Nehalem!
Fixes PR/35535

diffstat:

 common/lib/libc/arch/x86_64/string/bcopy.S |  39 +++++++++++++++++++++++++----
 1 files changed, 33 insertions(+), 6 deletions(-)

diffs (92 lines):

diff -r 6f54beb27b63 -r 9174d514d4c2 common/lib/libc/arch/x86_64/string/bcopy.S
--- a/common/lib/libc/arch/x86_64/string/bcopy.S        Sun Nov 22 17:09:58 2009 +0000
+++ b/common/lib/libc/arch/x86_64/string/bcopy.S        Sun Nov 22 17:25:47 2009 +0000
@@ -32,14 +32,14 @@
 #include <machine/asm.h>
 
 #if defined(LIBC_SCCS)
-       RCSID("$NetBSD: bcopy.S,v 1.3 2009/11/21 19:52:54 dsl Exp $")
+       RCSID("$NetBSD: bcopy.S,v 1.4 2009/11/22 17:25:47 dsl Exp $")
 #endif
 
        /*
         * (ov)bcopy (src,dst,cnt)
         *  ws%tools.de@localhost     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
         *
-        * Hacked about by dsl%netnsd.org@localhost
+        * Hacked about by dsl%netbsd.org@localhost
         */
 
 #ifdef MEMCOPY
@@ -55,7 +55,9 @@
        movq    %rdx,%rcx
 #if defined(MEMCOPY) || defined(MEMMOVE)
        movq    %rdi,%rax       /* must return destination address */
+       mov     %rdi,%r11       /* for misaligned check */
 #else
+       mov     %rsi,%r11       /* for misaligned check */
        xchgq   %rdi,%rsi       /* bcopy() has arg order reversed */
 #endif
 
@@ -68,7 +70,7 @@
        jz      8f              /* j if less than 8 bytes */
 
        lea     -8(%rdi,%rdx),%r9       /* target address of last 8 */
-       mov     -8(%rsi,%rdx),%r10      /* get last bytes */
+       mov     -8(%rsi,%rdx),%r10      /* get last word */
 #if !defined(NO_OVERLAP)
        cmpq    %rdx,%r8        /* overlapping? */
        jb      10f
@@ -80,16 +82,41 @@
  * if %ecx is more than 76.
  * AMD might do something similar some day.
  */
+       and     $7,%r11         /* destination misaligned ? */
+       jnz     2f
        rep
        movsq
-       mov     %r10,(%r9)      /* write last bytes */
+       mov     %r10,(%r9)      /* write last word */
+       ret
+
+/*
+ * Destination misaligned
+ * AMD say it is better to align the destination (not the source).
+ * This will also re-align copies if the source and dest are both
+ * misaligned by the same amount)
+ * (I think Nehalem will use its accelerated copy if the source
+ * and destination have the same alignment.)
+ */
+2:
+       lea     -9(%r11,%rdx),%rcx      /* post re-alignment count */
+       neg     %r11                    /* now -1 .. -7 */
+       mov     (%rsi),%rdx             /* get first word */
+       mov     %rdi,%r8                /* target for first word */
+       lea     8(%rsi,%r11),%rsi
+       lea     8(%rdi,%r11),%rdi
+       shr     $3,%rcx
+       rep
+       movsq
+       mov     %rdx,(%r8)              /* write first word */
+       mov     %r10,(%r9)              /* write last word */
        ret
 
 #if !defined(NO_OVERLAP)
 /* Must copy backwards.
  * Reverse copy is probably easy to code faster than 'rep movds'
- * since that requires (IIRC) an extra clock per iteration.
+ * since that requires (IIRC) an extra clock every 3 iterations (AMD).
  * However I don't suppose anything cares that much!
+ * The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4.
  * The copy is aligned with the buffer start (more likely to
  * be a multiple of 8 than the end).
  */
@@ -106,7 +133,7 @@
 
 /* Less than 8 bytes to copy, copy by bytes */
 /* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
- * For long transfers it is 50+ !
+ * For longer transfers it is 50+ !
  */
 8:     mov     %rdx,%rcx
 



Home | Main Index | Thread Index | Old Index