[src/trunk]: src/common/lib/libc/arch/x86_64/string Avoid doing two 'rep movs...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/common/lib/libc/arch/x86_64/string Avoid doing two 'rep movs...
From: dsl <dsl%NetBSD.org@localhost>
Date: Mon, 06 Apr 2020 18:09:59 +0000

details:   https://anonhg.NetBSD.org/src/rev/a544ac70e1a6
branches:  trunk
changeset: 749217:a544ac70e1a6
user:      dsl <dsl%NetBSD.org@localhost>
date:      Sat Nov 21 19:52:54 2009 +0000

description:
Avoid doing two 'rep movs' operations.

diffstat:

 common/lib/libc/arch/x86_64/string/bcopy.S |  100 ++++++++++++++++++++--------
 1 files changed, 70 insertions(+), 30 deletions(-)

diffs (136 lines):

diff -r f979b750312e -r a544ac70e1a6 common/lib/libc/arch/x86_64/string/bcopy.S
--- a/common/lib/libc/arch/x86_64/string/bcopy.S        Sat Nov 21 18:53:08 2009 +0000
+++ b/common/lib/libc/arch/x86_64/string/bcopy.S        Sat Nov 21 19:52:54 2009 +0000
@@ -32,16 +32,19 @@
 #include <machine/asm.h>
 
 #if defined(LIBC_SCCS)
-       RCSID("$NetBSD: bcopy.S,v 1.2 2007/11/12 18:41:59 ad Exp $")
+       RCSID("$NetBSD: bcopy.S,v 1.3 2009/11/21 19:52:54 dsl Exp $")
 #endif
 
        /*
         * (ov)bcopy (src,dst,cnt)
         *  ws%tools.de@localhost     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
+        *
+        * Hacked about by dsl%netnsd.org@localhost
         */
 
 #ifdef MEMCOPY
 ENTRY(memcpy)
+#define NO_OVERLAP
 #else
 #ifdef MEMMOVE
 ENTRY(memmove)
@@ -49,45 +52,82 @@
 ENTRY(bcopy)
 #endif
 #endif
+       movq    %rdx,%rcx
 #if defined(MEMCOPY) || defined(MEMMOVE)
-       movq    %rdi,%r11       /* save dest */
+       movq    %rdi,%rax       /* must return destination address */
 #else
-       xchgq   %rdi,%rsi
+       xchgq   %rdi,%rsi       /* bcopy() has arg order reversed */
+#endif
+
+#if !defined(NO_OVERLAP)
+       movq    %rdi,%r8
+       subq    %rsi,%r8
 #endif
-       movq    %rdx,%rcx
-       movq    %rdi,%rax
-       subq    %rsi,%rax
-       cmpq    %rcx,%rax       /* overlapping? */
-       jb      1f
-       /* nope, copy forwards. */
-       shrq    $3,%rcx         /* copy by words */
+
+       shrq    $3,%rcx         /* count for copy by words */
+       jz      8f              /* j if less than 8 bytes */
+
+       lea     -8(%rdi,%rdx),%r9       /* target address of last 8 */
+       mov     -8(%rsi,%rdx),%r10      /* get last bytes */
+#if !defined(NO_OVERLAP)
+       cmpq    %rdx,%r8        /* overlapping? */
+       jb      10f
+#endif
+
+/*
+ * Non-overlaping, copy forwards.
+ * Newer Intel cpus (Nehalem) will do 16byte read/write transfers
+ * if %ecx is more than 76.
+ * AMD might do something similar some day.
+ */
        rep
        movsq
-       movq    %rdx,%rcx
-       andq    $7,%rcx         /* any bytes left? */
+       mov     %r10,(%r9)      /* write last bytes */
+       ret
+
+#if !defined(NO_OVERLAP)
+/* Must copy backwards.
+ * Reverse copy is probably easy to code faster than 'rep movds'
+ * since that requires (IIRC) an extra clock per iteration.
+ * However I don't suppose anything cares that much!
+ * The copy is aligned with the buffer start (more likely to
+ * be a multiple of 8 than the end).
+ */
+10:
+       lea     -8(%rsi,%rcx,8),%rsi
+       lea     -8(%rdi,%rcx,8),%rdi
+       std
+       rep
+       movsq
+       cld
+       mov     %r10,(%r9)      /* write last bytes */
+       ret
+#endif
+
+/* Less than 8 bytes to copy, copy by bytes */
+/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
+ * For long transfers it is 50+ !
+ */
+8:     mov     %rdx,%rcx
+
+#if !defined(NO_OVERLAP)
+       cmpq    %rdx,%r8        /* overlapping? */
+       jb      81f
+#endif
+
+       /* nope, copy forwards. */
        rep
        movsb
-#if defined(MEMCOPY) || defined(MEMMOVE)
-       movq    %r11,%rax
-#endif
        ret
-1:
-       addq    %rcx,%rdi       /* copy backwards. */
-       addq    %rcx,%rsi
+
+#if !defined(NO_OVERLAP)
+/* Must copy backwards */
+81:
+       lea     -1(%rsi,%rcx),%rsi
+       lea     -1(%rdi,%rcx),%rdi
        std
-       andq    $7,%rcx         /* any fractional bytes? */
-       decq    %rdi
-       decq    %rsi
        rep
        movsb
-       movq    %rdx,%rcx       /* copy remainder by words */
-       shrq    $3,%rcx
-       subq    $7,%rsi
-       subq    $7,%rdi
-       rep
-       movsq
-#if defined(MEMCOPY) || defined(MEMMOVE)
-       movq    %r11,%rax
-#endif
        cld
        ret
+#endif

Prev by Date: [src/trunk]: src/dist/nawk Better fix for PR/42320 by Takehiko NOZAKI.
Next by Date: [src/trunk]: src/sys/sys Export lwp_getpcb() to userland, since crash(8) need...
Previous by Thread: [src/trunk]: src/dist/nawk Better fix for PR/42320 by Takehiko NOZAKI.
Next by Thread: [src/trunk]: src/sys/sys Export lwp_getpcb() to userland, since crash(8) need...
Indexes:

Home | Main Index | Thread Index | Old Index