Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/common/lib/libc/arch/aarch64/string Fixed to not use the "br...



details:   https://anonhg.NetBSD.org/src/rev/8154df8ce927
branches:  trunk
changeset: 971017:8154df8ce927
user:      ryo <ryo%NetBSD.org@localhost>
date:      Sat Apr 11 05:12:52 2020 +0000

description:
Fixed to not use the "br" instruction. Branch Target Identification (BTI) doesn't like "br".

requested by maxv@

diffstat:

 common/lib/libc/arch/aarch64/string/bcopy.S  |  134 ++++++++++++++++++--------
 common/lib/libc/arch/aarch64/string/memset.S |   30 +++--
 2 files changed, 110 insertions(+), 54 deletions(-)

diffs (245 lines):

diff -r 2f2fedfbb37b -r 8154df8ce927 common/lib/libc/arch/aarch64/string/bcopy.S
--- a/common/lib/libc/arch/aarch64/string/bcopy.S       Sat Apr 11 01:51:14 2020 +0000
+++ b/common/lib/libc/arch/aarch64/string/bcopy.S       Sat Apr 11 05:12:52 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
+/* $NetBSD: bcopy.S,v 1.2 2020/04/11 05:12:52 ryo Exp $ */
 
 /*
  * Copyright (c) 2018 Ryo Shimizu <ryo%nerv.org@localhost>
@@ -29,7 +29,7 @@
 #include <machine/asm.h>
 
 #if defined(LIBC_SCCS)
-RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $")
+RCSID("$NetBSD: bcopy.S,v 1.2 2020/04/11 05:12:52 ryo Exp $")
 #endif
 
 #if defined(MEMCOPY)
@@ -207,32 +207,60 @@
 #endif /* (STP_ALIGN > 8) */
 9:
 
+backward_copy1k:
+       /* while (len >= 1024) */
+       /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
        cmp     LEN, #1024
-       bhs     backward_copy1k
-backward_less1k:
-       /* copy 16*n bytes */
-       and     TMP_D, LEN, #(1023-15)          /* len &= 1023; len &= ~15; */
-       adr     TMP_X, 8f
-       sub     LEN, LEN, TMP_D
-       sub     TMP_X, TMP_X, TMP_D, lsr #1     /* jump to (8f - len/2) */
-       br      TMP_X
-backward_copy1k:       /* copy 16*64 bytes */
+       blo     9f
+1:
        sub     LEN, LEN, #1024
        .rept   (1024 / 16)
        ldp     DATA0, DATA1, [SRC0, #-16]!     /* *--dst = *--src; */
        stp     DATA0, DATA1, [DST, #-16]!
        .endr
-8:
-       cbz     LEN, done
        cmp     LEN, #1024
-       bhs     backward_copy1k
-       cmp     LEN, #16
-       bhs     backward_less1k
+       bhs     1b
+9:
 
+       /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
+       tbz     LEN, #9, 1f
+       .rept   (512 / 16)
+       ldp     DATA0, DATA1, [SRC0, #-16]!
+       stp     DATA0, DATA1, [DST, #-16]!
+       .endr
+1:
+       /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
+       tbz     LEN, #8, 1f
+       .rept   (256 / 16)
+       ldp     DATA0, DATA1, [SRC0, #-16]!
+       stp     DATA0, DATA1, [DST, #-16]!
+       .endr
+1:
+       /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
+       tbz     LEN, #7, 1f
+       .rept   (128 / 16)
+       ldp     DATA0, DATA1, [SRC0, #-16]!
+       stp     DATA0, DATA1, [DST, #-16]!
+       .endr
+1:
+       /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
+       tbz     LEN, #6, 1f
+       .rept   (64 / 16)
+       ldp     DATA0, DATA1, [SRC0, #-16]!
+       stp     DATA0, DATA1, [DST, #-16]!
+       .endr
+1:
+       /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
+       tbz     LEN, #5, 1f
+       .rept   (32 / 16)
+       ldp     DATA0, DATA1, [SRC0, #-16]!
+       stp     DATA0, DATA1, [DST, #-16]!
+       .endr
+1:
        /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
        tbz     LEN, #4, 1f
        ldp     DATA0, DATA1, [SRC0, #-16]!
-       ldp     DATA0, DATA1, [DST, #-16]!
+       stp     DATA0, DATA1, [DST, #-16]!
 1:
        /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
        tbz     LEN, #3, 1f
@@ -271,14 +299,10 @@
        bcs     9f
 backward_tiny:
        /* copy 1-10 bytes */
-       adr     TMP_X, 8f
-       sub     TMP_X, TMP_X, LEN, lsl #3       /* jump to (8f - len*2) */
-       br      TMP_X
-       .rept   10
+1:     sub     LEN, LEN, #1
        ldrb    TMP_Xw, [SRC0, #-1]!
        strb    TMP_Xw, [DST, #-1]!
-       .endr
-8:
+       cbz     LEN, 1b
        ret
 9:
        /* length is small(<32), and src or dst may be unaligned */
@@ -548,14 +572,10 @@
        bcs     9f
 forward_tiny:
        /* copy 1-10 bytes */
-       adr     TMP_X, 8f
-       sub     TMP_X, TMP_X, LEN, lsl #3       /* jump to (8f - len*2) */
-       br      TMP_X
-       .rept   10
+1:     sub     LEN, LEN, #1
        ldrb    TMP_Xw, [SRC0], #1
        strb    TMP_Xw, [DST], #1
-       .endr
-8:
+       cbz     LEN, 1b
        ret
 9:
        /* length is small(<32), and src or dst may be unaligned */
@@ -938,28 +958,56 @@
 #endif /* (STP_ALIGN > 8) */
 9:
 
+forward_copy1k:
+       /* while (len >= 1024) */
+       /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
        cmp     LEN, #1024
-       bhs     forward_copy1k
-forward_less1k:
-       /* copy 16*n bytes */
-       and     TMP_D, LEN, #(1023-15)          /* len &= 1023; len &= ~15; */
-       adr     TMP_X, 8f
-       sub     LEN, LEN, TMP_D
-       sub     TMP_X, TMP_X, TMP_D, lsr #1     /* jump to (8f - len/2) */
-       br      TMP_X
-forward_copy1k:        /* copy 16*64 bytes */
+       blo     9f
+1:
        sub     LEN, LEN, #1024
        .rept   (1024 / 16)
        ldp     DATA0, DATA1, [SRC0], #16       /* *dst++ = *src++; */
        stp     DATA0, DATA1, [DST], #16
        .endr
-8:
-       cbz     LEN, done
        cmp     LEN, #1024
-       bhs     forward_copy1k
-       cmp     LEN, #16
-       bhs     forward_less1k
+       bhs     1b
+9:
 
+       /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
+       tbz     LEN, #9, 1f
+       .rept   (512 / 16)
+       ldp     DATA0, DATA1, [SRC0], #16
+       stp     DATA0, DATA1, [DST], #16
+       .endr
+1:
+       /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
+       tbz     LEN, #8, 1f
+       .rept   (256 / 16)
+       ldp     DATA0, DATA1, [SRC0], #16
+       stp     DATA0, DATA1, [DST], #16
+       .endr
+1:
+       /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
+       tbz     LEN, #7, 1f
+       .rept   (128 / 16)
+       ldp     DATA0, DATA1, [SRC0], #16
+       stp     DATA0, DATA1, [DST], #16
+       .endr
+1:
+       /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
+       tbz     LEN, #6, 1f
+       .rept   (64 / 16)
+       ldp     DATA0, DATA1, [SRC0], #16
+       stp     DATA0, DATA1, [DST], #16
+       .endr
+1:
+       /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
+       tbz     LEN, #5, 1f
+       .rept   (32 / 16)
+       ldp     DATA0, DATA1, [SRC0], #16
+       stp     DATA0, DATA1, [DST], #16
+       .endr
+1:
        /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
        tbz     LEN, #4, 1f
        ldp     DATA0, DATA1, [SRC0], #16
diff -r 2f2fedfbb37b -r 8154df8ce927 common/lib/libc/arch/aarch64/string/memset.S
--- a/common/lib/libc/arch/aarch64/string/memset.S      Sat Apr 11 01:51:14 2020 +0000
+++ b/common/lib/libc/arch/aarch64/string/memset.S      Sat Apr 11 05:12:52 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: memset.S,v 1.2 2017/08/29 15:00:23 ryo Exp $ */
+/* $NetBSD: memset.S,v 1.3 2020/04/11 05:12:52 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -158,18 +158,26 @@
        tbz     x15, #3, .Lzero_qword_aligned
        str     xzr, [x15], #8
 .Lzero_qword_aligned:
-       cbz     x7, .Lblock_aligned     /* less than 16 bytes? just branch */
-       adr     x6, .Lunrolled_end
-       sub     x6, x6, x7, lsl #2      /* backup to write the last N insn */
-       br      x6                      /* and do it */
+       cbz     x7, .Lblock_aligned     /* aligned? just branch */
 
-       /*
-        * The maximum size of DCZID_EL0:BS supported is 2048 bytes.
-        */
-       .rept (2048 / 16) - 1
+       /* align to DCZID_EL0:BS boundary */
+       tbz     x7, #0, 0f              /* fill 16byte? */
+       stp     xzr, xzr, [x15], #16
+0:
+       tbz     x7, #1, 1f              /* fill 32byte? */
+       stp     xzr, xzr, [x15], #16
        stp     xzr, xzr, [x15], #16
-       .endr
-.Lunrolled_end:
+1:
+       lsr     x7, x7, #2
+       cbz     x7, 9f
+.L64bytes_fill:
+       sub     x7, x7, #1
+       stp     xzr, xzr, [x15], #16
+       stp     xzr, xzr, [x15], #16
+       stp     xzr, xzr, [x15], #16
+       stp     xzr, xzr, [x15], #16
+       cbnz    x7, .L64bytes_fill
+9:
 
 /*
  * Now we are block aligned.



Home | Main Index | Thread Index | Old Index