Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/common/lib/libc/arch/aarch64/string Fixed to not use the "br...
details: https://anonhg.NetBSD.org/src/rev/d8841c5c464f
branches: trunk
changeset: 930644:d8841c5c464f
user: ryo <ryo%NetBSD.org@localhost>
date: Sat Apr 11 05:12:52 2020 +0000
description:
Fixed to not use the "br" instruction. Branch Target Identification (BTI) doesn't like "br".
requested by maxv@
diffstat:
common/lib/libc/arch/aarch64/string/bcopy.S | 134 ++++++++++++++++++--------
common/lib/libc/arch/aarch64/string/memset.S | 30 +++--
2 files changed, 110 insertions(+), 54 deletions(-)
diffs (245 lines):
diff -r 931d8a067d40 -r d8841c5c464f common/lib/libc/arch/aarch64/string/bcopy.S
--- a/common/lib/libc/arch/aarch64/string/bcopy.S Sat Apr 11 01:51:14 2020 +0000
+++ b/common/lib/libc/arch/aarch64/string/bcopy.S Sat Apr 11 05:12:52 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
+/* $NetBSD: bcopy.S,v 1.2 2020/04/11 05:12:52 ryo Exp $ */
/*
* Copyright (c) 2018 Ryo Shimizu <ryo%nerv.org@localhost>
@@ -29,7 +29,7 @@
#include <machine/asm.h>
#if defined(LIBC_SCCS)
-RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $")
+RCSID("$NetBSD: bcopy.S,v 1.2 2020/04/11 05:12:52 ryo Exp $")
#endif
#if defined(MEMCOPY)
@@ -207,32 +207,60 @@
#endif /* (STP_ALIGN > 8) */
9:
+backward_copy1k:
+ /* while (len >= 1024) */
+ /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
cmp LEN, #1024
- bhs backward_copy1k
-backward_less1k:
- /* copy 16*n bytes */
- and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
- adr TMP_X, 8f
- sub LEN, LEN, TMP_D
- sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
- br TMP_X
-backward_copy1k: /* copy 16*64 bytes */
+ blo 9f
+1:
sub LEN, LEN, #1024
.rept (1024 / 16)
ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */
stp DATA0, DATA1, [DST, #-16]!
.endr
-8:
- cbz LEN, done
cmp LEN, #1024
- bhs backward_copy1k
- cmp LEN, #16
- bhs backward_less1k
+ bhs 1b
+9:
+ /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
+ tbz LEN, #9, 1f
+ .rept (512 / 16)
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+ .endr
+1:
+ /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
+ tbz LEN, #8, 1f
+ .rept (256 / 16)
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+ .endr
+1:
+ /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
+ tbz LEN, #7, 1f
+ .rept (128 / 16)
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+ .endr
+1:
+ /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
+ tbz LEN, #6, 1f
+ .rept (64 / 16)
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+ .endr
+1:
+ /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
+ tbz LEN, #5, 1f
+ .rept (32 / 16)
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+ .endr
+1:
/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
tbz LEN, #4, 1f
ldp DATA0, DATA1, [SRC0, #-16]!
- ldp DATA0, DATA1, [DST, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
1:
/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
tbz LEN, #3, 1f
@@ -271,14 +299,10 @@
bcs 9f
backward_tiny:
/* copy 1-10 bytes */
- adr TMP_X, 8f
- sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
- br TMP_X
- .rept 10
+1: sub LEN, LEN, #1
ldrb TMP_Xw, [SRC0, #-1]!
strb TMP_Xw, [DST, #-1]!
- .endr
-8:
+ cbz LEN, 1b
ret
9:
/* length is small(<32), and src or dst may be unaligned */
@@ -548,14 +572,10 @@
bcs 9f
forward_tiny:
/* copy 1-10 bytes */
- adr TMP_X, 8f
- sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
- br TMP_X
- .rept 10
+1: sub LEN, LEN, #1
ldrb TMP_Xw, [SRC0], #1
strb TMP_Xw, [DST], #1
- .endr
-8:
+ cbz LEN, 1b
ret
9:
/* length is small(<32), and src or dst may be unaligned */
@@ -938,28 +958,56 @@
#endif /* (STP_ALIGN > 8) */
9:
+forward_copy1k:
+ /* while (len >= 1024) */
+ /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
cmp LEN, #1024
- bhs forward_copy1k
-forward_less1k:
- /* copy 16*n bytes */
- and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
- adr TMP_X, 8f
- sub LEN, LEN, TMP_D
- sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
- br TMP_X
-forward_copy1k: /* copy 16*64 bytes */
+ blo 9f
+1:
sub LEN, LEN, #1024
.rept (1024 / 16)
ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */
stp DATA0, DATA1, [DST], #16
.endr
-8:
- cbz LEN, done
cmp LEN, #1024
- bhs forward_copy1k
- cmp LEN, #16
- bhs forward_less1k
+ bhs 1b
+9:
+ /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
+ tbz LEN, #9, 1f
+ .rept (512 / 16)
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+ .endr
+1:
+ /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
+ tbz LEN, #8, 1f
+ .rept (256 / 16)
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+ .endr
+1:
+ /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
+ tbz LEN, #7, 1f
+ .rept (128 / 16)
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+ .endr
+1:
+ /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
+ tbz LEN, #6, 1f
+ .rept (64 / 16)
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+ .endr
+1:
+ /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
+ tbz LEN, #5, 1f
+ .rept (32 / 16)
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+ .endr
+1:
/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
tbz LEN, #4, 1f
ldp DATA0, DATA1, [SRC0], #16
diff -r 931d8a067d40 -r d8841c5c464f common/lib/libc/arch/aarch64/string/memset.S
--- a/common/lib/libc/arch/aarch64/string/memset.S Sat Apr 11 01:51:14 2020 +0000
+++ b/common/lib/libc/arch/aarch64/string/memset.S Sat Apr 11 05:12:52 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: memset.S,v 1.2 2017/08/29 15:00:23 ryo Exp $ */
+/* $NetBSD: memset.S,v 1.3 2020/04/11 05:12:52 ryo Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -158,18 +158,26 @@
tbz x15, #3, .Lzero_qword_aligned
str xzr, [x15], #8
.Lzero_qword_aligned:
- cbz x7, .Lblock_aligned /* less than 16 bytes? just branch */
- adr x6, .Lunrolled_end
- sub x6, x6, x7, lsl #2 /* backup to write the last N insn */
- br x6 /* and do it */
+ cbz x7, .Lblock_aligned /* aligned? just branch */
- /*
- * The maximum size of DCZID_EL0:BS supported is 2048 bytes.
- */
- .rept (2048 / 16) - 1
+ /* align to DCZID_EL0:BS boundary */
+ tbz x7, #0, 0f /* fill 16byte? */
+ stp xzr, xzr, [x15], #16
+0:
+ tbz x7, #1, 1f /* fill 32byte? */
+ stp xzr, xzr, [x15], #16
stp xzr, xzr, [x15], #16
- .endr
-.Lunrolled_end:
+1:
+ lsr x7, x7, #2
+ cbz x7, 9f
+.L64bytes_fill:
+ sub x7, x7, #1
+ stp xzr, xzr, [x15], #16
+ stp xzr, xzr, [x15], #16
+ stp xzr, xzr, [x15], #16
+ stp xzr, xzr, [x15], #16
+ cbnz x7, .L64bytes_fill
+9:
/*
* Now we are block aligned.
Home |
Main Index |
Thread Index |
Old Index