Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/crypto/chacha/arch Reduce some duplication.
details: https://anonhg.NetBSD.org/src/rev/a285cc0013a5
branches: trunk
changeset: 936476:a285cc0013a5
user: riastradh <riastradh%NetBSD.org@localhost>
date: Mon Jul 27 20:48:18 2020 +0000
description:
Reduce some duplication.
Shouldn't substantively hurt performance -- the comparison that has
been moved into the loop was essentially the former loop condition --
and may improve performance by reducing code size since there's only
one inline call to chacha_permute instead of two.
diffstat:
sys/crypto/chacha/arch/arm/chacha_neon.c | 83 ++++++++++++-----------------
sys/crypto/chacha/arch/x86/chacha_sse2.c | 87 ++++++++++++-------------------
2 files changed, 70 insertions(+), 100 deletions(-)
diffs (266 lines):
diff -r 185330b3c140 -r a285cc0013a5 sys/crypto/chacha/arch/arm/chacha_neon.c
--- a/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Jul 27 20:46:17 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Jul 27 20:48:18 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */
+/* $NetBSD: chacha_neon.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -168,7 +168,7 @@
le32dec(nonce + 8)
};
- for (; n >= 64; s += 64, n -= 64) {
+ for (; n; s += 64, n -= 64) {
r0 = in0;
r1 = in1;
r2 = in2;
@@ -178,32 +178,25 @@
r1 = vhtole_u32(vaddq_u32(r1, in1));
r2 = vhtole_u32(vaddq_u32(r2, in2));
r3 = vhtole_u32(vaddq_u32(r3, in3));
+
+ if (n < 64) {
+ uint8_t buf[64] __aligned(16);
+
+ vst1q_u32((uint32_t *)buf + 4*0, r0);
+ vst1q_u32((uint32_t *)buf + 4*1, r1);
+ vst1q_u32((uint32_t *)buf + 4*2, r2);
+ vst1q_u32((uint32_t *)buf + 4*3, r3);
+ memcpy(s, buf, n);
+
+ break;
+ }
+
vst1q_u32((uint32_t *)s + 4*0, r0);
vst1q_u32((uint32_t *)s + 4*1, r1);
vst1q_u32((uint32_t *)s + 4*2, r2);
vst1q_u32((uint32_t *)s + 4*3, r3);
in3 = vaddq_u32(in3, blkno_inc);
}
-
- if (n) {
- uint8_t buf[64];
-
- r0 = in0;
- r1 = in1;
- r2 = in2;
- r3 = in3;
- chacha_permute(&r0, &r1, &r2, &r3, nr);
- r0 = vhtole_u32(vaddq_u32(r0, in0));
- r1 = vhtole_u32(vaddq_u32(r1, in1));
- r2 = vhtole_u32(vaddq_u32(r2, in2));
- r3 = vhtole_u32(vaddq_u32(r3, in3));
- vst1q_u32((uint32_t *)buf + 4*0, r0);
- vst1q_u32((uint32_t *)buf + 4*1, r1);
- vst1q_u32((uint32_t *)buf + 4*2, r2);
- vst1q_u32((uint32_t *)buf + 4*3, r3);
-
- memcpy(s, buf, n);
- }
}
}
@@ -234,7 +227,7 @@
le32dec(nonce + 8)
};
- for (; n >= 64; s += 64, p += 64, n -= 64) {
+ for (; n; s += 64, p += 64, n -= 64) {
r0 = in0;
r1 = in1;
r2 = in2;
@@ -244,6 +237,25 @@
r1 = vhtole_u32(vaddq_u32(r1, in1));
r2 = vhtole_u32(vaddq_u32(r2, in2));
r3 = vhtole_u32(vaddq_u32(r3, in3));
+
+ if (n < 64) {
+ uint8_t buf[64] __aligned(16);
+ unsigned i;
+
+ vst1q_u32((uint32_t *)buf + 4*0, r0);
+ vst1q_u32((uint32_t *)buf + 4*1, r1);
+ vst1q_u32((uint32_t *)buf + 4*2, r2);
+ vst1q_u32((uint32_t *)buf + 4*3, r3);
+
+ for (i = 0; i < n - n%4; i += 4)
+ le32enc(s + i,
+ le32dec(p + i) ^ le32dec(buf + i));
+ for (; i < n; i++)
+ s[i] = p[i] ^ buf[i];
+
+ break;
+ }
+
r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
@@ -254,31 +266,6 @@
vst1q_u32((uint32_t *)s + 4*3, r3);
in3 = vaddq_u32(in3, blkno_inc);
}
-
- if (n) {
- uint8_t buf[64];
- unsigned i;
-
- r0 = in0;
- r1 = in1;
- r2 = in2;
- r3 = in3;
- chacha_permute(&r0, &r1, &r2, &r3, nr);
- r0 = vhtole_u32(vaddq_u32(r0, in0));
- r1 = vhtole_u32(vaddq_u32(r1, in1));
- r2 = vhtole_u32(vaddq_u32(r2, in2));
- r3 = vhtole_u32(vaddq_u32(r3, in3));
- vst1q_u32((uint32_t *)buf + 4*0, r0);
- vst1q_u32((uint32_t *)buf + 4*1, r1);
- vst1q_u32((uint32_t *)buf + 4*2, r2);
- vst1q_u32((uint32_t *)buf + 4*3, r3);
-
- for (i = 0; i < n - n%4; i += 4)
- le32enc(s + i,
- le32dec(p + i) ^ le32dec(buf + i));
- for (; i < n; i++)
- s[i] = p[i] ^ buf[i];
- }
}
}
diff -r 185330b3c140 -r a285cc0013a5 sys/crypto/chacha/arch/x86/chacha_sse2.c
--- a/sys/crypto/chacha/arch/x86/chacha_sse2.c Mon Jul 27 20:46:17 2020 +0000
+++ b/sys/crypto/chacha/arch/x86/chacha_sse2.c Mon Jul 27 20:48:18 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */
+/* $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -313,7 +313,7 @@
in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
le32dec(nonce), blkno);
- for (; n >= 64; s += 64, n -= 64) {
+ for (; n; s += 64, n -= 64) {
r0 = in0;
r1 = in1;
r2 = in2;
@@ -323,36 +323,25 @@
r1 = _mm_add_epi32(r1, in1);
r2 = _mm_add_epi32(r2, in2);
r3 = _mm_add_epi32(r3, in3);
+
+ if (n < 64) {
+ uint8_t buf[64] __aligned(16);
+
+ _mm_storeu_si128((__m128i *)buf + 0, r0);
+ _mm_storeu_si128((__m128i *)buf + 1, r1);
+ _mm_storeu_si128((__m128i *)buf + 2, r2);
+ _mm_storeu_si128((__m128i *)buf + 3, r3);
+ memcpy(s, buf, n);
+
+ break;
+ }
+
_mm_storeu_si128((__m128i *)s + 0, r0);
_mm_storeu_si128((__m128i *)s + 1, r1);
_mm_storeu_si128((__m128i *)s + 2, r2);
_mm_storeu_si128((__m128i *)s + 3, r3);
in3 = _mm_add_epi32(in3, blkno_inc);
}
-
- if (n) {
- uint8_t buf[64];
- unsigned i;
-
- r0 = in0;
- r1 = in1;
- r2 = in2;
- r3 = in3;
- chacha_permute(&r0, &r1, &r2, &r3, nr);
- r0 = _mm_add_epi32(r0, in0);
- r1 = _mm_add_epi32(r1, in1);
- r2 = _mm_add_epi32(r2, in2);
- r3 = _mm_add_epi32(r3, in3);
- _mm_storeu_si128((__m128i *)buf + 0, r0);
- _mm_storeu_si128((__m128i *)buf + 1, r1);
- _mm_storeu_si128((__m128i *)buf + 2, r2);
- _mm_storeu_si128((__m128i *)buf + 3, r3);
-
- for (i = 0; i < n - n%4; i += 4)
- le32enc(s + i, le32dec(buf + i));
- for (; i < n; i++)
- s[i] = buf[i];
- }
}
}
@@ -480,7 +469,7 @@
in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
le32dec(nonce), blkno);
- for (; n >= 64; s += 64, p += 64, n -= 64) {
+ for (; n; s += 64, p += 64, n -= 64) {
r0 = in0;
r1 = in1;
r2 = in2;
@@ -490,6 +479,25 @@
r1 = _mm_add_epi32(r1, in1);
r2 = _mm_add_epi32(r2, in2);
r3 = _mm_add_epi32(r3, in3);
+
+ if (n < 64) {
+ uint8_t buf[64] __aligned(16);
+ unsigned i;
+
+ _mm_storeu_si128((__m128i *)buf + 0, r0);
+ _mm_storeu_si128((__m128i *)buf + 1, r1);
+ _mm_storeu_si128((__m128i *)buf + 2, r2);
+ _mm_storeu_si128((__m128i *)buf + 3, r3);
+
+ for (i = 0; i < n - n%4; i += 4)
+ le32enc(s + i,
+ le32dec(p + i) ^ le32dec(buf + i));
+ for (; i < n; i++)
+ s[i] = p[i] ^ buf[i];
+
+ break;
+ }
+
r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
@@ -500,31 +508,6 @@
_mm_storeu_si128((__m128i *)s + 3, r3);
in3 = _mm_add_epi32(in3, blkno_inc);
}
-
- if (n) {
- uint8_t buf[64];
- unsigned i;
-
- r0 = in0;
- r1 = in1;
- r2 = in2;
- r3 = in3;
- chacha_permute(&r0, &r1, &r2, &r3, nr);
- r0 = _mm_add_epi32(r0, in0);
- r1 = _mm_add_epi32(r1, in1);
- r2 = _mm_add_epi32(r2, in2);
- r3 = _mm_add_epi32(r3, in3);
- _mm_storeu_si128((__m128i *)buf + 0, r0);
- _mm_storeu_si128((__m128i *)buf + 1, r1);
- _mm_storeu_si128((__m128i *)buf + 2, r2);
- _mm_storeu_si128((__m128i *)buf + 3, r3);
-
- for (i = 0; i < n - n%4; i += 4)
- le32enc(s + i,
- le32dec(p + i) ^ le32dec(buf + i));
- for (; i < n; i++)
- s[i] = p[i] ^ buf[i];
- }
}
}
Home |
Main Index |
Thread Index |
Old Index