Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/crypto/chacha/arch Reduce some duplication.



details:   https://anonhg.NetBSD.org/src/rev/7e0282052c11
branches:  trunk
changeset: 1012338:7e0282052c11
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Mon Jul 27 20:48:18 2020 +0000

description:
Reduce some duplication.

Shouldn't substantively hurt performance -- the comparison that has
been moved into the loop was essentially the former loop condition --
and may improve performance by reducing code size since there's only
one inline call to chacha_permute instead of two.

diffstat:

 sys/crypto/chacha/arch/arm/chacha_neon.c |  83 ++++++++++++-----------------
 sys/crypto/chacha/arch/x86/chacha_sse2.c |  87 ++++++++++++-------------------
 2 files changed, 70 insertions(+), 100 deletions(-)

diffs (266 lines):

diff -r 7defb5f81a8a -r 7e0282052c11 sys/crypto/chacha/arch/arm/chacha_neon.c
--- a/sys/crypto/chacha/arch/arm/chacha_neon.c  Mon Jul 27 20:46:17 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon.c  Mon Jul 27 20:48:18 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $        */
+/*     $NetBSD: chacha_neon.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -168,7 +168,7 @@
                        le32dec(nonce + 8)
                };
 
-               for (; n >= 64; s += 64, n -= 64) {
+               for (; n; s += 64, n -= 64) {
                        r0 = in0;
                        r1 = in1;
                        r2 = in2;
@@ -178,32 +178,25 @@
                        r1 = vhtole_u32(vaddq_u32(r1, in1));
                        r2 = vhtole_u32(vaddq_u32(r2, in2));
                        r3 = vhtole_u32(vaddq_u32(r3, in3));
+
+                       if (n < 64) {
+                               uint8_t buf[64] __aligned(16);
+
+                               vst1q_u32((uint32_t *)buf + 4*0, r0);
+                               vst1q_u32((uint32_t *)buf + 4*1, r1);
+                               vst1q_u32((uint32_t *)buf + 4*2, r2);
+                               vst1q_u32((uint32_t *)buf + 4*3, r3);
+                               memcpy(s, buf, n);
+
+                               break;
+                       }
+
                        vst1q_u32((uint32_t *)s + 4*0, r0);
                        vst1q_u32((uint32_t *)s + 4*1, r1);
                        vst1q_u32((uint32_t *)s + 4*2, r2);
                        vst1q_u32((uint32_t *)s + 4*3, r3);
                        in3 = vaddq_u32(in3, blkno_inc);
                }
-
-               if (n) {
-                       uint8_t buf[64];
-
-                       r0 = in0;
-                       r1 = in1;
-                       r2 = in2;
-                       r3 = in3;
-                       chacha_permute(&r0, &r1, &r2, &r3, nr);
-                       r0 = vhtole_u32(vaddq_u32(r0, in0));
-                       r1 = vhtole_u32(vaddq_u32(r1, in1));
-                       r2 = vhtole_u32(vaddq_u32(r2, in2));
-                       r3 = vhtole_u32(vaddq_u32(r3, in3));
-                       vst1q_u32((uint32_t *)buf + 4*0, r0);
-                       vst1q_u32((uint32_t *)buf + 4*1, r1);
-                       vst1q_u32((uint32_t *)buf + 4*2, r2);
-                       vst1q_u32((uint32_t *)buf + 4*3, r3);
-
-                       memcpy(s, buf, n);
-               }
        }
 }
 
@@ -234,7 +227,7 @@
                        le32dec(nonce + 8)
                };
 
-               for (; n >= 64; s += 64, p += 64, n -= 64) {
+               for (; n; s += 64, p += 64, n -= 64) {
                        r0 = in0;
                        r1 = in1;
                        r2 = in2;
@@ -244,6 +237,25 @@
                        r1 = vhtole_u32(vaddq_u32(r1, in1));
                        r2 = vhtole_u32(vaddq_u32(r2, in2));
                        r3 = vhtole_u32(vaddq_u32(r3, in3));
+
+                       if (n < 64) {
+                               uint8_t buf[64] __aligned(16);
+                               unsigned i;
+
+                               vst1q_u32((uint32_t *)buf + 4*0, r0);
+                               vst1q_u32((uint32_t *)buf + 4*1, r1);
+                               vst1q_u32((uint32_t *)buf + 4*2, r2);
+                               vst1q_u32((uint32_t *)buf + 4*3, r3);
+
+                               for (i = 0; i < n - n%4; i += 4)
+                                       le32enc(s + i,
+                                           le32dec(p + i) ^ le32dec(buf + i));
+                               for (; i < n; i++)
+                                       s[i] = p[i] ^ buf[i];
+
+                               break;
+                       }
+
                        r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
                        r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
                        r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
@@ -254,31 +266,6 @@
                        vst1q_u32((uint32_t *)s + 4*3, r3);
                        in3 = vaddq_u32(in3, blkno_inc);
                }
-
-               if (n) {
-                       uint8_t buf[64];
-                       unsigned i;
-
-                       r0 = in0;
-                       r1 = in1;
-                       r2 = in2;
-                       r3 = in3;
-                       chacha_permute(&r0, &r1, &r2, &r3, nr);
-                       r0 = vhtole_u32(vaddq_u32(r0, in0));
-                       r1 = vhtole_u32(vaddq_u32(r1, in1));
-                       r2 = vhtole_u32(vaddq_u32(r2, in2));
-                       r3 = vhtole_u32(vaddq_u32(r3, in3));
-                       vst1q_u32((uint32_t *)buf + 4*0, r0);
-                       vst1q_u32((uint32_t *)buf + 4*1, r1);
-                       vst1q_u32((uint32_t *)buf + 4*2, r2);
-                       vst1q_u32((uint32_t *)buf + 4*3, r3);
-
-                       for (i = 0; i < n - n%4; i += 4)
-                               le32enc(s + i,
-                                   le32dec(p + i) ^ le32dec(buf + i));
-                       for (; i < n; i++)
-                               s[i] = p[i] ^ buf[i];
-               }
        }
 }
 
diff -r 7defb5f81a8a -r 7e0282052c11 sys/crypto/chacha/arch/x86/chacha_sse2.c
--- a/sys/crypto/chacha/arch/x86/chacha_sse2.c  Mon Jul 27 20:46:17 2020 +0000
+++ b/sys/crypto/chacha/arch/x86/chacha_sse2.c  Mon Jul 27 20:48:18 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $        */
+/*     $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -313,7 +313,7 @@
                in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
                    le32dec(nonce), blkno);
 
-               for (; n >= 64; s += 64, n -= 64) {
+               for (; n; s += 64, n -= 64) {
                        r0 = in0;
                        r1 = in1;
                        r2 = in2;
@@ -323,36 +323,25 @@
                        r1 = _mm_add_epi32(r1, in1);
                        r2 = _mm_add_epi32(r2, in2);
                        r3 = _mm_add_epi32(r3, in3);
+
+                       if (n < 64) {
+                               uint8_t buf[64] __aligned(16);
+
+                               _mm_storeu_si128((__m128i *)buf + 0, r0);
+                               _mm_storeu_si128((__m128i *)buf + 1, r1);
+                               _mm_storeu_si128((__m128i *)buf + 2, r2);
+                               _mm_storeu_si128((__m128i *)buf + 3, r3);
+                               memcpy(s, buf, n);
+
+                               break;
+                       }
+
                        _mm_storeu_si128((__m128i *)s + 0, r0);
                        _mm_storeu_si128((__m128i *)s + 1, r1);
                        _mm_storeu_si128((__m128i *)s + 2, r2);
                        _mm_storeu_si128((__m128i *)s + 3, r3);
                        in3 = _mm_add_epi32(in3, blkno_inc);
                }
-
-               if (n) {
-                       uint8_t buf[64];
-                       unsigned i;
-
-                       r0 = in0;
-                       r1 = in1;
-                       r2 = in2;
-                       r3 = in3;
-                       chacha_permute(&r0, &r1, &r2, &r3, nr);
-                       r0 = _mm_add_epi32(r0, in0);
-                       r1 = _mm_add_epi32(r1, in1);
-                       r2 = _mm_add_epi32(r2, in2);
-                       r3 = _mm_add_epi32(r3, in3);
-                       _mm_storeu_si128((__m128i *)buf + 0, r0);
-                       _mm_storeu_si128((__m128i *)buf + 1, r1);
-                       _mm_storeu_si128((__m128i *)buf + 2, r2);
-                       _mm_storeu_si128((__m128i *)buf + 3, r3);
-
-                       for (i = 0; i < n - n%4; i += 4)
-                               le32enc(s + i, le32dec(buf + i));
-                       for (; i < n; i++)
-                               s[i] = buf[i];
-               }
        }
 }
 
@@ -480,7 +469,7 @@
                in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
                    le32dec(nonce), blkno);
 
-               for (; n >= 64; s += 64, p += 64, n -= 64) {
+               for (; n; s += 64, p += 64, n -= 64) {
                        r0 = in0;
                        r1 = in1;
                        r2 = in2;
@@ -490,6 +479,25 @@
                        r1 = _mm_add_epi32(r1, in1);
                        r2 = _mm_add_epi32(r2, in2);
                        r3 = _mm_add_epi32(r3, in3);
+
+                       if (n < 64) {
+                               uint8_t buf[64] __aligned(16);
+                               unsigned i;
+
+                               _mm_storeu_si128((__m128i *)buf + 0, r0);
+                               _mm_storeu_si128((__m128i *)buf + 1, r1);
+                               _mm_storeu_si128((__m128i *)buf + 2, r2);
+                               _mm_storeu_si128((__m128i *)buf + 3, r3);
+
+                               for (i = 0; i < n - n%4; i += 4)
+                                       le32enc(s + i,
+                                           le32dec(p + i) ^ le32dec(buf + i));
+                               for (; i < n; i++)
+                                       s[i] = p[i] ^ buf[i];
+
+                               break;
+                       }
+
                        r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
                        r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
                        r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
@@ -500,31 +508,6 @@
                        _mm_storeu_si128((__m128i *)s + 3, r3);
                        in3 = _mm_add_epi32(in3, blkno_inc);
                }
-
-               if (n) {
-                       uint8_t buf[64];
-                       unsigned i;
-
-                       r0 = in0;
-                       r1 = in1;
-                       r2 = in2;
-                       r3 = in3;
-                       chacha_permute(&r0, &r1, &r2, &r3, nr);
-                       r0 = _mm_add_epi32(r0, in0);
-                       r1 = _mm_add_epi32(r1, in1);
-                       r2 = _mm_add_epi32(r2, in2);
-                       r3 = _mm_add_epi32(r3, in3);
-                       _mm_storeu_si128((__m128i *)buf + 0, r0);
-                       _mm_storeu_si128((__m128i *)buf + 1, r1);
-                       _mm_storeu_si128((__m128i *)buf + 2, r2);
-                       _mm_storeu_si128((__m128i *)buf + 3, r3);
-
-                       for (i = 0; i < n - n%4; i += 4)
-                               le32enc(s + i,
-                                   le32dec(p + i) ^ le32dec(buf + i));
-                       for (; i < n; i++)
-                               s[i] = p[i] ^ buf[i];
-               }
        }
 }
 



Home | Main Index | Thread Index | Old Index