[src/trunk]: src/sys Implement ChaCha with SSE2 on x86 machines.

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys Implement ChaCha with SSE2 on x86 machines.
From: riastradh <riastradh%NetBSD.org@localhost>
Date: Sat, 01 May 2021 04:36:01 +0000
details:   https://anonhg.NetBSD.org/src/rev/48da887bfd5c
branches:  trunk
changeset: 974167:48da887bfd5c
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Sat Jul 25 22:49:20 2020 +0000

description:
Implement ChaCha with SSE2 on x86 machines.

Slightly disappointed that it only doubles, rather than quadruples,
throughput on my Ivy Bridge laptop.  Worth investigating.

diffstat:

 sys/arch/x86/conf/files.x86                   |    5 +-
 sys/arch/x86/x86/identcpu.c                   |   12 +-
 sys/crypto/chacha/arch/x86/chacha_sse2.c      |  561 ++++++++++++++++++++++++++
 sys/crypto/chacha/arch/x86/chacha_sse2.h      |   69 +++
 sys/crypto/chacha/arch/x86/chacha_sse2_impl.c |  153 +++++++
 sys/crypto/chacha/arch/x86/files.chacha_x86   |    6 +
 sys/crypto/chacha/arch/x86/immintrin.h        |  351 ++++++++++++++++
 7 files changed, 1154 insertions(+), 3 deletions(-)

diffs (truncated from 1223 to 300 lines):

diff -r 24a8ffb32b57 -r 48da887bfd5c sys/arch/x86/conf/files.x86
--- a/sys/arch/x86/conf/files.x86       Sat Jul 25 22:47:16 2020 +0000
+++ b/sys/arch/x86/conf/files.x86       Sat Jul 25 22:49:20 2020 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: files.x86,v 1.117 2020/07/14 00:45:53 yamaguchi Exp $
+#      $NetBSD: files.x86,v 1.118 2020/07/25 22:49:20 riastradh Exp $
 
 # options for MP configuration through the MP spec
 defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI
@@ -179,3 +179,6 @@
 
 # Permutation-based AES with PSHUFB
 include "crypto/aes/arch/x86/files.aesssse3"
+
+# ChaCha with SSE2
+include "crypto/chacha/arch/x86/files.chacha_x86"
diff -r 24a8ffb32b57 -r 48da887bfd5c sys/arch/x86/x86/identcpu.c
--- a/sys/arch/x86/x86/identcpu.c       Sat Jul 25 22:47:16 2020 +0000
+++ b/sys/arch/x86/x86/identcpu.c       Sat Jul 25 22:49:20 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $ */
+/*     $NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $ */
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $");
 
 #include "opt_xen.h"
 
@@ -44,6 +44,8 @@
 #include <crypto/aes/arch/x86/aes_sse2.h>
 #include <crypto/aes/arch/x86/aes_ssse3.h>
 #include <crypto/aes/arch/x86/aes_via.h>
+#include <crypto/chacha/chacha_impl.h>
+#include <crypto/chacha/arch/x86/chacha_sse2.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -1001,6 +1003,8 @@
                /* Early patch of text segment. */
                x86_patch(true);
 #endif
+
+               /* AES */
 #ifdef __x86_64__      /* not yet implemented on i386 */
                if (cpu_feature[1] & CPUID2_AES)
                        aes_md_init(&aes_ni_impl);
@@ -1014,6 +1018,10 @@
                        aes_md_init(&aes_ssse3_impl);
                else if (i386_has_sse && i386_has_sse2)
                        aes_md_init(&aes_sse2_impl);
+
+               /* ChaCha */
+               if (i386_has_sse && i386_has_sse2)
+                       chacha_md_init(&chacha_sse2_impl);
        } else {
                /*
                 * If not first. Warn about cpu_feature mismatch for
diff -r 24a8ffb32b57 -r 48da887bfd5c sys/crypto/chacha/arch/x86/chacha_sse2.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/chacha/arch/x86/chacha_sse2.c  Sat Jul 25 22:49:20 2020 +0000
@@ -0,0 +1,561 @@
+/*     $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $        */
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include "immintrin.h"
+
+#include "chacha_sse2.h"
+
+static inline __m128i
+rol32(__m128i x, uint8_t n)
+{
+
+       return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n);
+}
+
+static inline void
+chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
+    unsigned nr)
+{
+       __m128i r0, r1, r2, r3;
+       __m128i c0, c1, c2, c3;
+
+       r0 = *p0;
+       r1 = *p1;
+       r2 = *p2;
+       r3 = *p3;
+
+       for (; nr > 0; nr -= 2) {
+               r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16);
+               r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12);
+               r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8);
+               r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7);
+
+               c0 = r0;
+               c1 = _mm_shuffle_epi32(r1, 0x39);
+               c2 = _mm_shuffle_epi32(r2, 0x4e);
+               c3 = _mm_shuffle_epi32(r3, 0x93);
+
+               c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16);
+               c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12);
+               c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8);
+               c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7);
+
+               r0 = c0;
+               r1 = _mm_shuffle_epi32(c1, 0x93);
+               r2 = _mm_shuffle_epi32(c2, 0x4e);
+               r3 = _mm_shuffle_epi32(c3, 0x39);
+       }
+
+       *p0 = r0;
+       *p1 = r1;
+       *p2 = r2;
+       *p3 = r3;
+}
+
+void
+chacha_core_sse2(uint8_t out[restrict static 64],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+       __m128i in0, in1, in2, in3;
+       __m128i r0, r1, r2, r3;
+
+       r0 = in0 = _mm_loadu_si128((const __m128i *)c);
+       r1 = in1 = _mm_loadu_si128((const __m128i *)k);
+       r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
+       r3 = in3 = _mm_loadu_si128((const __m128i *)in);
+
+       chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+       _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
+       _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
+       _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
+       _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
+}
+
+void
+hchacha_sse2(uint8_t out[restrict static 32],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+       __m128i r0, r1, r2, r3;
+
+       r0 = _mm_loadu_si128((const __m128i *)c);
+       r1 = _mm_loadu_si128((const __m128i *)k);
+       r2 = _mm_loadu_si128((const __m128i *)k + 1);
+       r3 = _mm_loadu_si128((const __m128i *)in);
+
+       chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+       _mm_storeu_si128((__m128i *)out + 0, r0);
+       _mm_storeu_si128((__m128i *)out + 1, r3);
+}
+
+#define        CHACHA_QUARTERROUND(a, b, c, d) do                                    \
+{                                                                            \
+       (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16);        \
+       (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12);        \
+       (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8);         \
+       (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7);         \
+} while (/*CONSTCOND*/0)
+
+static inline __m128i
+load1_epi32(const void *p)
+{
+       return (__m128i)_mm_load1_ps(p);
+}
+
+static inline __m128i
+loadu_epi32(const void *p)
+{
+       return _mm_loadu_si128(p);
+}
+
+static inline void
+storeu_epi32(void *p, __m128i v)
+{
+       return _mm_storeu_si128(p, v);
+}
+
+static inline __m128i
+unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+       __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */
+       __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */
+
+       /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */
+       return (__m128i)_mm_movelh_ps(lo, hi);
+}
+
+static inline __m128i
+unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+       __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */
+       __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */
+
+       /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */
+       return (__m128i)_mm_movehl_ps(hi, lo);
+}
+
+static inline __m128i
+unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+       __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */
+       __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */
+
+       /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */
+       return (__m128i)_mm_movelh_ps(lo, hi);
+}
+
+static inline __m128i
+unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+       __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */
+       __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */
+
+       /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */
+       return (__m128i)_mm_movehl_ps(hi, lo);
+}
+
+void
+chacha_stream_sse2(uint8_t *restrict s, size_t n,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+       __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
+       __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
+       __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
+       unsigned r;
+
+       if (n < 256)
+               goto out;
+
+       x0 = load1_epi32(chacha_const32 + 0);
+       x1 = load1_epi32(chacha_const32 + 4);
+       x2 = load1_epi32(chacha_const32 + 8);
+       x3 = load1_epi32(chacha_const32 + 12);
+       x4 = load1_epi32(k + 0);
+       x5 = load1_epi32(k + 4);
+       x6 = load1_epi32(k + 8);
+       x7 = load1_epi32(k + 12);
+       x8 = load1_epi32(k + 16);
+       x9 = load1_epi32(k + 20);
+       x10 = load1_epi32(k + 24);
+       x11 = load1_epi32(k + 28);
+       /* x12 set in the loop */
+       x13 = load1_epi32(nonce + 0);
+       x14 = load1_epi32(nonce + 4);
+       x15 = load1_epi32(nonce + 8);
+
+       for (; n >= 256; s += 256, n -= 256, blkno += 4) {
+               x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
+                   _mm_set_epi32(3,2,1,0));
+               y0 = x0;
+               y1 = x1;
+               y2 = x2;
+               y3 = x3;
+               y4 = x4;
+               y5 = x5;
+               y6 = x6;
Prev by Date: [src/trunk]: src/sys New ChaCha API in kernel.
Next by Date: [src/trunk]: src/sys Implement ChaCha with NEON on ARM.
Previous by Thread: [src/trunk]: src/sys New ChaCha API in kernel.
Next by Thread: [src/trunk]: src/sys Implement ChaCha with NEON on ARM.
Indexes:
Home | Main Index | Thread Index | Old Index