Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys Implement ChaCha with SSE2 on x86 machines.
details: https://anonhg.NetBSD.org/src/rev/a9e32b980b04
branches: trunk
changeset: 1012202:a9e32b980b04
user: riastradh <riastradh%NetBSD.org@localhost>
date: Sat Jul 25 22:49:20 2020 +0000
description:
Implement ChaCha with SSE2 on x86 machines.
Slightly disappointed that it only doubles, rather than quadruples,
throughput on my Ivy Bridge laptop. Worth investigating.
diffstat:
sys/arch/x86/conf/files.x86 | 5 +-
sys/arch/x86/x86/identcpu.c | 12 +-
sys/crypto/chacha/arch/x86/chacha_sse2.c | 561 ++++++++++++++++++++++++++
sys/crypto/chacha/arch/x86/chacha_sse2.h | 69 +++
sys/crypto/chacha/arch/x86/chacha_sse2_impl.c | 153 +++++++
sys/crypto/chacha/arch/x86/files.chacha_x86 | 6 +
sys/crypto/chacha/arch/x86/immintrin.h | 351 ++++++++++++++++
7 files changed, 1154 insertions(+), 3 deletions(-)
diffs (truncated from 1223 to 300 lines):
diff -r 8c92e6090e9b -r a9e32b980b04 sys/arch/x86/conf/files.x86
--- a/sys/arch/x86/conf/files.x86 Sat Jul 25 22:47:16 2020 +0000
+++ b/sys/arch/x86/conf/files.x86 Sat Jul 25 22:49:20 2020 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: files.x86,v 1.117 2020/07/14 00:45:53 yamaguchi Exp $
+# $NetBSD: files.x86,v 1.118 2020/07/25 22:49:20 riastradh Exp $
# options for MP configuration through the MP spec
defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI
@@ -179,3 +179,6 @@
# Permutation-based AES with PSHUFB
include "crypto/aes/arch/x86/files.aesssse3"
+
+# ChaCha with SSE2
+include "crypto/chacha/arch/x86/files.chacha_x86"
diff -r 8c92e6090e9b -r a9e32b980b04 sys/arch/x86/x86/identcpu.c
--- a/sys/arch/x86/x86/identcpu.c Sat Jul 25 22:47:16 2020 +0000
+++ b/sys/arch/x86/x86/identcpu.c Sat Jul 25 22:49:20 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $ */
+/* $NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $");
#include "opt_xen.h"
@@ -44,6 +44,8 @@
#include <crypto/aes/arch/x86/aes_sse2.h>
#include <crypto/aes/arch/x86/aes_ssse3.h>
#include <crypto/aes/arch/x86/aes_via.h>
+#include <crypto/chacha/chacha_impl.h>
+#include <crypto/chacha/arch/x86/chacha_sse2.h>
#include <uvm/uvm_extern.h>
@@ -1001,6 +1003,8 @@
/* Early patch of text segment. */
x86_patch(true);
#endif
+
+ /* AES */
#ifdef __x86_64__ /* not yet implemented on i386 */
if (cpu_feature[1] & CPUID2_AES)
aes_md_init(&aes_ni_impl);
@@ -1014,6 +1018,10 @@
aes_md_init(&aes_ssse3_impl);
else if (i386_has_sse && i386_has_sse2)
aes_md_init(&aes_sse2_impl);
+
+ /* ChaCha */
+ if (i386_has_sse && i386_has_sse2)
+ chacha_md_init(&chacha_sse2_impl);
} else {
/*
* If not first. Warn about cpu_feature mismatch for
diff -r 8c92e6090e9b -r a9e32b980b04 sys/crypto/chacha/arch/x86/chacha_sse2.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/chacha/arch/x86/chacha_sse2.c Sat Jul 25 22:49:20 2020 +0000
@@ -0,0 +1,561 @@
+/* $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include "immintrin.h"
+
+#include "chacha_sse2.h"
+
+static inline __m128i
+rol32(__m128i x, uint8_t n)
+{
+
+ return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n);
+}
+
+static inline void
+chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
+ unsigned nr)
+{
+ __m128i r0, r1, r2, r3;
+ __m128i c0, c1, c2, c3;
+
+ r0 = *p0;
+ r1 = *p1;
+ r2 = *p2;
+ r3 = *p3;
+
+ for (; nr > 0; nr -= 2) {
+ r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16);
+ r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12);
+ r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8);
+ r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7);
+
+ c0 = r0;
+ c1 = _mm_shuffle_epi32(r1, 0x39);
+ c2 = _mm_shuffle_epi32(r2, 0x4e);
+ c3 = _mm_shuffle_epi32(r3, 0x93);
+
+ c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16);
+ c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12);
+ c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8);
+ c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7);
+
+ r0 = c0;
+ r1 = _mm_shuffle_epi32(c1, 0x93);
+ r2 = _mm_shuffle_epi32(c2, 0x4e);
+ r3 = _mm_shuffle_epi32(c3, 0x39);
+ }
+
+ *p0 = r0;
+ *p1 = r1;
+ *p2 = r2;
+ *p3 = r3;
+}
+
+void
+chacha_core_sse2(uint8_t out[restrict static 64],
+ const uint8_t in[static 16],
+ const uint8_t k[static 32],
+ const uint8_t c[static 16],
+ unsigned nr)
+{
+ __m128i in0, in1, in2, in3;
+ __m128i r0, r1, r2, r3;
+
+ r0 = in0 = _mm_loadu_si128((const __m128i *)c);
+ r1 = in1 = _mm_loadu_si128((const __m128i *)k);
+ r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
+ r3 = in3 = _mm_loadu_si128((const __m128i *)in);
+
+ chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+ _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
+ _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
+ _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
+ _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
+}
+
+void
+hchacha_sse2(uint8_t out[restrict static 32],
+ const uint8_t in[static 16],
+ const uint8_t k[static 32],
+ const uint8_t c[static 16],
+ unsigned nr)
+{
+ __m128i r0, r1, r2, r3;
+
+ r0 = _mm_loadu_si128((const __m128i *)c);
+ r1 = _mm_loadu_si128((const __m128i *)k);
+ r2 = _mm_loadu_si128((const __m128i *)k + 1);
+ r3 = _mm_loadu_si128((const __m128i *)in);
+
+ chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+ _mm_storeu_si128((__m128i *)out + 0, r0);
+ _mm_storeu_si128((__m128i *)out + 1, r3);
+}
+
+#define CHACHA_QUARTERROUND(a, b, c, d) do \
+{ \
+ (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \
+ (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \
+ (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \
+ (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \
+} while (/*CONSTCOND*/0)
+
+static inline __m128i
+load1_epi32(const void *p)
+{
+ return (__m128i)_mm_load1_ps(p);
+}
+
+static inline __m128i
+loadu_epi32(const void *p)
+{
+ return _mm_loadu_si128(p);
+}
+
+static inline void
+storeu_epi32(void *p, __m128i v)
+{
+ return _mm_storeu_si128(p, v);
+}
+
+static inline __m128i
+unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */
+ __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */
+
+ /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */
+ return (__m128i)_mm_movelh_ps(lo, hi);
+}
+
+static inline __m128i
+unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */
+ __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */
+
+ /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */
+ return (__m128i)_mm_movehl_ps(hi, lo);
+}
+
+static inline __m128i
+unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */
+ __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */
+
+ /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */
+ return (__m128i)_mm_movelh_ps(lo, hi);
+}
+
+static inline __m128i
+unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */
+ __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */
+
+ /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */
+ return (__m128i)_mm_movehl_ps(hi, lo);
+}
+
+void
+chacha_stream_sse2(uint8_t *restrict s, size_t n,
+ uint32_t blkno,
+ const uint8_t nonce[static 12],
+ const uint8_t k[static 32],
+ unsigned nr)
+{
+ __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
+ __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
+ __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
+ unsigned r;
+
+ if (n < 256)
+ goto out;
+
+ x0 = load1_epi32(chacha_const32 + 0);
+ x1 = load1_epi32(chacha_const32 + 4);
+ x2 = load1_epi32(chacha_const32 + 8);
+ x3 = load1_epi32(chacha_const32 + 12);
+ x4 = load1_epi32(k + 0);
+ x5 = load1_epi32(k + 4);
+ x6 = load1_epi32(k + 8);
+ x7 = load1_epi32(k + 12);
+ x8 = load1_epi32(k + 16);
+ x9 = load1_epi32(k + 20);
+ x10 = load1_epi32(k + 24);
+ x11 = load1_epi32(k + 28);
+ /* x12 set in the loop */
+ x13 = load1_epi32(nonce + 0);
+ x14 = load1_epi32(nonce + 4);
+ x15 = load1_epi32(nonce + 8);
+
+ for (; n >= 256; s += 256, n -= 256, blkno += 4) {
+ x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
+ _mm_set_epi32(3,2,1,0));
+ y0 = x0;
+ y1 = x1;
+ y2 = x2;
+ y3 = x3;
+ y4 = x4;
+ y5 = x5;
+ y6 = x6;
Home |
Main Index |
Thread Index |
Old Index