Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/crypto/chacha/arch/arm Take advantage of REV32 and TBL f...



details:   https://anonhg.NetBSD.org/src/rev/5d70ec4d5d70
branches:  trunk
changeset: 1012349:5d70ec4d5d70
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Mon Jul 27 20:58:06 2020 +0000

description:
Take advantage of REV32 and TBL for 16-bit and 8-bit rotations.

However, disable use of (V)TBL on armv7/aarch32 for now, because for
some reason GCC spills things to the stack despite having plenty of
free registers, which hurts performance more than it helps at least
on ARM Cortex-A8.

diffstat:

 sys/crypto/chacha/arch/arm/arm_neon.h    |  99 +++++++++++++++++++++++++++++++-
 sys/crypto/chacha/arch/arm/chacha_neon.c |  81 +++++++++++++++++++++++--
 2 files changed, 170 insertions(+), 10 deletions(-)

diffs (264 lines):

diff -r 2efe1b61b1aa -r 5d70ec4d5d70 sys/crypto/chacha/arch/arm/arm_neon.h
--- a/sys/crypto/chacha/arch/arm/arm_neon.h     Mon Jul 27 20:57:23 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/arm_neon.h     Mon Jul 27 20:58:06 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: arm_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $   */
+/*     $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $   */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,6 +39,7 @@
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
 typedef __Int8x16_t int8x16_t;
+typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
@@ -46,6 +47,7 @@
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 typedef __simd128_int8_t int8x16_t;
+typedef __simd128_uint16_t uint16x8_t;
 typedef __simd128_uint32_t uint32x4_t;
 typedef __simd128_uint64_t uint64x2_t;
 typedef __simd128_uint8_t uint8x16_t;
@@ -70,9 +72,11 @@
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
@@ -330,6 +334,27 @@
 }
 
 _INTRINSATTR
+static __inline uint16x8_t
+vreinterpretq_u16_u32(uint32x4_t __v)
+{
+       return (uint16x8_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u16(uint16x8_t __v)
+{
+       return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u64(uint64x2_t __v)
+{
+       return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u8(uint8x16_t __v)
 {
@@ -338,6 +363,13 @@
 
 _INTRINSATTR
 static __inline uint64x2_t
+vreinterpretq_u64_u32(uint32x4_t __v)
+{
+       return (uint64x2_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint64x2_t
 vreinterpretq_u64_u8(uint8x16_t __v)
 {
        return (uint64x2_t)__v;
@@ -365,6 +397,17 @@
 }
 
 _INTRINSATTR
+static __inline uint16x8_t
+vrev32q_u16(uint16x8_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+       return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
+#elif defined(__clang__)
+       return __builtin_shufflevector(__v,  1,0, 3,2, 5,4, 7,6);
+#endif
+}
+
+_INTRINSATTR
 static __inline uint8x16_t
 vrev32q_u8(uint8x16_t __v)
 {
@@ -531,4 +574,58 @@
 #endif
 }
 
+#ifndef __aarch64__            /* XXX */
+
+_INTRINSATTR
+static __inline uint8x8_t
+vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+       return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
+           (int8x8_t)__idx);
+#elif defined(__clang__)
+       uint8x8_t __ret;
+#ifndef __LITTLE_ENDIAN__
+       __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
+       __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
+#endif
+       __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
+           (int8x8_t)__idx, 16);
+#ifndef __LITTLE_ENDIAN__
+       __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
+#endif
+       return __ret;
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x8_t
+vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+       union {
+               uint8x8x2_t __u8x8x82;
+               __builtin_neon_ti __ti;
+       } __u = { __tab };
+       return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
+#elif defined(__clang__)
+       uint8x8_t __ret;
+#ifndef __LITTLE_ENDIAN__
+       __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
+           7,6,5,4,3,2,1,0);
+       __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
+           7,6,5,4,3,2,1,0);
+       __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
+#endif
+       __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
+           (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
+#ifndef __LITTLE_ENDIAN__
+       __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
+#endif
+       return __ret;
+#endif
+}
+
+#endif /* !defined(__aarch64__) */
+
 #endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */
diff -r 2efe1b61b1aa -r 5d70ec4d5d70 sys/crypto/chacha/arch/arm/chacha_neon.c
--- a/sys/crypto/chacha/arch/arm/chacha_neon.c  Mon Jul 27 20:57:23 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon.c  Mon Jul 27 20:58:06 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: chacha_neon.c,v 1.3 2020/07/27 20:51:29 riastradh Exp $        */
+/*     $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -59,6 +59,69 @@
 #endif
 }
 
+static inline uint32x4_t
+rol16(uint32x4_t x)
+{
+       uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
+
+       y16 = vrev32q_u16(x16);
+
+       return vreinterpretq_u32_u16(y16);
+}
+
+static inline uint32x4_t
+rol12(uint32x4_t x)
+{
+
+       return vrolq_n_u32(x, 12);
+}
+
+static inline uint32x4_t
+rol8(uint32x4_t x)
+{
+#if defined(__aarch64__)
+       static const uint8x16_t rol8_tab = {
+                 3, 0, 1, 2,  7, 4, 5, 6,
+                11, 8, 9,10, 15,12,13,14,
+       };
+       uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
+
+       y8 = vqtbl1q_u8(x8, rol8_tab);
+
+       return vreinterpretq_u32_u8(y8);
+#elif 0
+       /*
+        * GCC does a lousy job with this, spilling two 64-bit vector
+        * registers to the stack every time.  There should be plenty
+        * of vector registers free, requiring no spills at all, and
+        * GCC should be able to hoist the load of rol8_tab out of any
+        * loops, but it doesn't and so attempting to use VTBL hurts
+        * more than it helps.
+        */
+       static const uint8x8_t rol8_tab = {
+                3, 0, 1, 2,  7, 4, 5, 6,
+       };
+
+       uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
+
+       y64 = (uint64x2_t) {
+               (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
+               (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
+       };
+
+       return vreinterpretq_u32_u64(y64);
+#else
+       return vrolq_n_u32(x, 8);
+#endif
+}
+
+static inline uint32x4_t
+rol7(uint32x4_t x)
+{
+
+       return vrolq_n_u32(x, 7);
+}
+
 static inline void
 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
     unsigned nr)
@@ -72,20 +135,20 @@
        r3 = *p3;
 
        for (; nr > 0; nr -= 2) {
-               r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 16);
-               r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 12);
-               r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 8);
-               r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 7);
+               r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
+               r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
+               r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
+               r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
 
                c0 = r0;
                c1 = vextq_u32(r1, r1, 1);
                c2 = vextq_u32(r2, r2, 2);
                c3 = vextq_u32(r3, r3, 3);
 
-               c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 16);
-               c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 12);
-               c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 8);
-               c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 7);
+               c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
+               c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
+               c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
+               c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
 
                r0 = c0;
                r1 = vextq_u32(c1, c1, 3);



Home | Main Index | Thread Index | Old Index