Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys Implement ChaCha with NEON on ARM.



details:   https://anonhg.NetBSD.org/src/rev/1960b4c095f1
branches:  trunk
changeset: 974168:1960b4c095f1
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Sat Jul 25 22:51:57 2020 +0000

description:
Implement ChaCha with NEON on ARM.

XXX Needs performance measurement.
XXX Needs adaptation to arm32 neon which has half the registers.

diffstat:

 sys/arch/aarch64/aarch64/cpu.c                |   26 +-
 sys/arch/aarch64/conf/files.aarch64           |    5 +-
 sys/crypto/chacha/arch/arm/arm_neon.h         |  534 ++++++++++++++++++++++++++
 sys/crypto/chacha/arch/arm/chacha_neon.c      |  315 +++++++++++++++
 sys/crypto/chacha/arch/arm/chacha_neon.h      |   83 ++++
 sys/crypto/chacha/arch/arm/chacha_neon_64.S   |  491 +++++++++++++++++++++++
 sys/crypto/chacha/arch/arm/chacha_neon_impl.c |  181 ++++++++
 sys/crypto/chacha/arch/arm/files.chacha_arm   |    9 +
 8 files changed, 1641 insertions(+), 3 deletions(-)

diffs (truncated from 1721 to 300 lines):

diff -r 48da887bfd5c -r 1960b4c095f1 sys/arch/aarch64/aarch64/cpu.c
--- a/sys/arch/aarch64/aarch64/cpu.c    Sat Jul 25 22:49:20 2020 +0000
+++ b/sys/arch/aarch64/aarch64/cpu.c    Sat Jul 25 22:51:57 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.53 2020/07/25 22:12:56 riastradh Exp $ */
+/* $NetBSD: cpu.c,v 1.54 2020/07/25 22:51:57 riastradh Exp $ */
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <ryo%nerv.org@localhost>
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.53 2020/07/25 22:12:56 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.54 2020/07/25 22:51:57 riastradh Exp $");
 
 #include "locators.h"
 #include "opt_arm_debug.h"
@@ -47,6 +47,8 @@
 #include <crypto/aes/aes_impl.h>
 #include <crypto/aes/arch/arm/aes_armv8.h>
 #include <crypto/aes/arch/arm/aes_neon.h>
+#include <crypto/chacha/chacha_impl.h>
+#include <crypto/chacha/arch/arm/chacha_neon.h>
 
 #include <aarch64/armreg.h>
 #include <aarch64/cpu.h>
@@ -75,6 +77,7 @@
 static void cpu_setup_sysctl(device_t, struct cpu_info *);
 static void cpu_setup_rng(device_t, struct cpu_info *);
 static void cpu_setup_aes(device_t, struct cpu_info *);
+static void cpu_setup_chacha(device_t, struct cpu_info *);
 
 #ifdef MULTIPROCESSOR
 #define NCPUINFO       MAXCPUS
@@ -164,6 +167,7 @@
        cpu_setup_sysctl(dv, ci);
        cpu_setup_rng(dv, ci);
        cpu_setup_aes(dv, ci);
+       cpu_setup_chacha(dv, ci);
 }
 
 struct cpuidtab {
@@ -633,6 +637,24 @@
        }
 }
 
+/*
+ * setup the ChaCha implementation
+ */
+static void
+cpu_setup_chacha(device_t dv, struct cpu_info *ci)
+{
+       struct aarch64_sysctl_cpu_id *id = &ci->ci_id;
+
+       /* Check for SIMD support.  */
+       switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) {
+       case ID_AA64PFR0_EL1_ADV_SIMD_IMPL:
+               chacha_md_init(&chacha_neon_impl);
+               return;
+       default:
+               break;
+       }
+}
+
 #ifdef MULTIPROCESSOR
 void
 cpu_hatch(struct cpu_info *ci)
diff -r 48da887bfd5c -r 1960b4c095f1 sys/arch/aarch64/conf/files.aarch64
--- a/sys/arch/aarch64/conf/files.aarch64       Sat Jul 25 22:49:20 2020 +0000
+++ b/sys/arch/aarch64/conf/files.aarch64       Sat Jul 25 22:51:57 2020 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: files.aarch64,v 1.25 2020/07/17 07:16:10 ryo Exp $
+#      $NetBSD: files.aarch64,v 1.26 2020/07/25 22:51:57 riastradh Exp $
 
 defflag opt_cpuoptions.h       AARCH64_ALIGNMENT_CHECK
 defflag opt_cpuoptions.h       AARCH64_EL0_STACK_ALIGNMENT_CHECK
@@ -145,3 +145,6 @@
 
 # vpaes with ARM NEON
 include "crypto/aes/arch/arm/files.aesneon"
+
+# ChaCha with ARM NEON
+include "crypto/chacha/arch/arm/files.chacha_arm"
diff -r 48da887bfd5c -r 1960b4c095f1 sys/crypto/chacha/arch/arm/arm_neon.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/chacha/arch/arm/arm_neon.h     Sat Jul 25 22:51:57 2020 +0000
@@ -0,0 +1,534 @@
+/*     $NetBSD: arm_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $   */
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef        _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
+#define        _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
+
+#if defined(__GNUC__) && !defined(__clang__)
+
+#define        _INTRINSATTR                                                          \
+       __extension__                                                         \
+       __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+
+#ifdef __aarch64__
+typedef __Int32x4_t int32x4_t;
+typedef __Int64x2_t int64x2_t;
+typedef __Int8x16_t int8x16_t;
+typedef __Uint32x4_t uint32x4_t;
+typedef __Uint64x2_t uint64x2_t;
+typedef __Uint8x16_t uint8x16_t;
+#else
+typedef __simd128_int32_t int32x4_t;
+typedef __simd128_int64_t int64x2_t;
+typedef __simd128_int8_t int8x16_t;
+typedef __simd128_uint32_t uint32x4_t;
+typedef __simd128_uint64_t uint64x2_t;
+typedef __simd128_uint8_t uint8x16_t;
+
+typedef __simd64_int8_t int8x8_t;
+typedef __simd64_uint8_t uint8x8_t;
+typedef __builtin_neon_udi uint64x1_t;
+typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+#endif
+
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+#define        __neon_lane_index(__v, __i)     (__arraycount(__v) - 1 - __i)
+#else
+#define        __neon_lane_index(__v, __i)     __i
+#endif
+
+#elif defined(__clang__)
+
+#define        _INTRINSATTR                                                          \
+       __attribute__((__always_inline__, __nodebug__))
+
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+
+#ifdef __LITTLE_ENDIAN__
+#define        __neon_lane_index(__v, __i)     __i
+#else
+#define        __neon_lane_index(__v, __i)     (__arraycount(__v) - 1 - __i)
+#endif
+
+#else
+
+#error Teach me how to neon in your compile!
+
+#endif
+
+_INTRINSATTR
+static __inline uint32x4_t
+vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
+{
+       return __v0 + __v1;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vcltq_s32(int32x4_t __v0, int32x4_t __v1)
+{
+       return (uint32x4_t)(__v0 < __v1);
+}
+
+_INTRINSATTR
+static __inline int32x4_t
+vdupq_n_s32(int32_t __x)
+{
+       return (int32x4_t) { __x, __x, __x, __x };
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vdupq_n_u32(uint32_t __x)
+{
+       return (uint32x4_t) { __x, __x, __x, __x };
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vdupq_n_u8(uint8_t __x)
+{
+       return (uint8x16_t) {
+               __x, __x, __x, __x, __x, __x, __x, __x,
+               __x, __x, __x, __x, __x, __x, __x, __x,
+       };
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
+{
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+       return __builtin_shuffle(__hi, __lo,
+           (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
+#else
+       return __builtin_shuffle(__lo, __hi,
+           (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define        vextq_u32(__lo, __hi, __i)                                            \
+       (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),                 \
+           (int8x16_t)(__hi), (__i), 50)
+#else
+#define        vextq_u32(__lo, __hi, __i) (                                          \
+{                                                                            \
+       uint32x4_t __tlo = (__lo);                                            \
+       uint32x4_t __thi = (__hi);                                            \
+       uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
+       uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
+       uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,            \
+           (int8x16_t)__hi_r, __i, 50);                                      \
+       __builtin_shufflevector(__r, __r, 3,2,1,0);                           \
+})
+#endif /* __LITTLE_ENDIAN__ */
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint8x16_t
+vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
+{
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+       return __builtin_shuffle(__hi, __lo,
+           (uint8x16_t) {
+               16 - __i, 17 - __i, 18 - __i, 19 - __i,
+               20 - __i, 21 - __i, 22 - __i, 23 - __i,
+               24 - __i, 25 - __i, 26 - __i, 27 - __i,
+               28 - __i, 29 - __i, 30 - __i, 31 - __i,
+       });
+#else
+       return __builtin_shuffle(__lo, __hi,
+           (uint8x16_t) {
+               __i +  0, __i +  1, __i +  2, __i +  3,
+               __i +  4, __i +  5, __i +  6, __i +  7,
+               __i +  8, __i +  9, __i + 10, __i + 11,
+               __i + 12, __i + 13, __i + 14, __i + 15,
+       });
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define        vextq_u8(__lo, __hi, __i)                                             \
+       (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),                 \
+           (int8x16_t)(__hi), (__i), 48)
+#else
+#define        vextq_u8(__lo, __hi, __i) (                                           \
+{                                                                            \
+       uint8x16_t __tlo = (__lo);                                            \
+       uint8x16_t __thi = (__hi);                                            \
+       uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,             \
+           15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);                           \
+       uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,             \
+           15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);                           \
+       uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,            \
+           (int8x16_t)__hi_r, (__i), 48);                                    \
+       return __builtin_shufflevector(__r, __r,                              \
+           15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);                           \
+})
+#endif /* __LITTLE_ENDIAN */
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32_t
+vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
+{
+#ifdef __aarch64__



Home | Main Index | Thread Index | Old Index