Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/crypto/chacha/arch/arm Note that VSRI seems to hurt here.
details: https://anonhg.NetBSD.org/src/rev/b81932f2cd0c
branches: trunk
changeset: 936488:b81932f2cd0c
user: riastradh <riastradh%NetBSD.org@localhost>
date: Mon Jul 27 20:58:56 2020 +0000
description:
Note that VSRI seems to hurt here.
diffstat:
sys/crypto/chacha/arch/arm/arm_neon.h | 36 +++++++++++++++++++++++++++++++-
sys/crypto/chacha/arch/arm/chacha_neon.c | 10 ++++++++-
2 files changed, 44 insertions(+), 2 deletions(-)
diffs (75 lines):
diff -r eeda7666768a -r b81932f2cd0c sys/crypto/chacha/arch/arm/arm_neon.h
--- a/sys/crypto/chacha/arch/arm/arm_neon.h Mon Jul 27 20:58:06 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/arm_neon.h Mon Jul 27 20:58:56 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */
+/* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -529,6 +529,40 @@
#endif /* __LITTLE_ENDIAN__ */
#endif
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+ return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
+#else
+ return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
+ (int32x4_t)__vsh, __bits);
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u32(__vins, __vsh, __bits) \
+ (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \
+ (int32x4_t)(__vsh), (__bits), 34)
+#else
+#define vsliq_n_s32(__vins, __vsh, __bits) ( \
+{ \
+ int32x4_t __tvins = (__vins); \
+ int32x4_t __tvsh = (__vsh); \
+ uint8_t __tbits = (__bits); \
+ int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
+ 3,2,1,0); \
+ int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
+ 3,2,1,0); \
+ int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \
+ 34); \
+ __builtin_shufflevector(__r, __r, 3,2,1,0); \
+})
+#endif
+#endif
+
_INTRINSATTR
static __inline void
vst1q_u32(uint32_t *__p32, uint32x4_t __v)
diff -r eeda7666768a -r b81932f2cd0c sys/crypto/chacha/arch/arm/chacha_neon.c
--- a/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Jul 27 20:58:06 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Jul 27 20:58:56 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */
+/* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -36,7 +36,15 @@
vrolq_n_u32(uint32x4_t x, uint8_t n)
{
+ /*
+ * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
+ * practice it hurts performance at least on Cortex-A8.
+ */
+#if 1
return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
+#else
+ return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
+#endif
}
static inline uint32x4_t
Home |
Main Index |
Thread Index |
Old Index