Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/crypto Fix ARM NEON implementations of AES and ChaCha on...



details:   https://anonhg.NetBSD.org/src/rev/7201e6d4c70f
branches:  trunk
changeset: 1012712:7201e6d4c70f
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Sat Aug 08 14:47:01 2020 +0000

description:
Fix ARM NEON implementations of AES and ChaCha on big-endian ARM.

New macros such as VQ_N_U32(a,b,c,d) for NEON vector initializers.
Needed because GCC and Clang disagree on the ordering of lanes,
depending on whether it's 64-bit big-endian, 32-bit big-endian, or
little-endian -- and, bizarrely, both of them disagree with the
architectural numbering of lanes.

Experimented with using

static const uint8_t x8[16] = {...};

        uint8x16_t x = vld1q_u8(x8);

which doesn't require knowing anything about the ordering of lanes,
but this generates considerably worse code and apparently confuses
GCC into not recognizing the constant value of x8.

Fix some clang mistakes while here too.

diffstat:

 sys/crypto/aes/arch/arm/aes_armv8_64.S      |   27 +---
 sys/crypto/aes/arch/arm/aes_neon.c          |  200 ++++++++++++++--------------
 sys/crypto/aes/arch/arm/aes_neon_32.S       |   74 +++++-----
 sys/crypto/aes/arch/arm/aes_neon_impl.h     |    3 +-
 sys/crypto/aes/arch/arm/aes_neon_subr.c     |   80 +++++-----
 sys/crypto/aes/arch/arm/arm_neon.h          |  171 ++++++++++++++++++++++-
 sys/crypto/aes/arch/arm/arm_neon_imm.h      |   80 +++++++++++
 sys/crypto/chacha/arch/arm/arm_neon.h       |   44 ++++-
 sys/crypto/chacha/arch/arm/arm_neon_imm.h   |   80 +++++++++++
 sys/crypto/chacha/arch/arm/chacha_neon.c    |  162 +++++++++------------
 sys/crypto/chacha/arch/arm/chacha_neon_32.S |  178 +++++++-----------------
 sys/crypto/chacha/arch/arm/chacha_neon_64.S |  132 +++++++++---------
 12 files changed, 720 insertions(+), 511 deletions(-)

diffs (truncated from 2141 to 300 lines):

diff -r d614b4182592 -r 7201e6d4c70f sys/crypto/aes/arch/arm/aes_armv8_64.S
--- a/sys/crypto/aes/arch/arm/aes_armv8_64.S    Sat Aug 08 14:43:28 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_armv8_64.S    Sat Aug 08 14:47:01 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $      */
+/*     $NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $      */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -26,11 +26,9 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <sys/endian.h>
-
 #include <aarch64/asm.h>
 
-RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $")
+RCSID("$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $")
 
        .arch_extension aes
 
@@ -921,19 +919,13 @@
        ld1     {v5.4s}, [x11]          /* q5 := (0,0,0,1) (host-endian) */
        mov     x9, x0                  /* x9 := enckey */
        mov     x10, x3                 /* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v2.16b, v2.16b          /* q2 := ctr (host-endian) */
-#endif
        _ALIGN_TEXT
 1:     ldr     q3, [x1], #0x10         /* q3 := plaintext block */
        add     v2.4s, v2.4s, v5.4s     /* increment ctr (32-bit) */
        mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v1.16b, v2.16b          /* q1 := ctr (big-endian) */
-#else
-       mov     v1.16b, v2.16b          /* q1 := ctr (big-endian) */
-#endif
        eor     v0.16b, v0.16b, v3.16b  /* q0 := auth ^ ptxt */
        bl      aesarmv8_enc2           /* q0 := auth', q1 := pad;
                                         * trash x0/x3/q16 */
@@ -941,9 +933,7 @@
        subs    x10, x10, #0x10         /* count down bytes */
        str     q3, [x2], #0x10         /* store ciphertext block */
        b.ne    1b                      /* repeat if more blocks */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v2.16b, v2.16b          /* q2 := ctr (big-endian) */
-#endif
        stp     q0, q2, [x4]            /* store updated auth/ctr */
        ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
@@ -968,18 +958,12 @@
        ld1     {v5.4s}, [x11]          /* q5 := (0,0,0,1) (host-endian) */
        mov     x9, x0                  /* x9 := enckey */
        mov     x10, x3                 /* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v2.16b, v2.16b          /* q2 := ctr (host-endian) */
-#endif
 
        /* Decrypt the first block.  */
        add     v2.4s, v2.4s, v5.4s     /* increment ctr (32-bit) */
        mov     x3, x5                  /* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v0.16b, v2.16b          /* q0 := ctr (big-endian) */
-#else
-       mov     v0.16b, v2.16b          /* q0 := ctr (big-endian) */
-#endif
        ldr     q3, [x1], #0x10         /* q3 := ctxt */
        bl      aesarmv8_enc1           /* q0 := pad; trash x0/x3/q16 */
        b       2f
@@ -995,11 +979,7 @@
        add     v2.4s, v2.4s, v5.4s     /* increment ctr (32-bit) */
        mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v0.16b, v2.16b          /* q0 := ctr (big-endian) */
-#else
-       mov     v0.16b, v2.16b          /* q0 := ctr (big-endian) */
-#endif
        ldr     q3, [x1], #0x10         /* q3 := ctxt */
        bl      aesarmv8_enc2           /* q0 := pad, q1 := auth';
                                         * trash x0/x3/q16 */
@@ -1009,15 +989,14 @@
        eor     v1.16b, v1.16b, v3.16b  /* q1 := auth ^ ptxt */
        b.ne    1b
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
        rev32   v2.16b, v2.16b          /* q2 := ctr (big-endian) */
-#endif
 
        /* Authenticate the last block.  */
        mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
        mov     v0.16b, v1.16b          /* q0 := auth ^ ptxt */
        bl      aesarmv8_enc1           /* q0 := auth'; trash x0/x3/q16 */
+
        stp     q0, q2, [x4]            /* store updated auth/ctr */
        ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
diff -r d614b4182592 -r 7201e6d4c70f sys/crypto/aes/arch/arm/aes_neon.c
--- a/sys/crypto/aes/arch/arm/aes_neon.c        Sat Aug 08 14:43:28 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.c        Sat Aug 08 14:47:01 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $   */
+/*     $NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $   */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
 
 #include <sys/types.h>
 
@@ -60,141 +60,141 @@
 
 static const uint8x16_t
 mc_forward[4] = {
-       {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
-        0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
-       {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
-        0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
-       {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
-        0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
-       {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
-        0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
+       VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
+           0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
+       VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
+           0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
+       VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
+           0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
+       VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
+           0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
 },
 mc_backward[4] __aarch64_used = {
-       {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
-        0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
-       {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
-        0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
-       {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
-        0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
-       {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
-        0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
+       VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
+           0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
+       VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
+           0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
+       VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
+           0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
+       VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
+           0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
 },
 ipt[2] __aarch64_used = {
-       {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
-        0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
-       {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
-        0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
+       VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
+           0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
+       VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
+           0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
 },
 opt[2] = {
-       {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
-        0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
-       {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
-        0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
+       VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
+           0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
+       VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
+           0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
 },
 dipt[2] __aarch64_used = {
-       {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
-        0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
-       {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
-        0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
+       VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
+           0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
+       VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
+           0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
 },
 sb1[2] __aarch64_used = {
-       {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
-        0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
-       {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
-        0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
+       VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
+           0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
+       VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
+           0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
 },
 sb2[2] __aarch64_used = {
-       {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
-        0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
-       {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
-        0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
+       VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
+           0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
+       VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
+           0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
 },
 sbo[2] __aarch64_used = {
-       {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
-        0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
-       {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
-        0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
+       VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
+           0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
+       VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
+           0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
 },
 dsb9[2] __aarch64_used = {
-       {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
-        0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
-       {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
-        0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
+       VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
+           0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
+       VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
+           0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
 },
 dsbd[2] __aarch64_used = {
-       {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
-        0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
-       {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
-        0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
+       VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
+           0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
+       VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
+           0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
 },
 dsbb[2] __aarch64_used = {
-       {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
-        0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
-       {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
-        0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
+       VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
+           0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
+       VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
+           0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
 },
 dsbe[2] __aarch64_used = {
-       {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
-        0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
-       {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
-        0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
+       VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
+           0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
+       VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
+           0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
 },
 dsbo[2] __aarch64_used = {
-       {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
-        0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
-       {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
-        0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
+       VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
+           0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
+       VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
+           0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
 },
 dks1[2] = {
-       {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
-        0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
-       {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
-        0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
+       VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
+           0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
+       VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
+           0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
 },
 dks2[2] = {
-       {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
-        0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
-       {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
-        0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
+       VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
+           0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
+       VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
+           0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
 },
 dks3[2] = {
-       {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
-        0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
-       {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
-        0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
+       VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
+           0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
+       VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
+           0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
 },
 dks4[2] = {
-       {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
-        0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},



Home | Main Index | Thread Index | Old Index