Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/crypto/aes/arch/arm Provide hand-written AES NEON assemb...
details: https://anonhg.NetBSD.org/src/rev/698f039f31ac
branches: trunk
changeset: 1011443:698f039f31ac
user: riastradh <riastradh%NetBSD.org@localhost>
date: Mon Jun 29 23:57:56 2020 +0000
description:
Provide hand-written AES NEON assembly for arm32.
gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32;
hand-writing it made it about 12x faster, by avoiding a zillion loads
and stores to spill everything and the kitchen sink onto the stack.
(But gcc does fine on aarch64, presumably because it has twice as
many registers and doesn't have to deal with q2=d4/d5 overlapping.)
diffstat:
sys/crypto/aes/arch/arm/aes_neon.c | 44 +-
sys/crypto/aes/arch/arm/aes_neon_32.S | 653 ++++++++++++++++++++++++++++++++++
sys/crypto/aes/arch/arm/files.aesneon | 6 +-
3 files changed, 688 insertions(+), 15 deletions(-)
diffs (truncated from 818 to 300 lines):
diff -r ea4f95a9ad32 -r 698f039f31ac sys/crypto/aes/arch/arm/aes_neon.c
--- a/sys/crypto/aes/arch/arm/aes_neon.c Mon Jun 29 23:56:30 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.c Mon Jun 29 23:57:56 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */
+/* $NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $");
#include <sys/types.h>
@@ -47,6 +47,12 @@
#include "aes_neon_impl.h"
+#ifdef __aarch64__
+#define __aarch64_used
+#else
+#define __aarch64_used __unused
+#endif
+
static const uint8x16_t
mc_forward[4] = {
{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
@@ -58,7 +64,7 @@
{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
},
-mc_backward[4] = {
+mc_backward[4] __aarch64_used = {
{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
@@ -68,7 +74,7 @@
{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
},
-ipt[2] = {
+ipt[2] __aarch64_used = {
{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
@@ -80,55 +86,55 @@
{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
},
-dipt[2] = {
+dipt[2] __aarch64_used = {
{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
},
-sb1[2] = {
+sb1[2] __aarch64_used = {
{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
},
-sb2[2] = {
+sb2[2] __aarch64_used = {
{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
},
-sbo[2] = {
+sbo[2] __aarch64_used = {
{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
},
-dsb9[2] = {
+dsb9[2] __aarch64_used = {
{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
},
-dsbd[2] = {
+dsbd[2] __aarch64_used = {
{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
},
-dsbb[2] = {
+dsbb[2] __aarch64_used = {
{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
},
-dsbe[2] = {
+dsbe[2] __aarch64_used = {
{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
},
-dsbo[2] = {
+dsbo[2] __aarch64_used = {
{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
@@ -164,7 +170,7 @@
{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
},
-sr[4] = {
+sr[4] __aarch64_used = {
{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
@@ -533,6 +539,14 @@
storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
}
+#ifdef __aarch64__
+
+/*
+ * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
+ * do the performance-critical parts -- encryption and decryption -- in
+ * hand-written assembly on arm32.
+ */
+
uint8x16_t
aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
{
@@ -608,3 +622,5 @@
x ^= loadroundkey(rk32);
return vqtbl1q_u8(x, sr[i]);
}
+
+#endif
diff -r ea4f95a9ad32 -r 698f039f31ac sys/crypto/aes/arch/arm/aes_neon_32.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_32.S Mon Jun 29 23:57:56 2020 +0000
@@ -0,0 +1,653 @@
+/* $NetBSD: aes_neon_32.S,v 1.1 2020/06/29 23:57:56 riastradh Exp $ */
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arm/asm.h>
+
+ .fpu neon
+
+ .section .rodata
+ .p2align 4
+
+ .type inv,_ASM_TYPE_OBJECT
+inv:
+ .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
+ .byte 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
+END(inv)
+
+ .type inva,_ASM_TYPE_OBJECT
+inva:
+ .byte 0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
+ .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
+END(inva)
+
+ .type mc_forward,_ASM_TYPE_OBJECT
+mc_forward:
+ .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
+ .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
+
+ .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
+ .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
+
+ .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
+ .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
+
+.Lmc_forward_3:
+ .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
+ .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
+END(mc_forward)
+
+ .type mc_backward,_ASM_TYPE_OBJECT
+mc_backward:
+ .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
+ .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
+
+ .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
+ .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
+
+ .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
+ .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
+
+ .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
+ .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
+END(mc_backward)
+
+ .type sr,_ASM_TYPE_OBJECT
+sr:
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */
+ .byte 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
+
+ .byte 0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */
+ .byte 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
+
+ .byte 0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */
+ .byte 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
+
+ .byte 0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */
+ .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
+END(sr)
+
+ .type iptlo,_ASM_TYPE_OBJECT
+iptlo:
+ .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+ .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
+END(iptlo)
+
+ .type ipthi,_ASM_TYPE_OBJECT
+ipthi:
+ .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+ .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
+END(ipthi)
+
+ .type sb1_0,_ASM_TYPE_OBJECT
+sb1_0:
+ .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+ .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
+END(sb1_0)
+
+ .type sb1_1,_ASM_TYPE_OBJECT
+sb1_1:
+ .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+ .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
+END(sb1_1)
+
+ .type sb2_0,_ASM_TYPE_OBJECT
+sb2_0:
+ .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
+ .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
+END(sb2_0)
+
+ .type sb2_1,_ASM_TYPE_OBJECT
+sb2_1:
+ .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+ .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
+END(sb2_1)
+
+ .type sbo_0,_ASM_TYPE_OBJECT
+sbo_0:
+ .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
+ .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
+END(sbo_0)
+
+ .type sbo_1,_ASM_TYPE_OBJECT
+sbo_1:
+ .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+ .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
+END(sbo_1)
+
+ .type diptlo,_ASM_TYPE_OBJECT
+diptlo:
+ .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+ .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
+END(diptlo)
+
+ .type dipthi,_ASM_TYPE_OBJECT
+dipthi:
+ .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+ .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
+END(dipthi)
Home |
Main Index |
Thread Index |
Old Index