Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/crypto/aes/arch/arm aes neon: Issue 256-bit loads rather...
details: https://anonhg.NetBSD.org/src/rev/3b140607dcac
branches: trunk
changeset: 1014013:3b140607dcac
user: riastradh <riastradh%NetBSD.org@localhost>
date: Thu Sep 10 11:29:02 2020 +0000
description:
aes neon: Issue 256-bit loads rather than pairs of 128-bit loads.
Not sure why I didn't realize you could do this before!
Saves some temporary registers that can now be allocated to shave off
a few cycles.
diffstat:
sys/crypto/aes/arch/arm/aes_neon_32.S | 247 ++++++++++++---------------------
1 files changed, 93 insertions(+), 154 deletions(-)
diffs (truncated from 389 to 300 lines):
diff -r 256116917028 -r 3b140607dcac sys/crypto/aes/arch/arm/aes_neon_32.S
--- a/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 06:02:30 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:29:02 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $ */
+/* $NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
#include <arm/asm.h>
-RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
.fpu neon
@@ -38,9 +38,10 @@
.long .Lconstants - .
.section .rodata
- .p2align 4
+ .p2align 5
.Lconstants:
+.Linv_inva: /* inv and inva must be consecutive */
.type inv,_ASM_TYPE_OBJECT
inv:
.byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
@@ -99,125 +100,85 @@
.byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
END(sr)
- .type iptlo,_ASM_TYPE_OBJECT
-iptlo:
- .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+ .type ipt,_ASM_TYPE_OBJECT
+ipt:
+ .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 /* lo */
.byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
-END(iptlo)
-
- .type ipthi,_ASM_TYPE_OBJECT
-ipthi:
- .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+ .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
.byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
-END(ipthi)
+END(ipt)
- .type sb1_0,_ASM_TYPE_OBJECT
-sb1_0:
- .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+ .type sb1,_ASM_TYPE_OBJECT
+sb1:
+ .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
.byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
-END(sb1_0)
-
- .type sb1_1,_ASM_TYPE_OBJECT
-sb1_1:
- .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+ .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
.byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
-END(sb1_1)
-
- .type sb2_0,_ASM_TYPE_OBJECT
-sb2_0:
- .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
- .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
-END(sb2_0)
+END(sb1)
- .type sb2_1,_ASM_TYPE_OBJECT
-sb2_1:
- .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+ .type sb2,_ASM_TYPE_OBJECT
+sb2:
+ .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
+ .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
+ .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
.byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
-END(sb2_1)
-
- .type sbo_0,_ASM_TYPE_OBJECT
-sbo_0:
- .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
- .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
-END(sbo_0)
+END(sb2)
- .type sbo_1,_ASM_TYPE_OBJECT
-sbo_1:
- .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+ .type sbo,_ASM_TYPE_OBJECT
+sbo:
+ .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
+ .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
+ .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
.byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
-END(sbo_1)
+END(sbo)
- .type diptlo,_ASM_TYPE_OBJECT
-diptlo:
- .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+ .type dipt,_ASM_TYPE_OBJECT
+dipt:
+ .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F /* lo */
.byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
-END(diptlo)
-
- .type dipthi,_ASM_TYPE_OBJECT
-dipthi:
- .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+ .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 /* hi */
.byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
-END(dipthi)
+END(dipt)
- .type dsb9_0,_ASM_TYPE_OBJECT
-dsb9_0:
- .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
+ .type dsb9,_ASM_TYPE_OBJECT
+dsb9:
+ .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 /* 0 */
.byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
-END(dsb9_0)
-
- .type dsb9_1,_ASM_TYPE_OBJECT
-dsb9_1:
- .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
+ .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 /* 1 */
.byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
-END(dsb9_1)
+END(dsb9)
- .type dsbd_0,_ASM_TYPE_OBJECT
-dsbd_0:
- .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
+ .type dsbd,_ASM_TYPE_OBJECT
+dsbd:
+ .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D /* 0 */
.byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
-END(dsbd_0)
-
- .type dsbd_1,_ASM_TYPE_OBJECT
-dsbd_1:
- .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
+ .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C /* 1 */
.byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
-END(dsbd_1)
-
- .type dsbb_0,_ASM_TYPE_OBJECT
-dsbb_0:
- .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
- .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
-END(dsbb_0)
+END(dsbd)
- .type dsbb_1,_ASM_TYPE_OBJECT
-dsbb_1:
- .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
+ .type dsbb,_ASM_TYPE_OBJECT
+dsbb:
+ .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0 /* 0 */
+ .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
+ .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1 /* 1 */
.byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
-END(dsbb_1)
-
- .type dsbe_0,_ASM_TYPE_OBJECT
-dsbe_0:
- .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
- .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
-END(dsbe_0)
+END(dsbb)
- .type dsbe_1,_ASM_TYPE_OBJECT
-dsbe_1:
- .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
+ .type dsbe,_ASM_TYPE_OBJECT
+dsbe:
+ .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46 /* 0 */
+ .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
+ .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C /* 1 */
.byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
-END(dsbe_1)
+END(dsbe)
- .type dsbo_0,_ASM_TYPE_OBJECT
-dsbo_0:
- .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
+ .type dsbo,_ASM_TYPE_OBJECT
+dsbo:
+ .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13 /* 0 */
.byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
-END(dsbo_0)
-
- .type dsbo_1,_ASM_TYPE_OBJECT
-dsbo_1:
- .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
+ .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12 /* 1 */
.byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
-END(dsbo_1)
+END(dsbo)
/*
* aes_neon_enc1(enc, x, nrounds)
@@ -274,7 +235,7 @@
ldr r12, .Lconstants_addr
adr r11, .Lconstants_addr
- vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
+ vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
movw r3, #0
vmov.i8 q1, #0x0f
@@ -282,24 +243,16 @@
add r12, r12, r11
/* (q4, q5) := (iptlo, ipthi) */
- add r6, r12, #(iptlo - .Lconstants)
- add r7, r12, #(ipthi - .Lconstants)
- vld1.8 {d8-d9}, [r6 :128]
- vld1.8 {d10-d11}, [r7 :128]
+ add r6, r12, #(ipt - .Lconstants)
+ vld1.8 {q4-q5}, [r6 :256]
/* load the rest of the constants */
- add r4, r12, #(sb1_0 - .Lconstants)
- add r5, r12, #(sb1_1 - .Lconstants)
- add r6, r12, #(sb2_0 - .Lconstants)
- add r7, r12, #(sb2_1 - .Lconstants)
- add r8, r12, #(inv - .Lconstants)
- add r10, r12, #(inva - .Lconstants)
- vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */
- vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */
- vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */
- vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */
- vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */
- vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */
+ add r4, r12, #(sb1 - .Lconstants)
+ add r6, r12, #(sb2 - .Lconstants)
+ add r8, r12, #(.Linv_inva - .Lconstants)
+ vld1.8 {q6-q7}, [r4 :256] /* q6 = sb1[0], q7 = sb1[1] */
+ vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */
+ vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */
/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
add r4, r12, #(mc_forward - .Lconstants)
@@ -323,7 +276,7 @@
b 2f
_ALIGN_TEXT
-1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
+1: vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
vtbl.8 d24, {d12-d13}, d4
@@ -343,8 +296,8 @@
/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
add r6, r4, r3, lsl #4
add r7, r5, r3, lsl #4
- vld1.8 {d24-d25}, [r6]
- vld1.8 {d26-d27}, [r7]
+ vld1.8 {q12}, [r6 :128]
+ vld1.8 {q13}, [r7 :128]
/* q15 := A2_B = A2 + A(mcf) */
vtbl.8 d30, {d0-d1}, d24
@@ -413,14 +366,12 @@
/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
add r8, r12, #(sr - .Lconstants)
- add r6, r12, #(sbo_0 - .Lconstants)
- add r7, r12, #(sbo_1 - .Lconstants)
+ add r6, r12, #(sbo - .Lconstants)
add r8, r8, r3, lsl #4
- vld1.8 {d12-d13}, [r6 :128]
- vld1.8 {d14-d15}, [r7 :128]
- vld1.8 {d30-d31}, [r8 :128]
+ vld1.8 {q6-q7}, [r6 :256]
+ vld1.8 {q15}, [r8 :128]
- vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
+ vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
vtbl.8 d4, {d12-d13}, d4
@@ -502,7 +453,7 @@
ldr r12, .Lconstants_addr
adr r11, .Lconstants_addr
- vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
+ vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */
vmov.i8 q1, #0x0f
and r3, r3, #3 /* r3 := 3 & ~(x - 1) */
Home |
Main Index |
Thread Index |
Old Index