Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/crypto/aes/arch/arm Reallocate registers to avoid abusin...
details: https://anonhg.NetBSD.org/src/rev/f9785a1fdf00
branches: trunk
changeset: 935368:f9785a1fdf00
user: riastradh <riastradh%NetBSD.org@localhost>
date: Tue Jun 30 23:06:02 2020 +0000
description:
Reallocate registers to avoid abusing callee-saves registers, v8-v15.
Forgot to consult the AAPCS before committing this before -- oops!
While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.
diffstat:
sys/crypto/aes/arch/arm/aes_armv8_64.S | 423 ++++++++++++++++----------------
1 files changed, 207 insertions(+), 216 deletions(-)
diffs (truncated from 726 to 300 lines):
diff -r 3c6fe576e0fa -r f9785a1fdf00 sys/crypto/aes/arch/arm/aes_armv8_64.S
--- a/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Jun 30 21:53:39 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Jun 30 23:06:02 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */
+/* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -116,7 +116,7 @@
adrl x4, unshiftrows_rotword_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
- ldr q8, [x4] /* q8 := unshiftrows_rotword_3 table */
+ ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
str q1, [x0], #0x10 /* store master key as first round key */
mov x2, #10 /* round count */
@@ -136,7 +136,7 @@
/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
ld1r {v4.4s}, [x3], #4
- tbl v3.16b, {v3.16b}, v8.16b
+ tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b
/*
@@ -175,8 +175,8 @@
adrl x4, unshiftrows_rotword_1
adrl x5, unshiftrows_rotword_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
- ldr q8, [x4] /* q8 := unshiftrows_rotword_1 */
- ldr q9, [x5] /* q9 := unshiftrows_rotword_3 */
+ ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
+ ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
str q1, [x0], #0x10 /* store master key[0:128) as round key */
mov x2, #12 /* round count */
@@ -197,7 +197,7 @@
/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
ld1r {v4.4s}, [x3], #4
- tbl v3.16b, {v3.16b}, v8.16b
+ tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b
/*
@@ -269,8 +269,8 @@
* q2 = rk
* q3 = nrk
* v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
- * q8 = unshiftrows_rotword_1
- * q9 = unshiftrows_rotword_3
+ * q16 = unshiftrows_rotword_1
+ * q17 = unshiftrows_rotword_3
*
* We have to compute, in q1:
*
@@ -294,7 +294,7 @@
/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
ld1r {v4.4s}, [x3], #4
- tbl v1.16b, {v1.16b}, v9.16b
+ tbl v1.16b, {v1.16b}, v17.16b
eor v1.16b, v1.16b, v4.16b
/*
@@ -354,8 +354,8 @@
adrl x4, unshiftrows_rotword_3
adrl x5, unshiftrows_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
- ldr q8, [x4] /* q8 := unshiftrows_rotword_3 */
- ldr q9, [x5] /* q9 := unshiftrows_3 */
+ ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
+ ldr q17, [x5] /* q17 := unshiftrows_3 */
/* store master key as first two round keys */
stp q1, q2, [x0], #0x20
@@ -376,7 +376,7 @@
/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
ld1r {v4.4s}, [x3], #4
- tbl v3.16b, {v3.16b}, v8.16b
+ tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b
/*
@@ -402,7 +402,7 @@
aese v3.16b, v0.16b
/* v3.4s[i] := SubBytes(rk[3]) */
- tbl v3.16b, {v3.16b}, v9.16b
+ tbl v3.16b, {v3.16b}, v17.16b
/*
* v5.4s := (0,prk[0],prk[1],prk[2])
@@ -458,9 +458,9 @@
ENTRY(aesarmv8_enc)
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
- ldr q0, [x1] /* q0 := block */
- bl aesarmv8_enc1
- str q0, [x2] /* store block */
+ ldr q0, [x1] /* q0 := ptxt */
+ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
+ str q0, [x2] /* store ctxt */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_enc)
@@ -476,9 +476,9 @@
ENTRY(aesarmv8_dec)
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
- ldr q0, [x1] /* q0 := block */
- bl aesarmv8_dec1
- str q0, [x2] /* store block */
+ ldr q0, [x1] /* q0 := ctxt */
+ bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
+ str q0, [x2] /* store ptxt */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_dec)
@@ -505,7 +505,7 @@
eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
- bl aesarmv8_enc1 /* q0 := ciphertext block */
+ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
subs x10, x10, #0x10 /* count down nbytes */
str q0, [x2], #0x10 /* store ciphertext block */
b.ne 1b /* repeat if x10 is nonzero */
@@ -527,10 +527,9 @@
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_cbc_dec1)
- stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
+ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
- ldr q8, [x4] /* q8 := iv */
- str q8, [sp, #16] /* save iv */
+ ldr q24, [x4] /* q24 := iv */
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
add x1, x1, x3 /* x1 := pointer past end of in */
@@ -539,18 +538,17 @@
str q0, [x4] /* update iv */
1: mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
- bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3 */
+ bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
subs x10, x10, #0x10 /* count down nbytes */
b.eq 2f /* stop if this is the first block */
- ldr q8, [x1, #-0x10]! /* q8 := chaining value */
- eor v0.16b, v0.16b, v8.16b /* q0 := plaintext block */
+ ldr q31, [x1, #-0x10]! /* q31 := chaining value */
+ eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
str q0, [x2, #-0x10]! /* store plaintext block */
- mov v0.16b, v8.16b /* move cv = ciphertext block */
+ mov v0.16b, v31.16b /* move cv = ciphertext block */
b 1b
-2: ldr q8, [sp, #16] /* q8 := iv */
- eor v0.16b, v0.16b, v8.16b /* q0 := first plaintext block */
+2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
str q0, [x2, #-0x10]! /* store first plaintext block */
- ldp fp, lr, [sp], #32 /* pop stack frame */
+ ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_cbc_dec1)
@@ -566,10 +564,9 @@
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_cbc_dec8)
- stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
+ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
- ldr q8, [x4] /* q8 := iv */
- str q8, [sp, #16] /* save iv */
+ ldr q24, [x4] /* q24 := iv */
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
add x1, x1, x3 /* x1 := pointer past end of in */
@@ -579,23 +576,24 @@
1: ldp q4, q5, [x1, #-0x20]!
ldp q2, q3, [x1, #-0x20]!
ldp q0, q1, [x1, #-0x20]!
- mov v15.16b, v6.16b /* q[8+i] := cv[i], 0<i<8 */
- mov v14.16b, v5.16b
- mov v13.16b, v4.16b
- mov v12.16b, v3.16b
- mov v11.16b, v2.16b
- mov v10.16b, v1.16b
- mov v9.16b, v0.16b
+ mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
+ mov v30.16b, v5.16b
+ mov v29.16b, v4.16b
+ mov v28.16b, v3.16b
+ mov v27.16b, v2.16b
+ mov v26.16b, v1.16b
+ mov v25.16b, v0.16b
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
- bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i] */
- eor v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */
- eor v6.16b, v6.16b, v14.16b
- eor v5.16b, v5.16b, v13.16b
- eor v4.16b, v4.16b, v12.16b
- eor v3.16b, v3.16b, v11.16b
- eor v2.16b, v2.16b, v10.16b
- eor v1.16b, v1.16b, v9.16b
+ bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
+ * trash x0/x3/q16 */
+ eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
+ eor v6.16b, v6.16b, v30.16b
+ eor v5.16b, v5.16b, v29.16b
+ eor v4.16b, v4.16b, v28.16b
+ eor v3.16b, v3.16b, v27.16b
+ eor v2.16b, v2.16b, v26.16b
+ eor v1.16b, v1.16b, v25.16b
subs x10, x10, #0x80 /* count down nbytes */
stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
stp q4, q5, [x2, #-0x20]!
@@ -605,10 +603,9 @@
eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]!
b 1b
-2: ldr q8, [sp, #16] /* q8 := iv */
- eor v0.16b, v0.16b, v8.16b /* q0 := pt0 */
+2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
- ldp fp, lr, [sp], #32 /* pop stack frame */
+ ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_cbc_dec8)
@@ -629,18 +626,18 @@
mov fp, sp
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
- ldr q9, [x4] /* q9 := tweak */
+ ldr q31, [x4] /* q31 := tweak */
1: ldr q0, [x1], #0x10 /* q0 := ptxt */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
- eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
- bl aesarmv8_enc1 /* q0 := AES(ptxt ^ tweak) */
- eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
+ eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
+ bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
+ eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
str q0, [x2], #0x10 /* store ciphertext block */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x10 /* count down nbytes */
b.ne 1b /* repeat if more blocks */
- str q9, [x4] /* update tweak */
+ str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_xts_enc1)
@@ -657,61 +654,58 @@
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_xts_enc8)
- stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */
+ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
- ldr q9, [x4] /* q9 := tweak */
-1: str q9, [sp, #16] /* save tweak[0] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- str q9, [sp, #32] /* save tweak[1] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- mov v10.16b, v9.16b /* q10 := tweak[2] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- mov v11.16b, v9.16b /* q11 := tweak[3] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- mov v12.16b, v9.16b /* q11 := tweak[4] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- mov v13.16b, v9.16b /* q11 := tweak[5] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- mov v14.16b, v9.16b /* q11 := tweak[6] */
- bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
- mov v15.16b, v9.16b /* q11 := tweak[7] */
- ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */
- ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
+ ldr q31, [x4] /* q31 := tweak */
+1: mov v24.16b, v31.16b /* q24 := tweak[0] */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
+ mov v25.16b, v31.16b /* q25 := tweak[1] */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
+ mov v26.16b, v31.16b /* q26 := tweak[2] */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
+ mov v27.16b, v31.16b /* q27 := tweak[3] */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
+ mov v28.16b, v31.16b /* q28 := tweak[4] */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
+ mov v29.16b, v31.16b /* q29 := tweak[5] */
+ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
Home |
Main Index |
Thread Index |
Old Index