Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/crypto/aes/arch/arm Reallocate registers to avoid abusin...



details:   https://anonhg.NetBSD.org/src/rev/f9785a1fdf00
branches:  trunk
changeset: 935368:f9785a1fdf00
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Tue Jun 30 23:06:02 2020 +0000

description:
Reallocate registers to avoid abusing callee-saves registers, v8-v15.

Forgot to consult the AAPCS before committing this before -- oops!

While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.

diffstat:

 sys/crypto/aes/arch/arm/aes_armv8_64.S |  423 ++++++++++++++++----------------
 1 files changed, 207 insertions(+), 216 deletions(-)

diffs (truncated from 726 to 300 lines):

diff -r 3c6fe576e0fa -r f9785a1fdf00 sys/crypto/aes/arch/arm/aes_armv8_64.S
--- a/sys/crypto/aes/arch/arm/aes_armv8_64.S    Tue Jun 30 21:53:39 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_armv8_64.S    Tue Jun 30 23:06:02 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $       */
+/*     $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $       */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -116,7 +116,7 @@
 
        adrl    x4, unshiftrows_rotword_3
        eor     v0.16b, v0.16b, v0.16b  /* q0 := 0 */
-       ldr     q8, [x4]        /* q8 := unshiftrows_rotword_3 table */
+       ldr     q16, [x4]       /* q16 := unshiftrows_rotword_3 table */
 
        str     q1, [x0], #0x10 /* store master key as first round key */
        mov     x2, #10         /* round count */
@@ -136,7 +136,7 @@
 
        /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
        ld1r    {v4.4s}, [x3], #4
-       tbl     v3.16b, {v3.16b}, v8.16b
+       tbl     v3.16b, {v3.16b}, v16.16b
        eor     v3.16b, v3.16b, v4.16b
 
        /*
@@ -175,8 +175,8 @@
        adrl    x4, unshiftrows_rotword_1
        adrl    x5, unshiftrows_rotword_3
        eor     v0.16b, v0.16b, v0.16b  /* q0 := 0 */
-       ldr     q8, [x4]        /* q8 := unshiftrows_rotword_1 */
-       ldr     q9, [x5]        /* q9 := unshiftrows_rotword_3 */
+       ldr     q16, [x4]       /* q16 := unshiftrows_rotword_1 */
+       ldr     q17, [x5]       /* q17 := unshiftrows_rotword_3 */
 
        str     q1, [x0], #0x10 /* store master key[0:128) as round key */
        mov     x2, #12         /* round count */
@@ -197,7 +197,7 @@
 
        /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
        ld1r    {v4.4s}, [x3], #4
-       tbl     v3.16b, {v3.16b}, v8.16b
+       tbl     v3.16b, {v3.16b}, v16.16b
        eor     v3.16b, v3.16b, v4.16b
 
        /*
@@ -269,8 +269,8 @@
         *      q2 = rk
         *      q3 = nrk
         *      v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
-        *      q8 = unshiftrows_rotword_1
-        *      q9 = unshiftrows_rotword_3
+        *      q16 = unshiftrows_rotword_1
+        *      q17 = unshiftrows_rotword_3
         *
         * We have to compute, in q1:
         *
@@ -294,7 +294,7 @@
 
        /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
        ld1r    {v4.4s}, [x3], #4
-       tbl     v1.16b, {v1.16b}, v9.16b
+       tbl     v1.16b, {v1.16b}, v17.16b
        eor     v1.16b, v1.16b, v4.16b
 
        /*
@@ -354,8 +354,8 @@
        adrl    x4, unshiftrows_rotword_3
        adrl    x5, unshiftrows_3
        eor     v0.16b, v0.16b, v0.16b  /* q0 := 0 */
-       ldr     q8, [x4]        /* q8 := unshiftrows_rotword_3 */
-       ldr     q9, [x5]        /* q9 := unshiftrows_3 */
+       ldr     q16, [x4]       /* q16 := unshiftrows_rotword_3 */
+       ldr     q17, [x5]       /* q17 := unshiftrows_3 */
 
        /* store master key as first two round keys */
        stp     q1, q2, [x0], #0x20
@@ -376,7 +376,7 @@
 
        /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
        ld1r    {v4.4s}, [x3], #4
-       tbl     v3.16b, {v3.16b}, v8.16b
+       tbl     v3.16b, {v3.16b}, v16.16b
        eor     v3.16b, v3.16b, v4.16b
 
        /*
@@ -402,7 +402,7 @@
        aese    v3.16b, v0.16b
 
        /* v3.4s[i] := SubBytes(rk[3]) */
-       tbl     v3.16b, {v3.16b}, v9.16b
+       tbl     v3.16b, {v3.16b}, v17.16b
 
        /*
         * v5.4s := (0,prk[0],prk[1],prk[2])
@@ -458,9 +458,9 @@
 ENTRY(aesarmv8_enc)
        stp     fp, lr, [sp, #-16]!     /* push stack frame */
        mov     fp, sp
-       ldr     q0, [x1]        /* q0 := block */
-       bl      aesarmv8_enc1
-       str     q0, [x2]        /* store block */
+       ldr     q0, [x1]        /* q0 := ptxt */
+       bl      aesarmv8_enc1   /* q0 := ctxt; trash x0/x3/q16 */
+       str     q0, [x2]        /* store ctxt */
        ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
 END(aesarmv8_enc)
@@ -476,9 +476,9 @@
 ENTRY(aesarmv8_dec)
        stp     fp, lr, [sp, #-16]!     /* push stack frame */
        mov     fp, sp
-       ldr     q0, [x1]        /* q0 := block */
-       bl      aesarmv8_dec1
-       str     q0, [x2]        /* store block */
+       ldr     q0, [x1]        /* q0 := ctxt */
+       bl      aesarmv8_dec1   /* q0 := ptxt; trash x0/x3/q16 */
+       str     q0, [x2]        /* store ptxt */
        ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
 END(aesarmv8_dec)
@@ -505,7 +505,7 @@
        eor     v0.16b, v0.16b, v1.16b  /* q0 := cv ^ ptxt */
        mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
-       bl      aesarmv8_enc1           /* q0 := ciphertext block */
+       bl      aesarmv8_enc1           /* q0 := ctxt; trash x0/x3/q16 */
        subs    x10, x10, #0x10         /* count down nbytes */
        str     q0, [x2], #0x10         /* store ciphertext block */
        b.ne    1b                      /* repeat if x10 is nonzero */
@@ -527,10 +527,9 @@
  *     Standard ABI calling convention.
  */
 ENTRY(aesarmv8_cbc_dec1)
-       stp     fp, lr, [sp, #-32]!     /* push stack frame with uint128 */
+       stp     fp, lr, [sp, #-16]!     /* push stack frame */
        mov     fp, sp
-       ldr     q8, [x4]                /* q8 := iv */
-       str     q8, [sp, #16]           /* save iv */
+       ldr     q24, [x4]               /* q24 := iv */
        mov     x9, x0                  /* x9 := enckey */
        mov     x10, x3                 /* x10 := nbytes */
        add     x1, x1, x3              /* x1 := pointer past end of in */
@@ -539,18 +538,17 @@
        str     q0, [x4]                /* update iv */
 1:     mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
-       bl      aesarmv8_dec1           /* q0 := cv ^ ptxt; trash x0/x3 */
+       bl      aesarmv8_dec1           /* q0 := cv ^ ptxt; trash x0/x3/q16 */
        subs    x10, x10, #0x10         /* count down nbytes */
        b.eq    2f                      /* stop if this is the first block */
-       ldr     q8, [x1, #-0x10]!       /* q8 := chaining value */
-       eor     v0.16b, v0.16b, v8.16b  /* q0 := plaintext block */
+       ldr     q31, [x1, #-0x10]!      /* q31 := chaining value */
+       eor     v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
        str     q0, [x2, #-0x10]!       /* store plaintext block */
-       mov     v0.16b, v8.16b          /* move cv = ciphertext block */
+       mov     v0.16b, v31.16b         /* move cv = ciphertext block */
        b       1b
-2:     ldr     q8, [sp, #16]           /* q8 := iv */
-       eor     v0.16b, v0.16b, v8.16b  /* q0 := first plaintext block */
+2:     eor     v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
        str     q0, [x2, #-0x10]!       /* store first plaintext block */
-       ldp     fp, lr, [sp], #32       /* pop stack frame */
+       ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
 END(aesarmv8_cbc_dec1)
 
@@ -566,10 +564,9 @@
  *     Standard ABI calling convention.
  */
 ENTRY(aesarmv8_cbc_dec8)
-       stp     fp, lr, [sp, #-32]!     /* push stack frame with uint128 */
+       stp     fp, lr, [sp, #-16]!     /* push stack frame */
        mov     fp, sp
-       ldr     q8, [x4]                /* q8 := iv */
-       str     q8, [sp, #16]           /* save iv */
+       ldr     q24, [x4]               /* q24 := iv */
        mov     x9, x0                  /* x9 := enckey */
        mov     x10, x3                 /* x10 := nbytes */
        add     x1, x1, x3              /* x1 := pointer past end of in */
@@ -579,23 +576,24 @@
 1:     ldp     q4, q5, [x1, #-0x20]!
        ldp     q2, q3, [x1, #-0x20]!
        ldp     q0, q1, [x1, #-0x20]!
-       mov     v15.16b, v6.16b         /* q[8+i] := cv[i], 0<i<8 */
-       mov     v14.16b, v5.16b
-       mov     v13.16b, v4.16b
-       mov     v12.16b, v3.16b
-       mov     v11.16b, v2.16b
-       mov     v10.16b, v1.16b
-       mov     v9.16b, v0.16b
+       mov     v31.16b, v6.16b         /* q[24+i] := cv[i], 0<i<8 */
+       mov     v30.16b, v5.16b
+       mov     v29.16b, v4.16b
+       mov     v28.16b, v3.16b
+       mov     v27.16b, v2.16b
+       mov     v26.16b, v1.16b
+       mov     v25.16b, v0.16b
        mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
-       bl      aesarmv8_dec8           /* q[i] := cv[i] ^ pt[i] */
-       eor     v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */
-       eor     v6.16b, v6.16b, v14.16b
-       eor     v5.16b, v5.16b, v13.16b
-       eor     v4.16b, v4.16b, v12.16b
-       eor     v3.16b, v3.16b, v11.16b
-       eor     v2.16b, v2.16b, v10.16b
-       eor     v1.16b, v1.16b, v9.16b
+       bl      aesarmv8_dec8           /* q[i] := cv[i] ^ pt[i];
+                                        * trash x0/x3/q16 */
+       eor     v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
+       eor     v6.16b, v6.16b, v30.16b
+       eor     v5.16b, v5.16b, v29.16b
+       eor     v4.16b, v4.16b, v28.16b
+       eor     v3.16b, v3.16b, v27.16b
+       eor     v2.16b, v2.16b, v26.16b
+       eor     v1.16b, v1.16b, v25.16b
        subs    x10, x10, #0x80         /* count down nbytes */
        stp     q6, q7, [x2, #-0x20]!   /* store plaintext blocks */
        stp     q4, q5, [x2, #-0x20]!
@@ -605,10 +603,9 @@
        eor     v0.16b, v0.16b, v7.16b  /* q0 := pt0 */
        stp     q0, q1, [x2, #-0x20]!
        b       1b
-2:     ldr     q8, [sp, #16]           /* q8 := iv */
-       eor     v0.16b, v0.16b, v8.16b  /* q0 := pt0 */
+2:     eor     v0.16b, v0.16b, v24.16b /* q0 := pt0 */
        stp     q0, q1, [x2, #-0x20]!   /* store first two plaintext blocks */
-       ldp     fp, lr, [sp], #32       /* pop stack frame */
+       ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
 END(aesarmv8_cbc_dec8)
 
@@ -629,18 +626,18 @@
        mov     fp, sp
        mov     x9, x0                  /* x9 := enckey */
        mov     x10, x3                 /* x10 := nbytes */
-       ldr     q9, [x4]                /* q9 := tweak */
+       ldr     q31, [x4]               /* q31 := tweak */
 1:     ldr     q0, [x1], #0x10         /* q0 := ptxt */
        mov     x0, x9                  /* x0 := enckey */
        mov     x3, x5                  /* x3 := nrounds */
-       eor     v0.16b, v0.16b, v9.16b  /* q0 := ptxt ^ tweak */
-       bl      aesarmv8_enc1           /* q0 := AES(ptxt ^ tweak) */
-       eor     v0.16b, v0.16b, v9.16b  /* q0 := AES(ptxt ^ tweak) ^ tweak */
+       eor     v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
+       bl      aesarmv8_enc1           /* q0 := AES(...); trash x0/x3/q16 */
+       eor     v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
        str     q0, [x2], #0x10         /* store ciphertext block */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */
        subs    x10, x10, #0x10         /* count down nbytes */
        b.ne    1b                      /* repeat if more blocks */
-       str     q9, [x4]                /* update tweak */
+       str     q31, [x4]               /* update tweak */
        ldp     fp, lr, [sp], #16       /* pop stack frame */
        ret
 END(aesarmv8_xts_enc1)
@@ -657,61 +654,58 @@
  *     Standard ABI calling convention.
  */
 ENTRY(aesarmv8_xts_enc8)
-       stp     fp, lr, [sp, #-48]!     /* push stack frame uint128[2] */
+       stp     fp, lr, [sp, #-16]!     /* push stack frame */
        mov     fp, sp
        mov     x9, x0                  /* x9 := enckey */
        mov     x10, x3                 /* x10 := nbytes */
-       ldr     q9, [x4]                /* q9 := tweak */
-1:     str     q9, [sp, #16]           /* save tweak[0] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       str     q9, [sp, #32]           /* save tweak[1] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       mov     v10.16b, v9.16b         /* q10 := tweak[2] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       mov     v11.16b, v9.16b         /* q11 := tweak[3] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       mov     v12.16b, v9.16b         /* q11 := tweak[4] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       mov     v13.16b, v9.16b         /* q11 := tweak[5] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       mov     v14.16b, v9.16b         /* q11 := tweak[6] */
-       bl      aesarmv8_xts_mulx       /* q9 *= x; trash x0/q0/q1 */
-       mov     v15.16b, v9.16b         /* q11 := tweak[7] */
-       ldp     q8, q9, [sp, #16]       /* q8 := tweak[0], q9 := tweak[1] */
-       ldp     q0, q1, [x1], #0x20     /* q[i] := pt[i] */
+       ldr     q31, [x4]               /* q31 := tweak */
+1:     mov     v24.16b, v31.16b        /* q24 := tweak[0] */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */
+       mov     v25.16b, v31.16b        /* q25 := tweak[1] */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */
+       mov     v26.16b, v31.16b        /* q26 := tweak[2] */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */
+       mov     v27.16b, v31.16b        /* q27 := tweak[3] */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */
+       mov     v28.16b, v31.16b        /* q28 := tweak[4] */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */
+       mov     v29.16b, v31.16b        /* q29 := tweak[5] */
+       bl      aesarmv8_xts_mulx       /* q31 *= x; trash x0/q0/q1 */



Home | Main Index | Thread Index | Old Index