Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/crypto/aes/arch/arm aes neon: Issue 256-bit loads rather...



details:   https://anonhg.NetBSD.org/src/rev/3b140607dcac
branches:  trunk
changeset: 1014013:3b140607dcac
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Thu Sep 10 11:29:02 2020 +0000

description:
aes neon: Issue 256-bit loads rather than pairs of 128-bit loads.

Not sure why I didn't realize you could do this before!

Saves some temporary registers that can now be allocated to shave off
a few cycles.

diffstat:

 sys/crypto/aes/arch/arm/aes_neon_32.S |  247 ++++++++++++---------------------
 1 files changed, 93 insertions(+), 154 deletions(-)

diffs (truncated from 389 to 300 lines):

diff -r 256116917028 -r 3b140607dcac sys/crypto/aes/arch/arm/aes_neon_32.S
--- a/sys/crypto/aes/arch/arm/aes_neon_32.S     Thu Sep 10 06:02:30 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_32.S     Thu Sep 10 11:29:02 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $        */
+/*     $NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include <arm/asm.h>
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
 
        .fpu    neon
 
@@ -38,9 +38,10 @@
        .long   .Lconstants - .
 
        .section .rodata
-       .p2align 4
+       .p2align 5
 .Lconstants:
 
+.Linv_inva:    /* inv and inva must be consecutive */
        .type   inv,_ASM_TYPE_OBJECT
 inv:
        .byte   0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
@@ -99,125 +100,85 @@
        .byte   0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
 END(sr)
 
-       .type   iptlo,_ASM_TYPE_OBJECT
-iptlo:
-       .byte   0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+       .type   ipt,_ASM_TYPE_OBJECT
+ipt:
+       .byte   0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 /* lo */
        .byte   0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
-END(iptlo)
-
-       .type   ipthi,_ASM_TYPE_OBJECT
-ipthi:
-       .byte   0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+       .byte   0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
        .byte   0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
-END(ipthi)
+END(ipt)
 
-       .type   sb1_0,_ASM_TYPE_OBJECT
-sb1_0:
-       .byte   0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+       .type   sb1,_ASM_TYPE_OBJECT
+sb1:
+       .byte   0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
        .byte   0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
-END(sb1_0)
-
-       .type   sb1_1,_ASM_TYPE_OBJECT
-sb1_1:
-       .byte   0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+       .byte   0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
        .byte   0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
-END(sb1_1)
-
-       .type   sb2_0,_ASM_TYPE_OBJECT
-sb2_0:
-       .byte   0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
-       .byte   0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
-END(sb2_0)
+END(sb1)
 
-       .type   sb2_1,_ASM_TYPE_OBJECT
-sb2_1:
-       .byte   0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+       .type   sb2,_ASM_TYPE_OBJECT
+sb2:
+       .byte   0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
+       .byte   0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
+       .byte   0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
        .byte   0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
-END(sb2_1)
-
-       .type   sbo_0,_ASM_TYPE_OBJECT
-sbo_0:
-       .byte   0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
-       .byte   0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
-END(sbo_0)
+END(sb2)
 
-       .type   sbo_1,_ASM_TYPE_OBJECT
-sbo_1:
-       .byte   0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+       .type   sbo,_ASM_TYPE_OBJECT
+sbo:
+       .byte   0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
+       .byte   0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
+       .byte   0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
        .byte   0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
-END(sbo_1)
+END(sbo)
 
-       .type   diptlo,_ASM_TYPE_OBJECT
-diptlo:
-       .byte   0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+       .type   dipt,_ASM_TYPE_OBJECT
+dipt:
+       .byte   0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F /* lo */
        .byte   0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
-END(diptlo)
-
-       .type   dipthi,_ASM_TYPE_OBJECT
-dipthi:
-       .byte   0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+       .byte   0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 /* hi */
        .byte   0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
-END(dipthi)
+END(dipt)
 
-       .type   dsb9_0,_ASM_TYPE_OBJECT
-dsb9_0:
-       .byte   0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
+       .type   dsb9,_ASM_TYPE_OBJECT
+dsb9:
+       .byte   0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 /* 0 */
        .byte   0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
-END(dsb9_0)
-
-       .type   dsb9_1,_ASM_TYPE_OBJECT
-dsb9_1:
-       .byte   0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
+       .byte   0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 /* 1 */
        .byte   0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
-END(dsb9_1)
+END(dsb9)
 
-       .type   dsbd_0,_ASM_TYPE_OBJECT
-dsbd_0:
-       .byte   0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
+       .type   dsbd,_ASM_TYPE_OBJECT
+dsbd:
+       .byte   0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D /* 0 */
        .byte   0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
-END(dsbd_0)
-
-       .type   dsbd_1,_ASM_TYPE_OBJECT
-dsbd_1:
-       .byte   0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
+       .byte   0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C /* 1 */
        .byte   0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
-END(dsbd_1)
-
-       .type   dsbb_0,_ASM_TYPE_OBJECT
-dsbb_0:
-       .byte   0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
-       .byte   0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
-END(dsbb_0)
+END(dsbd)
 
-       .type   dsbb_1,_ASM_TYPE_OBJECT
-dsbb_1:
-       .byte   0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
+       .type   dsbb,_ASM_TYPE_OBJECT
+dsbb:
+       .byte   0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0 /* 0 */
+       .byte   0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
+       .byte   0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1 /* 1 */
        .byte   0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
-END(dsbb_1)
-
-       .type   dsbe_0,_ASM_TYPE_OBJECT
-dsbe_0:
-       .byte   0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
-       .byte   0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
-END(dsbe_0)
+END(dsbb)
 
-       .type   dsbe_1,_ASM_TYPE_OBJECT
-dsbe_1:
-       .byte   0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
+       .type   dsbe,_ASM_TYPE_OBJECT
+dsbe:
+       .byte   0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46 /* 0 */
+       .byte   0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
+       .byte   0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C /* 1 */
        .byte   0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
-END(dsbe_1)
+END(dsbe)
 
-       .type   dsbo_0,_ASM_TYPE_OBJECT
-dsbo_0:
-       .byte   0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
+       .type   dsbo,_ASM_TYPE_OBJECT
+dsbo:
+       .byte   0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13 /* 0 */
        .byte   0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
-END(dsbo_0)
-
-       .type   dsbo_1,_ASM_TYPE_OBJECT
-dsbo_1:
-       .byte   0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
+       .byte   0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12 /* 1 */
        .byte   0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
-END(dsbo_1)
+END(dsbo)
 
 /*
  * aes_neon_enc1(enc, x, nrounds)
@@ -274,7 +235,7 @@
        ldr     r12, .Lconstants_addr
        adr     r11, .Lconstants_addr
 
-       vld1.8  {d28-d29}, [r0 :128]!   /* q14 = *rk++ */
+       vld1.8  {q14}, [r0 :128]!       /* q14 = *rk++ */
        movw    r3, #0
        vmov.i8 q1, #0x0f
 
@@ -282,24 +243,16 @@
        add     r12, r12, r11
 
        /* (q4, q5) := (iptlo, ipthi) */
-       add     r6, r12, #(iptlo - .Lconstants)
-       add     r7, r12, #(ipthi - .Lconstants)
-       vld1.8  {d8-d9}, [r6 :128]
-       vld1.8  {d10-d11}, [r7 :128]
+       add     r6, r12, #(ipt - .Lconstants)
+       vld1.8  {q4-q5}, [r6 :256]
 
        /* load the rest of the constants */
-       add     r4, r12, #(sb1_0 - .Lconstants)
-       add     r5, r12, #(sb1_1 - .Lconstants)
-       add     r6, r12, #(sb2_0 - .Lconstants)
-       add     r7, r12, #(sb2_1 - .Lconstants)
-       add     r8, r12, #(inv - .Lconstants)
-       add     r10, r12, #(inva - .Lconstants)
-       vld1.8  {d12-d13}, [r4 :128]    /* q6 = sb1[0] */
-       vld1.8  {d14-d15}, [r5 :128]    /* q7 = sb1[1] */
-       vld1.8  {d16-d17}, [r6 :128]    /* q8 = sb2[0] */
-       vld1.8  {d18-d19}, [r7 :128]    /* q9 = sb2[1] */
-       vld1.8  {d20-d21}, [r8 :128]    /* q10 = inv */
-       vld1.8  {d22-d23}, [r10 :128]   /* q11 = inva */
+       add     r4, r12, #(sb1 - .Lconstants)
+       add     r6, r12, #(sb2 - .Lconstants)
+       add     r8, r12, #(.Linv_inva - .Lconstants)
+       vld1.8  {q6-q7}, [r4 :256]      /* q6 = sb1[0], q7 = sb1[1] */
+       vld1.8  {q8-q9}, [r6 :256]      /* q8 = sb2[0], q9 = sb2[1] */
+       vld1.8  {q10-q11}, [r8 :256]    /* q10 = inv, q11 = inva */
 
        /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
        add     r4, r12, #(mc_forward - .Lconstants)
@@ -323,7 +276,7 @@
        b       2f
 
        _ALIGN_TEXT
-1:     vld1.8  {d28-d29}, [r0 :128]!   /* q14 = *rk++ */
+1:     vld1.8  {q14}, [r0 :128]!       /* q14 = *rk++ */
 
        /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
        vtbl.8  d24, {d12-d13}, d4
@@ -343,8 +296,8 @@
        /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
        add     r6, r4, r3, lsl #4
        add     r7, r5, r3, lsl #4
-       vld1.8  {d24-d25}, [r6]
-       vld1.8  {d26-d27}, [r7]
+       vld1.8  {q12}, [r6 :128]
+       vld1.8  {q13}, [r7 :128]
 
        /* q15 := A2_B = A2 + A(mcf) */
        vtbl.8  d30, {d0-d1}, d24
@@ -413,14 +366,12 @@
 
        /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
        add     r8, r12, #(sr - .Lconstants)
-       add     r6, r12, #(sbo_0 - .Lconstants)
-       add     r7, r12, #(sbo_1 - .Lconstants)
+       add     r6, r12, #(sbo - .Lconstants)
        add     r8, r8, r3, lsl #4
-       vld1.8  {d12-d13}, [r6 :128]
-       vld1.8  {d14-d15}, [r7 :128]
-       vld1.8  {d30-d31}, [r8 :128]
+       vld1.8  {q6-q7}, [r6 :256]
+       vld1.8  {q15}, [r8 :128]
 
-       vld1.8  {d28-d29}, [r0 :128]!   /* q14 = *rk++ */
+       vld1.8  {q14}, [r0 :128]!       /* q14 = *rk++ */
 
        /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
        vtbl.8  d4, {d12-d13}, d4
@@ -502,7 +453,7 @@
        ldr     r12, .Lconstants_addr
        adr     r11, .Lconstants_addr
 
-       vld1.8  {d28-d29}, [r0 :128]!   /* q14 = *rk++ */
+       vld1.8  {q14}, [r0 :128]!       /* q14 = *rk++ */
        rsb     r3, r1, #0              /* r3 := ~(x - 1) = -x */
        vmov.i8 q1, #0x0f
        and     r3, r3, #3              /* r3 := 3 & ~(x - 1) */



Home | Main Index | Thread Index | Old Index