[src/trunk]: src/sys/crypto/chacha/arch/arm Issue three more swaps to save ei...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/crypto/chacha/arch/arm Issue three more swaps to save ei...
From: riastradh <riastradh%NetBSD.org@localhost>
Date: Sat, 01 May 2021 04:38:34 +0000
details:   https://anonhg.NetBSD.org/src/rev/18dc63745187
branches:  trunk
changeset: 974373:18dc63745187
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Wed Jul 29 14:23:59 2020 +0000

description:
Issue three more swaps to save eight stores.

Reduces code size and yields a small (~2%) cgd throughput boost.

Remove duplicate comment while here.

diffstat:

 sys/crypto/chacha/arch/arm/chacha_neon_32.S |  167 +++++++++------------------
 1 files changed, 58 insertions(+), 109 deletions(-)

diffs (297 lines):

diff -r 254476381bb4 -r 18dc63745187 sys/crypto/chacha/arch/arm/chacha_neon_32.S
--- a/sys/crypto/chacha/arch/arm/chacha_neon_32.S       Wed Jul 29 13:03:36 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon_32.S       Wed Jul 29 14:23:59 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $     */
+/*     $NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $     */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $")
 
        .fpu    neon
 
@@ -305,21 +305,29 @@
         *      q7 = (x3[4], x3[5]; x3[6], x3[7])
         *
         * The first two rows to write out are q0 = x0[0:4) and q4 =
-        * x0[4:8).  If we first swap q1 and q4, then once we've
-        * written them out we free up consecutive registers q0-q1 for
-        * store-multiple.
+        * x0[4:8).  Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14
+        * enables us to issue all stores in consecutive pairs:
+        *      x0 in q0-q1
+        *      x1 in q8-q9
+        *      x2 in q2-q3
+        *      x3 in q10-q11
+        *      x4 in q4-q5
+        *      x5 in q12-q3
+        *      x6 in q6-q7
+        *      x7 in q14-q15
         */
 
        vswp    q1, q4
+       vswp    q3, q6
 
        vadd.u32 q0, q0, q9
        vadd.u32 q4, q4, q9
        vadd.u32 q2, q2, q9
-       vadd.u32 q3, q3, q9
+       vadd.u32 q6, q6, q9
 
        vadd.u32 q1, q1, q8
        vadd.u32 q5, q5, q8
-       vadd.u32 q6, q6, q8
+       vadd.u32 q3, q3, q8
        vadd.u32 q7, q7, q8
 
        vld1.32 {q8-q9}, [fp, :256]     /* restore q8-q9 */
@@ -349,14 +357,17 @@
        vswp    d19, d22
        vswp    d27, d30
 
+       vswp    q9, q12
+       vswp    q11, q14
+
        vadd.u32 q8, q8, q0
-       vadd.u32 q9, q9, q0
+       vadd.u32 q12, q12, q0
        vadd.u32 q10, q10, q0
-       vadd.u32 q11, q11, q0
+       vadd.u32 q14, q14, q0
 
-       vadd.u32 q12, q12, q1
+       vadd.u32 q9, q9, q1
        vadd.u32 q13, q13, q1
-       vadd.u32 q14, q14, q1
+       vadd.u32 q11, q11, q1
        vadd.u32 q15, q15, q1
 
        LE32TOH(q8)
@@ -368,28 +379,18 @@
        LE32TOH(q14)
        LE32TOH(q15)
 
-       /* prepare to zero temporary space on stack */
+       /* vst1.32      {q0-q1}, [r0]! */
+       vst1.32 {q8-q9}, [r0]!
+       vst1.32 {q2-q3}, [r0]!
+       vst1.32 {q10-q11}, [r0]!
+       vst1.32 {q4-q5}, [r0]!
+       vst1.32 {q12-q13}, [r0]!
+       vst1.32 {q6-q7}, [r0]!
+       vst1.32 {q14-q15}, [r0]
+
+       /* zero temporary space on the stack */
        vmov.i32 q0, #0
        vmov.i32 q1, #0
-
-       /* vst1.32      {q0}, [r0]! */
-       /* vst1.32      {q1}, [r0]! */  /* (was q4 before vswp) */
-       vst1.32 {q8}, [r0]!
-       vst1.32 {q12}, [r0]!
-       vst1.32 {q2}, [r0]!
-       vst1.32 {q6}, [r0]!
-       vst1.32 {q10}, [r0]!
-       vst1.32 {q14}, [r0]!
-       vst1.32 {q4}, [r0]!     /* (was q1 before vswp) */
-       vst1.32 {q5}, [r0]!
-       vst1.32 {q9}, [r0]!
-       vst1.32 {q13}, [r0]!
-       vst1.32 {q3}, [r0]!
-       vst1.32 {q7}, [r0]!
-       vst1.32 {q11}, [r0]!
-       vst1.32 {q15}, [r0]
-
-       /* zero temporary space on the stack */
        vst1.8  {q0-q1}, [fp, :256]
 
        /* restore callee-saves registers and stack */
@@ -481,42 +482,8 @@
         * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
         * {0,1,2,...,15}.  The twist is that the p[i] and the y[i] are
         * transposed from one another, and the x[i] are in general
-        * registers and memory.  So we have:
-        *
-        *      q0 = (x0[0], x1[0]; x2[0], x3[0])
-        *      q1 = (x0[1], x1[1]; x2[1], x3[1])
-        *      q2 = (x0[2], x1[2]; x2[2], x3[2])
-        *      q3 = (x0[3], x1[3]; x2[3], x3[3])
-        *      ...
-        *      q15 = (x0[15], x1[15]; x2[15], x3[15])
-        *
-        * where xi[j] is the jth word of the ith 16-word block.  Zip
-        * consecutive pairs with vzip.32, and you get:
-        *
-        *      q0 = (x0[0], x0[1]; x1[0], x1[1])
-        *      q1 = (x2[0], x2[1]; x3[0], x3[1])
-        *      q2 = (x0[2], x0[3]; x1[2], x1[3])
-        *      q3 = (x2[2], x2[3]; x3[2], x3[3])
-        *      ...
-        *      q15 = (x2[14], x2[15]; x3[14], x3[15])
-        *
-        * As 64-bit d registers, this is:
-        *
-        *      d0 = (x0[0], x0[1])     d1 = (x1[0], x1[1])
-        *      d2 = (x2[0], x2[1])     d3 = (x3[0], x3[1])
-        *      d4 = (x0[2], x0[3])     d5 = (x1[2], x1[3])
-        *      d6 = (x2[2], x2[3])     d7 = (x3[2], x3[3])
-        *      ...
-        *      d30 = (x2[14], x2[15])  d31 = (x3[14], x3[15])
-        *
-        * Swap d1<->d4, d3<->d6, ..., and you get:
-        *
-        *      q0 = (x0[0], x0[1]; x0[2], x0[3])
-        *      q1 = (x2[0], x2[1]; x2[2], x2[3])
-        *      q2 = (x1[0], x1[1]; x1[2], x1[3])
-        *      q3 = (x3[0], x3[1]; x3[2], x3[3])
-        *      ...
-        *      q15 = (x15[0], x15[1]; x15[2], x15[3])
+        * registers and memory.  See comments in chacha_stream256_neon
+        * for the layout with swaps.
         */
 
        sub     r7, r7, #0x10
@@ -533,39 +500,22 @@
        vadd.u32 q12, q12, q8   /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
        vld1.32 {q8}, [r4]!     /* q8 := key[0:16) */
 
-       vswp    d1, d4
+       vswp    d3, d6
        vswp    d9, d12
-       vswp    d3, d6
+       vswp    d1, d4
        vswp    d11, d14
 
-       /*
-        * At this point, the blocks are:
-        *
-        *      q0 = (x0[0], x0[1]; x0[2], x0[3])
-        *      q1 = (x2[0], x2[1]; x2[2], x2[3])
-        *      q2 = (x1[0], x1[1]; x1[2], x1[3])
-        *      q3 = (x3[0], x3[1]; x3[2], x3[3])
-        *      q4 = (x0[4], x0[5]; x0[6], x0[7])
-        *      q5 = (x2[4], x2[5]; x2[6], x2[7])
-        *      q6 = (x1[4], x1[5]; x1[6], x1[7])
-        *      q7 = (x3[4], x3[5]; x3[6], x3[7])
-        *
-        * The first two rows to write out are q0 = x0[0:4) and q4 =
-        * x0[4:8).  If we first swap q1 and q4, then once we've
-        * written them out we free up consecutive registers q0-q1 for
-        * store-multiple.
-        */
-
        vswp    q1, q4
+       vswp    q3, q6
 
        vadd.u32 q0, q0, q9
        vadd.u32 q4, q4, q9
        vadd.u32 q2, q2, q9
-       vadd.u32 q3, q3, q9
+       vadd.u32 q6, q6, q9
 
        vadd.u32 q1, q1, q8
        vadd.u32 q5, q5, q8
-       vadd.u32 q6, q6, q8
+       vadd.u32 q3, q3, q8
        vadd.u32 q7, q7, q8
 
        vld1.32 {q8-q9}, [r1]!  /* load plaintext bytes [0:32) */
@@ -595,21 +545,22 @@
        vzip.32 q12, q13
        vzip.32 q14, q15
 
-       vswp    d17, d20
+       vswp    d19, d22
        vswp    d25, d28
-       vswp    d19, d22
+       vswp    d17, d20
        vswp    d27, d30
 
        vswp    q9, q12         /* free up q9 earlier for consecutive q8-q9 */
+       vswp    q11, q14
 
        vadd.u32 q8, q8, q0
        vadd.u32 q12, q12, q0
        vadd.u32 q10, q10, q0
-       vadd.u32 q11, q11, q0
+       vadd.u32 q14, q14, q0
 
        vadd.u32 q9, q9, q1
        vadd.u32 q13, q13, q1
-       vadd.u32 q14, q14, q1
+       vadd.u32 q11, q11, q1
        vadd.u32 q15, q15, q1
 
        vld1.32 {q0-q1}, [r1]!  /* load plaintext bytes [32:64) */
@@ -617,10 +568,10 @@
        LE32TOH(q8)
        LE32TOH(q9)
        LE32TOH(q10)
-       LE32TOH(q14)
+       LE32TOH(q11)
        LE32TOH(q12)
        LE32TOH(q13)
-       LE32TOH(q11)
+       LE32TOH(q14)
        LE32TOH(q15)
 
        veor    q0, q0, q8      /* compute ciphertext bytes [32:64) */
@@ -631,40 +582,38 @@
        vld1.32 {q0-q1}, [r1]!  /* load plaintext bytes [96:128) */
 
        veor    q2, q2, q8      /* compute ciphertext bytes [64:96) */
-       veor    q6, q6, q9
+       veor    q3, q3, q9
 
        vld1.32 {q8-q9}, [r1]!  /* load plaintext bytes [128:160) */
-       vst1.32 {q2}, [r0]!     /* store ciphertext bytes [64:80) */
+       vst1.32 {q2-q3}, [r0]!  /* store ciphertext bytes [64:80) */
 
        veor    q10, q10, q0    /* compute ciphertext bytes [96:128) */
-       veor    q14, q14, q1
+       veor    q11, q11, q1
 
        vld1.32 {q0-q1}, [r1]!  /* load plaintext bytes [160:192) */
-       vst1.32 {q6}, [r0]!     /* store ciphertext bytes [80:96) */
+       vst1.32 {q10-q11}, [r0]!        /* store ciphertext bytes [80:96) */
 
        veor    q4, q4, q8      /* compute ciphertext bytes [128:160) */
        veor    q5, q5, q9
 
        vld1.32 {q8-q9}, [r1]!  /* load plaintext bytes [192:224) */
-       vst1.32 {q10}, [r0]!    /* store ciphertext bytes [96:112) */
+       vst1.32 {q4-q5}, [r0]!  /* store ciphertext bytes [96:112) */
 
        veor    q12, q12, q0    /* compute ciphertext bytes [160:192) */
        veor    q13, q13, q1
 
        vld1.32 {q0-q1}, [r1]   /* load plaintext bytes [224:256) */
-       vst1.32 {q14}, [r0]!    /* store ciphertext bytes [112:128) */
+       vst1.32 {q12-q13}, [r0]!        /* store ciphertext bytes [112:128) */
 
-       veor    q8, q3, q8      /* compute ciphertext bytes [192:224) */
-       veor    q9, q7, q9
+       veor    q6, q6, q8      /* compute ciphertext bytes [192:224) */
+       veor    q7, q7, q9
 
-       vst1.32 {q4-q5}, [r0]!  /* store ciphertext bytes [128:160) */
-       vst1.32 {q12-q13}, [r0]!        /* store ciphertext bytes [160:192) */
+       vst1.32 {q6-q7}, [r0]!  /* store ciphertext bytes [192:224) */
 
-       veor    q0, q11, q0     /* compute ciphertext bytes [224:256) */
-       veor    q1, q15, q1
+       veor    q14, q14, q0    /* compute ciphertext bytes [224:256) */
+       veor    q15, q15, q1
 
-       vst1.32 {q8-q9}, [r0]!  /* store ciphertext bytes [192:224) */
-       vst1.32 {q0-q1}, [r0]   /* store ciphertext bytes [224:256) */
+       vst1.32 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */
 
        /* zero temporary space on the stack */
        vmov.i32 q0, #0
Prev by Date: [src/trunk]: src/sys/dev/pci fix xmm7360_os_msleep() macro to actually work w...
Next by Date: [src/trunk]: src/usr.bin/make/unit-tests make(1): add unit tests for parse er...
Previous by Thread: [src/trunk]: src/sys/dev/pci fix xmm7360_os_msleep() macro to actually work w...
Next by Thread: [src/trunk]: src/usr.bin/make/unit-tests make(1): add unit tests for parse er...
Indexes:
Home | Main Index | Thread Index | Old Index