Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64 re-gen...



details:   https://anonhg.NetBSD.org/src/rev/2892b70f5446
branches:  trunk
changeset: 814441:2892b70f5446
user:      christos <christos%NetBSD.org@localhost>
date:      Sun Mar 20 22:18:43 2016 +0000

description:
re-gen to fix sha1. there were also improvements for montgomery multiplications
that we did not have from a previous change.

diffstat:

 crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S  |    8 +-
 crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S |  677 +++++++--
 2 files changed, 537 insertions(+), 148 deletions(-)

diffs (truncated from 890 to 300 lines):

diff -r fa17161478ec -r 2892b70f5446 crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S       Sun Mar 20 22:17:13 2016 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S       Sun Mar 20 22:18:43 2016 +0000
@@ -1298,7 +1298,7 @@
        pushq   %rbx
        pushq   %rbp
        pushq   %r12
-       leaq    -64(%rsp),%rsp
+       leaq    -72(%rsp),%rsp
        movq    %rdi,%r8
        movq    %rsi,%r9
        movq    %rdx,%r10
@@ -2470,7 +2470,7 @@
        movl    %ecx,8(%r8)
        movl    %edx,12(%r8)
        movl    %ebp,16(%r8)
-       leaq    64(%rsp),%rsi
+       leaq    72(%rsp),%rsi
        movq    0(%rsi),%r12
        movq    8(%rsi),%rbp
        movq    16(%rsi),%rbx
@@ -2485,7 +2485,7 @@
        pushq   %rbx
        pushq   %rbp
        pushq   %r12
-       leaq    -64(%rsp),%rsp
+       leaq    -72(%rsp),%rsp
        movq    %rdi,%r8
        movq    %rsi,%r9
        movq    %rdx,%r10
@@ -3621,7 +3621,7 @@
        movl    %ecx,8(%r8)
        movl    %edx,12(%r8)
        movl    %ebp,16(%r8)
-       leaq    64(%rsp),%rsi
+       leaq    72(%rsp),%rsi
        movq    0(%rsi),%r12
        movq    8(%rsi),%rbp
        movq    16(%rsi),%rbx
diff -r fa17161478ec -r 2892b70f5446 crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S      Sun Mar 20 22:17:13 2016 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S      Sun Mar 20 22:18:43 2016 +0000
@@ -14,47 +14,153 @@
 .align 16
 .Lmul_enter:
        movl    %r9d,%r9d
-       movl    8(%rsp),%r10d
+       movd    8(%rsp),%xmm5
+       leaq    .Linc(%rip),%r10
        pushq   %rbx
        pushq   %rbp
        pushq   %r12
        pushq   %r13
        pushq   %r14
        pushq   %r15
+
+.Lmul_alloca:
        movq    %rsp,%rax
        leaq    2(%r9),%r11
        negq    %r11
-       leaq    (%rsp,%r11,8),%rsp
+       leaq    -264(%rsp,%r11,8),%rsp
        andq    $-1024,%rsp
 
        movq    %rax,8(%rsp,%r9,8)
 .Lmul_body:
-       movq    %rdx,%r12
-       movq    %r10,%r11
-       shrq    $3,%r10
-       andq    $7,%r11
-       notq    %r10
-       leaq    .Lmagic_masks(%rip),%rax
-       andq    $3,%r10
-       leaq    96(%r12,%r11,8),%r12
-       movq    0(%rax,%r10,8),%xmm4
-       movq    8(%rax,%r10,8),%xmm5
-       movq    16(%rax,%r10,8),%xmm6
-       movq    24(%rax,%r10,8),%xmm7
+       leaq    128(%rdx),%r12
+       movdqa  0(%r10),%xmm0
+       movdqa  16(%r10),%xmm1
+       leaq    24-112(%rsp,%r9,8),%r10
+       andq    $-16,%r10
+
+       pshufd  $0,%xmm5,%xmm5
+       movdqa  %xmm1,%xmm4
+       movdqa  %xmm1,%xmm2
+       paddd   %xmm0,%xmm1
+       pcmpeqd %xmm5,%xmm0
+.byte  0x67
+       movdqa  %xmm4,%xmm3
+       paddd   %xmm1,%xmm2
+       pcmpeqd %xmm5,%xmm1
+       movdqa  %xmm0,112(%r10)
+       movdqa  %xmm4,%xmm0
+
+       paddd   %xmm2,%xmm3
+       pcmpeqd %xmm5,%xmm2
+       movdqa  %xmm1,128(%r10)
+       movdqa  %xmm4,%xmm1
+
+       paddd   %xmm3,%xmm0
+       pcmpeqd %xmm5,%xmm3
+       movdqa  %xmm2,144(%r10)
+       movdqa  %xmm4,%xmm2
+
+       paddd   %xmm0,%xmm1
+       pcmpeqd %xmm5,%xmm0
+       movdqa  %xmm3,160(%r10)
+       movdqa  %xmm4,%xmm3
+       paddd   %xmm1,%xmm2
+       pcmpeqd %xmm5,%xmm1
+       movdqa  %xmm0,176(%r10)
+       movdqa  %xmm4,%xmm0
+
+       paddd   %xmm2,%xmm3
+       pcmpeqd %xmm5,%xmm2
+       movdqa  %xmm1,192(%r10)
+       movdqa  %xmm4,%xmm1
+
+       paddd   %xmm3,%xmm0
+       pcmpeqd %xmm5,%xmm3
+       movdqa  %xmm2,208(%r10)
+       movdqa  %xmm4,%xmm2
+
+       paddd   %xmm0,%xmm1
+       pcmpeqd %xmm5,%xmm0
+       movdqa  %xmm3,224(%r10)
+       movdqa  %xmm4,%xmm3
+       paddd   %xmm1,%xmm2
+       pcmpeqd %xmm5,%xmm1
+       movdqa  %xmm0,240(%r10)
+       movdqa  %xmm4,%xmm0
+
+       paddd   %xmm2,%xmm3
+       pcmpeqd %xmm5,%xmm2
+       movdqa  %xmm1,256(%r10)
+       movdqa  %xmm4,%xmm1
 
-       movq    -96(%r12),%xmm0
-       movq    -32(%r12),%xmm1
-       pand    %xmm4,%xmm0
-       movq    32(%r12),%xmm2
-       pand    %xmm5,%xmm1
-       movq    96(%r12),%xmm3
-       pand    %xmm6,%xmm2
+       paddd   %xmm3,%xmm0
+       pcmpeqd %xmm5,%xmm3
+       movdqa  %xmm2,272(%r10)
+       movdqa  %xmm4,%xmm2
+
+       paddd   %xmm0,%xmm1
+       pcmpeqd %xmm5,%xmm0
+       movdqa  %xmm3,288(%r10)
+       movdqa  %xmm4,%xmm3
+       paddd   %xmm1,%xmm2
+       pcmpeqd %xmm5,%xmm1
+       movdqa  %xmm0,304(%r10)
+
+       paddd   %xmm2,%xmm3
+.byte  0x67
+       pcmpeqd %xmm5,%xmm2
+       movdqa  %xmm1,320(%r10)
+
+       pcmpeqd %xmm5,%xmm3
+       movdqa  %xmm2,336(%r10)
+       pand    64(%r12),%xmm0
+
+       pand    80(%r12),%xmm1
+       pand    96(%r12),%xmm2
+       movdqa  %xmm3,352(%r10)
+       pand    112(%r12),%xmm3
+       por     %xmm2,%xmm0
+       por     %xmm3,%xmm1
+       movdqa  -128(%r12),%xmm4
+       movdqa  -112(%r12),%xmm5
+       movdqa  -96(%r12),%xmm2
+       pand    112(%r10),%xmm4
+       movdqa  -80(%r12),%xmm3
+       pand    128(%r10),%xmm5
+       por     %xmm4,%xmm0
+       pand    144(%r10),%xmm2
+       por     %xmm5,%xmm1
+       pand    160(%r10),%xmm3
+       por     %xmm2,%xmm0
+       por     %xmm3,%xmm1
+       movdqa  -64(%r12),%xmm4
+       movdqa  -48(%r12),%xmm5
+       movdqa  -32(%r12),%xmm2
+       pand    176(%r10),%xmm4
+       movdqa  -16(%r12),%xmm3
+       pand    192(%r10),%xmm5
+       por     %xmm4,%xmm0
+       pand    208(%r10),%xmm2
+       por     %xmm5,%xmm1
+       pand    224(%r10),%xmm3
+       por     %xmm2,%xmm0
+       por     %xmm3,%xmm1
+       movdqa  0(%r12),%xmm4
+       movdqa  16(%r12),%xmm5
+       movdqa  32(%r12),%xmm2
+       pand    240(%r10),%xmm4
+       movdqa  48(%r12),%xmm3
+       pand    256(%r10),%xmm5
+       por     %xmm4,%xmm0
+       pand    272(%r10),%xmm2
+       por     %xmm5,%xmm1
+       pand    288(%r10),%xmm3
+       por     %xmm2,%xmm0
+       por     %xmm3,%xmm1
        por     %xmm1,%xmm0
-       pand    %xmm7,%xmm3
-       por     %xmm2,%xmm0
+       pshufd  $78,%xmm0,%xmm1
+       por     %xmm1,%xmm0
        leaq    256(%r12),%r12
-       por     %xmm3,%xmm0
-
 .byte  102,72,15,126,195
 
        movq    (%r8),%r8
@@ -63,29 +169,14 @@
        xorq    %r14,%r14
        xorq    %r15,%r15
 
-       movq    -96(%r12),%xmm0
-       movq    -32(%r12),%xmm1
-       pand    %xmm4,%xmm0
-       movq    32(%r12),%xmm2
-       pand    %xmm5,%xmm1
-
        movq    %r8,%rbp
        mulq    %rbx
        movq    %rax,%r10
        movq    (%rcx),%rax
 
-       movq    96(%r12),%xmm3
-       pand    %xmm6,%xmm2
-       por     %xmm1,%xmm0
-       pand    %xmm7,%xmm3
-
        imulq   %r10,%rbp
        movq    %rdx,%r11
 
-       por     %xmm2,%xmm0
-       leaq    256(%r12),%r12
-       por     %xmm3,%xmm0
-
        mulq    %rbp
        addq    %rax,%r10
        movq    8(%rsi),%rax
@@ -118,8 +209,6 @@
        cmpq    %r9,%r15
        jne     .L1st
 
-.byte  102,72,15,126,195
-
        addq    %rax,%r13
        movq    (%rsi),%rax
        adcq    $0,%rdx
@@ -139,33 +228,76 @@
        jmp     .Louter
 .align 16
 .Louter:
+       leaq    24+128(%rsp,%r9,8),%rdx
+       andq    $-16,%rdx
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       movdqa  -128(%r12),%xmm0
+       movdqa  -112(%r12),%xmm1
+       movdqa  -96(%r12),%xmm2
+       movdqa  -80(%r12),%xmm3
+       pand    -128(%rdx),%xmm0
+       pand    -112(%rdx),%xmm1
+       por     %xmm0,%xmm4
+       pand    -96(%rdx),%xmm2
+       por     %xmm1,%xmm5
+       pand    -80(%rdx),%xmm3
+       por     %xmm2,%xmm4
+       por     %xmm3,%xmm5
+       movdqa  -64(%r12),%xmm0
+       movdqa  -48(%r12),%xmm1
+       movdqa  -32(%r12),%xmm2
+       movdqa  -16(%r12),%xmm3
+       pand    -64(%rdx),%xmm0
+       pand    -48(%rdx),%xmm1
+       por     %xmm0,%xmm4
+       pand    -32(%rdx),%xmm2
+       por     %xmm1,%xmm5
+       pand    -16(%rdx),%xmm3
+       por     %xmm2,%xmm4
+       por     %xmm3,%xmm5
+       movdqa  0(%r12),%xmm0
+       movdqa  16(%r12),%xmm1
+       movdqa  32(%r12),%xmm2
+       movdqa  48(%r12),%xmm3
+       pand    0(%rdx),%xmm0
+       pand    16(%rdx),%xmm1
+       por     %xmm0,%xmm4
+       pand    32(%rdx),%xmm2



Home | Main Index | Thread Index | Old Index