Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64 re-gen...
details: https://anonhg.NetBSD.org/src/rev/2892b70f5446
branches: trunk
changeset: 814441:2892b70f5446
user: christos <christos%NetBSD.org@localhost>
date: Sun Mar 20 22:18:43 2016 +0000
description:
re-gen to fix sha1. there were also improvements for montgomery multiplications
that we did not have from a previous change.
diffstat:
crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S | 8 +-
crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S | 677 +++++++--
2 files changed, 537 insertions(+), 148 deletions(-)
diffs (truncated from 890 to 300 lines):
diff -r fa17161478ec -r 2892b70f5446 crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S Sun Mar 20 22:17:13 2016 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S Sun Mar 20 22:18:43 2016 +0000
@@ -1298,7 +1298,7 @@
pushq %rbx
pushq %rbp
pushq %r12
- leaq -64(%rsp),%rsp
+ leaq -72(%rsp),%rsp
movq %rdi,%r8
movq %rsi,%r9
movq %rdx,%r10
@@ -2470,7 +2470,7 @@
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq 64(%rsp),%rsi
+ leaq 72(%rsp),%rsi
movq 0(%rsi),%r12
movq 8(%rsi),%rbp
movq 16(%rsi),%rbx
@@ -2485,7 +2485,7 @@
pushq %rbx
pushq %rbp
pushq %r12
- leaq -64(%rsp),%rsp
+ leaq -72(%rsp),%rsp
movq %rdi,%r8
movq %rsi,%r9
movq %rdx,%r10
@@ -3621,7 +3621,7 @@
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq 64(%rsp),%rsi
+ leaq 72(%rsp),%rsi
movq 0(%rsi),%r12
movq 8(%rsi),%rbp
movq 16(%rsi),%rbx
diff -r fa17161478ec -r 2892b70f5446 crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S Sun Mar 20 22:17:13 2016 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S Sun Mar 20 22:18:43 2016 +0000
@@ -14,47 +14,153 @@
.align 16
.Lmul_enter:
movl %r9d,%r9d
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq .Linc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
+.Lmul_alloca:
movq %rsp,%rax
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
+ pshufd $78,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -63,29 +169,14 @@
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -118,8 +209,6 @@
cmpq %r9,%r15
jne .L1st
-.byte 102,72,15,126,195
-
addq %rax,%r13
movq (%rsi),%rax
adcq $0,%rdx
@@ -139,33 +228,76 @@
jmp .Louter
.align 16
.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
Home |
Main Index |
Thread Index |
Old Index