122 lines
3.4 KiB
Diff
122 lines
3.4 KiB
Diff
--- crypto/openssl/crypto/bn/asm/rsaz-avx2.pl.orig
|
|
+++ crypto/openssl/crypto/bn/asm/rsaz-avx2.pl
|
|
@@ -239,7 +239,7 @@
|
|
vmovdqu 32*8-128($ap), $ACC8
|
|
|
|
lea 192(%rsp), $tp0 # 64+128=192
|
|
- vpbroadcastq .Land_mask(%rip), $AND_MASK
|
|
+ vmovdqu .Land_mask(%rip), $AND_MASK
|
|
jmp .LOOP_GRANDE_SQR_1024
|
|
|
|
.align 32
|
|
@@ -1070,10 +1070,10 @@
|
|
vpmuludq 32*6-128($np),$Yi,$TEMP1
|
|
vpaddq $TEMP1,$ACC6,$ACC6
|
|
vpmuludq 32*7-128($np),$Yi,$TEMP2
|
|
- vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
|
|
+ vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
|
|
vpaddq $TEMP2,$ACC7,$ACC7
|
|
vpmuludq 32*8-128($np),$Yi,$TEMP0
|
|
- vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
|
|
+ vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
|
|
vpaddq $TEMP0,$ACC8,$ACC8
|
|
|
|
mov %rbx, %rax
|
|
@@ -1086,7 +1086,9 @@
|
|
vmovdqu -8+32*2-128($ap),$TEMP2
|
|
|
|
mov $r1, %rax
|
|
+ vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
|
|
imull $n0, %eax
|
|
+ vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
|
|
and \$0x1fffffff, %eax
|
|
|
|
imulq 16-128($ap),%rbx
|
|
@@ -1322,15 +1324,12 @@
|
|
# But as we underutilize resources, it's possible to correct in
|
|
# each iteration with marginal performance loss. But then, as
|
|
# we do it in each iteration, we can correct less digits, and
|
|
-# avoid performance penalties completely. Also note that we
|
|
-# correct only three digits out of four. This works because
|
|
-# most significant digit is subjected to less additions.
|
|
+# avoid performance penalties completely.
|
|
|
|
$TEMP0 = $ACC9;
|
|
$TEMP3 = $Bi;
|
|
$TEMP4 = $Yi;
|
|
$code.=<<___;
|
|
- vpermq \$0, $AND_MASK, $AND_MASK
|
|
vpaddq (%rsp), $TEMP1, $ACC0
|
|
|
|
vpsrlq \$29, $ACC0, $TEMP1
|
|
@@ -1763,7 +1762,7 @@
|
|
|
|
.align 64
|
|
.Land_mask:
|
|
- .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
|
|
+ .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
|
|
.Lscatter_permd:
|
|
.long 0,2,4,6,7,7,7,7
|
|
.Lgather_permd:
|
|
--- crypto/openssl/ssl/ssl.h.orig
|
|
+++ crypto/openssl/ssl/ssl.h
|
|
@@ -1727,7 +1727,7 @@
|
|
# define SSL_ST_BEFORE 0x4000
|
|
# define SSL_ST_OK 0x03
|
|
# define SSL_ST_RENEGOTIATE (0x04|SSL_ST_INIT)
|
|
-# define SSL_ST_ERR 0x05
|
|
+# define SSL_ST_ERR (0x05|SSL_ST_INIT)
|
|
|
|
# define SSL_CB_LOOP 0x01
|
|
# define SSL_CB_EXIT 0x02
|
|
--- secure/lib/libcrypto/amd64/rsaz-avx2.S.orig
|
|
+++ secure/lib/libcrypto/amd64/rsaz-avx2.S
|
|
@@ -68,7 +68,7 @@
|
|
vmovdqu 256-128(%rsi),%ymm8
|
|
|
|
leaq 192(%rsp),%rbx
|
|
- vpbroadcastq .Land_mask(%rip),%ymm15
|
|
+ vmovdqu .Land_mask(%rip),%ymm15
|
|
jmp .LOOP_GRANDE_SQR_1024
|
|
|
|
.align 32
|
|
@@ -801,10 +801,10 @@
|
|
vpmuludq 192-128(%rcx),%ymm11,%ymm12
|
|
vpaddq %ymm12,%ymm6,%ymm6
|
|
vpmuludq 224-128(%rcx),%ymm11,%ymm13
|
|
- vpblendd $3,%ymm14,%ymm9,%ymm9
|
|
+ vpblendd $3,%ymm14,%ymm9,%ymm12
|
|
vpaddq %ymm13,%ymm7,%ymm7
|
|
vpmuludq 256-128(%rcx),%ymm11,%ymm0
|
|
- vpaddq %ymm9,%ymm3,%ymm3
|
|
+ vpaddq %ymm12,%ymm3,%ymm3
|
|
vpaddq %ymm0,%ymm8,%ymm8
|
|
|
|
movq %rbx,%rax
|
|
@@ -817,7 +817,9 @@
|
|
vmovdqu -8+64-128(%rsi),%ymm13
|
|
|
|
movq %r10,%rax
|
|
+ vpblendd $0xfc,%ymm14,%ymm9,%ymm9
|
|
imull %r8d,%eax
|
|
+ vpaddq %ymm9,%ymm4,%ymm4
|
|
andl $0x1fffffff,%eax
|
|
|
|
imulq 16-128(%rsi),%rbx
|
|
@@ -1046,7 +1048,6 @@
|
|
|
|
decl %r14d
|
|
jnz .Loop_mul_1024
|
|
- vpermq $0,%ymm15,%ymm15
|
|
vpaddq (%rsp),%ymm12,%ymm0
|
|
|
|
vpsrlq $29,%ymm0,%ymm12
|
|
@@ -1686,7 +1687,7 @@
|
|
|
|
.align 64
|
|
.Land_mask:
|
|
-.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
|
|
+.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
|
|
.Lscatter_permd:
|
|
.long 0,2,4,6,7,7,7,7
|
|
.Lgather_permd:
|