diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/accelerated/x86/coff/ghash-x86_64.s | 42 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha1-ssse3-x86_64.s | 229 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha512-ssse3-x86_64.s | 274 | ||||
-rw-r--r-- | lib/accelerated/x86/elf/ghash-x86_64.s | 42 | ||||
-rw-r--r-- | lib/accelerated/x86/elf/sha1-ssse3-x86_64.s | 170 | ||||
-rw-r--r-- | lib/accelerated/x86/elf/sha512-ssse3-x86_64.s | 213 | ||||
-rw-r--r-- | lib/accelerated/x86/macosx/ghash-x86_64.s | 42 | ||||
-rw-r--r-- | lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s | 170 | ||||
-rw-r--r-- | lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s | 213 |
9 files changed, 72 insertions, 1323 deletions
diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s index 2b4c72acd4..b1d69911c7 100644 --- a/lib/accelerated/x86/coff/ghash-x86_64.s +++ b/lib/accelerated/x86/coff/ghash-x86_64.s @@ -990,8 +990,8 @@ gcm_ghash_clmul: pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 -.byte 102,68,15,58,68,231,16 xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,16 xorps %xmm13,%xmm5 movups 80(%rdx),%xmm7 xorps %xmm12,%xmm4 @@ -1009,8 +1009,8 @@ gcm_ghash_clmul: pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 -.byte 102,68,15,58,68,231,0 xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%r8),%r8 @@ -1028,23 +1028,23 @@ gcm_ghash_clmul: xorps %xmm3,%xmm0 movdqu 32(%r8),%xmm3 movdqa %xmm11,%xmm13 -.byte 102,68,15,58,68,199,16 pshufd $78,%xmm11,%xmm12 +.byte 102,68,15,58,68,199,16 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rdx),%xmm7 - xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 + xorps %xmm4,%xmm8 + movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 - movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 + pxor %xmm1,%xmm8 movdqa %xmm8,%xmm9 -.byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 +.byte 102,68,15,58,68,234,17 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 @@ -1053,8 +1053,8 @@ gcm_ghash_clmul: pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 + pxor %xmm0,%xmm9 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 @@ -1081,31 +1081,32 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 + pxor %xmm1,%xmm0 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + leaq 64(%r8),%r8 subq $64,%r9 jnc .Lmod4_loop .Ltail4x: .byte 102,65,15,58,68,199,0 -.byte 102,65,15,58,68,207,17 -.byte 102,68,15,58,68,199,16 xorps %xmm12,%xmm4 +.byte 102,65,15,58,68,207,17 xorps %xmm3,%xmm0 +.byte 102,68,15,58,68,199,16 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 @@ -1185,13 +1186,13 @@ gcm_ghash_clmul: pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 - movdqu (%r8),%xmm9 + movdqu (%r8),%xmm5 pxor %xmm0,%xmm8 -.byte 102,69,15,56,0,202 +.byte 102,65,15,56,0,234 movdqu 16(%r8),%xmm3 pxor %xmm1,%xmm8 - pxor %xmm9,%xmm1 + pxor %xmm5,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 @@ -1218,9 +1219,9 @@ gcm_ghash_clmul: pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 +.byte 102,15,58,68,234,17 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 -.byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 @@ -1229,6 +1230,7 @@ gcm_ghash_clmul: psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 +.byte 0x66,0x90 subq $32,%r9 ja .Lmod_loop diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s index 1b7fe3a51c..e8136025e8 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s @@ -57,8 +57,6 @@ sha1_block_data_order: movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu - testl $536870912,%r10d - jnz _shaext_shortcut jmp _ssse3_shortcut .p2align 4 @@ -1280,195 +1278,6 @@ sha1_block_data_order: movq 16(%rsp),%rsi .byte 0xf3,0xc3 .LSEH_end_sha1_block_data_order: -.def sha1_block_data_order_shaext; .scl 3; .type 32; .endef -.p2align 5 -sha1_block_data_order_shaext: - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%rax -.LSEH_begin_sha1_block_data_order_shaext: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - -_shaext_shortcut: - leaq -72(%rsp),%rsp - movaps %xmm6,-8-64(%rax) - movaps %xmm7,-8-48(%rax) - movaps %xmm8,-8-32(%rax) - movaps %xmm9,-8-16(%rax) -.Lprologue_shaext: - movdqu (%rdi),%xmm0 - movd 16(%rdi),%xmm1 - movdqa K_XX_XX+160(%rip),%xmm3 - - movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 - movdqa %xmm1,%xmm9 -.byte 102,15,56,0,251 - jmp .Loop_shaext - -.p2align 4 -.Loop_shaext: - decq %rdx - leaq 64(%rsi),%rax - paddd %xmm4,%xmm1 - cmovneq %rax,%rsi - movdqa %xmm0,%xmm8 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%rsi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%rsi),%xmm5 -.byte 102,15,56,0,227 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,235 - - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,243 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 65,15,56,200,201 -.byte 102,15,56,0,251 - - paddd %xmm8,%xmm0 - movdqa %xmm1,%xmm9 - - jnz .Loop_shaext - - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%rdi) - movd %xmm1,16(%rdi) - movaps -8-64(%rax),%xmm6 - movaps -8-48(%rax),%xmm7 - movaps -8-32(%rax),%xmm8 - movaps -8-16(%rax),%xmm9 - movq %rax,%rsp -.Lepilogue_shaext: - movq 8(%rsp),%rdi - movq 16(%rsp),%rsi - .byte 0xf3,0xc3 -.LSEH_end_sha1_block_data_order_shaext: .def sha1_block_data_order_ssse3; .scl 3; .type 32; .endef .p2align 4 sha1_block_data_order_ssse3: @@ -2680,7 +2489,6 @@ K_XX_XX: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 @@ -2726,37 +2534,6 @@ se_handler: jmp .Lcommon_seh_tail -.def shaext_handler; .scl 3; .type 32; .endef -.p2align 4 -shaext_handler: - pushq %rsi - pushq %rdi - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushfq - subq $64,%rsp - - movq 120(%r8),%rax - movq 248(%r8),%rbx - - leaq .Lprologue_shaext(%rip),%r10 - cmpq %r10,%rbx - jb .Lcommon_seh_tail - - leaq .Lepilogue_shaext(%rip),%r10 - cmpq %r10,%rbx - jae .Lcommon_seh_tail - - leaq -8-64(%rax),%rsi - leaq 512(%r8),%rdi - movl $8,%ecx -.long 0xa548f3fc - - jmp .Lcommon_seh_tail .def ssse3_handler; .scl 3; .type 32; .endef .p2align 4 @@ -2853,9 +2630,6 @@ ssse3_handler: .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order -.rva .LSEH_begin_sha1_block_data_order_shaext -.rva .LSEH_end_sha1_block_data_order_shaext -.rva .LSEH_info_sha1_block_data_order_shaext .rva .LSEH_begin_sha1_block_data_order_ssse3 .rva .LSEH_end_sha1_block_data_order_ssse3 .rva .LSEH_info_sha1_block_data_order_ssse3 @@ -2864,9 +2638,6 @@ ssse3_handler: .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler -.LSEH_info_sha1_block_data_order_shaext: -.byte 9,0,0,0 -.rva shaext_handler .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s index 959a2525bf..edaa67b95f 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s @@ -56,8 +56,6 @@ sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $536870912,%r11d - jnz _shaext_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -1794,237 +1792,6 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.def sha256_block_data_order_shaext; .scl 3; .type 32; .endef -.p2align 6 -sha256_block_data_order_shaext: - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%rax -.LSEH_begin_sha256_block_data_order_shaext: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - -_shaext_shortcut: - leaq -88(%rsp),%rsp - movaps %xmm6,-8-80(%rax) - movaps %xmm7,-8-64(%rax) - movaps %xmm8,-8-48(%rax) - movaps %xmm9,-8-32(%rax) - movaps %xmm10,-8-16(%rax) -.Lprologue_shaext: - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 512-128(%rcx),%xmm7 - - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .Loop_shaext - -.p2align 4 -.Loop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 64-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 96-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 224-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 256-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 288-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 320-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 352-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 384-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 416-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 448-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 480-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz .Loop_shaext - - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) - movaps -8-80(%rax),%xmm6 - movaps -8-64(%rax),%xmm7 - movaps -8-48(%rax),%xmm8 - movaps -8-32(%rax),%xmm9 - movaps -8-16(%rax),%xmm10 - movq %rax,%rsp -.Lepilogue_shaext: - movq 8(%rsp),%rdi - movq 16(%rsp),%rsi - .byte 0xf3,0xc3 -.LSEH_end_sha256_block_data_order_shaext: .def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef .p2align 6 sha256_block_data_order_ssse3: @@ -2075,13 +1842,13 @@ sha256_block_data_order_ssse3: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 +.byte 102,15,56,0,199 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 - movdqa 32(%rbp),%xmm5 .byte 102,15,56,0,215 + movdqa 32(%rbp),%xmm5 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 @@ -3235,46 +3002,12 @@ se_handler: popq %rsi .byte 0xf3,0xc3 -.def shaext_handler; .scl 3; .type 32; .endef -.p2align 4 -shaext_handler: - pushq %rsi - pushq %rdi - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushfq - subq $64,%rsp - - movq 120(%r8),%rax - movq 248(%r8),%rbx - - leaq .Lprologue_shaext(%rip),%r10 - cmpq %r10,%rbx - jb .Lin_prologue - - leaq .Lepilogue_shaext(%rip),%r10 - cmpq %r10,%rbx - jae .Lin_prologue - - leaq -8-80(%rax),%rsi - leaq 512(%r8),%rdi - movl $10,%ecx -.long 0xa548f3fc - - jmp .Lin_prologue .section .pdata .p2align 2 .rva .LSEH_begin_sha256_block_data_order .rva .LSEH_end_sha256_block_data_order .rva .LSEH_info_sha256_block_data_order -.rva .LSEH_begin_sha256_block_data_order_shaext -.rva .LSEH_end_sha256_block_data_order_shaext -.rva .LSEH_info_sha256_block_data_order_shaext .rva .LSEH_begin_sha256_block_data_order_ssse3 .rva .LSEH_end_sha256_block_data_order_ssse3 .rva .LSEH_info_sha256_block_data_order_ssse3 @@ -3284,9 +3017,6 @@ shaext_handler: .byte 9,0,0,0 .rva se_handler .rva .Lprologue,.Lepilogue -.LSEH_info_sha256_block_data_order_shaext: -.byte 9,0,0,0 -.rva shaext_handler .LSEH_info_sha256_block_data_order_ssse3: .byte 9,0,0,0 .rva se_handler diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s index afa6714a6e..17f6bebe6e 100644 --- a/lib/accelerated/x86/elf/ghash-x86_64.s +++ b/lib/accelerated/x86/elf/ghash-x86_64.s @@ -949,8 +949,8 @@ gcm_ghash_clmul: pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 -.byte 102,68,15,58,68,231,16 xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,16 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 @@ -968,8 +968,8 @@ gcm_ghash_clmul: pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 -.byte 102,68,15,58,68,231,0 xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx @@ -987,23 +987,23 @@ gcm_ghash_clmul: xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 -.byte 102,68,15,58,68,199,16 pshufd $78,%xmm11,%xmm12 +.byte 102,68,15,58,68,199,16 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 - xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 + xorps %xmm4,%xmm8 + movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 - movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 + pxor %xmm1,%xmm8 movdqa %xmm8,%xmm9 -.byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 +.byte 102,68,15,58,68,234,17 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 @@ -1012,8 +1012,8 @@ gcm_ghash_clmul: pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 + pxor %xmm0,%xmm9 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 @@ -1040,31 +1040,32 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 + pxor %xmm1,%xmm0 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + leaq 64(%rdx),%rdx subq $64,%rcx jnc .Lmod4_loop .Ltail4x: .byte 102,65,15,58,68,199,0 -.byte 102,65,15,58,68,207,17 -.byte 102,68,15,58,68,199,16 xorps %xmm12,%xmm4 +.byte 102,65,15,58,68,207,17 xorps %xmm3,%xmm0 +.byte 102,68,15,58,68,199,16 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 @@ -1144,13 +1145,13 @@ gcm_ghash_clmul: pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 - movdqu (%rdx),%xmm9 + movdqu (%rdx),%xmm5 pxor %xmm0,%xmm8 -.byte 102,69,15,56,0,202 +.byte 102,65,15,56,0,234 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 - pxor %xmm9,%xmm1 + pxor %xmm5,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 @@ -1177,9 +1178,9 @@ gcm_ghash_clmul: pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 +.byte 102,15,58,68,234,17 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 -.byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 @@ -1188,6 +1189,7 @@ gcm_ghash_clmul: psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 +.byte 0x66,0x90 subq $32,%rcx ja .Lmod_loop diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s index bed632617f..116efd0c18 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s @@ -49,8 +49,6 @@ sha1_block_data_order: movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu - testl $536870912,%r10d - jnz _shaext_shortcut jmp _ssse3_shortcut .align 16 @@ -1270,173 +1268,6 @@ sha1_block_data_order: .Lepilogue: .byte 0xf3,0xc3 .size sha1_block_data_order,.-sha1_block_data_order -.type sha1_block_data_order_shaext,@function -.align 32 -sha1_block_data_order_shaext: -_shaext_shortcut: - movdqu (%rdi),%xmm0 - movd 16(%rdi),%xmm1 - movdqa K_XX_XX+160(%rip),%xmm3 - - movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 - movdqa %xmm1,%xmm9 -.byte 102,15,56,0,251 - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - decq %rdx - leaq 64(%rsi),%rax - paddd %xmm4,%xmm1 - cmovneq %rax,%rsi - movdqa %xmm0,%xmm8 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%rsi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%rsi),%xmm5 -.byte 102,15,56,0,227 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,235 - - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,243 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 65,15,56,200,201 -.byte 102,15,56,0,251 - - paddd %xmm8,%xmm0 - movdqa %xmm1,%xmm9 - - jnz .Loop_shaext - - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%rdi) - movd %xmm1,16(%rdi) - .byte 0xf3,0xc3 -.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: @@ -2625,7 +2456,6 @@ K_XX_XX: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s index e1f92d4a78..f85e0bb6d8 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s @@ -48,8 +48,6 @@ sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $536870912,%r11d - jnz _shaext_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -1784,213 +1782,6 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha256_block_data_order_shaext,@function -.align 64 -sha256_block_data_order_shaext: -_shaext_shortcut: - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 512-128(%rcx),%xmm7 - - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 64-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 96-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 224-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 256-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 288-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 320-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 352-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 384-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 416-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 448-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 480-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz .Loop_shaext - - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) - .byte 0xf3,0xc3 -.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: @@ -2029,13 +1820,13 @@ sha256_block_data_order_ssse3: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 +.byte 102,15,56,0,199 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 - movdqa 32(%rbp),%xmm5 .byte 102,15,56,0,215 + movdqa 32(%rbp),%xmm5 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s index 9e246794f0..a63034a6fc 100644 --- a/lib/accelerated/x86/macosx/ghash-x86_64.s +++ b/lib/accelerated/x86/macosx/ghash-x86_64.s @@ -949,8 +949,8 @@ L$_ghash_clmul: pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 -.byte 102,68,15,58,68,231,16 xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,16 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 @@ -968,8 +968,8 @@ L$_ghash_clmul: pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 -.byte 102,68,15,58,68,231,0 xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx @@ -987,23 +987,23 @@ L$mod4_loop: xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 -.byte 102,68,15,58,68,199,16 pshufd $78,%xmm11,%xmm12 +.byte 102,68,15,58,68,199,16 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 - xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 + xorps %xmm4,%xmm8 + movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 - movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 + pxor %xmm1,%xmm8 movdqa %xmm8,%xmm9 -.byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 +.byte 102,68,15,58,68,234,17 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa L$7_mask(%rip),%xmm8 @@ -1012,8 +1012,8 @@ L$mod4_loop: pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 + pxor %xmm0,%xmm9 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 @@ -1040,31 +1040,32 @@ L$mod4_loop: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 + pxor %xmm1,%xmm0 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + leaq 64(%rdx),%rdx subq $64,%rcx jnc L$mod4_loop L$tail4x: .byte 102,65,15,58,68,199,0 -.byte 102,65,15,58,68,207,17 -.byte 102,68,15,58,68,199,16 xorps %xmm12,%xmm4 +.byte 102,65,15,58,68,207,17 xorps %xmm3,%xmm0 +.byte 102,68,15,58,68,199,16 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 @@ -1144,13 +1145,13 @@ L$mod_loop: pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 - movdqu (%rdx),%xmm9 + movdqu (%rdx),%xmm5 pxor %xmm0,%xmm8 -.byte 102,69,15,56,0,202 +.byte 102,65,15,56,0,234 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 - pxor %xmm9,%xmm1 + pxor %xmm5,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 @@ -1177,9 +1178,9 @@ L$mod_loop: pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 +.byte 102,15,58,68,234,17 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 -.byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 @@ -1188,6 +1189,7 @@ L$mod_loop: psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 +.byte 0x66,0x90 subq $32,%rcx ja L$mod_loop diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s index 6e63270971..9091bd802e 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s @@ -49,8 +49,6 @@ _sha1_block_data_order: movl __gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz L$ialu - testl $536870912,%r10d - jnz _shaext_shortcut jmp _ssse3_shortcut .p2align 4 @@ -1271,173 +1269,6 @@ L$epilogue: .byte 0xf3,0xc3 -.p2align 5 -sha1_block_data_order_shaext: -_shaext_shortcut: - movdqu (%rdi),%xmm0 - movd 16(%rdi),%xmm1 - movdqa K_XX_XX+160(%rip),%xmm3 - - movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 - movdqa %xmm1,%xmm9 -.byte 102,15,56,0,251 - jmp L$oop_shaext - -.p2align 4 -L$oop_shaext: - decq %rdx - leaq 64(%rsi),%rax - paddd %xmm4,%xmm1 - cmovneq %rax,%rsi - movdqa %xmm0,%xmm8 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%rsi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%rsi),%xmm5 -.byte 102,15,56,0,227 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,235 - - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,243 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 65,15,56,200,201 -.byte 102,15,56,0,251 - - paddd %xmm8,%xmm0 - movdqa %xmm1,%xmm9 - - jnz L$oop_shaext - - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%rdi) - movd %xmm1,16(%rdi) - .byte 0xf3,0xc3 - - .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: @@ -2625,7 +2456,6 @@ K_XX_XX: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s index 94ea73c417..c48240f457 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s @@ -48,8 +48,6 @@ _sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $536870912,%r11d - jnz _shaext_shortcut testl $512,%r10d jnz L$ssse3_shortcut pushq %rbx @@ -1786,213 +1784,6 @@ K256: .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 -sha256_block_data_order_shaext: -_shaext_shortcut: - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 512-128(%rcx),%xmm7 - - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp L$oop_shaext - -.p2align 4 -L$oop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 64-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 96-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 224-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 256-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 288-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 320-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 352-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 384-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 416-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 448-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 480-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz L$oop_shaext - - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) - .byte 0xf3,0xc3 - - -.p2align 6 sha256_block_data_order_ssse3: L$ssse3_shortcut: pushq %rbx @@ -2029,13 +1820,13 @@ L$loop_ssse3: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 +.byte 102,15,56,0,199 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 - movdqa 32(%rbp),%xmm5 .byte 102,15,56,0,215 + movdqa 32(%rbp),%xmm5 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 |