diff options
author | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2012-03-19 22:55:14 +0100 |
---|---|---|
committer | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2012-03-19 22:58:02 +0100 |
commit | 9567d93c07f87ecb5c8560b7a45125de28710bc1 (patch) | |
tree | 31a779ef6d1e51589dc257599dca05ea6a768c01 /lib | |
parent | abbfc182f738c654ebeaf75cf6893acc0947699b (diff) | |
download | gnutls-9567d93c07f87ecb5c8560b7a45125de28710bc1.tar.gz |
updated openssl code
Diffstat (limited to 'lib')
-rw-r--r-- | lib/accelerated/x86/README | 4 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/padlock-x86-64-coff.s | 162 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/padlock-x86-coff.s | 232 | ||||
-rw-r--r-- | lib/accelerated/x86/elf/padlock-x86-64.s | 162 | ||||
-rwxr-xr-x | lib/accelerated/x86/license.txt | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 162 | ||||
-rw-r--r-- | lib/accelerated/x86/macosx/padlock-x86-macosx.s | 234 |
7 files changed, 704 insertions, 254 deletions
diff --git a/lib/accelerated/x86/README b/lib/accelerated/x86/README index 0dd5cb9855..ca3c546381 100644 --- a/lib/accelerated/x86/README +++ b/lib/accelerated/x86/README @@ -1,4 +1,4 @@ -The AES-NI and Padlock implementation by Andy Polyakov is not part of the -GnuTLS library, but is used with GnuTLS. Its license is included in +The AES-NI and Padlock implementation by Andy Polyakov are not part of the +GnuTLS library, but is used with GnuTLS. Their license is included in license.txt. diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s index b69b33275e..9f658ee761 100644 --- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s @@ -354,8 +354,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -375,6 +373,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .p2align 4 .Lecb_loop: @@ -404,8 +417,8 @@ padlock_ecb_encrypt: testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: @@ -415,9 +428,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.p2align 4 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -431,26 +461,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.p2align 4 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .p2align 4 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -489,8 +532,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -510,6 +551,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .p2align 4 .Lcbc_loop: @@ -541,8 +597,8 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: @@ -552,9 +608,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.p2align 4 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -568,28 +641,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.p2align 4 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .p2align 4 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s index b068083fa6..69eb468638 100644 --- a/lib/accelerated/x86/coff/padlock-x86-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-coff.s @@ -180,16 +180,14 @@ _padlock_ecb_encrypt: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe .L006ecb_short testl $32,(%edx) - jnz .L007ecb_aligned + jnz .L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L007ecb_aligned + jnz .L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -201,10 +199,28 @@ _padlock_ecb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L008ecb_unaligned_tail + jmp .L007ecb_loop .align 16 -.L008ecb_loop: +.L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -229,8 +245,8 @@ _padlock_ecb_encrypt: testl $15,%edi jz .L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L010ecb_out_aligned: @@ -240,43 +256,75 @@ _padlock_ecb_encrypt: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L008ecb_loop + jz .L011ecb_break + cmpl %ebx,%ecx + jae .L007ecb_loop +.L008ecb_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.align 16 +.L011ecb_break: cmpl %ebp,%esp - je .L011ecb_done + je .L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L012ecb_bzero: +.L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L012ecb_bzero -.L011ecb_done: + ja .L013ecb_bzero +.L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L013ecb_exit + jmp .L014ecb_exit .align 16 -.L006ecb_short: +.L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L008ecb_loop -.align 16 -.L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -.L013ecb_exit: + testl %ebp,%ebp + jz .L014ecb_exit +.L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.L014ecb_exit: movl $1,%eax leal 4(%esp),%esp .L004ecb_abort: @@ -299,19 +347,17 @@ _padlock_cbc_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L015cbc_abort + jnz .L016cbc_abort testl $15,%ecx - jnz .L015cbc_abort + jnz .L016cbc_abort leal .Lpadlock_saved_context,%eax pushfl cld call __padlock_verify_ctx -.L016cbc_pic_point: +.L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe .L017cbc_short testl $32,(%edx) jnz .L018cbc_aligned testl $15,%edi @@ -331,7 +377,25 @@ _padlock_cbc_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L020cbc_unaligned_tail jmp .L019cbc_loop .align 16 .L019cbc_loop: @@ -343,13 +407,13 @@ _padlock_cbc_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L020cbc_inp_aligned + jz .L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L020cbc_inp_aligned: +.L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -359,61 +423,93 @@ _padlock_cbc_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L021cbc_out_aligned + jz .L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L021cbc_out_aligned: +.L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L019cbc_loop + jz .L023cbc_break + cmpl %ebx,%ecx + jae .L019cbc_loop +.L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.align 16 +.L023cbc_break: cmpl %ebp,%esp - je .L022cbc_done + je .L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L023cbc_bzero: +.L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L023cbc_bzero -.L022cbc_done: + ja .L025cbc_bzero +.L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L024cbc_exit -.align 16 -.L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L019cbc_loop + jmp .L026cbc_exit .align 16 .L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L024cbc_exit: + testl %ebp,%ebp + jz .L026cbc_exit +.L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -.L015cbc_abort: +.L016cbc_abort: popl %edi popl %esi popl %ebx @@ -437,10 +533,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L026ret + jne .L028ret addl $4,184(%ecx) movl $0,%eax -.L026ret: +.L028ret: ret .globl _padlock_sha1_oneshot .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s index bf5f62600c..4709ac2273 100644 --- a/lib/accelerated/x86/elf/padlock-x86-64.s +++ b/lib/accelerated/x86/elf/padlock-x86-64.s @@ -276,8 +276,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -297,6 +295,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .align 16 .Lecb_loop: @@ -326,8 +339,8 @@ padlock_ecb_encrypt: testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: @@ -337,9 +350,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.align 16 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -353,26 +383,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.align 16 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .align 16 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -400,8 +443,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -421,6 +462,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .align 16 .Lcbc_loop: @@ -452,8 +508,8 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: @@ -463,9 +519,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.align 16 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -479,28 +552,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.align 16 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .align 16 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/license.txt b/lib/accelerated/x86/license.txt index c87ba42b08..929ddd59e3 100755 --- a/lib/accelerated/x86/license.txt +++ b/lib/accelerated/x86/license.txt @@ -5,7 +5,7 @@ CRYPTOGAMS licenses depending on where you obtain it. For further details see http://www.openssl.org/~appro/cryptogams/. ==================================================================== -Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org> +Copyright (c) 2006-2012, CRYPTOGAMS by <appro@openssl.org> All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s index 9b912f9202..dbd89daabc 100644 --- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s +++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s @@ -276,8 +276,6 @@ _padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe L$ecb_short testl $32,(%rdx) jnz L$ecb_aligned testq $15,%rdi @@ -297,6 +295,21 @@ _padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja L$ecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$ecb_unaligned_tail jmp L$ecb_loop .p2align 4 L$ecb_loop: @@ -326,8 +339,8 @@ L$ecb_inp_aligned: testq $15,%rdi jz L$ecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$ecb_out_aligned: @@ -337,9 +350,26 @@ L$ecb_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz L$ecb_loop - + jz L$ecb_break + cmpq %rbx,%rcx + jae L$ecb_loop +L$ecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$ecb_loop +.p2align 4 +L$ecb_break: + cmpq %rbp,%rsp je L$ecb_done pxor %xmm0,%xmm0 @@ -353,26 +383,39 @@ L$ecb_bzero: L$ecb_done: leaq (%rbp),%rsp jmp L$ecb_exit -.p2align 4 -L$ecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$ecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$ecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$ecb_loop + .p2align 4 L$ecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$ecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz L$ecb_exit + +L$ecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$ecb_loop L$ecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -400,8 +443,6 @@ _padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe L$cbc_short testl $32,(%rdx) jnz L$cbc_aligned testq $15,%rdi @@ -421,6 +462,21 @@ _padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja L$cbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$cbc_unaligned_tail jmp L$cbc_loop .p2align 4 L$cbc_loop: @@ -452,8 +508,8 @@ L$cbc_inp_aligned: testq $15,%rdi jz L$cbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$cbc_out_aligned: @@ -463,9 +519,26 @@ L$cbc_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz L$cbc_loop - + jz L$cbc_break + cmpq %rbx,%rcx + jae L$cbc_loop +L$cbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$cbc_loop +.p2align 4 +L$cbc_break: + cmpq %rbp,%rsp je L$cbc_done pxor %xmm0,%xmm0 @@ -479,28 +552,41 @@ L$cbc_bzero: L$cbc_done: leaq (%rbp),%rsp jmp L$cbc_exit -.p2align 4 -L$cbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$cbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$cbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$cbc_loop + .p2align 4 L$cbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$cbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz L$cbc_exit + +L$cbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$cbc_loop L$cbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s index 02b427e6a8..40cfce9af2 100644 --- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s +++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s @@ -174,16 +174,14 @@ L005ecb_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe L006ecb_short testl $32,(%edx) - jnz L007ecb_aligned + jnz L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz L007ecb_aligned + jnz L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -195,10 +193,28 @@ L005ecb_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz L008ecb_unaligned_tail + jmp L007ecb_loop .align 4,0x90 -L008ecb_loop: +L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -223,8 +239,8 @@ L009ecb_inp_aligned: testl $15,%edi jz L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi L010ecb_out_aligned: @@ -234,43 +250,75 @@ L010ecb_out_aligned: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L008ecb_loop + jz L011ecb_break + cmpl %ebx,%ecx + jae L007ecb_loop +L008ecb_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L007ecb_loop +.align 4,0x90 +L011ecb_break: cmpl %ebp,%esp - je L011ecb_done + je L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -L012ecb_bzero: +L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L012ecb_bzero -L011ecb_done: + ja L013ecb_bzero +L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L013ecb_exit + jmp L014ecb_exit .align 4,0x90 -L006ecb_short: +L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L008ecb_loop -.align 4,0x90 -L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -L013ecb_exit: + testl %ebp,%ebp + jz L014ecb_exit +L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L007ecb_loop +L014ecb_exit: movl $1,%eax leal 4(%esp),%esp L004ecb_abort: @@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz L015cbc_abort + jnz L016cbc_abort testl $15,%ecx - jnz L015cbc_abort - leal Lpadlock_saved_context-L016cbc_pic_point,%eax + jnz L016cbc_abort + leal Lpadlock_saved_context-L017cbc_pic_point,%eax pushfl cld call __padlock_verify_ctx -L016cbc_pic_point: +L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe L017cbc_short testl $32,(%edx) jnz L018cbc_aligned testl $15,%edi @@ -324,7 +370,25 @@ L016cbc_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz L020cbc_unaligned_tail jmp L019cbc_loop .align 4,0x90 L019cbc_loop: @@ -336,13 +400,13 @@ L019cbc_loop: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz L020cbc_inp_aligned + jz L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -L020cbc_inp_aligned: +L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -352,61 +416,93 @@ L020cbc_inp_aligned: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz L021cbc_out_aligned + jz L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -L021cbc_out_aligned: +L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L019cbc_loop + jz L023cbc_break + cmpl %ebx,%ecx + jae L019cbc_loop +L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L019cbc_loop +.align 4,0x90 +L023cbc_break: cmpl %ebp,%esp - je L022cbc_done + je L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -L023cbc_bzero: +L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L023cbc_bzero -L022cbc_done: + ja L025cbc_bzero +L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L024cbc_exit -.align 4,0x90 -L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L019cbc_loop + jmp L026cbc_exit .align 4,0x90 L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -L024cbc_exit: + testl %ebp,%ebp + jz L026cbc_exit +L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L019cbc_loop +L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -L015cbc_abort: +L016cbc_abort: popl %edi popl %esi popl %ebx @@ -428,10 +524,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne L026ret + jne L028ret addl $4,184(%ecx) movl $0,%eax -L026ret: +L028ret: ret .globl _padlock_sha1_oneshot .align 4 |