updated openssl code

author: Nikos Mavrogiannopoulos <nmav@gnutls.org> 2012-03-19 22:55:14 +0100
committer: Nikos Mavrogiannopoulos <nmav@gnutls.org> 2012-03-19 22:58:02 +0100
commit: 9567d93c07f87ecb5c8560b7a45125de28710bc1 (patch)
tree: 31a779ef6d1e51589dc257599dca05ea6a768c01 /lib
parent: abbfc182f738c654ebeaf75cf6893acc0947699b (diff)
download: gnutls-9567d93c07f87ecb5c8560b7a45125de28710bc1.tar.gz
7 files changed, 704 insertions, 254 deletions
diff --git a/lib/accelerated/x86/README b/lib/accelerated/x86/README
index 0dd5cb9855..ca3c546381 100644
--- a/lib/accelerated/x86/README
+++ b/lib/accelerated/x86/README
@@ -1,4 +1,4 @@
-The AES-NI and Padlock implementation by Andy Polyakov is not part of the 
-GnuTLS library, but is used with GnuTLS. Its license is included in 
+The AES-NI and Padlock implementation by Andy Polyakov are not part of the 
+GnuTLS library, but is used with GnuTLS. Their license is included in 
 license.txt.
 
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
index b69b33275e..9f658ee761 100644
--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
@@ -354,8 +354,6 @@ padlock_ecb_encrypt:
 	leaq	16(%rdx),%rdx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpq	$128,%rcx
-	jbe	.Lecb_short
 	testl	$32,(%rdx)
 	jnz	.Lecb_aligned
 	testq	$15,%rdi
@@ -375,6 +373,21 @@ padlock_ecb_encrypt:
 	negq	%rax
 	andq	$512-1,%rbx
 	leaq	(%rax,%rbp,1),%rsp
+	movq	$512,%rax
+	cmovzq	%rax,%rbx
+	cmpq	%rbx,%rcx
+	ja	.Lecb_loop
+	movq	%rsi,%rax
+	cmpq	%rsp,%rbp
+	cmoveq	%rdi,%rax
+	addq	%rcx,%rax
+	negq	%rax
+	andq	$4095,%rax
+	cmpq	$128,%rax
+	movq	$-128,%rax
+	cmovaeq	%rbx,%rax
+	andq	%rax,%rbx
+	jz	.Lecb_unaligned_tail
 	jmp	.Lecb_loop
 .p2align	4
 .Lecb_loop:
@@ -404,8 +417,8 @@ padlock_ecb_encrypt:
 	testq	$15,%rdi
 	jz	.Lecb_out_aligned
 	movq	%rbx,%rcx
-	shrq	$3,%rcx
 	leaq	(%rsp),%rsi
+	shrq	$3,%rcx
 .byte	0xf3,0x48,0xa5		
 	subq	%rbx,%rdi
 .Lecb_out_aligned:
@@ -415,9 +428,26 @@ padlock_ecb_encrypt:
 	addq	%rbx,%rsi
 	subq	%rbx,%rcx
 	movq	$512,%rbx
-	jnz	.Lecb_loop
-
+	jz	.Lecb_break
+	cmpq	%rbx,%rcx
+	jae	.Lecb_loop
+.Lecb_unaligned_tail:
+	xorl	%eax,%eax
 	cmpq	%rsp,%rbp
+	cmoveq	%rcx,%rax
+	movq	%rdi,%r8
+	movq	%rcx,%rbx
+	subq	%rax,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	movq	%rsp,%rsi
+	movq	%r8,%rdi
+	movq	%rbx,%rcx
+	jmp	.Lecb_loop
+.p2align	4
+.Lecb_break:
+	cmpq	%rbp,%rsp
 	je	.Lecb_done
 
 	pxor	%xmm0,%xmm0
@@ -431,26 +461,39 @@ padlock_ecb_encrypt:
 .Lecb_done:
 	leaq	(%rbp),%rsp
 	jmp	.Lecb_exit
-.p2align	4
-.Lecb_short:
-	movq	%rsp,%rbp
-	subq	%rcx,%rsp
-	xorq	%rbx,%rbx
-.Lecb_short_copy:
-	movups	(%rsi,%rbx,1),%xmm0
-	leaq	16(%rbx),%rbx
-	cmpq	%rbx,%rcx
-	movaps	%xmm0,-16(%rsp,%rbx,1)
-	ja	.Lecb_short_copy
-	movq	%rsp,%rsi
-	movq	%rcx,%rbx
-	jmp	.Lecb_loop
+
 .p2align	4
 .Lecb_aligned:
+	leaq	(%rsi,%rcx,1),%rbp
+	negq	%rbp
+	andq	$4095,%rbp
+	xorl	%eax,%eax
+	cmpq	$128,%rbp
+	movq	$128-1,%rbp
+	cmovaeq	%rax,%rbp
+	andq	%rcx,%rbp
+	subq	%rbp,%rcx
+	jz	.Lecb_aligned_tail
 	leaq	-16(%rdx),%rax
 	leaq	16(%rdx),%rbx
 	shrq	$4,%rcx
 .byte	0xf3,0x0f,0xa7,200	
+	testq	%rbp,%rbp
+	jz	.Lecb_exit
+
+.Lecb_aligned_tail:
+	movq	%rdi,%r8
+	movq	%rbp,%rbx
+	movq	%rbp,%rcx
+	leaq	(%rsp),%rbp
+	subq	%rcx,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	leaq	(%r8),%rdi
+	leaq	(%rsp),%rsi
+	movq	%rbx,%rcx
+	jmp	.Lecb_loop
 .Lecb_exit:
 	movl	$1,%eax
 	leaq	8(%rsp),%rsp
@@ -489,8 +532,6 @@ padlock_cbc_encrypt:
 	leaq	16(%rdx),%rdx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpq	$64,%rcx
-	jbe	.Lcbc_short
 	testl	$32,(%rdx)
 	jnz	.Lcbc_aligned
 	testq	$15,%rdi
@@ -510,6 +551,21 @@ padlock_cbc_encrypt:
 	negq	%rax
 	andq	$512-1,%rbx
 	leaq	(%rax,%rbp,1),%rsp
+	movq	$512,%rax
+	cmovzq	%rax,%rbx
+	cmpq	%rbx,%rcx
+	ja	.Lcbc_loop
+	movq	%rsi,%rax
+	cmpq	%rsp,%rbp
+	cmoveq	%rdi,%rax
+	addq	%rcx,%rax
+	negq	%rax
+	andq	$4095,%rax
+	cmpq	$64,%rax
+	movq	$-64,%rax
+	cmovaeq	%rbx,%rax
+	andq	%rax,%rbx
+	jz	.Lcbc_unaligned_tail
 	jmp	.Lcbc_loop
 .p2align	4
 .Lcbc_loop:
@@ -541,8 +597,8 @@ padlock_cbc_encrypt:
 	testq	$15,%rdi
 	jz	.Lcbc_out_aligned
 	movq	%rbx,%rcx
-	shrq	$3,%rcx
 	leaq	(%rsp),%rsi
+	shrq	$3,%rcx
 .byte	0xf3,0x48,0xa5		
 	subq	%rbx,%rdi
 .Lcbc_out_aligned:
@@ -552,9 +608,26 @@ padlock_cbc_encrypt:
 	addq	%rbx,%rsi
 	subq	%rbx,%rcx
 	movq	$512,%rbx
-	jnz	.Lcbc_loop
-
+	jz	.Lcbc_break
+	cmpq	%rbx,%rcx
+	jae	.Lcbc_loop
+.Lcbc_unaligned_tail:
+	xorl	%eax,%eax
 	cmpq	%rsp,%rbp
+	cmoveq	%rcx,%rax
+	movq	%rdi,%r8
+	movq	%rcx,%rbx
+	subq	%rax,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	movq	%rsp,%rsi
+	movq	%r8,%rdi
+	movq	%rbx,%rcx
+	jmp	.Lcbc_loop
+.p2align	4
+.Lcbc_break:
+	cmpq	%rbp,%rsp
 	je	.Lcbc_done
 
 	pxor	%xmm0,%xmm0
@@ -568,28 +641,41 @@ padlock_cbc_encrypt:
 .Lcbc_done:
 	leaq	(%rbp),%rsp
 	jmp	.Lcbc_exit
-.p2align	4
-.Lcbc_short:
-	movq	%rsp,%rbp
-	subq	%rcx,%rsp
-	xorq	%rbx,%rbx
-.Lcbc_short_copy:
-	movups	(%rsi,%rbx,1),%xmm0
-	leaq	16(%rbx),%rbx
-	cmpq	%rbx,%rcx
-	movaps	%xmm0,-16(%rsp,%rbx,1)
-	ja	.Lcbc_short_copy
-	movq	%rsp,%rsi
-	movq	%rcx,%rbx
-	jmp	.Lcbc_loop
+
 .p2align	4
 .Lcbc_aligned:
+	leaq	(%rsi,%rcx,1),%rbp
+	negq	%rbp
+	andq	$4095,%rbp
+	xorl	%eax,%eax
+	cmpq	$64,%rbp
+	movq	$64-1,%rbp
+	cmovaeq	%rax,%rbp
+	andq	%rcx,%rbp
+	subq	%rbp,%rcx
+	jz	.Lcbc_aligned_tail
 	leaq	-16(%rdx),%rax
 	leaq	16(%rdx),%rbx
 	shrq	$4,%rcx
 .byte	0xf3,0x0f,0xa7,208	
 	movdqa	(%rax),%xmm0
 	movdqa	%xmm0,-16(%rdx)
+	testq	%rbp,%rbp
+	jz	.Lcbc_exit
+
+.Lcbc_aligned_tail:
+	movq	%rdi,%r8
+	movq	%rbp,%rbx
+	movq	%rbp,%rcx
+	leaq	(%rsp),%rbp
+	subq	%rcx,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	leaq	(%r8),%rdi
+	leaq	(%rsp),%rsi
+	movq	%rbx,%rcx
+	jmp	.Lcbc_loop
 .Lcbc_exit:
 	movl	$1,%eax
 	leaq	8(%rsp),%rsp
diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s
index b068083fa6..69eb468638 100644
--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
@@ -180,16 +180,14 @@ _padlock_ecb_encrypt:
 	leal	16(%edx),%edx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpl	$128,%ecx
-	jbe	.L006ecb_short
 	testl	$32,(%edx)
-	jnz	.L007ecb_aligned
+	jnz	.L006ecb_aligned
 	testl	$15,%edi
 	setz	%al
 	testl	$15,%esi
 	setz	%bl
 	testl	%ebx,%eax
-	jnz	.L007ecb_aligned
+	jnz	.L006ecb_aligned
 	negl	%eax
 	movl	$512,%ebx
 	notl	%eax
@@ -201,10 +199,28 @@ _padlock_ecb_encrypt:
 	negl	%eax
 	andl	$511,%ebx
 	leal	(%eax,%ebp,1),%esp
+	movl	$512,%eax
+	cmovzl	%eax,%ebx
+	movl	%ebp,%eax
+	andl	$-16,%ebp
 	andl	$-16,%esp
-	jmp	.L008ecb_loop
+	movl	%eax,16(%ebp)
+	cmpl	%ebx,%ecx
+	ja	.L007ecb_loop
+	movl	%esi,%eax
+	cmpl	%esp,%ebp
+	cmovel	%edi,%eax
+	addl	%ecx,%eax
+	negl	%eax
+	andl	$4095,%eax
+	cmpl	$128,%eax
+	movl	$-128,%eax
+	cmovael	%ebx,%eax
+	andl	%eax,%ebx
+	jz	.L008ecb_unaligned_tail
+	jmp	.L007ecb_loop
 .align	16
-.L008ecb_loop:
+.L007ecb_loop:
 	movl	%edi,(%ebp)
 	movl	%esi,4(%ebp)
 	movl	%ecx,8(%ebp)
@@ -229,8 +245,8 @@ _padlock_ecb_encrypt:
 	testl	$15,%edi
 	jz	.L010ecb_out_aligned
 	movl	%ebx,%ecx
-	shrl	$2,%ecx
 	leal	(%esp),%esi
+	shrl	$2,%ecx
 .byte	243,165
 	subl	%ebx,%edi
 .L010ecb_out_aligned:
@@ -240,43 +256,75 @@ _padlock_ecb_encrypt:
 	addl	%ebx,%esi
 	subl	%ebx,%ecx
 	movl	$512,%ebx
-	jnz	.L008ecb_loop
+	jz	.L011ecb_break
+	cmpl	%ebx,%ecx
+	jae	.L007ecb_loop
+.L008ecb_unaligned_tail:
+	xorl	%eax,%eax
+	cmpl	%ebp,%esp
+	cmovel	%ecx,%eax
+	subl	%eax,%esp
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	.L007ecb_loop
+.align	16
+.L011ecb_break:
 	cmpl	%ebp,%esp
-	je	.L011ecb_done
+	je	.L012ecb_done
 	pxor	%xmm0,%xmm0
 	leal	(%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
 	movaps	%xmm0,(%eax)
 	leal	16(%eax),%eax
 	cmpl	%eax,%ebp
-	ja	.L012ecb_bzero
-.L011ecb_done:
+	ja	.L013ecb_bzero
+.L012ecb_done:
+	movl	16(%ebp),%ebp
 	leal	24(%ebp),%esp
-	jmp	.L013ecb_exit
+	jmp	.L014ecb_exit
 .align	16
-.L006ecb_short:
+.L006ecb_aligned:
+	leal	(%esi,%ecx,1),%ebp
+	negl	%ebp
+	andl	$4095,%ebp
 	xorl	%eax,%eax
-	leal	-24(%esp),%ebp
-	subl	%ecx,%eax
-	leal	(%eax,%ebp,1),%esp
-	andl	$-16,%esp
-	xorl	%ebx,%ebx
-.L014ecb_short_copy:
-	movups	(%esi,%ebx,1),%xmm0
-	leal	16(%ebx),%ebx
-	cmpl	%ebx,%ecx
-	movaps	%xmm0,-16(%esp,%ebx,1)
-	ja	.L014ecb_short_copy
-	movl	%esp,%esi
-	movl	%ecx,%ebx
-	jmp	.L008ecb_loop
-.align	16
-.L007ecb_aligned:
+	cmpl	$128,%ebp
+	movl	$127,%ebp
+	cmovael	%eax,%ebp
+	andl	%ecx,%ebp
+	subl	%ebp,%ecx
+	jz	.L015ecb_aligned_tail
 	leal	-16(%edx),%eax
 	leal	16(%edx),%ebx
 	shrl	$4,%ecx
 .byte	243,15,167,200
-.L013ecb_exit:
+	testl	%ebp,%ebp
+	jz	.L014ecb_exit
+.L015ecb_aligned_tail:
+	movl	%ebp,%ecx
+	leal	-24(%esp),%ebp
+	movl	%ebp,%esp
+	movl	%ebp,%eax
+	subl	%ecx,%esp
+	andl	$-16,%ebp
+	andl	$-16,%esp
+	movl	%eax,16(%ebp)
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	.L007ecb_loop
+.L014ecb_exit:
 	movl	$1,%eax
 	leal	4(%esp),%esp
 .L004ecb_abort:
@@ -299,19 +347,17 @@ _padlock_cbc_encrypt:
 	movl	28(%esp),%edx
 	movl	32(%esp),%ecx
 	testl	$15,%edx
-	jnz	.L015cbc_abort
+	jnz	.L016cbc_abort
 	testl	$15,%ecx
-	jnz	.L015cbc_abort
+	jnz	.L016cbc_abort
 	leal	.Lpadlock_saved_context,%eax
 	pushfl
 	cld
 	call	__padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
 	leal	16(%edx),%edx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpl	$64,%ecx
-	jbe	.L017cbc_short
 	testl	$32,(%edx)
 	jnz	.L018cbc_aligned
 	testl	$15,%edi
@@ -331,7 +377,25 @@ _padlock_cbc_encrypt:
 	negl	%eax
 	andl	$511,%ebx
 	leal	(%eax,%ebp,1),%esp
+	movl	$512,%eax
+	cmovzl	%eax,%ebx
+	movl	%ebp,%eax
+	andl	$-16,%ebp
 	andl	$-16,%esp
+	movl	%eax,16(%ebp)
+	cmpl	%ebx,%ecx
+	ja	.L019cbc_loop
+	movl	%esi,%eax
+	cmpl	%esp,%ebp
+	cmovel	%edi,%eax
+	addl	%ecx,%eax
+	negl	%eax
+	andl	$4095,%eax
+	cmpl	$64,%eax
+	movl	$-64,%eax
+	cmovael	%ebx,%eax
+	andl	%eax,%ebx
+	jz	.L020cbc_unaligned_tail
 	jmp	.L019cbc_loop
 .align	16
 .L019cbc_loop:
@@ -343,13 +407,13 @@ _padlock_cbc_encrypt:
 	testl	$15,%edi
 	cmovnzl	%esp,%edi
 	testl	$15,%esi
-	jz	.L020cbc_inp_aligned
+	jz	.L021cbc_inp_aligned
 	shrl	$2,%ecx
 .byte	243,165
 	subl	%ebx,%edi
 	movl	%ebx,%ecx
 	movl	%edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
 	leal	-16(%edx),%eax
 	leal	16(%edx),%ebx
 	shrl	$4,%ecx
@@ -359,61 +423,93 @@ _padlock_cbc_encrypt:
 	movl	(%ebp),%edi
 	movl	12(%ebp),%ebx
 	testl	$15,%edi
-	jz	.L021cbc_out_aligned
+	jz	.L022cbc_out_aligned
 	movl	%ebx,%ecx
-	shrl	$2,%ecx
 	leal	(%esp),%esi
+	shrl	$2,%ecx
 .byte	243,165
 	subl	%ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
 	movl	4(%ebp),%esi
 	movl	8(%ebp),%ecx
 	addl	%ebx,%edi
 	addl	%ebx,%esi
 	subl	%ebx,%ecx
 	movl	$512,%ebx
-	jnz	.L019cbc_loop
+	jz	.L023cbc_break
+	cmpl	%ebx,%ecx
+	jae	.L019cbc_loop
+.L020cbc_unaligned_tail:
+	xorl	%eax,%eax
+	cmpl	%ebp,%esp
+	cmovel	%ecx,%eax
+	subl	%eax,%esp
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	.L019cbc_loop
+.align	16
+.L023cbc_break:
 	cmpl	%ebp,%esp
-	je	.L022cbc_done
+	je	.L024cbc_done
 	pxor	%xmm0,%xmm0
 	leal	(%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
 	movaps	%xmm0,(%eax)
 	leal	16(%eax),%eax
 	cmpl	%eax,%ebp
-	ja	.L023cbc_bzero
-.L022cbc_done:
+	ja	.L025cbc_bzero
+.L024cbc_done:
+	movl	16(%ebp),%ebp
 	leal	24(%ebp),%esp
-	jmp	.L024cbc_exit
-.align	16
-.L017cbc_short:
-	xorl	%eax,%eax
-	leal	-24(%esp),%ebp
-	subl	%ecx,%eax
-	leal	(%eax,%ebp,1),%esp
-	andl	$-16,%esp
-	xorl	%ebx,%ebx
-.L025cbc_short_copy:
-	movups	(%esi,%ebx,1),%xmm0
-	leal	16(%ebx),%ebx
-	cmpl	%ebx,%ecx
-	movaps	%xmm0,-16(%esp,%ebx,1)
-	ja	.L025cbc_short_copy
-	movl	%esp,%esi
-	movl	%ecx,%ebx
-	jmp	.L019cbc_loop
+	jmp	.L026cbc_exit
 .align	16
 .L018cbc_aligned:
+	leal	(%esi,%ecx,1),%ebp
+	negl	%ebp
+	andl	$4095,%ebp
+	xorl	%eax,%eax
+	cmpl	$64,%ebp
+	movl	$63,%ebp
+	cmovael	%eax,%ebp
+	andl	%ecx,%ebp
+	subl	%ebp,%ecx
+	jz	.L027cbc_aligned_tail
 	leal	-16(%edx),%eax
 	leal	16(%edx),%ebx
 	shrl	$4,%ecx
 .byte	243,15,167,208
 	movaps	(%eax),%xmm0
 	movaps	%xmm0,-16(%edx)
-.L024cbc_exit:
+	testl	%ebp,%ebp
+	jz	.L026cbc_exit
+.L027cbc_aligned_tail:
+	movl	%ebp,%ecx
+	leal	-24(%esp),%ebp
+	movl	%ebp,%esp
+	movl	%ebp,%eax
+	subl	%ecx,%esp
+	andl	$-16,%ebp
+	andl	$-16,%esp
+	movl	%eax,16(%ebp)
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	.L019cbc_loop
+.L026cbc_exit:
 	movl	$1,%eax
 	leal	4(%esp),%esp
-.L015cbc_abort:
+.L016cbc_abort:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -437,10 +533,10 @@ __win32_segv_handler:
 	movl	4(%esp),%edx
 	movl	12(%esp),%ecx
 	cmpl	$3221225477,(%edx)
-	jne	.L026ret
+	jne	.L028ret
 	addl	$4,184(%ecx)
 	movl	$0,%eax
-.L026ret:
+.L028ret:
 	ret
 .globl	_padlock_sha1_oneshot
 .def	_padlock_sha1_oneshot;	.scl	2;	.type	32;	.endef
diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s
index bf5f62600c..4709ac2273 100644
--- a/lib/accelerated/x86/elf/padlock-x86-64.s
+++ b/lib/accelerated/x86/elf/padlock-x86-64.s
@@ -276,8 +276,6 @@ padlock_ecb_encrypt:
 	leaq	16(%rdx),%rdx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpq	$128,%rcx
-	jbe	.Lecb_short
 	testl	$32,(%rdx)
 	jnz	.Lecb_aligned
 	testq	$15,%rdi
@@ -297,6 +295,21 @@ padlock_ecb_encrypt:
 	negq	%rax
 	andq	$512-1,%rbx
 	leaq	(%rax,%rbp,1),%rsp
+	movq	$512,%rax
+	cmovzq	%rax,%rbx
+	cmpq	%rbx,%rcx
+	ja	.Lecb_loop
+	movq	%rsi,%rax
+	cmpq	%rsp,%rbp
+	cmoveq	%rdi,%rax
+	addq	%rcx,%rax
+	negq	%rax
+	andq	$4095,%rax
+	cmpq	$128,%rax
+	movq	$-128,%rax
+	cmovaeq	%rbx,%rax
+	andq	%rax,%rbx
+	jz	.Lecb_unaligned_tail
 	jmp	.Lecb_loop
 .align	16
 .Lecb_loop:
@@ -326,8 +339,8 @@ padlock_ecb_encrypt:
 	testq	$15,%rdi
 	jz	.Lecb_out_aligned
 	movq	%rbx,%rcx
-	shrq	$3,%rcx
 	leaq	(%rsp),%rsi
+	shrq	$3,%rcx
 .byte	0xf3,0x48,0xa5		
 	subq	%rbx,%rdi
 .Lecb_out_aligned:
@@ -337,9 +350,26 @@ padlock_ecb_encrypt:
 	addq	%rbx,%rsi
 	subq	%rbx,%rcx
 	movq	$512,%rbx
-	jnz	.Lecb_loop
-
+	jz	.Lecb_break
+	cmpq	%rbx,%rcx
+	jae	.Lecb_loop
+.Lecb_unaligned_tail:
+	xorl	%eax,%eax
 	cmpq	%rsp,%rbp
+	cmoveq	%rcx,%rax
+	movq	%rdi,%r8
+	movq	%rcx,%rbx
+	subq	%rax,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	movq	%rsp,%rsi
+	movq	%r8,%rdi
+	movq	%rbx,%rcx
+	jmp	.Lecb_loop
+.align	16
+.Lecb_break:
+	cmpq	%rbp,%rsp
 	je	.Lecb_done
 
 	pxor	%xmm0,%xmm0
@@ -353,26 +383,39 @@ padlock_ecb_encrypt:
 .Lecb_done:
 	leaq	(%rbp),%rsp
 	jmp	.Lecb_exit
-.align	16
-.Lecb_short:
-	movq	%rsp,%rbp
-	subq	%rcx,%rsp
-	xorq	%rbx,%rbx
-.Lecb_short_copy:
-	movups	(%rsi,%rbx,1),%xmm0
-	leaq	16(%rbx),%rbx
-	cmpq	%rbx,%rcx
-	movaps	%xmm0,-16(%rsp,%rbx,1)
-	ja	.Lecb_short_copy
-	movq	%rsp,%rsi
-	movq	%rcx,%rbx
-	jmp	.Lecb_loop
+
 .align	16
 .Lecb_aligned:
+	leaq	(%rsi,%rcx,1),%rbp
+	negq	%rbp
+	andq	$4095,%rbp
+	xorl	%eax,%eax
+	cmpq	$128,%rbp
+	movq	$128-1,%rbp
+	cmovaeq	%rax,%rbp
+	andq	%rcx,%rbp
+	subq	%rbp,%rcx
+	jz	.Lecb_aligned_tail
 	leaq	-16(%rdx),%rax
 	leaq	16(%rdx),%rbx
 	shrq	$4,%rcx
 .byte	0xf3,0x0f,0xa7,200	
+	testq	%rbp,%rbp
+	jz	.Lecb_exit
+
+.Lecb_aligned_tail:
+	movq	%rdi,%r8
+	movq	%rbp,%rbx
+	movq	%rbp,%rcx
+	leaq	(%rsp),%rbp
+	subq	%rcx,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	leaq	(%r8),%rdi
+	leaq	(%rsp),%rsi
+	movq	%rbx,%rcx
+	jmp	.Lecb_loop
 .Lecb_exit:
 	movl	$1,%eax
 	leaq	8(%rsp),%rsp
@@ -400,8 +443,6 @@ padlock_cbc_encrypt:
 	leaq	16(%rdx),%rdx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpq	$64,%rcx
-	jbe	.Lcbc_short
 	testl	$32,(%rdx)
 	jnz	.Lcbc_aligned
 	testq	$15,%rdi
@@ -421,6 +462,21 @@ padlock_cbc_encrypt:
 	negq	%rax
 	andq	$512-1,%rbx
 	leaq	(%rax,%rbp,1),%rsp
+	movq	$512,%rax
+	cmovzq	%rax,%rbx
+	cmpq	%rbx,%rcx
+	ja	.Lcbc_loop
+	movq	%rsi,%rax
+	cmpq	%rsp,%rbp
+	cmoveq	%rdi,%rax
+	addq	%rcx,%rax
+	negq	%rax
+	andq	$4095,%rax
+	cmpq	$64,%rax
+	movq	$-64,%rax
+	cmovaeq	%rbx,%rax
+	andq	%rax,%rbx
+	jz	.Lcbc_unaligned_tail
 	jmp	.Lcbc_loop
 .align	16
 .Lcbc_loop:
@@ -452,8 +508,8 @@ padlock_cbc_encrypt:
 	testq	$15,%rdi
 	jz	.Lcbc_out_aligned
 	movq	%rbx,%rcx
-	shrq	$3,%rcx
 	leaq	(%rsp),%rsi
+	shrq	$3,%rcx
 .byte	0xf3,0x48,0xa5		
 	subq	%rbx,%rdi
 .Lcbc_out_aligned:
@@ -463,9 +519,26 @@ padlock_cbc_encrypt:
 	addq	%rbx,%rsi
 	subq	%rbx,%rcx
 	movq	$512,%rbx
-	jnz	.Lcbc_loop
-
+	jz	.Lcbc_break
+	cmpq	%rbx,%rcx
+	jae	.Lcbc_loop
+.Lcbc_unaligned_tail:
+	xorl	%eax,%eax
 	cmpq	%rsp,%rbp
+	cmoveq	%rcx,%rax
+	movq	%rdi,%r8
+	movq	%rcx,%rbx
+	subq	%rax,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	movq	%rsp,%rsi
+	movq	%r8,%rdi
+	movq	%rbx,%rcx
+	jmp	.Lcbc_loop
+.align	16
+.Lcbc_break:
+	cmpq	%rbp,%rsp
 	je	.Lcbc_done
 
 	pxor	%xmm0,%xmm0
@@ -479,28 +552,41 @@ padlock_cbc_encrypt:
 .Lcbc_done:
 	leaq	(%rbp),%rsp
 	jmp	.Lcbc_exit
-.align	16
-.Lcbc_short:
-	movq	%rsp,%rbp
-	subq	%rcx,%rsp
-	xorq	%rbx,%rbx
-.Lcbc_short_copy:
-	movups	(%rsi,%rbx,1),%xmm0
-	leaq	16(%rbx),%rbx
-	cmpq	%rbx,%rcx
-	movaps	%xmm0,-16(%rsp,%rbx,1)
-	ja	.Lcbc_short_copy
-	movq	%rsp,%rsi
-	movq	%rcx,%rbx
-	jmp	.Lcbc_loop
+
 .align	16
 .Lcbc_aligned:
+	leaq	(%rsi,%rcx,1),%rbp
+	negq	%rbp
+	andq	$4095,%rbp
+	xorl	%eax,%eax
+	cmpq	$64,%rbp
+	movq	$64-1,%rbp
+	cmovaeq	%rax,%rbp
+	andq	%rcx,%rbp
+	subq	%rbp,%rcx
+	jz	.Lcbc_aligned_tail
 	leaq	-16(%rdx),%rax
 	leaq	16(%rdx),%rbx
 	shrq	$4,%rcx
 .byte	0xf3,0x0f,0xa7,208	
 	movdqa	(%rax),%xmm0
 	movdqa	%xmm0,-16(%rdx)
+	testq	%rbp,%rbp
+	jz	.Lcbc_exit
+
+.Lcbc_aligned_tail:
+	movq	%rdi,%r8
+	movq	%rbp,%rbx
+	movq	%rbp,%rcx
+	leaq	(%rsp),%rbp
+	subq	%rcx,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	leaq	(%r8),%rdi
+	leaq	(%rsp),%rsi
+	movq	%rbx,%rcx
+	jmp	.Lcbc_loop
 .Lcbc_exit:
 	movl	$1,%eax
 	leaq	8(%rsp),%rsp
diff --git a/lib/accelerated/x86/license.txt b/lib/accelerated/x86/license.txt
index c87ba42b08..929ddd59e3 100755
--- a/lib/accelerated/x86/license.txt
+++ b/lib/accelerated/x86/license.txt
@@ -5,7 +5,7 @@ CRYPTOGAMS licenses depending on where you obtain it. For further
 details see http://www.openssl.org/~appro/cryptogams/.
 ====================================================================
 
-Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+Copyright (c) 2006-2012, CRYPTOGAMS by <appro@openssl.org>
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
index 9b912f9202..dbd89daabc 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
@@ -276,8 +276,6 @@ _padlock_ecb_encrypt:
 	leaq	16(%rdx),%rdx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpq	$128,%rcx
-	jbe	L$ecb_short
 	testl	$32,(%rdx)
 	jnz	L$ecb_aligned
 	testq	$15,%rdi
@@ -297,6 +295,21 @@ _padlock_ecb_encrypt:
 	negq	%rax
 	andq	$512-1,%rbx
 	leaq	(%rax,%rbp,1),%rsp
+	movq	$512,%rax
+	cmovzq	%rax,%rbx
+	cmpq	%rbx,%rcx
+	ja	L$ecb_loop
+	movq	%rsi,%rax
+	cmpq	%rsp,%rbp
+	cmoveq	%rdi,%rax
+	addq	%rcx,%rax
+	negq	%rax
+	andq	$4095,%rax
+	cmpq	$128,%rax
+	movq	$-128,%rax
+	cmovaeq	%rbx,%rax
+	andq	%rax,%rbx
+	jz	L$ecb_unaligned_tail
 	jmp	L$ecb_loop
 .p2align	4
 L$ecb_loop:
@@ -326,8 +339,8 @@ L$ecb_inp_aligned:
 	testq	$15,%rdi
 	jz	L$ecb_out_aligned
 	movq	%rbx,%rcx
-	shrq	$3,%rcx
 	leaq	(%rsp),%rsi
+	shrq	$3,%rcx
 .byte	0xf3,0x48,0xa5		
 	subq	%rbx,%rdi
 L$ecb_out_aligned:
@@ -337,9 +350,26 @@ L$ecb_out_aligned:
 	addq	%rbx,%rsi
 	subq	%rbx,%rcx
 	movq	$512,%rbx
-	jnz	L$ecb_loop
-
+	jz	L$ecb_break
+	cmpq	%rbx,%rcx
+	jae	L$ecb_loop
+L$ecb_unaligned_tail:
+	xorl	%eax,%eax
 	cmpq	%rsp,%rbp
+	cmoveq	%rcx,%rax
+	movq	%rdi,%r8
+	movq	%rcx,%rbx
+	subq	%rax,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	movq	%rsp,%rsi
+	movq	%r8,%rdi
+	movq	%rbx,%rcx
+	jmp	L$ecb_loop
+.p2align	4
+L$ecb_break:
+	cmpq	%rbp,%rsp
 	je	L$ecb_done
 
 	pxor	%xmm0,%xmm0
@@ -353,26 +383,39 @@ L$ecb_bzero:
 L$ecb_done:
 	leaq	(%rbp),%rsp
 	jmp	L$ecb_exit
-.p2align	4
-L$ecb_short:
-	movq	%rsp,%rbp
-	subq	%rcx,%rsp
-	xorq	%rbx,%rbx
-L$ecb_short_copy:
-	movups	(%rsi,%rbx,1),%xmm0
-	leaq	16(%rbx),%rbx
-	cmpq	%rbx,%rcx
-	movaps	%xmm0,-16(%rsp,%rbx,1)
-	ja	L$ecb_short_copy
-	movq	%rsp,%rsi
-	movq	%rcx,%rbx
-	jmp	L$ecb_loop
+
 .p2align	4
 L$ecb_aligned:
+	leaq	(%rsi,%rcx,1),%rbp
+	negq	%rbp
+	andq	$4095,%rbp
+	xorl	%eax,%eax
+	cmpq	$128,%rbp
+	movq	$128-1,%rbp
+	cmovaeq	%rax,%rbp
+	andq	%rcx,%rbp
+	subq	%rbp,%rcx
+	jz	L$ecb_aligned_tail
 	leaq	-16(%rdx),%rax
 	leaq	16(%rdx),%rbx
 	shrq	$4,%rcx
 .byte	0xf3,0x0f,0xa7,200	
+	testq	%rbp,%rbp
+	jz	L$ecb_exit
+
+L$ecb_aligned_tail:
+	movq	%rdi,%r8
+	movq	%rbp,%rbx
+	movq	%rbp,%rcx
+	leaq	(%rsp),%rbp
+	subq	%rcx,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	leaq	(%r8),%rdi
+	leaq	(%rsp),%rsi
+	movq	%rbx,%rcx
+	jmp	L$ecb_loop
 L$ecb_exit:
 	movl	$1,%eax
 	leaq	8(%rsp),%rsp
@@ -400,8 +443,6 @@ _padlock_cbc_encrypt:
 	leaq	16(%rdx),%rdx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpq	$64,%rcx
-	jbe	L$cbc_short
 	testl	$32,(%rdx)
 	jnz	L$cbc_aligned
 	testq	$15,%rdi
@@ -421,6 +462,21 @@ _padlock_cbc_encrypt:
 	negq	%rax
 	andq	$512-1,%rbx
 	leaq	(%rax,%rbp,1),%rsp
+	movq	$512,%rax
+	cmovzq	%rax,%rbx
+	cmpq	%rbx,%rcx
+	ja	L$cbc_loop
+	movq	%rsi,%rax
+	cmpq	%rsp,%rbp
+	cmoveq	%rdi,%rax
+	addq	%rcx,%rax
+	negq	%rax
+	andq	$4095,%rax
+	cmpq	$64,%rax
+	movq	$-64,%rax
+	cmovaeq	%rbx,%rax
+	andq	%rax,%rbx
+	jz	L$cbc_unaligned_tail
 	jmp	L$cbc_loop
 .p2align	4
 L$cbc_loop:
@@ -452,8 +508,8 @@ L$cbc_inp_aligned:
 	testq	$15,%rdi
 	jz	L$cbc_out_aligned
 	movq	%rbx,%rcx
-	shrq	$3,%rcx
 	leaq	(%rsp),%rsi
+	shrq	$3,%rcx
 .byte	0xf3,0x48,0xa5		
 	subq	%rbx,%rdi
 L$cbc_out_aligned:
@@ -463,9 +519,26 @@ L$cbc_out_aligned:
 	addq	%rbx,%rsi
 	subq	%rbx,%rcx
 	movq	$512,%rbx
-	jnz	L$cbc_loop
-
+	jz	L$cbc_break
+	cmpq	%rbx,%rcx
+	jae	L$cbc_loop
+L$cbc_unaligned_tail:
+	xorl	%eax,%eax
 	cmpq	%rsp,%rbp
+	cmoveq	%rcx,%rax
+	movq	%rdi,%r8
+	movq	%rcx,%rbx
+	subq	%rax,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	movq	%rsp,%rsi
+	movq	%r8,%rdi
+	movq	%rbx,%rcx
+	jmp	L$cbc_loop
+.p2align	4
+L$cbc_break:
+	cmpq	%rbp,%rsp
 	je	L$cbc_done
 
 	pxor	%xmm0,%xmm0
@@ -479,28 +552,41 @@ L$cbc_bzero:
 L$cbc_done:
 	leaq	(%rbp),%rsp
 	jmp	L$cbc_exit
-.p2align	4
-L$cbc_short:
-	movq	%rsp,%rbp
-	subq	%rcx,%rsp
-	xorq	%rbx,%rbx
-L$cbc_short_copy:
-	movups	(%rsi,%rbx,1),%xmm0
-	leaq	16(%rbx),%rbx
-	cmpq	%rbx,%rcx
-	movaps	%xmm0,-16(%rsp,%rbx,1)
-	ja	L$cbc_short_copy
-	movq	%rsp,%rsi
-	movq	%rcx,%rbx
-	jmp	L$cbc_loop
+
 .p2align	4
 L$cbc_aligned:
+	leaq	(%rsi,%rcx,1),%rbp
+	negq	%rbp
+	andq	$4095,%rbp
+	xorl	%eax,%eax
+	cmpq	$64,%rbp
+	movq	$64-1,%rbp
+	cmovaeq	%rax,%rbp
+	andq	%rcx,%rbp
+	subq	%rbp,%rcx
+	jz	L$cbc_aligned_tail
 	leaq	-16(%rdx),%rax
 	leaq	16(%rdx),%rbx
 	shrq	$4,%rcx
 .byte	0xf3,0x0f,0xa7,208	
 	movdqa	(%rax),%xmm0
 	movdqa	%xmm0,-16(%rdx)
+	testq	%rbp,%rbp
+	jz	L$cbc_exit
+
+L$cbc_aligned_tail:
+	movq	%rdi,%r8
+	movq	%rbp,%rbx
+	movq	%rbp,%rcx
+	leaq	(%rsp),%rbp
+	subq	%rcx,%rsp
+	shrq	$3,%rcx
+	leaq	(%rsp),%rdi
+.byte	0xf3,0x48,0xa5		
+	leaq	(%r8),%rdi
+	leaq	(%rsp),%rsi
+	movq	%rbx,%rcx
+	jmp	L$cbc_loop
 L$cbc_exit:
 	movl	$1,%eax
 	leaq	8(%rsp),%rsp
diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
index 02b427e6a8..40cfce9af2 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
@@ -174,16 +174,14 @@ L005ecb_pic_point:
 	leal	16(%edx),%edx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpl	$128,%ecx
-	jbe	L006ecb_short
 	testl	$32,(%edx)
-	jnz	L007ecb_aligned
+	jnz	L006ecb_aligned
 	testl	$15,%edi
 	setz	%al
 	testl	$15,%esi
 	setz	%bl
 	testl	%ebx,%eax
-	jnz	L007ecb_aligned
+	jnz	L006ecb_aligned
 	negl	%eax
 	movl	$512,%ebx
 	notl	%eax
@@ -195,10 +193,28 @@ L005ecb_pic_point:
 	negl	%eax
 	andl	$511,%ebx
 	leal	(%eax,%ebp,1),%esp
+	movl	$512,%eax
+	cmovzl	%eax,%ebx
+	movl	%ebp,%eax
+	andl	$-16,%ebp
 	andl	$-16,%esp
-	jmp	L008ecb_loop
+	movl	%eax,16(%ebp)
+	cmpl	%ebx,%ecx
+	ja	L007ecb_loop
+	movl	%esi,%eax
+	cmpl	%esp,%ebp
+	cmovel	%edi,%eax
+	addl	%ecx,%eax
+	negl	%eax
+	andl	$4095,%eax
+	cmpl	$128,%eax
+	movl	$-128,%eax
+	cmovael	%ebx,%eax
+	andl	%eax,%ebx
+	jz	L008ecb_unaligned_tail
+	jmp	L007ecb_loop
 .align	4,0x90
-L008ecb_loop:
+L007ecb_loop:
 	movl	%edi,(%ebp)
 	movl	%esi,4(%ebp)
 	movl	%ecx,8(%ebp)
@@ -223,8 +239,8 @@ L009ecb_inp_aligned:
 	testl	$15,%edi
 	jz	L010ecb_out_aligned
 	movl	%ebx,%ecx
-	shrl	$2,%ecx
 	leal	(%esp),%esi
+	shrl	$2,%ecx
 .byte	243,165
 	subl	%ebx,%edi
 L010ecb_out_aligned:
@@ -234,43 +250,75 @@ L010ecb_out_aligned:
 	addl	%ebx,%esi
 	subl	%ebx,%ecx
 	movl	$512,%ebx
-	jnz	L008ecb_loop
+	jz	L011ecb_break
+	cmpl	%ebx,%ecx
+	jae	L007ecb_loop
+L008ecb_unaligned_tail:
+	xorl	%eax,%eax
+	cmpl	%ebp,%esp
+	cmovel	%ecx,%eax
+	subl	%eax,%esp
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	L007ecb_loop
+.align	4,0x90
+L011ecb_break:
 	cmpl	%ebp,%esp
-	je	L011ecb_done
+	je	L012ecb_done
 	pxor	%xmm0,%xmm0
 	leal	(%esp),%eax
-L012ecb_bzero:
+L013ecb_bzero:
 	movaps	%xmm0,(%eax)
 	leal	16(%eax),%eax
 	cmpl	%eax,%ebp
-	ja	L012ecb_bzero
-L011ecb_done:
+	ja	L013ecb_bzero
+L012ecb_done:
+	movl	16(%ebp),%ebp
 	leal	24(%ebp),%esp
-	jmp	L013ecb_exit
+	jmp	L014ecb_exit
 .align	4,0x90
-L006ecb_short:
+L006ecb_aligned:
+	leal	(%esi,%ecx,1),%ebp
+	negl	%ebp
+	andl	$4095,%ebp
 	xorl	%eax,%eax
-	leal	-24(%esp),%ebp
-	subl	%ecx,%eax
-	leal	(%eax,%ebp,1),%esp
-	andl	$-16,%esp
-	xorl	%ebx,%ebx
-L014ecb_short_copy:
-	movups	(%esi,%ebx,1),%xmm0
-	leal	16(%ebx),%ebx
-	cmpl	%ebx,%ecx
-	movaps	%xmm0,-16(%esp,%ebx,1)
-	ja	L014ecb_short_copy
-	movl	%esp,%esi
-	movl	%ecx,%ebx
-	jmp	L008ecb_loop
-.align	4,0x90
-L007ecb_aligned:
+	cmpl	$128,%ebp
+	movl	$127,%ebp
+	cmovael	%eax,%ebp
+	andl	%ecx,%ebp
+	subl	%ebp,%ecx
+	jz	L015ecb_aligned_tail
 	leal	-16(%edx),%eax
 	leal	16(%edx),%ebx
 	shrl	$4,%ecx
 .byte	243,15,167,200
-L013ecb_exit:
+	testl	%ebp,%ebp
+	jz	L014ecb_exit
+L015ecb_aligned_tail:
+	movl	%ebp,%ecx
+	leal	-24(%esp),%ebp
+	movl	%ebp,%esp
+	movl	%ebp,%eax
+	subl	%ecx,%esp
+	andl	$-16,%ebp
+	andl	$-16,%esp
+	movl	%eax,16(%ebp)
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	L007ecb_loop
+L014ecb_exit:
 	movl	$1,%eax
 	leal	4(%esp),%esp
 L004ecb_abort:
@@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin:
 	movl	28(%esp),%edx
 	movl	32(%esp),%ecx
 	testl	$15,%edx
-	jnz	L015cbc_abort
+	jnz	L016cbc_abort
 	testl	$15,%ecx
-	jnz	L015cbc_abort
-	leal	Lpadlock_saved_context-L016cbc_pic_point,%eax
+	jnz	L016cbc_abort
+	leal	Lpadlock_saved_context-L017cbc_pic_point,%eax
 	pushfl
 	cld
 	call	__padlock_verify_ctx
-L016cbc_pic_point:
+L017cbc_pic_point:
 	leal	16(%edx),%edx
 	xorl	%eax,%eax
 	xorl	%ebx,%ebx
-	cmpl	$64,%ecx
-	jbe	L017cbc_short
 	testl	$32,(%edx)
 	jnz	L018cbc_aligned
 	testl	$15,%edi
@@ -324,7 +370,25 @@ L016cbc_pic_point:
 	negl	%eax
 	andl	$511,%ebx
 	leal	(%eax,%ebp,1),%esp
+	movl	$512,%eax
+	cmovzl	%eax,%ebx
+	movl	%ebp,%eax
+	andl	$-16,%ebp
 	andl	$-16,%esp
+	movl	%eax,16(%ebp)
+	cmpl	%ebx,%ecx
+	ja	L019cbc_loop
+	movl	%esi,%eax
+	cmpl	%esp,%ebp
+	cmovel	%edi,%eax
+	addl	%ecx,%eax
+	negl	%eax
+	andl	$4095,%eax
+	cmpl	$64,%eax
+	movl	$-64,%eax
+	cmovael	%ebx,%eax
+	andl	%eax,%ebx
+	jz	L020cbc_unaligned_tail
 	jmp	L019cbc_loop
 .align	4,0x90
 L019cbc_loop:
@@ -336,13 +400,13 @@ L019cbc_loop:
 	testl	$15,%edi
 	cmovnzl	%esp,%edi
 	testl	$15,%esi
-	jz	L020cbc_inp_aligned
+	jz	L021cbc_inp_aligned
 	shrl	$2,%ecx
 .byte	243,165
 	subl	%ebx,%edi
 	movl	%ebx,%ecx
 	movl	%edi,%esi
-L020cbc_inp_aligned:
+L021cbc_inp_aligned:
 	leal	-16(%edx),%eax
 	leal	16(%edx),%ebx
 	shrl	$4,%ecx
@@ -352,61 +416,93 @@ L020cbc_inp_aligned:
 	movl	(%ebp),%edi
 	movl	12(%ebp),%ebx
 	testl	$15,%edi
-	jz	L021cbc_out_aligned
+	jz	L022cbc_out_aligned
 	movl	%ebx,%ecx
-	shrl	$2,%ecx
 	leal	(%esp),%esi
+	shrl	$2,%ecx
 .byte	243,165
 	subl	%ebx,%edi
-L021cbc_out_aligned:
+L022cbc_out_aligned:
 	movl	4(%ebp),%esi
 	movl	8(%ebp),%ecx
 	addl	%ebx,%edi
 	addl	%ebx,%esi
 	subl	%ebx,%ecx
 	movl	$512,%ebx
-	jnz	L019cbc_loop
+	jz	L023cbc_break
+	cmpl	%ebx,%ecx
+	jae	L019cbc_loop
+L020cbc_unaligned_tail:
+	xorl	%eax,%eax
+	cmpl	%ebp,%esp
+	cmovel	%ecx,%eax
+	subl	%eax,%esp
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	L019cbc_loop
+.align	4,0x90
+L023cbc_break:
 	cmpl	%ebp,%esp
-	je	L022cbc_done
+	je	L024cbc_done
 	pxor	%xmm0,%xmm0
 	leal	(%esp),%eax
-L023cbc_bzero:
+L025cbc_bzero:
 	movaps	%xmm0,(%eax)
 	leal	16(%eax),%eax
 	cmpl	%eax,%ebp
-	ja	L023cbc_bzero
-L022cbc_done:
+	ja	L025cbc_bzero
+L024cbc_done:
+	movl	16(%ebp),%ebp
 	leal	24(%ebp),%esp
-	jmp	L024cbc_exit
-.align	4,0x90
-L017cbc_short:
-	xorl	%eax,%eax
-	leal	-24(%esp),%ebp
-	subl	%ecx,%eax
-	leal	(%eax,%ebp,1),%esp
-	andl	$-16,%esp
-	xorl	%ebx,%ebx
-L025cbc_short_copy:
-	movups	(%esi,%ebx,1),%xmm0
-	leal	16(%ebx),%ebx
-	cmpl	%ebx,%ecx
-	movaps	%xmm0,-16(%esp,%ebx,1)
-	ja	L025cbc_short_copy
-	movl	%esp,%esi
-	movl	%ecx,%ebx
-	jmp	L019cbc_loop
+	jmp	L026cbc_exit
 .align	4,0x90
 L018cbc_aligned:
+	leal	(%esi,%ecx,1),%ebp
+	negl	%ebp
+	andl	$4095,%ebp
+	xorl	%eax,%eax
+	cmpl	$64,%ebp
+	movl	$63,%ebp
+	cmovael	%eax,%ebp
+	andl	%ecx,%ebp
+	subl	%ebp,%ecx
+	jz	L027cbc_aligned_tail
 	leal	-16(%edx),%eax
 	leal	16(%edx),%ebx
 	shrl	$4,%ecx
 .byte	243,15,167,208
 	movaps	(%eax),%xmm0
 	movaps	%xmm0,-16(%edx)
-L024cbc_exit:
+	testl	%ebp,%ebp
+	jz	L026cbc_exit
+L027cbc_aligned_tail:
+	movl	%ebp,%ecx
+	leal	-24(%esp),%ebp
+	movl	%ebp,%esp
+	movl	%ebp,%eax
+	subl	%ecx,%esp
+	andl	$-16,%ebp
+	andl	$-16,%esp
+	movl	%eax,16(%ebp)
+	movl	%edi,%eax
+	movl	%ecx,%ebx
+	shrl	$2,%ecx
+	leal	(%esp),%edi
+.byte	243,165
+	movl	%esp,%esi
+	movl	%eax,%edi
+	movl	%ebx,%ecx
+	jmp	L019cbc_loop
+L026cbc_exit:
 	movl	$1,%eax
 	leal	4(%esp),%esp
-L015cbc_abort:
+L016cbc_abort:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -428,10 +524,10 @@ __win32_segv_handler:
 	movl	4(%esp),%edx
 	movl	12(%esp),%ecx
 	cmpl	$3221225477,(%edx)
-	jne	L026ret
+	jne	L028ret
 	addl	$4,184(%ecx)
 	movl	$0,%eax
-L026ret:
+L028ret:
 	ret
 .globl	_padlock_sha1_oneshot
 .align	4
author	Nikos Mavrogiannopoulos <nmav@gnutls.org>	2012-03-19 22:55:14 +0100
committer	Nikos Mavrogiannopoulos <nmav@gnutls.org>	2012-03-19 22:58:02 +0100
commit	9567d93c07f87ecb5c8560b7a45125de28710bc1 (patch)
tree	31a779ef6d1e51589dc257599dca05ea6a768c01 /lib
parent	abbfc182f738c654ebeaf75cf6893acc0947699b (diff)
download	gnutls-9567d93c07f87ecb5c8560b7a45125de28710bc1.tar.gz