summaryrefslogtreecommitdiff
path: root/x64dll.asm
diff options
context:
space:
mode:
authorweidai <weidai11@users.noreply.github.com>2009-03-12 11:24:12 +0000
committerweidai <weidai11@users.noreply.github.com>2009-03-12 11:24:12 +0000
commit2779fc60506e2042ab1569ffad4061f1187d186c (patch)
tree68edc0bccf003f5615716b3ae2d6b97067af39c4 /x64dll.asm
parent64af4560dc8ba66ef0e2ac3b05dec6f445ec96fe (diff)
downloadcryptopp-git-2779fc60506e2042ab1569ffad4061f1187d186c.tar.gz
- add EAX mode, XSalsa20
- speed up GCM key setup - wipe stack in AES assembly code - speed up CFB mode
Diffstat (limited to 'x64dll.asm')
-rw-r--r--x64dll.asm467
1 files changed, 240 insertions, 227 deletions
diff --git a/x64dll.asm b/x64dll.asm
index 6b94e1e1..18270982 100644
--- a/x64dll.asm
+++ b/x64dll.asm
@@ -62,357 +62,356 @@ Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
rex_push_reg rsi
push_reg rdi
push_reg rbx
-push_reg rbp
push_reg r12
.endprolog
mov r8, rcx
-mov rsi, ?Te@rdtable@CryptoPP@@3PA_KA
+mov r11, ?Te@rdtable@CryptoPP@@3PA_KA
mov rdi, QWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
-mov rbp, [(r8+16*19)]
+mov rsi, [(r8+16*19)]
mov rax, 16
-and rax, rbp
+and rax, rsi
movdqa xmm3, XMMWORD PTR [rdx+16+rax]
movdqa [(r8+16*12)], xmm3
lea rax, [rdx+rax+2*16]
-sub rax, rbp
+sub rax, rsi
label0:
-movdqa xmm0, [rax+rbp]
-movdqa XMMWORD PTR [(r8+0)+rbp], xmm0
-add rbp, 16
-cmp rbp, 16*12
+movdqa xmm0, [rax+rsi]
+movdqa XMMWORD PTR [(r8+0)+rsi], xmm0
+add rsi, 16
+cmp rsi, 16*12
jl label0
-movdqa xmm4, [rax+rbp]
+movdqa xmm4, [rax+rsi]
movdqa xmm1, [rdx]
-mov r11d, [rdx+4*4]
+mov r12d, [rdx+4*4]
mov ebx, [rdx+5*4]
mov ecx, [rdx+6*4]
mov edx, [rdx+7*4]
xor rax, rax
label9:
-mov ebp, [rsi+rax]
+mov esi, [r11+rax]
add rax, rdi
-mov ebp, [rsi+rax]
+mov esi, [r11+rax]
add rax, rdi
-mov ebp, [rsi+rax]
+mov esi, [r11+rax]
add rax, rdi
-mov ebp, [rsi+rax]
+mov esi, [r11+rax]
add rax, rdi
cmp rax, 2048
jl label9
lfence
test DWORD PTR [(r8+16*18+8)], 1
jz label8
-mov rbp, [(r8+16*14)]
-movdqa xmm2, [rbp]
+mov rsi, [(r8+16*14)]
+movdqu xmm2, [rsi]
pxor xmm2, xmm1
psrldq xmm1, 14
movd eax, xmm1
-mov al, BYTE PTR [rbp+15]
-mov r12d, eax
+mov al, BYTE PTR [rsi+15]
+mov r10d, eax
movd eax, xmm2
psrldq xmm2, 4
movd edi, xmm2
psrldq xmm2, 4
-movzx ebp, al
-xor r11d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor edx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor r12d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor edx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor ecx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor ecx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov eax, edi
movd edi, xmm2
psrldq xmm2, 4
-movzx ebp, al
-xor ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor r11d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor r12d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor edx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor ecx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor edx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor ecx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov eax, edi
movd edi, xmm2
-movzx ebp, al
-xor ecx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor ecx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor r11d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor edx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor r12d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor edx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov eax, edi
-movzx ebp, al
-xor edx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor ecx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor edx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor ecx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, al
+xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
psrldq xmm2, 3
mov eax, [(r8+16*12)+0*4]
mov edi, [(r8+16*12)+2*4]
-mov r10d, [(r8+16*12)+3*4]
-movzx ebp, cl
-xor r10d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, bl
-xor edi, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, bh
-xor r10d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+mov r9d, [(r8+16*12)+3*4]
+movzx esi, cl
+xor r9d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, bl
+xor edi, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, bh
+xor r9d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr ebx, 16
-movzx ebp, bl
-xor eax, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, bh
-mov ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx esi, bl
+xor eax, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, bh
+mov ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
xor ebx, [(r8+16*12)+1*4]
-movzx ebp, ch
-xor eax, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, ch
+xor eax, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr ecx, 16
-movzx ebp, dl
-xor eax, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, dh
-xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, dl
+xor eax, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, dh
+xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr edx, 16
-movzx ebp, ch
-xor edi, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, cl
-xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, dl
-xor edi, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, dh
-xor r10d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx esi, ch
+xor edi, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, cl
+xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, dl
+xor edi, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, dh
+xor r9d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
movd ecx, xmm2
-mov edx, r11d
-mov [(r8+0)+3*4], r10d
+mov edx, r12d
+mov [(r8+0)+3*4], r9d
mov [(r8+0)+0*4], eax
mov [(r8+0)+1*4], ebx
mov [(r8+0)+2*4], edi
jmp label5
label3:
-mov r11d, [(r8+16*12)+0*4]
+mov r12d, [(r8+16*12)+0*4]
mov ebx, [(r8+16*12)+1*4]
mov ecx, [(r8+16*12)+2*4]
mov edx, [(r8+16*12)+3*4]
label8:
mov rax, [(r8+16*14)]
movdqu xmm2, [rax]
-mov rbp, [(r8+16*14)+8]
-movdqu xmm5, [rbp]
+mov rsi, [(r8+16*14)+8]
+movdqu xmm5, [rsi]
pxor xmm2, xmm1
pxor xmm2, xmm5
movd eax, xmm2
psrldq xmm2, 4
movd edi, xmm2
psrldq xmm2, 4
-movzx ebp, al
-xor r11d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor edx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor r12d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor edx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor ecx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor ecx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov eax, edi
movd edi, xmm2
psrldq xmm2, 4
-movzx ebp, al
-xor ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor r11d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor r12d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor edx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor ecx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor edx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor ecx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov eax, edi
movd edi, xmm2
-movzx ebp, al
-xor ecx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor ecx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor r11d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor edx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor r12d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor edx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov eax, edi
-movzx ebp, al
-xor edx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ah
-xor ecx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, al
+xor edx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ah
+xor ecx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, al
-xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, ah
-xor r11d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-mov eax, r11d
+movzx esi, al
+xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor r12d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+mov eax, r12d
add r8, [(r8+16*19)]
add r8, 4*16
jmp label2
label1:
-mov ecx, r12d
-mov edx, r11d
+mov ecx, r10d
+mov edx, r12d
mov eax, [(r8+0)+0*4]
mov ebx, [(r8+0)+1*4]
xor cl, ch
and rcx, 255
label5:
-add r12d, 1
-xor edx, DWORD PTR [rsi+rcx*8+3]
-movzx ebp, dl
-xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, dh
-mov ecx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+add r10d, 1
+xor edx, DWORD PTR [r11+rcx*8+3]
+movzx esi, dl
+xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, dh
+mov ecx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr edx, 16
xor ecx, [(r8+0)+2*4]
-movzx ebp, dh
-xor eax, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, dl
-mov edx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx esi, dh
+xor eax, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, dl
+mov edx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
xor edx, [(r8+0)+3*4]
add r8, [(r8+16*19)]
add r8, 3*16
jmp label4
label2:
-mov r10d, [(r8+0)-4*16+3*4]
+mov r9d, [(r8+0)-4*16+3*4]
mov edi, [(r8+0)-4*16+2*4]
-movzx ebp, cl
-xor r10d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, cl
+xor r9d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov cl, al
-movzx ebp, ah
-xor edi, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor edi, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, bl
-xor edi, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, bh
-xor r10d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, bl
+xor edi, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, bh
+xor r9d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr ebx, 16
-movzx ebp, al
-xor r10d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, ah
-mov eax, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, bl
-xor eax, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, bh
-mov ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ch
-xor eax, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, cl
-xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor r9d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, ah
+mov eax, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, bl
+xor eax, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, bh
+mov ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ch
+xor eax, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, cl
+xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
shr ecx, 16
-movzx ebp, dl
-xor eax, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, dh
-xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, dl
+xor eax, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, dh
+xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr edx, 16
-movzx ebp, ch
-xor edi, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, cl
-xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, dl
-xor edi, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, dh
-xor r10d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx esi, ch
+xor edi, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, cl
+xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, dl
+xor edi, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, dh
+xor r9d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
mov ecx, edi
xor eax, [(r8+0)-4*16+0*4]
xor ebx, [(r8+0)-4*16+1*4]
-mov edx, r10d
+mov edx, r9d
label4:
-mov r10d, [(r8+0)-4*16+7*4]
+mov r9d, [(r8+0)-4*16+7*4]
mov edi, [(r8+0)-4*16+6*4]
-movzx ebp, cl
-xor r10d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, cl
+xor r9d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
mov cl, al
-movzx ebp, ah
-xor edi, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, ah
+xor edi, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr eax, 16
-movzx ebp, bl
-xor edi, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, bh
-xor r10d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, bl
+xor edi, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, bh
+xor r9d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr ebx, 16
-movzx ebp, al
-xor r10d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, ah
-mov eax, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, bl
-xor eax, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, bh
-mov ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, ch
-xor eax, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
-movzx ebp, cl
-xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx esi, al
+xor r9d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, ah
+mov eax, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, bl
+xor eax, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, bh
+mov ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, ch
+xor eax, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
+movzx esi, cl
+xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
shr ecx, 16
-movzx ebp, dl
-xor eax, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
-movzx ebp, dh
-xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx esi, dl
+xor eax, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
+movzx esi, dh
+xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
shr edx, 16
-movzx ebp, ch
-xor edi, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
-movzx ebp, cl
-xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, dl
-xor edi, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
-movzx ebp, dh
-xor r10d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx esi, ch
+xor edi, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
+movzx esi, cl
+xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, dl
+xor edi, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
+movzx esi, dh
+xor r9d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
mov ecx, edi
xor eax, [(r8+0)-4*16+4*4]
xor ebx, [(r8+0)-4*16+5*4]
-mov edx, r10d
+mov edx, r9d
add r8, 32
test r8, 255
jnz label2
sub r8, 16*16
-movzx ebp, ch
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, dl
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, ch
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, dl
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+2], di
-movzx ebp, dh
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, al
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, dh
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, al
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+6], di
shr edx, 16
-movzx ebp, ah
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, bl
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, ah
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, bl
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+10], di
shr eax, 16
-movzx ebp, bh
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, cl
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, bh
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, cl
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+14], di
shr ebx, 16
-movzx ebp, dh
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, al
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, dh
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, al
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+12], di
shr ecx, 16
-movzx ebp, ah
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, bl
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, ah
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, bl
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+0], di
-movzx ebp, bh
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, cl
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, bh
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, cl
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+4], di
-movzx ebp, ch
-movzx edi, BYTE PTR [rsi+rbp*8+1]
-movzx ebp, dl
-xor edi, DWORD PTR [rsi+rbp*8+0]
+movzx esi, ch
+movzx edi, BYTE PTR [r11+rsi*8+1]
+movzx esi, dl
+xor edi, DWORD PTR [r11+rsi*8+0]
mov WORD PTR [(r8+16*13)+8], di
mov rax, [(r8+16*14)+16]
mov rbx, [(r8+16*14)+24]
@@ -430,13 +429,27 @@ mov [(r8+16*18+8)], rcx
test rcx, 1
jnz label1
movdqa xmm0, [(r8+16*16)]
-paddd xmm0, [(r8+16*14)]
+paddq xmm0, [(r8+16*14)]
movdqa [(r8+16*14)], xmm0
jmp label3
label7:
-mov rbp, [(r8+16*18)]
+xorps xmm0, xmm0
+lea rax, [(r8+0)+7*16]
+movaps [rax-7*16], xmm0
+movaps [rax-6*16], xmm0
+movaps [rax-5*16], xmm0
+movaps [rax-4*16], xmm0
+movaps [rax-3*16], xmm0
+movaps [rax-2*16], xmm0
+movaps [rax-1*16], xmm0
+movaps [rax+0*16], xmm0
+movaps [rax+1*16], xmm0
+movaps [rax+2*16], xmm0
+movaps [rax+3*16], xmm0
+movaps [rax+4*16], xmm0
+movaps [rax+5*16], xmm0
+movaps [rax+6*16], xmm0
pop r12
-pop rbp
pop rbx
pop rdi
pop rsi