summaryrefslogtreecommitdiff
path: root/x64dll.asm
diff options
context:
space:
mode:
authorweidai <weidai11@users.noreply.github.com>2009-03-02 02:39:17 +0000
committerweidai <weidai11@users.noreply.github.com>2009-03-02 02:39:17 +0000
commitd8a644fc4ee2af9dc62f2a8c167b023d0c71d13b (patch)
tree0fecaa7a6728d07549a41864ea2cedfb245f0bd3 /x64dll.asm
parentfa25129ac981ceed9569496c02b83771b394fa40 (diff)
downloadcryptopp-git-d8a644fc4ee2af9dc62f2a8c167b023d0c71d13b.tar.gz
changes for 5.6:
- added AuthenticatedSymmetricCipher interface class and Filter wrappers - added CCM, GCM (with SSE2 assembly), CMAC, and SEED - improved AES speed on x86 and x64 - removed WORD64_AVAILABLE; compiler 64-bit int support is now required
Diffstat (limited to 'x64dll.asm')
-rw-r--r--x64dll.asm665
1 files changed, 665 insertions, 0 deletions
diff --git a/x64dll.asm b/x64dll.asm
new file mode 100644
index 00000000..643dbe42
--- /dev/null
+++ b/x64dll.asm
@@ -0,0 +1,665 @@
+include ksamd64.inc
+EXTERNDEF ?Te@rdtable@CryptoPP@@3PA_KA:FAR
+EXTERNDEF ?g_cacheLineSize@CryptoPP@@3IA:FAR
+.CODE
+
+ ALIGN 8
+Baseline_Add PROC
+ lea rdx, [rdx+8*rcx]
+ lea r8, [r8+8*rcx]
+ lea r9, [r9+8*rcx]
+ neg rcx ; rcx is negative index
+ jz $1@Baseline_Add
+ mov rax,[r8+8*rcx]
+ add rax,[r9+8*rcx]
+ mov [rdx+8*rcx],rax
+$0@Baseline_Add:
+ mov rax,[r8+8*rcx+8]
+ adc rax,[r9+8*rcx+8]
+ mov [rdx+8*rcx+8],rax
+ lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
+ jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero
+ mov rax,[r8+8*rcx]
+ adc rax,[r9+8*rcx]
+ mov [rdx+8*rcx],rax
+ jmp $0@Baseline_Add
+$1@Baseline_Add:
+ mov rax, 0
+ adc rax, rax ; store carry into rax (return result register)
+ ret
+Baseline_Add ENDP
+
+ ALIGN 8
+Baseline_Sub PROC
+ lea rdx, [rdx+8*rcx]
+ lea r8, [r8+8*rcx]
+ lea r9, [r9+8*rcx]
+ neg rcx ; rcx is negative index
+ jz $1@Baseline_Sub
+ mov rax,[r8+8*rcx]
+ sub rax,[r9+8*rcx]
+ mov [rdx+8*rcx],rax
+$0@Baseline_Sub:
+ mov rax,[r8+8*rcx+8]
+ sbb rax,[r9+8*rcx+8]
+ mov [rdx+8*rcx+8],rax
+ lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
+ jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero
+ mov rax,[r8+8*rcx]
+ sbb rax,[r9+8*rcx]
+ mov [rdx+8*rcx],rax
+ jmp $0@Baseline_Sub
+$1@Baseline_Sub:
+ mov rax, 0
+ adc rax, rax ; store carry into rax (return result register)
+
+ ret
+Baseline_Sub ENDP
+
+ALIGN 8
+Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
+rex_push_reg rsi
+push_reg rdi
+push_reg rbx
+push_reg rbp
+push_reg r12
+.endprolog
+mov r8, rcx
+mov rsi, ?Te@rdtable@CryptoPP@@3PA_KA
+mov rdi, QWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
+mov rbp, [(r8+16*19)]
+mov rax, 16
+and rax, rbp
+movdqa xmm3, XMMWORD PTR [rdx+16+rax]
+movdqa [(r8+16*12)], xmm3
+lea rax, [rdx+rax+2*16]
+sub rax, rbp
+label0:
+movdqa xmm0, [rax+rbp]
+movdqa XMMWORD PTR [(r8+0)+rbp], xmm0
+add rbp, 16
+cmp rbp, 16*12
+jl label0
+movdqa xmm4, [rax+rbp]
+movdqa xmm1, [rdx]
+mov r11d, [rdx+4*4]
+mov ebx, [rdx+5*4]
+mov ecx, [rdx+6*4]
+mov edx, [rdx+7*4]
+xor rax, rax
+label9:
+mov ebp, [rsi+rax]
+add rax, rdi
+mov ebp, [rsi+rax]
+add rax, rdi
+mov ebp, [rsi+rax]
+add rax, rdi
+mov ebp, [rsi+rax]
+add rax, rdi
+cmp rax, 2048
+jl label9
+lfence
+test DWORD PTR [(r8+16*18+8)], 1
+jz label8
+mov rbp, [(r8+16*14)]
+movdqa xmm2, [rbp]
+pxor xmm2, xmm1
+psrldq xmm1, 14
+movd eax, xmm1
+mov al, BYTE PTR [rbp+15]
+mov r12d, eax
+movd eax, xmm2
+psrldq xmm2, 4
+movd edi, xmm2
+psrldq xmm2, 4
+movzx ebp, al
+xor r11d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor edx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor ecx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, edi
+movd edi, xmm2
+psrldq xmm2, 4
+movzx ebp, al
+xor ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor r11d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor edx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor ecx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, edi
+movd edi, xmm2
+movzx ebp, al
+xor ecx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor r11d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor edx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, edi
+movzx ebp, al
+xor edx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor ecx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+psrldq xmm2, 3
+mov eax, [(r8+16*12)+0*4]
+mov edi, [(r8+16*12)+2*4]
+mov r10d, [(r8+16*12)+3*4]
+movzx ebp, cl
+xor r10d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, bl
+xor edi, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, bh
+xor r10d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr ebx, 16
+movzx ebp, bl
+xor eax, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, bh
+mov ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+xor ebx, [(r8+16*12)+1*4]
+movzx ebp, ch
+xor eax, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr ecx, 16
+movzx ebp, dl
+xor eax, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, dh
+xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr edx, 16
+movzx ebp, ch
+xor edi, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, cl
+xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, dl
+xor edi, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, dh
+xor r10d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movd ecx, xmm2
+mov edx, r11d
+mov [(r8+0)+3*4], r10d
+mov [(r8+0)+0*4], eax
+mov [(r8+0)+1*4], ebx
+mov [(r8+0)+2*4], edi
+jmp label5
+label3:
+mov r11d, [(r8+16*12)+0*4]
+mov ebx, [(r8+16*12)+1*4]
+mov ecx, [(r8+16*12)+2*4]
+mov edx, [(r8+16*12)+3*4]
+label8:
+mov rax, [(r8+16*14)]
+movdqu xmm2, [rax]
+mov rbp, [(r8+16*14)+8]
+movdqu xmm5, [rbp]
+pxor xmm2, xmm1
+pxor xmm2, xmm5
+movd eax, xmm2
+psrldq xmm2, 4
+movd edi, xmm2
+psrldq xmm2, 4
+movzx ebp, al
+xor r11d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor edx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor ecx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, edi
+movd edi, xmm2
+psrldq xmm2, 4
+movzx ebp, al
+xor ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor r11d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor edx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor ecx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, edi
+movd edi, xmm2
+movzx ebp, al
+xor ecx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor r11d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor edx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, edi
+movzx ebp, al
+xor edx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ah
+xor ecx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, al
+xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, ah
+xor r11d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov eax, r11d
+add r8, [(r8+16*19)]
+add r8, 4*16
+jmp label2
+label1:
+mov ecx, r12d
+mov edx, r11d
+mov eax, [(r8+0)+0*4]
+mov ebx, [(r8+0)+1*4]
+xor cl, ch
+and rcx, 255
+label5:
+add r12d, 1
+xor edx, DWORD PTR [rsi+rcx*8+3]
+movzx ebp, dl
+xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, dh
+mov ecx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr edx, 16
+xor ecx, [(r8+0)+2*4]
+movzx ebp, dh
+xor eax, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, dl
+mov edx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+xor edx, [(r8+0)+3*4]
+add r8, [(r8+16*19)]
+add r8, 3*16
+jmp label4
+label2:
+mov r10d, [(r8+0)-4*16+3*4]
+mov edi, [(r8+0)-4*16+2*4]
+movzx ebp, cl
+xor r10d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov cl, al
+movzx ebp, ah
+xor edi, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, bl
+xor edi, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, bh
+xor r10d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr ebx, 16
+movzx ebp, al
+xor r10d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, ah
+mov eax, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, bl
+xor eax, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, bh
+mov ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ch
+xor eax, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, cl
+xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+shr ecx, 16
+movzx ebp, dl
+xor eax, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, dh
+xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr edx, 16
+movzx ebp, ch
+xor edi, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, cl
+xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, dl
+xor edi, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, dh
+xor r10d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+mov ecx, edi
+xor eax, [(r8+0)-4*16+0*4]
+xor ebx, [(r8+0)-4*16+1*4]
+mov edx, r10d
+label4:
+mov r10d, [(r8+0)-4*16+7*4]
+mov edi, [(r8+0)-4*16+6*4]
+movzx ebp, cl
+xor r10d, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+mov cl, al
+movzx ebp, ah
+xor edi, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr eax, 16
+movzx ebp, bl
+xor edi, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, bh
+xor r10d, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr ebx, 16
+movzx ebp, al
+xor r10d, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, ah
+mov eax, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, bl
+xor eax, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, bh
+mov ebx, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, ch
+xor eax, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+movzx ebp, cl
+xor ebx, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+shr ecx, 16
+movzx ebp, dl
+xor eax, DWORD PTR [rsi+8*rbp+(((3+3) MOD (4))+1)]
+movzx ebp, dh
+xor ebx, DWORD PTR [rsi+8*rbp+(((2+3) MOD (4))+1)]
+shr edx, 16
+movzx ebp, ch
+xor edi, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+movzx ebp, cl
+xor ebx, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, dl
+xor edi, DWORD PTR [rsi+8*rbp+(((1+3) MOD (4))+1)]
+movzx ebp, dh
+xor r10d, DWORD PTR [rsi+8*rbp+(((0+3) MOD (4))+1)]
+mov ecx, edi
+xor eax, [(r8+0)-4*16+4*4]
+xor ebx, [(r8+0)-4*16+5*4]
+mov edx, r10d
+add r8, 32
+test r8, 255
+jnz label2
+sub r8, 16*16
+movzx ebp, ch
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, dl
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+2], di
+movzx ebp, dh
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, al
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+6], di
+shr edx, 16
+movzx ebp, ah
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, bl
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+10], di
+shr eax, 16
+movzx ebp, bh
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, cl
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+14], di
+shr ebx, 16
+movzx ebp, dh
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, al
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+12], di
+shr ecx, 16
+movzx ebp, ah
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, bl
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+0], di
+movzx ebp, bh
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, cl
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+4], di
+movzx ebp, ch
+movzx edi, BYTE PTR [rsi+rbp*8+1]
+movzx ebp, dl
+xor edi, DWORD PTR [rsi+rbp*8+0]
+mov WORD PTR [(r8+16*13)+8], di
+mov rax, [(r8+16*14)+16]
+mov rbx, [(r8+16*14)+24]
+mov rcx, [(r8+16*18+8)]
+sub rcx, 16
+movdqu xmm2, [rax]
+pxor xmm2, xmm4
+movdqa xmm0, [(r8+16*16)+16]
+paddq xmm0, [(r8+16*14)+16]
+movdqa [(r8+16*14)+16], xmm0
+pxor xmm2, [(r8+16*13)]
+movdqu [rbx], xmm2
+jle label7
+mov [(r8+16*18+8)], rcx
+test rcx, 1
+jnz label1
+movdqa xmm0, [(r8+16*16)]
+paddd xmm0, [(r8+16*14)]
+movdqa [(r8+16*14)], xmm0
+jmp label3
+label7:
+mov rbp, [(r8+16*18)]
+pop r12
+pop rbp
+pop rbx
+pop rdi
+pop rsi
+ret
+Rijndael_Enc_AdvancedProcessBlocks ENDP
+
+ALIGN 8
+GCM_AuthenticateBlocks_2K PROC FRAME
+rex_push_reg rsi
+push_reg rdi
+push_reg rbx
+.endprolog
+mov rsi, r8
+mov r11, r9
+movdqa xmm0, [rsi]
+label0:
+movdqu xmm4, [rcx]
+pxor xmm0, xmm4
+movd ebx, xmm0
+mov eax, 0f0f0f0f0h
+and eax, ebx
+shl ebx, 4
+and ebx, 0f0f0f0f0h
+movzx edi, ah
+movdqa xmm5, XMMWORD PTR [rsi + 32 + 1024 + rdi]
+movzx edi, al
+movdqa xmm4, XMMWORD PTR [rsi + 32 + 1024 + rdi]
+shr eax, 16
+movzx edi, ah
+movdqa xmm3, XMMWORD PTR [rsi + 32 + 1024 + rdi]
+movzx edi, al
+movdqa xmm2, XMMWORD PTR [rsi + 32 + 1024 + rdi]
+psrldq xmm0, 4
+movd eax, xmm0
+and eax, 0f0f0f0f0h
+movzx edi, bh
+pxor xmm5, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
+movzx edi, bl
+pxor xmm4, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
+shr ebx, 16
+movzx edi, bh
+pxor xmm3, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
+movzx edi, bl
+pxor xmm2, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
+movd ebx, xmm0
+shl ebx, 4
+and ebx, 0f0f0f0f0h
+movzx edi, ah
+pxor xmm5, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
+movzx edi, al
+pxor xmm4, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
+shr eax, 16
+movzx edi, ah
+pxor xmm3, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
+movzx edi, al
+pxor xmm2, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
+psrldq xmm0, 4
+movd eax, xmm0
+and eax, 0f0f0f0f0h
+movzx edi, bh
+pxor xmm5, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
+movzx edi, bl
+pxor xmm4, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
+shr ebx, 16
+movzx edi, bh
+pxor xmm3, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
+movzx edi, bl
+pxor xmm2, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
+movd ebx, xmm0
+shl ebx, 4
+and ebx, 0f0f0f0f0h
+movzx edi, ah
+pxor xmm5, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
+movzx edi, al
+pxor xmm4, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
+shr eax, 16
+movzx edi, ah
+pxor xmm3, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
+movzx edi, al
+pxor xmm2, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
+psrldq xmm0, 4
+movd eax, xmm0
+and eax, 0f0f0f0f0h
+movzx edi, bh
+pxor xmm5, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
+movzx edi, bl
+pxor xmm4, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
+shr ebx, 16
+movzx edi, bh
+pxor xmm3, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
+movzx edi, bl
+pxor xmm2, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
+movd ebx, xmm0
+shl ebx, 4
+and ebx, 0f0f0f0f0h
+movzx edi, ah
+pxor xmm5, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
+movzx edi, al
+pxor xmm4, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
+shr eax, 16
+movzx edi, ah
+pxor xmm3, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
+movzx edi, al
+pxor xmm2, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
+movzx edi, bh
+pxor xmm5, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
+movzx edi, bl
+pxor xmm4, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
+shr ebx, 16
+movzx edi, bh
+pxor xmm3, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
+movzx edi, bl
+pxor xmm2, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
+movdqa xmm0, xmm3
+pslldq xmm3, 1
+pxor xmm2, xmm3
+movdqa xmm1, xmm2
+pslldq xmm2, 1
+pxor xmm5, xmm2
+psrldq xmm0, 15
+movd rdi, xmm0
+movzx eax, WORD PTR [r11 + rdi*2]
+shl eax, 8
+movdqa xmm0, xmm5
+pslldq xmm5, 1
+pxor xmm4, xmm5
+psrldq xmm1, 15
+movd rdi, xmm1
+xor ax, WORD PTR [r11 + rdi*2]
+shl eax, 8
+psrldq xmm0, 15
+movd rdi, xmm0
+xor ax, WORD PTR [r11 + rdi*2]
+movd xmm0, eax
+pxor xmm0, xmm4
+add rcx, 16
+sub rdx, 1
+jnz label0
+movdqa [rsi], xmm0
+pop rbx
+pop rdi
+pop rsi
+ret
+GCM_AuthenticateBlocks_2K ENDP
+
+ALIGN 8
+GCM_AuthenticateBlocks_64K PROC FRAME
+rex_push_reg rsi
+push_reg rdi
+.endprolog
+mov rsi, r8
+movdqa xmm0, [rsi]
+label1:
+movdqu xmm1, [rcx]
+pxor xmm1, xmm0
+pxor xmm0, xmm0
+movd eax, xmm1
+psrldq xmm1, 4
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (0*4+0)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (0*4+1)*256*16 + rdi*8]
+shr eax, 16
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (0*4+2)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (0*4+3)*256*16 + rdi*8]
+movd eax, xmm1
+psrldq xmm1, 4
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (1*4+0)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (1*4+1)*256*16 + rdi*8]
+shr eax, 16
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (1*4+2)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (1*4+3)*256*16 + rdi*8]
+movd eax, xmm1
+psrldq xmm1, 4
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (2*4+0)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (2*4+1)*256*16 + rdi*8]
+shr eax, 16
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (2*4+2)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (2*4+3)*256*16 + rdi*8]
+movd eax, xmm1
+psrldq xmm1, 4
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (3*4+0)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (3*4+1)*256*16 + rdi*8]
+shr eax, 16
+movzx edi, al
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (3*4+2)*256*16 + rdi*8]
+movzx edi, ah
+add rdi, rdi
+pxor xmm0, [rsi + 32 + (3*4+3)*256*16 + rdi*8]
+add rcx, 16
+sub rdx, 1
+jnz label1
+movdqa [rsi], xmm0
+pop rdi
+pop rsi
+ret
+GCM_AuthenticateBlocks_64K ENDP
+
+_TEXT ENDS
+END