summaryrefslogtreecommitdiff
path: root/lib/freebl/intel-gcm-x64-masm.asm
diff options
context:
space:
mode:
authorShay Gueron <shay.gueron@intel.com>2014-04-08 19:01:48 -0700
committerShay Gueron <shay.gueron@intel.com>2014-04-08 19:01:48 -0700
commit8535b43e4cdd2ac4dfc93053052c666f1774d292 (patch)
tree91759ab81b466c39c2b77f6237a588cf05c577fe /lib/freebl/intel-gcm-x64-masm.asm
parentb6fd78e9cc5058fa37808d7478b54c37bc93d3df (diff)
downloadnss-hg-8535b43e4cdd2ac4dfc93053052c666f1774d292.tar.gz
Bug 979703: Implementation of AES in different modes of operation, using
AES-NI and PCLMULQDQ-NI, for WIN32 and WIN64 platforms. r=wtc.
Diffstat (limited to 'lib/freebl/intel-gcm-x64-masm.asm')
-rw-r--r--lib/freebl/intel-gcm-x64-masm.asm1301
1 files changed, 1301 insertions, 0 deletions
diff --git a/lib/freebl/intel-gcm-x64-masm.asm b/lib/freebl/intel-gcm-x64-masm.asm
new file mode 100644
index 000000000..408879d38
--- /dev/null
+++ b/lib/freebl/intel-gcm-x64-masm.asm
@@ -0,0 +1,1301 @@
+; LICENSE:
+; This submission to NSS is to be made available under the terms of the
+; Mozilla Public License, v. 2.0. You can obtain one at http:
+; //mozilla.org/MPL/2.0/.
+;###############################################################################
+; Copyright(c) 2014, Intel Corp.
+; Developers and authors:
+; Shay Gueron and Vlad Krasnov
+; Intel Corporation, Israel Development Centre, Haifa, Israel
+; Please send feedback directly to crypto.feedback.alias@intel.com
+
+
+.DATA
+ALIGN 16
+Lone dq 1,0
+Ltwo dq 2,0
+Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
+Lpoly dq 01h, 0c200000000000000h
+
+.CODE
+
+
+GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
+ vpclmulqdq TMP1, SRC2, SRC1, 0h
+ vpclmulqdq TMP4, SRC2, SRC1, 011h
+
+ vpshufd TMP2, SRC2, 78
+ vpshufd TMP3, SRC1, 78
+ vpxor TMP2, TMP2, SRC2
+ vpxor TMP3, TMP3, SRC1
+
+ vpclmulqdq TMP2, TMP2, TMP3, 0h
+ vpxor TMP2, TMP2, TMP1
+ vpxor TMP2, TMP2, TMP4
+
+ vpslldq TMP3, TMP2, 8
+ vpsrldq TMP2, TMP2, 8
+
+ vpxor TMP1, TMP1, TMP3
+ vpxor TMP4, TMP4, TMP2
+
+ vpclmulqdq TMP2, TMP1, [Lpoly], 010h
+ vpshufd TMP3, TMP1, 78
+ vpxor TMP1, TMP2, TMP3
+
+ vpclmulqdq TMP2, TMP1, [Lpoly], 010h
+ vpshufd TMP3, TMP1, 78
+ vpxor TMP1, TMP2, TMP3
+
+ vpxor DST, TMP1, TMP4
+
+ ENDM
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Generates the final GCM tag
+; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
+; unsigned char *Tp,
+; unsigned int Mlen,
+; unsigned int Alen,
+; unsigned char *X0,
+; unsigned char *TAG);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmTAG PROC
+
+Htbl textequ <rcx>
+Tp textequ <rdx>
+Mlen textequ <r8>
+Alen textequ <r9>
+X0 textequ <r10>
+TAG textequ <r11>
+
+T textequ <xmm0>
+TMP0 textequ <xmm1>
+
+ mov X0, [rsp + 1*8 + 4*8]
+ mov TAG, [rsp + 1*8 + 5*8]
+
+ vzeroupper
+ vmovdqu T, XMMWORD PTR[Tp]
+ vpxor TMP0, TMP0, TMP0
+
+ shl Mlen, 3
+ shl Alen, 3
+
+ ;vpinsrq TMP0, TMP0, Mlen, 0
+ ;vpinsrq TMP0, TMP0, Alen, 1
+ ; workaround the ml64.exe vpinsrq issue
+ vpinsrd TMP0, TMP0, r8d, 0
+ vpinsrd TMP0, TMP0, r9d, 2
+ shr Mlen, 32
+ shr Alen, 32
+ vpinsrd TMP0, TMP0, r8d, 1
+ vpinsrd TMP0, TMP0, r9d, 3
+
+ vpxor T, T, TMP0
+ vmovdqu TMP0, XMMWORD PTR[Htbl]
+ GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
+
+ vpshufb T, T, [Lbswap_mask]
+ vpxor T, T, [X0]
+ vmovdqu XMMWORD PTR[TAG], T
+ vzeroupper
+
+ ret
+
+intel_aes_gcmTAG ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Generates the H table
+; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmINIT PROC
+
+Htbl textequ <rcx>
+KS textequ <rdx>
+NR textequ <r8d>
+
+T textequ <xmm0>
+TMP0 textequ <xmm1>
+
+ vzeroupper
+ ; AES-ENC(0)
+ vmovdqu T, XMMWORD PTR[KS]
+ lea KS, [16 + KS]
+ dec NR
+Lenc_loop:
+ vaesenc T, T, [KS]
+ lea KS, [16 + KS]
+ dec NR
+ jnz Lenc_loop
+
+ vaesenclast T, T, [KS]
+ vpshufb T, T, [Lbswap_mask]
+
+ ;Calculate H` = GFMUL(H, 2)
+ vpsrad xmm3, T, 31
+ vpshufd xmm3, xmm3, 0ffh
+ vpand xmm5, xmm3, [Lpoly]
+ vpsrld xmm3, T, 31
+ vpslld xmm4, T, 1
+ vpslldq xmm3, xmm3, 4
+ vpxor T, xmm4, xmm3
+ vpxor T, T, xmm5
+
+ vmovdqu TMP0, T
+ vmovdqu XMMWORD PTR[Htbl + 0*16], T
+
+ vpshufd xmm2, T, 78
+ vpxor xmm2, xmm2, T
+ vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
+
+ i = 1
+ WHILE i LT 8
+ GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
+ vmovdqu XMMWORD PTR[Htbl + i*16], T
+ vpshufd xmm2, T, 78
+ vpxor xmm2, xmm2, T
+ vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
+ i = i+1
+ ENDM
+ vzeroupper
+ ret
+intel_aes_gcmINIT ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authenticate only
+; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmAAD PROC
+
+Htbl textequ <rcx>
+inp textequ <rdx>
+len textequ <r8>
+Tp textequ <r9>
+hlp0 textequ <r10>
+
+DATA textequ <xmm0>
+T textequ <xmm1>
+TMP0 textequ <xmm2>
+TMP1 textequ <xmm3>
+TMP2 textequ <xmm4>
+TMP3 textequ <xmm5>
+TMP4 textequ <xmm6>
+Xhi textequ <xmm7>
+
+KARATSUBA_AAD MACRO i
+ vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
+ vpxor TMP0, TMP0, TMP3
+ vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
+ vpxor TMP1, TMP1, TMP3
+ vpshufd TMP3, DATA, 78
+ vpxor TMP3, TMP3, DATA
+ vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
+ vpxor TMP2, TMP2, TMP3
+ENDM
+
+ test len, len
+ jnz LbeginAAD
+ ret
+
+LbeginAAD:
+ vzeroupper
+
+ sub rsp, 2*16
+ vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
+ vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
+
+ vpxor Xhi, Xhi, Xhi
+
+ vmovdqu T, XMMWORD PTR[Tp]
+ ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
+ mov hlp0, len
+ and hlp0, 128-1
+ jz Lmod_loop
+
+ and len, -128
+ sub hlp0, 16
+
+ ; Prefix block
+ vmovdqu DATA, XMMWORD PTR[inp]
+ vpshufb DATA, DATA, [Lbswap_mask]
+ vpxor DATA, DATA, T
+
+ vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h
+ vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h
+ vpshufd TMP3, DATA, 78
+ vpxor TMP3, TMP3, DATA
+ vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
+
+ lea inp, [inp+16]
+ test hlp0, hlp0
+ jnz Lpre_loop
+ jmp Lred1
+
+ ;hash remaining prefix bocks (up to 7 total prefix blocks)
+Lpre_loop:
+
+ sub hlp0, 16
+
+ vmovdqu DATA, XMMWORD PTR[inp]
+ vpshufb DATA, DATA, [Lbswap_mask]
+
+ vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h
+ vpxor TMP0, TMP0, TMP3
+ vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h
+ vpxor TMP1, TMP1, TMP3
+ vpshufd TMP3, DATA, 78
+ vpxor TMP3, TMP3, DATA
+ vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
+ vpxor TMP2, TMP2, TMP3
+
+ test hlp0, hlp0
+ lea inp, [inp+16]
+ jnz Lpre_loop
+
+Lred1:
+
+ vpxor TMP2, TMP2, TMP0
+ vpxor TMP2, TMP2, TMP1
+ vpsrldq TMP3, TMP2, 8
+ vpslldq TMP2, TMP2, 8
+
+ vpxor Xhi, TMP1, TMP3
+ vpxor T, TMP0, TMP2
+
+
+Lmod_loop:
+
+ sub len, 16*8
+ jb Ldone
+ ; Block #0
+ vmovdqu DATA, XMMWORD PTR[inp + 16*7]
+ vpshufb DATA, DATA, [Lbswap_mask]
+
+ vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h
+ vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h
+ vpshufd TMP3, DATA, 78
+ vpxor TMP3, TMP3, DATA
+ vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
+
+ ; Block #1
+ vmovdqu DATA, XMMWORD PTR[inp + 16*6]
+ vpshufb DATA, DATA, [Lbswap_mask]
+ KARATSUBA_AAD 1
+
+ ; Block #2
+ vmovdqu DATA, XMMWORD PTR[inp + 16*5]
+ vpshufb DATA, DATA, [Lbswap_mask]
+
+ vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
+ vpalignr T, T, T, 8
+
+ KARATSUBA_AAD 2
+
+ vpxor T, T, TMP4 ;reduction stage 1b
+
+ ; Block #3
+ vmovdqu DATA, XMMWORD PTR[inp + 16*4]
+ vpshufb DATA, DATA, [Lbswap_mask]
+ KARATSUBA_AAD 3
+ ; Block #4
+ vmovdqu DATA, XMMWORD PTR[inp + 16*3]
+ vpshufb DATA, DATA, [Lbswap_mask]
+
+ vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
+ vpalignr T, T, T, 8
+
+ KARATSUBA_AAD 4
+
+ vpxor T, T, TMP4 ;reduction stage 2b
+ ; Block #5
+ vmovdqu DATA, XMMWORD PTR[inp + 16*2]
+ vpshufb DATA, DATA, [Lbswap_mask]
+ KARATSUBA_AAD 5
+
+ vpxor T, T, Xhi ;reduction finalize
+ ; Block #6
+ vmovdqu DATA, XMMWORD PTR[inp + 16*1]
+ vpshufb DATA, DATA, [Lbswap_mask]
+ KARATSUBA_AAD 6
+ ; Block #7
+ vmovdqu DATA, XMMWORD PTR[inp + 16*0]
+ vpshufb DATA, DATA, [Lbswap_mask]
+ vpxor DATA, DATA, T
+ KARATSUBA_AAD 7
+ ; Aggregated 8 blocks, now karatsuba fixup
+ vpxor TMP2, TMP2, TMP0
+ vpxor TMP2, TMP2, TMP1
+ vpsrldq TMP3, TMP2, 8
+ vpslldq TMP2, TMP2, 8
+
+ vpxor Xhi, TMP1, TMP3
+ vpxor T, TMP0, TMP2
+
+ lea inp, [inp + 16*8]
+ jmp Lmod_loop
+
+Ldone:
+ vpclmulqdq TMP4, T, [Lpoly], 010h
+ vpalignr T, T, T, 8
+ vpxor T, T, TMP4
+
+ vpclmulqdq TMP4, T, [Lpoly], 010h
+ vpalignr T, T, T, 8
+ vpxor T, T, TMP4
+
+ vpxor T, T, Xhi
+ vmovdqu XMMWORD PTR[Tp], T
+ vzeroupper
+
+ vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
+ vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
+ add rsp, 16*2
+
+ ret
+
+intel_aes_gcmAAD ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Encrypt and Authenticate
+; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmENC PROC
+
+PT textequ <rcx>
+CT textequ <rdx>
+Htbl textequ <r8>
+Gctx textequ <r8>
+len textequ <r9>
+KS textequ <r10>
+NR textequ <eax>
+
+aluCTR textequ <r11d>
+aluKSl textequ <r12d>
+aluTMP textequ <r13d>
+
+T textequ <xmm0>
+TMP0 textequ <xmm1>
+TMP1 textequ <xmm2>
+TMP2 textequ <xmm3>
+TMP3 textequ <xmm4>
+TMP4 textequ <xmm5>
+TMP5 textequ <xmm6>
+CTR0 textequ <xmm7>
+CTR1 textequ <xmm8>
+CTR2 textequ <xmm9>
+CTR3 textequ <xmm10>
+CTR4 textequ <xmm11>
+CTR5 textequ <xmm12>
+CTR6 textequ <xmm13>
+CTR7 textequ <xmm14>
+BSWAPMASK textequ <xmm15>
+
+ROUND MACRO i
+ vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
+ vaesenc CTR0, CTR0, TMP3
+ vaesenc CTR1, CTR1, TMP3
+ vaesenc CTR2, CTR2, TMP3
+ vaesenc CTR3, CTR3, TMP3
+ vaesenc CTR4, CTR4, TMP3
+ vaesenc CTR5, CTR5, TMP3
+ vaesenc CTR6, CTR6, TMP3
+ vaesenc CTR7, CTR7, TMP3
+ENDM
+ROUNDMUL MACRO i
+ vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
+
+ vaesenc CTR0, CTR0, TMP3
+ vaesenc CTR1, CTR1, TMP3
+ vaesenc CTR2, CTR2, TMP3
+ vaesenc CTR3, CTR3, TMP3
+
+ vpshufd TMP4, TMP5, 78
+ vpxor TMP4, TMP4, TMP5
+
+ vaesenc CTR4, CTR4, TMP3
+ vaesenc CTR5, CTR5, TMP3
+ vaesenc CTR6, CTR6, TMP3
+ vaesenc CTR7, CTR7, TMP3
+
+ vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
+ vpxor TMP0, TMP0, TMP3
+ vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
+ vpclmulqdq TMP3, TMP5, TMP4, 011h
+ vpxor TMP1, TMP1, TMP3
+ vpclmulqdq TMP3, TMP5, TMP4, 000h
+ vpxor TMP2, TMP2, TMP3
+ENDM
+KARATSUBA MACRO i
+ vpshufd TMP4, TMP5, 78
+ vpxor TMP4, TMP4, TMP5
+ vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
+ vpxor TMP0, TMP0, TMP3
+ vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
+ vpclmulqdq TMP3, TMP5, TMP4, 011h
+ vpxor TMP1, TMP1, TMP3
+ vpclmulqdq TMP3, TMP5, TMP4, 000h
+ vpxor TMP2, TMP2, TMP3
+ENDM
+NEXTCTR MACRO i
+ add aluCTR, 1
+ mov aluTMP, aluCTR
+ xor aluTMP, aluKSl
+ bswap aluTMP
+ mov [3*4 + 8*16 + i*16 + rsp], aluTMP
+ENDM
+
+
+ test len, len
+ jnz LbeginENC
+ ret
+
+LbeginENC:
+
+ vzeroupper
+ push r11
+ push r12
+ push r13
+ push rbp
+ sub rsp, 10*16
+ vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
+ vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
+ vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
+ vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
+ vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
+ vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
+ vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
+ vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
+ vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
+ vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
+
+ mov rbp, rsp
+ sub rsp, 16*16
+ and rsp, -16
+
+ vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
+ vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+ vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
+ mov KS, [16*16 + 3*16 + Gctx]
+ mov NR, [4 + KS]
+ lea KS, [48 + KS]
+
+ vpshufb CTR0, CTR0, BSWAPMASK
+
+ mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
+ mov aluKSl, [3*4 + KS]
+ bswap aluCTR
+ bswap aluKSl
+
+ vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
+ vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+ vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
+
+ cmp len, 128
+ jb LEncDataSingles
+; Prepare the "top" counters
+ vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
+
+; Encrypt the initial 8 blocks
+ sub len, 128
+ vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
+ vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
+ vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
+ vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
+ vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
+ vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
+ vpaddd CTR7, CTR6, XMMWORD PTR[Lone]
+
+ vpshufb CTR0, CTR0, BSWAPMASK
+ vpshufb CTR1, CTR1, BSWAPMASK
+ vpshufb CTR2, CTR2, BSWAPMASK
+ vpshufb CTR3, CTR3, BSWAPMASK
+ vpshufb CTR4, CTR4, BSWAPMASK
+ vpshufb CTR5, CTR5, BSWAPMASK
+ vpshufb CTR6, CTR6, BSWAPMASK
+ vpshufb CTR7, CTR7, BSWAPMASK
+
+ vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
+ vpxor CTR0, CTR0, TMP3
+ vpxor CTR1, CTR1, TMP3
+ vpxor CTR2, CTR2, TMP3
+ vpxor CTR3, CTR3, TMP3
+ vpxor CTR4, CTR4, TMP3
+ vpxor CTR5, CTR5, TMP3
+ vpxor CTR6, CTR6, TMP3
+ vpxor CTR7, CTR7, TMP3
+
+ ROUND 1
+
+ add aluCTR, 8
+ mov aluTMP, aluCTR
+ xor aluTMP, aluKSl
+ bswap aluTMP
+ mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
+
+ ROUND 2
+ NEXTCTR 1
+ ROUND 3
+ NEXTCTR 2
+ ROUND 4
+ NEXTCTR 3
+ ROUND 5
+ NEXTCTR 4
+ ROUND 6
+ NEXTCTR 5
+ ROUND 7
+ NEXTCTR 6
+ ROUND 8
+ NEXTCTR 7
+ ROUND 9
+ vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+
+ ROUND 10
+ ROUND 11
+ vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+
+ ROUND 12
+ ROUND 13
+ vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
+@@:
+ vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
+ vaesenclast CTR0, CTR0, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
+ vaesenclast CTR1, CTR1, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
+ vaesenclast CTR2, CTR2, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
+ vaesenclast CTR3, CTR3, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
+ vaesenclast CTR4, CTR4, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
+ vaesenclast CTR5, CTR5, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
+ vaesenclast CTR6, CTR6, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
+ vaesenclast CTR7, CTR7, TMP3
+
+ vmovdqu XMMWORD PTR[0*16 + CT], CTR0
+ vpshufb CTR0, CTR0, BSWAPMASK
+ vmovdqu XMMWORD PTR[1*16 + CT], CTR1
+ vpshufb CTR1, CTR1, BSWAPMASK
+ vmovdqu XMMWORD PTR[2*16 + CT], CTR2
+ vpshufb CTR2, CTR2, BSWAPMASK
+ vmovdqu XMMWORD PTR[3*16 + CT], CTR3
+ vpshufb CTR3, CTR3, BSWAPMASK
+ vmovdqu XMMWORD PTR[4*16 + CT], CTR4
+ vpshufb CTR4, CTR4, BSWAPMASK
+ vmovdqu XMMWORD PTR[5*16 + CT], CTR5
+ vpshufb CTR5, CTR5, BSWAPMASK
+ vmovdqu XMMWORD PTR[6*16 + CT], CTR6
+ vpshufb CTR6, CTR6, BSWAPMASK
+ vmovdqu XMMWORD PTR[7*16 + CT], CTR7
+ vpshufb TMP5, CTR7, BSWAPMASK
+
+ vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
+ vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
+ vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
+ vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
+ vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
+ vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
+ vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
+
+ lea CT, [8*16 + CT]
+ lea PT, [8*16 + PT]
+ jmp LEncDataOctets
+
+LEncDataOctets:
+ cmp len, 128
+ jb LEndEncOctets
+ sub len, 128
+
+ vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
+ vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
+ vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
+ vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
+ vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
+ vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
+ vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
+ vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
+
+ vpshufd TMP4, TMP5, 78
+ vpxor TMP4, TMP4, TMP5
+ vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+ vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
+ vpclmulqdq TMP1, TMP5, TMP4, 011h
+ vpclmulqdq TMP2, TMP5, TMP4, 000h
+
+ vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
+ ROUNDMUL 1
+ NEXTCTR 0
+ vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
+ ROUNDMUL 2
+ NEXTCTR 1
+ vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
+ ROUNDMUL 3
+ NEXTCTR 2
+ vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
+ ROUNDMUL 4
+ NEXTCTR 3
+ vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
+ ROUNDMUL 5
+ NEXTCTR 4
+ vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
+ ROUNDMUL 6
+ NEXTCTR 5
+ vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
+ ROUNDMUL 7
+ NEXTCTR 6
+
+ ROUND 8
+ NEXTCTR 7
+
+ vpxor TMP0, TMP0, TMP1
+ vpxor TMP0, TMP0, TMP2
+ vpsrldq TMP3, TMP0, 8
+ vpxor TMP4, TMP1, TMP3
+ vpslldq TMP3, TMP0, 8
+ vpxor T, TMP2, TMP3
+
+ vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
+ vpalignr T,T,T,8
+ vpxor T, T, TMP1
+
+ ROUND 9
+
+ vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
+ vpalignr T,T,T,8
+ vpxor T, T, TMP1
+
+ vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+
+ ROUND 10
+ ROUND 11
+ vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+
+ ROUND 12
+ ROUND 13
+ vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
+@@:
+ vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
+ vaesenclast CTR0, CTR0, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
+ vaesenclast CTR1, CTR1, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
+ vaesenclast CTR2, CTR2, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
+ vaesenclast CTR3, CTR3, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
+ vaesenclast CTR4, CTR4, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
+ vaesenclast CTR5, CTR5, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
+ vaesenclast CTR6, CTR6, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
+ vaesenclast CTR7, CTR7, TMP3
+
+ vmovdqu XMMWORD PTR[0*16 + CT], CTR0
+ vpshufb CTR0, CTR0, BSWAPMASK
+ vmovdqu XMMWORD PTR[1*16 + CT], CTR1
+ vpshufb CTR1, CTR1, BSWAPMASK
+ vmovdqu XMMWORD PTR[2*16 + CT], CTR2
+ vpshufb CTR2, CTR2, BSWAPMASK
+ vmovdqu XMMWORD PTR[3*16 + CT], CTR3
+ vpshufb CTR3, CTR3, BSWAPMASK
+ vmovdqu XMMWORD PTR[4*16 + CT], CTR4
+ vpshufb CTR4, CTR4, BSWAPMASK
+ vmovdqu XMMWORD PTR[5*16 + CT], CTR5
+ vpshufb CTR5, CTR5, BSWAPMASK
+ vmovdqu XMMWORD PTR[6*16 + CT], CTR6
+ vpshufb CTR6, CTR6, BSWAPMASK
+ vmovdqu XMMWORD PTR[7*16 + CT], CTR7
+ vpshufb TMP5, CTR7, BSWAPMASK
+
+ vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
+ vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
+ vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
+ vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
+ vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
+ vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
+ vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
+
+ vpxor T, T, TMP4
+
+ lea CT, [8*16 + CT]
+ lea PT, [8*16 + PT]
+ jmp LEncDataOctets
+
+LEndEncOctets:
+
+ vpshufd TMP4, TMP5, 78
+ vpxor TMP4, TMP4, TMP5
+ vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+ vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
+ vpclmulqdq TMP1, TMP5, TMP4, 011h
+ vpclmulqdq TMP2, TMP5, TMP4, 000h
+
+ vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
+ KARATSUBA 1
+ vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
+ KARATSUBA 2
+ vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
+ KARATSUBA 3
+ vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
+ KARATSUBA 4
+ vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
+ KARATSUBA 5
+ vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
+ KARATSUBA 6
+ vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
+ KARATSUBA 7
+
+ vpxor TMP0, TMP0, TMP1
+ vpxor TMP0, TMP0, TMP2
+ vpsrldq TMP3, TMP0, 8
+ vpxor TMP4, TMP1, TMP3
+ vpslldq TMP3, TMP0, 8
+ vpxor T, TMP2, TMP3
+
+ vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
+ vpalignr T,T,T,8
+ vpxor T, T, TMP1
+
+ vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
+ vpalignr T,T,T,8
+ vpxor T, T, TMP1
+
+ vpxor T, T, TMP4
+
+ sub aluCTR, 7
+
+LEncDataSingles:
+
+ cmp len, 16
+ jb LEncDataTail
+ sub len, 16
+
+ vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
+ NEXTCTR 0
+
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+ vaesenclast TMP1, TMP1, TMP2
+ vpxor TMP1, TMP1, XMMWORD PTR[PT]
+ vmovdqu XMMWORD PTR[CT], TMP1
+
+ lea PT, [16+PT]
+ lea CT, [16+CT]
+
+ vpshufb TMP1, TMP1, BSWAPMASK
+ vpxor T, T, TMP1
+ vmovdqu TMP0, XMMWORD PTR[Htbl]
+ GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
+
+ jmp LEncDataSingles
+
+LEncDataTail:
+
+ test len, len
+ jz LEncDataEnd
+
+ vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
+
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+ vaesenclast TMP1, TMP1, TMP2
+; zero a temp location
+ vpxor TMP2, TMP2, TMP2
+ vmovdqa XMMWORD PTR[rsp], TMP2
+; copy as many bytes as needed
+ xor KS, KS
+
+@@:
+ cmp len, KS
+ je @f
+ mov al, [PT + KS]
+ mov [rsp + KS], al
+ inc KS
+ jmp @b
+@@:
+ vpxor TMP1, TMP1, XMMWORD PTR[rsp]
+ vmovdqa XMMWORD PTR[rsp], TMP1
+ xor KS, KS
+@@:
+ cmp len, KS
+ je @f
+ mov al, [rsp + KS]
+ mov [CT + KS], al
+ inc KS
+ jmp @b
+@@:
+ cmp KS, 16
+ je @f
+ mov BYTE PTR[rsp + KS], 0
+ inc KS
+ jmp @b
+@@:
+BAIL:
+ vmovdqa TMP1, XMMWORD PTR[rsp]
+ vpshufb TMP1, TMP1, BSWAPMASK
+ vpxor T, T, TMP1
+ vmovdqu TMP0, XMMWORD PTR[Htbl]
+ GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
+
+LEncDataEnd:
+
+ vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
+ bswap aluCTR
+ mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
+
+ mov rsp, rbp
+
+ vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
+ vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
+ vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
+ vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
+ vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
+ vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
+ vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
+ vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
+ vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
+ vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
+
+ add rsp, 10*16
+ pop rbp
+ pop r13
+ pop r12
+ pop r11
+
+ vzeroupper
+
+ ret
+intel_aes_gcmENC ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Decrypt and Authenticate
+; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmDEC PROC
+
+NEXTCTR MACRO i
+ add aluCTR, 1
+ mov aluTMP, aluCTR
+ xor aluTMP, aluKSl
+ bswap aluTMP
+ mov [3*4 + i*16 + rsp], aluTMP
+ENDM
+
+PT textequ <rdx>
+CT textequ <rcx>
+
+ test len, len
+ jnz LbeginDEC
+ ret
+
+LbeginDEC:
+
+ vzeroupper
+ push r11
+ push r12
+ push r13
+ push rbp
+ sub rsp, 10*16
+ vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
+ vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
+ vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
+ vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
+ vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
+ vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
+ vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
+ vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
+ vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
+ vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
+
+ mov rbp, rsp
+ sub rsp, 8*16
+ and rsp, -16
+
+ vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
+ vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+ vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
+ mov KS, [16*16 + 3*16 + Gctx]
+ mov NR, [4 + KS]
+ lea KS, [48 + KS]
+
+ vpshufb CTR0, CTR0, BSWAPMASK
+
+ mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
+ mov aluKSl, [3*4 + KS]
+ bswap aluCTR
+ bswap aluKSl
+
+ vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
+ vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+ vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
+
+ cmp len, 128
+ jb LDecDataSingles
+; Prepare the "top" counters
+ vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
+ vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
+
+ NEXTCTR 1
+ NEXTCTR 2
+ NEXTCTR 3
+ NEXTCTR 4
+ NEXTCTR 5
+ NEXTCTR 6
+ NEXTCTR 7
+
+LDecDataOctets:
+ cmp len, 128
+ jb LEndDecOctets
+ sub len, 128
+
+ vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
+ vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
+ vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
+ vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
+ vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
+ vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
+ vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
+ vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
+
+ vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ vpshufd TMP4, TMP5, 78
+ vpxor TMP4, TMP4, TMP5
+ vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+ vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
+ vpclmulqdq TMP1, TMP5, TMP4, 011h
+ vpclmulqdq TMP2, TMP5, TMP4, 000h
+
+ vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ ROUNDMUL 1
+ NEXTCTR 0
+ vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ ROUNDMUL 2
+ NEXTCTR 1
+ vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ ROUNDMUL 3
+ NEXTCTR 2
+ vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ ROUNDMUL 4
+ NEXTCTR 3
+ vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ ROUNDMUL 5
+ NEXTCTR 4
+ vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ ROUNDMUL 6
+ NEXTCTR 5
+ vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
+ vpshufb TMP5, TMP5, BSWAPMASK
+ vpxor TMP5, TMP5, T
+ ROUNDMUL 7
+ NEXTCTR 6
+
+ ROUND 8
+ NEXTCTR 7
+
+ vpxor TMP0, TMP0, TMP1
+ vpxor TMP0, TMP0, TMP2
+ vpsrldq TMP3, TMP0, 8
+ vpxor TMP4, TMP1, TMP3
+ vpslldq TMP3, TMP0, 8
+ vpxor T, TMP2, TMP3
+
+ vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
+ vpalignr T,T,T,8
+ vpxor T, T, TMP1
+
+ ROUND 9
+
+ vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
+ vpalignr T,T,T,8
+ vpxor T, T, TMP1
+
+ vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+
+ ROUND 10
+ ROUND 11
+ vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+
+ ROUND 12
+ ROUND 13
+ vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
+@@:
+ vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT]
+ vaesenclast CTR0, CTR0, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT]
+ vaesenclast CTR1, CTR1, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT]
+ vaesenclast CTR2, CTR2, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT]
+ vaesenclast CTR3, CTR3, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT]
+ vaesenclast CTR4, CTR4, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT]
+ vaesenclast CTR5, CTR5, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT]
+ vaesenclast CTR6, CTR6, TMP3
+ vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT]
+ vaesenclast CTR7, CTR7, TMP3
+
+ vmovdqu XMMWORD PTR[0*16 + PT], CTR0
+ vmovdqu XMMWORD PTR[1*16 + PT], CTR1
+ vmovdqu XMMWORD PTR[2*16 + PT], CTR2
+ vmovdqu XMMWORD PTR[3*16 + PT], CTR3
+ vmovdqu XMMWORD PTR[4*16 + PT], CTR4
+ vmovdqu XMMWORD PTR[5*16 + PT], CTR5
+ vmovdqu XMMWORD PTR[6*16 + PT], CTR6
+ vmovdqu XMMWORD PTR[7*16 + PT], CTR7
+
+ vpxor T, T, TMP4
+
+ lea CT, [8*16 + CT]
+ lea PT, [8*16 + PT]
+ jmp LDecDataOctets
+
+LEndDecOctets:
+
+ sub aluCTR, 7
+
+LDecDataSingles:
+
+ cmp len, 16
+ jb LDecDataTail
+ sub len, 16
+
+ vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
+ NEXTCTR 0
+
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+ vaesenclast TMP1, TMP1, TMP2
+
+ vmovdqu TMP2, XMMWORD PTR[CT]
+ vpxor TMP1, TMP1, TMP2
+ vmovdqu XMMWORD PTR[PT], TMP1
+
+ lea PT, [16+PT]
+ lea CT, [16+CT]
+
+ vpshufb TMP2, TMP2, BSWAPMASK
+ vpxor T, T, TMP2
+ vmovdqu TMP0, XMMWORD PTR[Htbl]
+ GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
+
+ jmp LDecDataSingles
+
+LDecDataTail:
+
+ test len, len
+ jz LDecDataEnd
+
+ vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
+ inc aluCTR
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+ cmp NR, 10
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+ cmp NR, 12
+ je @f
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+ vaesenclast TMP1, TMP1, TMP2
+; copy as many bytes as needed
+ xor KS, KS
+@@:
+ cmp len, KS
+ je @f
+ mov al, [CT + KS]
+ mov [rsp + KS], al
+ inc KS
+ jmp @b
+@@:
+ cmp KS, 16
+ je @f
+ mov BYTE PTR[rsp + KS], 0
+ inc KS
+ jmp @b
+@@:
+ vmovdqa TMP2, XMMWORD PTR[rsp]
+ vpshufb TMP2, TMP2, BSWAPMASK
+ vpxor T, T, TMP2
+ vmovdqu TMP0, XMMWORD PTR[Htbl]
+ GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4
+
+
+ vpxor TMP1, TMP1, XMMWORD PTR[rsp]
+ vmovdqa XMMWORD PTR[rsp], TMP1
+ xor KS, KS
+@@:
+ cmp len, KS
+ je @f
+ mov al, [rsp + KS]
+ mov [PT + KS], al
+ inc KS
+ jmp @b
+@@:
+ cmp KS, 16
+ je @f
+ mov BYTE PTR[rsp + KS], 0
+ inc KS
+ jmp @b
+@@:
+
+LDecDataEnd:
+
+ vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
+ bswap aluCTR
+ mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
+
+ mov rsp, rbp
+
+ vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
+ vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
+ vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
+ vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
+ vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
+ vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
+ vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
+ vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
+ vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
+ vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
+
+ add rsp, 10*16
+ pop rbp
+ pop r13
+ pop r12
+ pop r11
+
+ vzeroupper
+
+ ret
+ret
+intel_aes_gcmDEC ENDP
+
+
+END