diff options
author | Shay Gueron <shay.gueron@intel.com> | 2014-04-08 19:01:48 -0700 |
---|---|---|
committer | Shay Gueron <shay.gueron@intel.com> | 2014-04-08 19:01:48 -0700 |
commit | 8535b43e4cdd2ac4dfc93053052c666f1774d292 (patch) | |
tree | 91759ab81b466c39c2b77f6237a588cf05c577fe /lib/freebl/intel-gcm-x64-masm.asm | |
parent | b6fd78e9cc5058fa37808d7478b54c37bc93d3df (diff) | |
download | nss-hg-8535b43e4cdd2ac4dfc93053052c666f1774d292.tar.gz |
Bug 979703: Implementation of AES in different modes of operation, using
AES-NI and PCLMULQDQ-NI, for WIN32 and WIN64 platforms. r=wtc.
Diffstat (limited to 'lib/freebl/intel-gcm-x64-masm.asm')
-rw-r--r-- | lib/freebl/intel-gcm-x64-masm.asm | 1301 |
1 files changed, 1301 insertions, 0 deletions
diff --git a/lib/freebl/intel-gcm-x64-masm.asm b/lib/freebl/intel-gcm-x64-masm.asm new file mode 100644 index 000000000..408879d38 --- /dev/null +++ b/lib/freebl/intel-gcm-x64-masm.asm @@ -0,0 +1,1301 @@ +; LICENSE: +; This submission to NSS is to be made available under the terms of the +; Mozilla Public License, v. 2.0. You can obtain one at http: +; //mozilla.org/MPL/2.0/. +;############################################################################### +; Copyright(c) 2014, Intel Corp. +; Developers and authors: +; Shay Gueron and Vlad Krasnov +; Intel Corporation, Israel Development Centre, Haifa, Israel +; Please send feedback directly to crypto.feedback.alias@intel.com + + +.DATA +ALIGN 16 +Lone dq 1,0 +Ltwo dq 2,0 +Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh +Lpoly dq 01h, 0c200000000000000h + +.CODE + + +GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 + vpclmulqdq TMP1, SRC2, SRC1, 0h + vpclmulqdq TMP4, SRC2, SRC1, 011h + + vpshufd TMP2, SRC2, 78 + vpshufd TMP3, SRC1, 78 + vpxor TMP2, TMP2, SRC2 + vpxor TMP3, TMP3, SRC1 + + vpclmulqdq TMP2, TMP2, TMP3, 0h + vpxor TMP2, TMP2, TMP1 + vpxor TMP2, TMP2, TMP4 + + vpslldq TMP3, TMP2, 8 + vpsrldq TMP2, TMP2, 8 + + vpxor TMP1, TMP1, TMP3 + vpxor TMP4, TMP4, TMP2 + + vpclmulqdq TMP2, TMP1, [Lpoly], 010h + vpshufd TMP3, TMP1, 78 + vpxor TMP1, TMP2, TMP3 + + vpclmulqdq TMP2, TMP1, [Lpoly], 010h + vpshufd TMP3, TMP1, 78 + vpxor TMP1, TMP2, TMP3 + + vpxor DST, TMP1, TMP4 + + ENDM + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Generates the final GCM tag +; void intel_aes_gcmTAG(unsigned char Htbl[16*16], +; unsigned char *Tp, +; unsigned int Mlen, +; unsigned int Alen, +; unsigned char *X0, +; unsigned char *TAG); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmTAG PROC + +Htbl textequ <rcx> +Tp textequ <rdx> +Mlen textequ <r8> +Alen textequ <r9> +X0 textequ <r10> +TAG textequ <r11> + +T textequ <xmm0> +TMP0 textequ <xmm1> + + mov X0, [rsp + 1*8 + 4*8] + mov TAG, [rsp + 1*8 + 5*8] + + vzeroupper + vmovdqu T, XMMWORD PTR[Tp] + vpxor TMP0, TMP0, TMP0 + + shl Mlen, 3 + shl Alen, 3 + + ;vpinsrq TMP0, TMP0, Mlen, 0 + ;vpinsrq TMP0, TMP0, Alen, 1 + ; workaround the ml64.exe vpinsrq issue + vpinsrd TMP0, TMP0, r8d, 0 + vpinsrd TMP0, TMP0, r9d, 2 + shr Mlen, 32 + shr Alen, 32 + vpinsrd TMP0, TMP0, r8d, 1 + vpinsrd TMP0, TMP0, r9d, 3 + + vpxor T, T, TMP0 + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 + + vpshufb T, T, [Lbswap_mask] + vpxor T, T, [X0] + vmovdqu XMMWORD PTR[TAG], T + vzeroupper + + ret + +intel_aes_gcmTAG ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Generates the H table +; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmINIT PROC + +Htbl textequ <rcx> +KS textequ <rdx> +NR textequ <r8d> + +T textequ <xmm0> +TMP0 textequ <xmm1> + + vzeroupper + ; AES-ENC(0) + vmovdqu T, XMMWORD PTR[KS] + lea KS, [16 + KS] + dec NR +Lenc_loop: + vaesenc T, T, [KS] + lea KS, [16 + KS] + dec NR + jnz Lenc_loop + + vaesenclast T, T, [KS] + vpshufb T, T, [Lbswap_mask] + + ;Calculate H` = GFMUL(H, 2) + vpsrad xmm3, T, 31 + vpshufd xmm3, xmm3, 0ffh + vpand xmm5, xmm3, [Lpoly] + vpsrld xmm3, T, 31 + vpslld xmm4, T, 1 + vpslldq xmm3, xmm3, 4 + vpxor T, xmm4, xmm3 + vpxor T, T, xmm5 + + vmovdqu TMP0, T + vmovdqu XMMWORD PTR[Htbl + 0*16], T + + vpshufd xmm2, T, 78 + vpxor xmm2, xmm2, T + vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 + + i = 1 + WHILE i LT 8 + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 + vmovdqu XMMWORD PTR[Htbl + i*16], T + vpshufd xmm2, T, 78 + vpxor xmm2, xmm2, T + vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 + i = i+1 + ENDM + vzeroupper + ret +intel_aes_gcmINIT ENDP + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authenticate only +; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmAAD PROC + +Htbl textequ <rcx> +inp textequ <rdx> +len textequ <r8> +Tp textequ <r9> +hlp0 textequ <r10> + +DATA textequ <xmm0> +T textequ <xmm1> +TMP0 textequ <xmm2> +TMP1 textequ <xmm3> +TMP2 textequ <xmm4> +TMP3 textequ <xmm5> +TMP4 textequ <xmm6> +Xhi textequ <xmm7> + +KARATSUBA_AAD MACRO i + vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h + vpxor TMP0, TMP0, TMP3 + vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h + vpxor TMP1, TMP1, TMP3 + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h + vpxor TMP2, TMP2, TMP3 +ENDM + + test len, len + jnz LbeginAAD + ret + +LbeginAAD: + vzeroupper + + sub rsp, 2*16 + vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 + vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 + + vpxor Xhi, Xhi, Xhi + + vmovdqu T, XMMWORD PTR[Tp] + ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first + mov hlp0, len + and hlp0, 128-1 + jz Lmod_loop + + and len, -128 + sub hlp0, 16 + + ; Prefix block + vmovdqu DATA, XMMWORD PTR[inp] + vpshufb DATA, DATA, [Lbswap_mask] + vpxor DATA, DATA, T + + vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h + vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h + + lea inp, [inp+16] + test hlp0, hlp0 + jnz Lpre_loop + jmp Lred1 + + ;hash remaining prefix bocks (up to 7 total prefix blocks) +Lpre_loop: + + sub hlp0, 16 + + vmovdqu DATA, XMMWORD PTR[inp] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h + vpxor TMP0, TMP0, TMP3 + vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h + vpxor TMP1, TMP1, TMP3 + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h + vpxor TMP2, TMP2, TMP3 + + test hlp0, hlp0 + lea inp, [inp+16] + jnz Lpre_loop + +Lred1: + + vpxor TMP2, TMP2, TMP0 + vpxor TMP2, TMP2, TMP1 + vpsrldq TMP3, TMP2, 8 + vpslldq TMP2, TMP2, 8 + + vpxor Xhi, TMP1, TMP3 + vpxor T, TMP0, TMP2 + + +Lmod_loop: + + sub len, 16*8 + jb Ldone + ; Block #0 + vmovdqu DATA, XMMWORD PTR[inp + 16*7] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h + vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h + + ; Block #1 + vmovdqu DATA, XMMWORD PTR[inp + 16*6] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 1 + + ; Block #2 + vmovdqu DATA, XMMWORD PTR[inp + 16*5] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a + vpalignr T, T, T, 8 + + KARATSUBA_AAD 2 + + vpxor T, T, TMP4 ;reduction stage 1b + + ; Block #3 + vmovdqu DATA, XMMWORD PTR[inp + 16*4] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 3 + ; Block #4 + vmovdqu DATA, XMMWORD PTR[inp + 16*3] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a + vpalignr T, T, T, 8 + + KARATSUBA_AAD 4 + + vpxor T, T, TMP4 ;reduction stage 2b + ; Block #5 + vmovdqu DATA, XMMWORD PTR[inp + 16*2] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 5 + + vpxor T, T, Xhi ;reduction finalize + ; Block #6 + vmovdqu DATA, XMMWORD PTR[inp + 16*1] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 6 + ; Block #7 + vmovdqu DATA, XMMWORD PTR[inp + 16*0] + vpshufb DATA, DATA, [Lbswap_mask] + vpxor DATA, DATA, T + KARATSUBA_AAD 7 + ; Aggregated 8 blocks, now karatsuba fixup + vpxor TMP2, TMP2, TMP0 + vpxor TMP2, TMP2, TMP1 + vpsrldq TMP3, TMP2, 8 + vpslldq TMP2, TMP2, 8 + + vpxor Xhi, TMP1, TMP3 + vpxor T, TMP0, TMP2 + + lea inp, [inp + 16*8] + jmp Lmod_loop + +Ldone: + vpclmulqdq TMP4, T, [Lpoly], 010h + vpalignr T, T, T, 8 + vpxor T, T, TMP4 + + vpclmulqdq TMP4, T, [Lpoly], 010h + vpalignr T, T, T, 8 + vpxor T, T, TMP4 + + vpxor T, T, Xhi + vmovdqu XMMWORD PTR[Tp], T + vzeroupper + + vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] + vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] + add rsp, 16*2 + + ret + +intel_aes_gcmAAD ENDP + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Encrypt and Authenticate +; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmENC PROC + +PT textequ <rcx> +CT textequ <rdx> +Htbl textequ <r8> +Gctx textequ <r8> +len textequ <r9> +KS textequ <r10> +NR textequ <eax> + +aluCTR textequ <r11d> +aluKSl textequ <r12d> +aluTMP textequ <r13d> + +T textequ <xmm0> +TMP0 textequ <xmm1> +TMP1 textequ <xmm2> +TMP2 textequ <xmm3> +TMP3 textequ <xmm4> +TMP4 textequ <xmm5> +TMP5 textequ <xmm6> +CTR0 textequ <xmm7> +CTR1 textequ <xmm8> +CTR2 textequ <xmm9> +CTR3 textequ <xmm10> +CTR4 textequ <xmm11> +CTR5 textequ <xmm12> +CTR6 textequ <xmm13> +CTR7 textequ <xmm14> +BSWAPMASK textequ <xmm15> + +ROUND MACRO i + vmovdqu TMP3, XMMWORD PTR[i*16 + KS] + vaesenc CTR0, CTR0, TMP3 + vaesenc CTR1, CTR1, TMP3 + vaesenc CTR2, CTR2, TMP3 + vaesenc CTR3, CTR3, TMP3 + vaesenc CTR4, CTR4, TMP3 + vaesenc CTR5, CTR5, TMP3 + vaesenc CTR6, CTR6, TMP3 + vaesenc CTR7, CTR7, TMP3 +ENDM +ROUNDMUL MACRO i + vmovdqu TMP3, XMMWORD PTR[i*16 + KS] + + vaesenc CTR0, CTR0, TMP3 + vaesenc CTR1, CTR1, TMP3 + vaesenc CTR2, CTR2, TMP3 + vaesenc CTR3, CTR3, TMP3 + + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + + vaesenc CTR4, CTR4, TMP3 + vaesenc CTR5, CTR5, TMP3 + vaesenc CTR6, CTR6, TMP3 + vaesenc CTR7, CTR7, TMP3 + + vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h + vpxor TMP0, TMP0, TMP3 + vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] + vpclmulqdq TMP3, TMP5, TMP4, 011h + vpxor TMP1, TMP1, TMP3 + vpclmulqdq TMP3, TMP5, TMP4, 000h + vpxor TMP2, TMP2, TMP3 +ENDM +KARATSUBA MACRO i + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h + vpxor TMP0, TMP0, TMP3 + vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] + vpclmulqdq TMP3, TMP5, TMP4, 011h + vpxor TMP1, TMP1, TMP3 + vpclmulqdq TMP3, TMP5, TMP4, 000h + vpxor TMP2, TMP2, TMP3 +ENDM +NEXTCTR MACRO i + add aluCTR, 1 + mov aluTMP, aluCTR + xor aluTMP, aluKSl + bswap aluTMP + mov [3*4 + 8*16 + i*16 + rsp], aluTMP +ENDM + + + test len, len + jnz LbeginENC + ret + +LbeginENC: + + vzeroupper + push r11 + push r12 + push r13 + push rbp + sub rsp, 10*16 + vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 + vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 + vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 + vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 + vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 + vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 + vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 + vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 + vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 + vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 + + mov rbp, rsp + sub rsp, 16*16 + and rsp, -16 + + vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] + vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] + mov KS, [16*16 + 3*16 + Gctx] + mov NR, [4 + KS] + lea KS, [48 + KS] + + vpshufb CTR0, CTR0, BSWAPMASK + + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] + mov aluKSl, [3*4 + KS] + bswap aluCTR + bswap aluKSl + + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0 + + cmp len, 128 + jb LEncDataSingles +; Prepare the "top" counters + vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0 + +; Encrypt the initial 8 blocks + sub len, 128 + vpaddd CTR1, CTR0, XMMWORD PTR[Lone] + vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] + vpaddd CTR3, CTR2, XMMWORD PTR[Lone] + vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] + vpaddd CTR5, CTR4, XMMWORD PTR[Lone] + vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] + vpaddd CTR7, CTR6, XMMWORD PTR[Lone] + + vpshufb CTR0, CTR0, BSWAPMASK + vpshufb CTR1, CTR1, BSWAPMASK + vpshufb CTR2, CTR2, BSWAPMASK + vpshufb CTR3, CTR3, BSWAPMASK + vpshufb CTR4, CTR4, BSWAPMASK + vpshufb CTR5, CTR5, BSWAPMASK + vpshufb CTR6, CTR6, BSWAPMASK + vpshufb CTR7, CTR7, BSWAPMASK + + vmovdqu TMP3, XMMWORD PTR[0*16 + KS] + vpxor CTR0, CTR0, TMP3 + vpxor CTR1, CTR1, TMP3 + vpxor CTR2, CTR2, TMP3 + vpxor CTR3, CTR3, TMP3 + vpxor CTR4, CTR4, TMP3 + vpxor CTR5, CTR5, TMP3 + vpxor CTR6, CTR6, TMP3 + vpxor CTR7, CTR7, TMP3 + + ROUND 1 + + add aluCTR, 8 + mov aluTMP, aluCTR + xor aluTMP, aluKSl + bswap aluTMP + mov [8*16 + 0*16 + 3*4 + rsp], aluTMP + + ROUND 2 + NEXTCTR 1 + ROUND 3 + NEXTCTR 2 + ROUND 4 + NEXTCTR 3 + ROUND 5 + NEXTCTR 4 + ROUND 6 + NEXTCTR 5 + ROUND 7 + NEXTCTR 6 + ROUND 8 + NEXTCTR 7 + ROUND 9 + vmovdqu TMP5, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + + ROUND 10 + ROUND 11 + vmovdqu TMP5, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + + ROUND 12 + ROUND 13 + vmovdqu TMP5, XMMWORD PTR[14*16 + KS] +@@: + vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] + vaesenclast CTR0, CTR0, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] + vaesenclast CTR1, CTR1, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] + vaesenclast CTR2, CTR2, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] + vaesenclast CTR3, CTR3, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] + vaesenclast CTR4, CTR4, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] + vaesenclast CTR5, CTR5, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] + vaesenclast CTR6, CTR6, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] + vaesenclast CTR7, CTR7, TMP3 + + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 + vpshufb CTR0, CTR0, BSWAPMASK + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 + vpshufb CTR1, CTR1, BSWAPMASK + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 + vpshufb CTR2, CTR2, BSWAPMASK + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 + vpshufb CTR3, CTR3, BSWAPMASK + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 + vpshufb CTR4, CTR4, BSWAPMASK + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 + vpshufb CTR5, CTR5, BSWAPMASK + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 + vpshufb CTR6, CTR6, BSWAPMASK + vmovdqu XMMWORD PTR[7*16 + CT], CTR7 + vpshufb TMP5, CTR7, BSWAPMASK + + vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 + vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 + vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 + vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 + vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 + vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 + vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 + + lea CT, [8*16 + CT] + lea PT, [8*16 + PT] + jmp LEncDataOctets + +LEncDataOctets: + cmp len, 128 + jb LEndEncOctets + sub len, 128 + + vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp] + vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp] + vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp] + vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp] + vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp] + vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp] + vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp] + vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp] + + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] + vpclmulqdq TMP1, TMP5, TMP4, 011h + vpclmulqdq TMP2, TMP5, TMP4, 000h + + vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] + ROUNDMUL 1 + NEXTCTR 0 + vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] + ROUNDMUL 2 + NEXTCTR 1 + vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] + ROUNDMUL 3 + NEXTCTR 2 + vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] + ROUNDMUL 4 + NEXTCTR 3 + vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] + ROUNDMUL 5 + NEXTCTR 4 + vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] + ROUNDMUL 6 + NEXTCTR 5 + vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] + ROUNDMUL 7 + NEXTCTR 6 + + ROUND 8 + NEXTCTR 7 + + vpxor TMP0, TMP0, TMP1 + vpxor TMP0, TMP0, TMP2 + vpsrldq TMP3, TMP0, 8 + vpxor TMP4, TMP1, TMP3 + vpslldq TMP3, TMP0, 8 + vpxor T, TMP2, TMP3 + + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h + vpalignr T,T,T,8 + vpxor T, T, TMP1 + + ROUND 9 + + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h + vpalignr T,T,T,8 + vpxor T, T, TMP1 + + vmovdqu TMP5, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + + ROUND 10 + ROUND 11 + vmovdqu TMP5, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + + ROUND 12 + ROUND 13 + vmovdqu TMP5, XMMWORD PTR[14*16 + KS] +@@: + vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] + vaesenclast CTR0, CTR0, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] + vaesenclast CTR1, CTR1, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] + vaesenclast CTR2, CTR2, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] + vaesenclast CTR3, CTR3, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] + vaesenclast CTR4, CTR4, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] + vaesenclast CTR5, CTR5, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] + vaesenclast CTR6, CTR6, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] + vaesenclast CTR7, CTR7, TMP3 + + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 + vpshufb CTR0, CTR0, BSWAPMASK + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 + vpshufb CTR1, CTR1, BSWAPMASK + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 + vpshufb CTR2, CTR2, BSWAPMASK + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 + vpshufb CTR3, CTR3, BSWAPMASK + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 + vpshufb CTR4, CTR4, BSWAPMASK + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 + vpshufb CTR5, CTR5, BSWAPMASK + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 + vpshufb CTR6, CTR6, BSWAPMASK + vmovdqu XMMWORD PTR[7*16 + CT], CTR7 + vpshufb TMP5, CTR7, BSWAPMASK + + vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 + vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 + vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 + vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 + vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 + vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 + vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 + + vpxor T, T, TMP4 + + lea CT, [8*16 + CT] + lea PT, [8*16 + PT] + jmp LEncDataOctets + +LEndEncOctets: + + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] + vpclmulqdq TMP1, TMP5, TMP4, 011h + vpclmulqdq TMP2, TMP5, TMP4, 000h + + vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] + KARATSUBA 1 + vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] + KARATSUBA 2 + vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] + KARATSUBA 3 + vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] + KARATSUBA 4 + vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] + KARATSUBA 5 + vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] + KARATSUBA 6 + vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] + KARATSUBA 7 + + vpxor TMP0, TMP0, TMP1 + vpxor TMP0, TMP0, TMP2 + vpsrldq TMP3, TMP0, 8 + vpxor TMP4, TMP1, TMP3 + vpslldq TMP3, TMP0, 8 + vpxor T, TMP2, TMP3 + + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h + vpalignr T,T,T,8 + vpxor T, T, TMP1 + + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h + vpalignr T,T,T,8 + vpxor T, T, TMP1 + + vpxor T, T, TMP4 + + sub aluCTR, 7 + +LEncDataSingles: + + cmp len, 16 + jb LEncDataTail + sub len, 16 + + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] + NEXTCTR 0 + + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 + vpxor TMP1, TMP1, XMMWORD PTR[PT] + vmovdqu XMMWORD PTR[CT], TMP1 + + lea PT, [16+PT] + lea CT, [16+CT] + + vpshufb TMP1, TMP1, BSWAPMASK + vpxor T, T, TMP1 + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 + + jmp LEncDataSingles + +LEncDataTail: + + test len, len + jz LEncDataEnd + + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] + + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 +; zero a temp location + vpxor TMP2, TMP2, TMP2 + vmovdqa XMMWORD PTR[rsp], TMP2 +; copy as many bytes as needed + xor KS, KS + +@@: + cmp len, KS + je @f + mov al, [PT + KS] + mov [rsp + KS], al + inc KS + jmp @b +@@: + vpxor TMP1, TMP1, XMMWORD PTR[rsp] + vmovdqa XMMWORD PTR[rsp], TMP1 + xor KS, KS +@@: + cmp len, KS + je @f + mov al, [rsp + KS] + mov [CT + KS], al + inc KS + jmp @b +@@: + cmp KS, 16 + je @f + mov BYTE PTR[rsp + KS], 0 + inc KS + jmp @b +@@: +BAIL: + vmovdqa TMP1, XMMWORD PTR[rsp] + vpshufb TMP1, TMP1, BSWAPMASK + vpxor T, T, TMP1 + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 + +LEncDataEnd: + + vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T + bswap aluCTR + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR + + mov rsp, rbp + + vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] + vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] + vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] + vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] + vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] + vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] + vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] + vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] + vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] + vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] + + add rsp, 10*16 + pop rbp + pop r13 + pop r12 + pop r11 + + vzeroupper + + ret +intel_aes_gcmENC ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Decrypt and Authenticate +; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmDEC PROC + +NEXTCTR MACRO i + add aluCTR, 1 + mov aluTMP, aluCTR + xor aluTMP, aluKSl + bswap aluTMP + mov [3*4 + i*16 + rsp], aluTMP +ENDM + +PT textequ <rdx> +CT textequ <rcx> + + test len, len + jnz LbeginDEC + ret + +LbeginDEC: + + vzeroupper + push r11 + push r12 + push r13 + push rbp + sub rsp, 10*16 + vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 + vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 + vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 + vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 + vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 + vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 + vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 + vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 + vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 + vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 + + mov rbp, rsp + sub rsp, 8*16 + and rsp, -16 + + vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] + vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] + mov KS, [16*16 + 3*16 + Gctx] + mov NR, [4 + KS] + lea KS, [48 + KS] + + vpshufb CTR0, CTR0, BSWAPMASK + + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] + mov aluKSl, [3*4 + KS] + bswap aluCTR + bswap aluKSl + + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vmovdqu XMMWORD PTR[0*16 + rsp], TMP0 + + cmp len, 128 + jb LDecDataSingles +; Prepare the "top" counters + vmovdqu XMMWORD PTR[1*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[2*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[3*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[4*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[5*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[6*16 + rsp], TMP0 + vmovdqu XMMWORD PTR[7*16 + rsp], TMP0 + + NEXTCTR 1 + NEXTCTR 2 + NEXTCTR 3 + NEXTCTR 4 + NEXTCTR 5 + NEXTCTR 6 + NEXTCTR 7 + +LDecDataOctets: + cmp len, 128 + jb LEndDecOctets + sub len, 128 + + vmovdqa CTR0, XMMWORD PTR[0*16 + rsp] + vmovdqa CTR1, XMMWORD PTR[1*16 + rsp] + vmovdqa CTR2, XMMWORD PTR[2*16 + rsp] + vmovdqa CTR3, XMMWORD PTR[3*16 + rsp] + vmovdqa CTR4, XMMWORD PTR[4*16 + rsp] + vmovdqa CTR5, XMMWORD PTR[5*16 + rsp] + vmovdqa CTR6, XMMWORD PTR[6*16 + rsp] + vmovdqa CTR7, XMMWORD PTR[7*16 + rsp] + + vmovdqu TMP5, XMMWORD PTR[7*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] + vpclmulqdq TMP1, TMP5, TMP4, 011h + vpclmulqdq TMP2, TMP5, TMP4, 000h + + vmovdqu TMP5, XMMWORD PTR[6*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + ROUNDMUL 1 + NEXTCTR 0 + vmovdqu TMP5, XMMWORD PTR[5*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + ROUNDMUL 2 + NEXTCTR 1 + vmovdqu TMP5, XMMWORD PTR[4*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + ROUNDMUL 3 + NEXTCTR 2 + vmovdqu TMP5, XMMWORD PTR[3*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + ROUNDMUL 4 + NEXTCTR 3 + vmovdqu TMP5, XMMWORD PTR[2*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + ROUNDMUL 5 + NEXTCTR 4 + vmovdqu TMP5, XMMWORD PTR[1*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + ROUNDMUL 6 + NEXTCTR 5 + vmovdqu TMP5, XMMWORD PTR[0*16 + CT] + vpshufb TMP5, TMP5, BSWAPMASK + vpxor TMP5, TMP5, T + ROUNDMUL 7 + NEXTCTR 6 + + ROUND 8 + NEXTCTR 7 + + vpxor TMP0, TMP0, TMP1 + vpxor TMP0, TMP0, TMP2 + vpsrldq TMP3, TMP0, 8 + vpxor TMP4, TMP1, TMP3 + vpslldq TMP3, TMP0, 8 + vpxor T, TMP2, TMP3 + + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h + vpalignr T,T,T,8 + vpxor T, T, TMP1 + + ROUND 9 + + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h + vpalignr T,T,T,8 + vpxor T, T, TMP1 + + vmovdqu TMP5, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + + ROUND 10 + ROUND 11 + vmovdqu TMP5, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + + ROUND 12 + ROUND 13 + vmovdqu TMP5, XMMWORD PTR[14*16 + KS] +@@: + vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT] + vaesenclast CTR0, CTR0, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT] + vaesenclast CTR1, CTR1, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT] + vaesenclast CTR2, CTR2, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT] + vaesenclast CTR3, CTR3, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT] + vaesenclast CTR4, CTR4, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT] + vaesenclast CTR5, CTR5, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT] + vaesenclast CTR6, CTR6, TMP3 + vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT] + vaesenclast CTR7, CTR7, TMP3 + + vmovdqu XMMWORD PTR[0*16 + PT], CTR0 + vmovdqu XMMWORD PTR[1*16 + PT], CTR1 + vmovdqu XMMWORD PTR[2*16 + PT], CTR2 + vmovdqu XMMWORD PTR[3*16 + PT], CTR3 + vmovdqu XMMWORD PTR[4*16 + PT], CTR4 + vmovdqu XMMWORD PTR[5*16 + PT], CTR5 + vmovdqu XMMWORD PTR[6*16 + PT], CTR6 + vmovdqu XMMWORD PTR[7*16 + PT], CTR7 + + vpxor T, T, TMP4 + + lea CT, [8*16 + CT] + lea PT, [8*16 + PT] + jmp LDecDataOctets + +LEndDecOctets: + + sub aluCTR, 7 + +LDecDataSingles: + + cmp len, 16 + jb LDecDataTail + sub len, 16 + + vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] + NEXTCTR 0 + + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 + + vmovdqu TMP2, XMMWORD PTR[CT] + vpxor TMP1, TMP1, TMP2 + vmovdqu XMMWORD PTR[PT], TMP1 + + lea PT, [16+PT] + lea CT, [16+CT] + + vpshufb TMP2, TMP2, BSWAPMASK + vpxor T, T, TMP2 + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 + + jmp LDecDataSingles + +LDecDataTail: + + test len, len + jz LDecDataEnd + + vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] + inc aluCTR + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 +; copy as many bytes as needed + xor KS, KS +@@: + cmp len, KS + je @f + mov al, [CT + KS] + mov [rsp + KS], al + inc KS + jmp @b +@@: + cmp KS, 16 + je @f + mov BYTE PTR[rsp + KS], 0 + inc KS + jmp @b +@@: + vmovdqa TMP2, XMMWORD PTR[rsp] + vpshufb TMP2, TMP2, BSWAPMASK + vpxor T, T, TMP2 + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4 + + + vpxor TMP1, TMP1, XMMWORD PTR[rsp] + vmovdqa XMMWORD PTR[rsp], TMP1 + xor KS, KS +@@: + cmp len, KS + je @f + mov al, [rsp + KS] + mov [PT + KS], al + inc KS + jmp @b +@@: + cmp KS, 16 + je @f + mov BYTE PTR[rsp + KS], 0 + inc KS + jmp @b +@@: + +LDecDataEnd: + + vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T + bswap aluCTR + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR + + mov rsp, rbp + + vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] + vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] + vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] + vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] + vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] + vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] + vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] + vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] + vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] + vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] + + add rsp, 10*16 + pop rbp + pop r13 + pop r12 + pop r11 + + vzeroupper + + ret +ret +intel_aes_gcmDEC ENDP + + +END |