summaryrefslogtreecommitdiff
path: root/lib/freebl/intel-aes.s
diff options
context:
space:
mode:
Diffstat (limited to 'lib/freebl/intel-aes.s')
-rw-r--r--lib/freebl/intel-aes.s2488
1 files changed, 2488 insertions, 0 deletions
diff --git a/lib/freebl/intel-aes.s b/lib/freebl/intel-aes.s
new file mode 100644
index 000000000..a83529a48
--- /dev/null
+++ b/lib/freebl/intel-aes.s
@@ -0,0 +1,2488 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+ .text
+
+#define IV_OFFSET 16
+#define EXPANDED_KEY_OFFSET 48
+
+
+/* in %rdi : the key
+ in %rsi : buffer for expanded key
+*/
+ .type intel_aes_encrypt_init_128,@function
+ .globl intel_aes_encrypt_init_128
+ .align 16
+intel_aes_encrypt_init_128:
+ movups (%rdi), %xmm1
+ movups %xmm1, (%rsi)
+ leaq 16(%rsi), %rsi
+ xorl %eax, %eax
+
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01 /* aeskeygenassist $0x01, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02 /* aeskeygenassist $0x02, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04 /* aeskeygenassist $0x04, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08 /* aeskeygenassist $0x08, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10 /* aeskeygenassist $0x10, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20 /* aeskeygenassist $0x20, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40 /* aeskeygenassist $0x40, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80 /* aeskeygenassist $0x80, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b /* aeskeygenassist $0x1b, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36 /* aeskeygenassist $0x36, %xmm1, %xmm2 */
+ call key_expansion128
+
+ ret
+ .size intel_aes_encrypt_init_128, .-intel_aes_encrypt_init_128
+
+
+/* in %rdi : the key
+ in %rsi : buffer for expanded key
+*/
+ .type intel_aes_decrypt_init_128,@function
+ .globl intel_aes_decrypt_init_128
+ .align 16
+intel_aes_decrypt_init_128:
+ movups (%rdi), %xmm1
+ movups %xmm1, (%rsi)
+ leaq 16(%rsi), %rsi
+ xorl %eax, %eax
+
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01 /* aeskeygenassist $0x01, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02 /* aeskeygenassist $0x02, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04 /* aeskeygenassist $0x04, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08 /* aeskeygenassist $0x08, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10 /* aeskeygenassist $0x10, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20 /* aeskeygenassist $0x20, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40 /* aeskeygenassist $0x40, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80 /* aeskeygenassist $0x80, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b /* aeskeygenassist $0x1b, %xmm1, %xmm2 */
+ call key_expansion128
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36 /* aeskeygenassist $0x36, %xmm1, %xmm2 */
+ call key_expansion128
+
+ ret
+ .size intel_aes_decrypt_init_128, .-intel_aes_decrypt_init_128
+
+
+ .type key_expansion128,@function
+ .align 16
+key_expansion128:
+ movd %eax, %xmm3
+ pshufd $0xff, %xmm2, %xmm2
+ shufps $0x10, %xmm1, %xmm3
+ pxor %xmm3, %xmm1
+ shufps $0x8c, %xmm1, %xmm3
+ pxor %xmm2, %xmm1
+ pxor %xmm3, %xmm1
+ movdqu %xmm1, (%rsi)
+ addq $16, %rsi
+ ret
+ .size key_expansion128, .-key_expansion128
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_encrypt_ecb_128,@function
+ .globl intel_aes_encrypt_ecb_128
+ .align 16
+intel_aes_encrypt_ecb_128:
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdi), %xmm2
+ movdqu 160(%rdi), %xmm12
+ xor %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm2, %xmm3
+ pxor %xmm2, %xmm4
+ pxor %xmm2, %xmm5
+ pxor %xmm2, %xmm6
+ pxor %xmm2, %xmm7
+ pxor %xmm2, %xmm8
+ pxor %xmm2, %xmm9
+ pxor %xmm2, %xmm10
+
+// complete loop unrolling
+ movdqu 16(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xdc /* aesenclast %xmm12, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xe4 /* aesenclast %xmm12, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xec /* aesenclast %xmm12, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xf4 /* aesenclast %xmm12, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xfc /* aesenclast %xmm12, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xc4 /* aesenclast %xmm12, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xcc /* aesenclast %xmm12, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xd4 /* aesenclast %xmm12, %xmm10 */
+
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm2, %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xcb /* aesenc %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcc /* aesenc %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcd /* aesenc %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xce /* aesenc %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcf /* aesenc %xmm7, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xca /* aesenc %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xcc /* aesenclast %xmm12, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: xor %eax, %eax
+ ret
+ .size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_decrypt_ecb_128,@function
+ .globl intel_aes_decrypt_ecb_128
+ .align 16
+intel_aes_decrypt_ecb_128:
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdi), %xmm2
+ movdqu 160(%rdi), %xmm12
+ xorl %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm12, %xmm3
+ pxor %xmm12, %xmm4
+ pxor %xmm12, %xmm5
+ pxor %xmm12, %xmm6
+ pxor %xmm12, %xmm7
+ pxor %xmm12, %xmm8
+ pxor %xmm12, %xmm9
+ pxor %xmm12, %xmm10
+
+// complete loop unrolling
+ movdqu 144(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 16(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x0f,0x38,0xdf,0xda /* aesdeclast %xmm2, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdf,0xe2 /* aesdeclast %xmm2, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdf,0xea /* aesdeclast %xmm2, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdf,0xf2 /* aesdeclast %xmm2, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdf,0xfa /* aesdeclast %xmm2, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2 /* aesdeclast %xmm2, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2 /* aesdeclast %xmm2, %xmm10 */
+
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm12, %xmm1
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xca /* aesdec %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc9 /* aesdec %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc8 /* aesdec %xmm8, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcf /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xce /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcd /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcc /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: xor %eax, %eax
+ ret
+ .size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_encrypt_cbc_128,@function
+ .globl intel_aes_encrypt_cbc_128
+ .align 16
+intel_aes_encrypt_cbc_128:
+ testq %r9, %r9
+ je 2f
+
+// leaq IV_OFFSET(%rdi), %rdx
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 16(%rdi), %rdx
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdx), %xmm0
+ movdqu (%rdi), %xmm2
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+ movdqu 160(%rdi), %xmm12
+
+ xorl %eax, %eax
+1: movdqu (%r8, %rax), %xmm1
+ pxor %xmm0, %xmm1
+ pxor %xmm2, %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xcb /* aesenc %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcc /* aesenc %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcd /* aesenc %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xce /* aesenc %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcf /* aesenc %xmm7, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xca /* aesenc %xmma, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmmb, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xcc /* aesenclast %xmm12, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ movdqa %xmm1, %xmm0
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 1b
+
+ movdqu %xmm0, (%rdx)
+
+2: xor %eax, %eax
+ ret
+ .size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_decrypt_cbc_128,@function
+ .globl intel_aes_decrypt_cbc_128
+ .align 16
+intel_aes_decrypt_cbc_128:
+// leaq IV_OFFSET(%rdi), %rdx
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 16(%rdi), %rdx
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdx), %xmm0 /* iv */
+ movdqu (%rdi), %xmm2 /* first key block */
+ movdqu 160(%rdi), %xmm12 /* last key block */
+ xorl %eax, %eax
+ cmpq $128, %r9
+ jb 1f
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3 /* 1st data block */
+ movdqu 16(%r8, %rax), %xmm4 /* 2d data block */
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm12, %xmm3
+ pxor %xmm12, %xmm4
+ pxor %xmm12, %xmm5
+ pxor %xmm12, %xmm6
+ pxor %xmm12, %xmm7
+ pxor %xmm12, %xmm8
+ pxor %xmm12, %xmm9
+ pxor %xmm12, %xmm10
+
+// complete loop unrolling
+ movdqu 144(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 16(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x0f,0x38,0xdf,0xda /* aesdeclast %xmm2, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdf,0xe2 /* aesdeclast %xmm2, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdf,0xea /* aesdeclast %xmm2, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdf,0xf2 /* aesdeclast %xmm2, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdf,0xfa /* aesdeclast %xmm2, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2 /* aesdeclast %xmm2, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2 /* aesdeclast %xmm2, %xmm10 */
+
+ pxor %xmm0, %xmm3
+ movdqu (%r8, %rax), %xmm0
+ pxor %xmm0, %xmm4
+ movdqu 16(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm5
+ movdqu 32(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm6
+ movdqu 48(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm7
+ movdqu 64(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm8
+ movdqu 80(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm9
+ movdqu 96(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm10
+ movdqu 112(%r8, %rax), %xmm0
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+
+4: movdqu (%r8, %rax), %xmm1
+ movdqa %xmm1, %xmm13
+ pxor %xmm12, %xmm1
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xca /* aesdec %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc9 /* aesdec %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc8 /* aesdec %xmm8, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcf /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xce /* aesdec %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcd /* aesdec %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcc /* aesdec %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm1 */
+ pxor %xmm0, %xmm1
+ movdqu %xmm1, (%rsi, %rax)
+ movdqa %xmm13, %xmm0
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: movdqu %xmm0, (%rdx)
+
+ xor %eax, %eax
+ ret
+ .size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
+
+/* in %rdi : the key
+ in %rsi : buffer for expanded key
+*/
+ .type intel_aes_encrypt_init_192,@function
+ .globl intel_aes_encrypt_init_192
+ .align 16
+intel_aes_encrypt_init_192:
+ movdqu (%rdi), %xmm1
+ movq 16(%rdi), %xmm3
+ movdqu %xmm1, (%rsi)
+ movq %xmm3, 16(%rsi)
+ leaq 24(%rsi), %rsi
+
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01 /* aeskeygenassist $0x01, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02 /* aeskeygenassist $0x02, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04 /* aeskeygenassist $0x04, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08 /* aeskeygenassist $0x08, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10 /* aeskeygenassist $0x10, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20 /* aeskeygenassist $0x20, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40 /* aeskeygenassist $0x40, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80 /* aeskeygenassist $0x80, %xmm3, %xmm2 */
+ call key_expansion192
+
+ ret
+ .size intel_aes_encrypt_init_192, .-intel_aes_encrypt_init_192
+
+
+/* in %rdi : the key
+ in %rsi : buffer for expanded key
+*/
+ .type intel_aes_decrypt_init_192,@function
+ .globl intel_aes_decrypt_init_192
+ .align 16
+intel_aes_decrypt_init_192:
+ movdqu (%rdi), %xmm1
+ movq 16(%rdi), %xmm3
+ movdqu %xmm1, (%rsi)
+ movq %xmm3, 16(%rsi)
+ leaq 24(%rsi), %rsi
+
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01 /* aeskeygenassist $0x01, %xmm3, %xmm2 */
+ call key_expansion192
+ movups -32(%rsi), %xmm2
+ movups -16(%rsi), %xmm4
+ .byte 0x66,0x0f,0x38,0xdb,0xd2 /* aesimc %xmm2, %xmm2 */
+ .byte 0x66,0x0f,0x38,0xdb,0xe4 /* aesimc %xmm4, %xmm4 */
+ movups %xmm2, -32(%rsi)
+ movups %xmm4, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02 /* aeskeygenassist $0x02, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -24(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04 /* aeskeygenassist $0x04, %xmm3, %xmm2 */
+ call key_expansion192
+ movups -32(%rsi), %xmm2
+ movups -16(%rsi), %xmm4
+ .byte 0x66,0x0f,0x38,0xdb,0xd2 /* aesimc %xmm2, %xmm2 */
+ .byte 0x66,0x0f,0x38,0xdb,0xe4 /* aesimc %xmm4, %xmm4 */
+ movups %xmm2, -32(%rsi)
+ movups %xmm4, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08 /* aeskeygenassist $0x08, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -24(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10 /* aeskeygenassist $0x10, %xmm3, %xmm2 */
+ call key_expansion192
+ movups -32(%rsi), %xmm2
+ movups -16(%rsi), %xmm4
+ .byte 0x66,0x0f,0x38,0xdb,0xd2 /* aesimc %xmm2, %xmm2 */
+ .byte 0x66,0x0f,0x38,0xdb,0xe4 /* aesimc %xmm4, %xmm4 */
+ movups %xmm2, -32(%rsi)
+ movups %xmm4, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20 /* aeskeygenassist $0x20, %xmm3, %xmm2 */
+ call key_expansion192
+ .byte 0x66,0x0f,0x38,0xdb,0xd1 /* aesimc %xmm1, %xmm2 */
+ movups %xmm2, -24(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40 /* aeskeygenassist $0x40, %xmm3, %xmm2 */
+ call key_expansion192
+ movups -32(%rsi), %xmm2
+ movups -16(%rsi), %xmm4
+ .byte 0x66,0x0f,0x38,0xdb,0xd2 /* aesimc %xmm2, %xmm2 */
+ .byte 0x66,0x0f,0x38,0xdb,0xe4 /* aesimc %xmm4, %xmm4 */
+ movups %xmm2, -32(%rsi)
+ movups %xmm4, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80 /* aeskeygenassist $0x80, %xmm3, %xmm2 */
+ call key_expansion192
+
+ ret
+ .size intel_aes_decrypt_init_192, .-intel_aes_decrypt_init_192
+
+
+ .type key_expansion192,@function
+ .align 16
+key_expansion192:
+ pshufd $0x55, %xmm2, %xmm2
+ xor %eax, %eax
+ movd %eax, %xmm4
+ shufps $0x10, %xmm1, %xmm4
+ pxor %xmm4, %xmm1
+ shufps $0x8c, %xmm1, %xmm4
+ pxor %xmm2, %xmm1
+ pxor %xmm4, %xmm1
+ movdqu %xmm1, (%rsi)
+ addq $16, %rsi
+
+ pshufd $0xff, %xmm1, %xmm4
+ movd %eax, %xmm5
+ shufps $0x00, %xmm3, %xmm5
+ shufps $0x08, %xmm3, %xmm5
+ pxor %xmm4, %xmm3
+ pxor %xmm5, %xmm3
+ movq %xmm3, (%rsi)
+ addq $8, %rsi
+ ret
+ .size key_expansion192, .-key_expansion192
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_encrypt_ecb_192,@function
+ .globl intel_aes_encrypt_ecb_192
+ .align 16
+intel_aes_encrypt_ecb_192:
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdi), %xmm2
+ movdqu 192(%rdi), %xmm14
+ xorl %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm2, %xmm3
+ pxor %xmm2, %xmm4
+ pxor %xmm2, %xmm5
+ pxor %xmm2, %xmm6
+ pxor %xmm2, %xmm7
+ pxor %xmm2, %xmm8
+ pxor %xmm2, %xmm9
+ pxor %xmm2, %xmm10
+
+// complete loop unrolling
+ movdqu 16(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ movdqu 160(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 176(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xde /* aesenclast %xmm14, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xe6 /* aesenclast %xmm14, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xee /* aesenclast %xmm14, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xf6 /* aesenclast %xmm14, %xmm7 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xfe /* aesenclast %xmm14, %xmm3 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xc6 /* aesenclast %xmm14, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xce /* aesenclast %xmm14, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xd6 /* aesenclast %xmm14, %xmm10 */
+
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+ movdqu 160(%rdi), %xmm12
+ movdqu 176(%rdi), %xmm13
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm2, %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xcb /* aesenc %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcc /* aesenc %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcd /* aesenc %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xce /* aesenc %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcf /* aesenc %xmm7, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xca /* aesenc %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc /* aesenc %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd /* aesenc %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xce /* aesenclast %xmm14, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: xor %eax, %eax
+ ret
+ .size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_decrypt_ecb_192,@function
+ .globl intel_aes_decrypt_ecb_192
+ .align 16
+intel_aes_decrypt_ecb_192:
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdi), %xmm2
+ movdqu 192(%rdi), %xmm14
+ xorl %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm14, %xmm3
+ pxor %xmm14, %xmm4
+ pxor %xmm14, %xmm5
+ pxor %xmm14, %xmm6
+ pxor %xmm14, %xmm7
+ pxor %xmm14, %xmm8
+ pxor %xmm14, %xmm9
+ pxor %xmm14, %xmm10
+
+// complete loop unrolling
+ movdqu 176(%rdi), %xmm1
+ movdqu 160(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 16(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x0f,0x38,0xdf,0xda /* aesdeclast %xmm2, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdf,0xe2 /* aesdeclast %xmm2, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdf,0xea /* aesdeclast %xmm2, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdf,0xf2 /* aesdeclast %xmm2, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdf,0xfa /* aesdeclast %xmm2, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2 /* aesdeclast %xmm2, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2 /* aesdeclast %xmm2, %xmm10 */
+
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+ movdqu 160(%rdi), %xmm12
+ movdqu 176(%rdi), %xmm13
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm14, %xmm1
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcd /* aesdec %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcc /* aesdec %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xca /* aesdec %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc9 /* aesdec %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc8 /* aesdec %xmm8, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcf /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xce /* aesdec %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcd /* aesdec %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcc /* aesdec %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: xor %eax, %eax
+ ret
+ .size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_encrypt_cbc_192,@function
+ .globl intel_aes_encrypt_cbc_192
+ .align 16
+intel_aes_encrypt_cbc_192:
+ testq %r9, %r9
+ je 2f
+
+// leaq IV_OFFSET(%rdi), %rdx
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 16(%rdi), %rdx
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdx), %xmm0
+ movdqu (%rdi), %xmm2
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+ movdqu 160(%rdi), %xmm12
+ movdqu 176(%rdi), %xmm13
+ movdqu 192(%rdi), %xmm14
+
+ xorl %eax, %eax
+1: movdqu (%r8, %rax), %xmm1
+ pxor %xmm0, %xmm1
+ pxor %xmm2, %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xcb /* aesenc %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcc /* aesenc %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcd /* aesenc %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xce /* aesenc %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcf /* aesenc %xmm7, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xca /* aesenc %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc /* aesenc %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd /* aesenc %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xce /* aesenclast %xmm14, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ movdqa %xmm1, %xmm0
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 1b
+
+ movdqu %xmm0, (%rdx)
+
+2: xor %eax, %eax
+ ret
+ .size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_decrypt_cbc_192,@function
+ .globl intel_aes_decrypt_cbc_192
+ .align 16
+intel_aes_decrypt_cbc_192:
+ leaq 16(%rdi), %rdx
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdx), %xmm0
+ movdqu (%rdi), %xmm2
+ movdqu 192(%rdi), %xmm14
+ xorl %eax, %eax
+ cmpq $128, %r9
+ jb 1f
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm14, %xmm3
+ pxor %xmm14, %xmm4
+ pxor %xmm14, %xmm5
+ pxor %xmm14, %xmm6
+ pxor %xmm14, %xmm7
+ pxor %xmm14, %xmm8
+ pxor %xmm14, %xmm9
+ pxor %xmm14, %xmm10
+
+// complete loop unrolling
+ movdqu 176(%rdi), %xmm1
+ movdqu 160(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 16(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x0f,0x38,0xdf,0xda /* aesdeclast %xmm2, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdf,0xe2 /* aesdeclast %xmm2, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdf,0xea /* aesdeclast %xmm2, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdf,0xf2 /* aesdeclast %xmm2, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdf,0xfa /* aesdeclast %xmm2, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2 /* aesdeclast %xmm2, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2 /* aesdeclast %xmm2, %xmm10 */
+
+ pxor %xmm0, %xmm3
+ movdqu (%r8, %rax), %xmm0
+ pxor %xmm0, %xmm4
+ movdqu 16(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm5
+ movdqu 32(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm6
+ movdqu 48(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm7
+ movdqu 64(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm8
+ movdqu 80(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm9
+ movdqu 96(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm10
+ movdqu 112(%r8, %rax), %xmm0
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm5
+ movdqu 64(%rdi), %xmm6
+ movdqu 80(%rdi), %xmm7
+ movdqu 96(%rdi), %xmm8
+ movdqu 112(%rdi), %xmm9
+ movdqu 128(%rdi), %xmm10
+ movdqu 144(%rdi), %xmm11
+ movdqu 160(%rdi), %xmm12
+ movdqu 176(%rdi), %xmm13
+
+4: movdqu (%r8, %rax), %xmm1
+ movdqa %xmm1, %xmm15
+ pxor %xmm14, %xmm1
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcd /* aesdec %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcc /* aesdec %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xca /* aesdec %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc9 /* aesdec %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc8 /* aesdec %xmm8, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcf /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xce /* aesdec %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcd /* aesdec %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcc /* aesdec %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm1 */
+ pxor %xmm0, %xmm1
+ movdqu %xmm1, (%rsi, %rax)
+ movdqa %xmm15, %xmm0
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: movdqu %xmm0, (%rdx)
+
+ xor %eax, %eax
+ ret
+ .size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192
+
+/* in %rdi : the key
+ in %rsi : buffer for expanded key
+*/
+ .type intel_aes_encrypt_init_256,@function
+ .globl intel_aes_encrypt_init_256
+ .align 16
+intel_aes_encrypt_init_256:
+ movdqu (%rdi), %xmm1
+ movdqu 16(%rdi), %xmm3
+ movdqu %xmm1, (%rsi)
+ movdqu %xmm3, 16(%rsi)
+ leaq 32(%rsi), %rsi
+ xor %eax, %eax
+
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01 /* aeskeygenassist $0x01, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02 /* aeskeygenassist $0x02, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04 /* aeskeygenassist $0x04, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08 /* aeskeygenassist $0x08, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10 /* aeskeygenassist $0x10, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20 /* aeskeygenassist $0x20, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40 /* aeskeygenassist $0x40, %xmm3, %xmm2 */
+ pxor %xmm6, %xmm6
+ pshufd $0xff, %xmm2, %xmm2
+ shufps $0x10, %xmm1, %xmm6
+ pxor %xmm6, %xmm1
+ shufps $0x8c, %xmm1, %xmm6
+ pxor %xmm2, %xmm1
+ pxor %xmm6, %xmm1
+ movdqu %xmm1, (%rsi)
+
+ ret
+ .size intel_aes_encrypt_init_256, .-intel_aes_encrypt_init_256
+
+
+/* in %rdi : the key
+ in %rsi : buffer for expanded key
+*/
+ .type intel_aes_decrypt_init_256,@function
+ .globl intel_aes_decrypt_init_256
+ .align 16
+intel_aes_decrypt_init_256:
+ movdqu (%rdi), %xmm1
+ movdqu 16(%rdi), %xmm3
+ movdqu %xmm1, (%rsi)
+ .byte 0x66,0x0f,0x38,0xdb,0xe3 /* aesimc %xmm3, %xmm4 */
+ movdqu %xmm4, 16(%rsi)
+ leaq 32(%rsi), %rsi
+ xor %eax, %eax
+
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01 /* aeskeygenassist $0x01, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x38,0xdb,0xe1 /* aesimc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdb,0xeb /* aesimc %xmm3, %xmm5 */
+ movdqu %xmm4, -32(%rsi)
+ movdqu %xmm5, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02 /* aeskeygenassist $0x02, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x38,0xdb,0xe1 /* aesimc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdb,0xeb /* aesimc %xmm3, %xmm5 */
+ movdqu %xmm4, -32(%rsi)
+ movdqu %xmm5, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04 /* aeskeygenassist $0x04, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x38,0xdb,0xe1 /* aesimc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdb,0xeb /* aesimc %xmm3, %xmm5 */
+ movdqu %xmm4, -32(%rsi)
+ movdqu %xmm5, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08 /* aeskeygenassist $0x08, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x38,0xdb,0xe1 /* aesimc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdb,0xeb /* aesimc %xmm3, %xmm5 */
+ movdqu %xmm4, -32(%rsi)
+ movdqu %xmm5, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10 /* aeskeygenassist $0x10, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x38,0xdb,0xe1 /* aesimc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdb,0xeb /* aesimc %xmm3, %xmm5 */
+ movdqu %xmm4, -32(%rsi)
+ movdqu %xmm5, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20 /* aeskeygenassist $0x20, %xmm3, %xmm2 */
+ call key_expansion256
+ .byte 0x66,0x0f,0x38,0xdb,0xe1 /* aesimc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdb,0xeb /* aesimc %xmm3, %xmm5 */
+ movdqu %xmm4, -32(%rsi)
+ movdqu %xmm5, -16(%rsi)
+ .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40 /* aeskeygenassist $0x40, %xmm3, %xmm2 */
+ pxor %xmm6, %xmm6
+ pshufd $0xff, %xmm2, %xmm2
+ shufps $0x10, %xmm1, %xmm6
+ pxor %xmm6, %xmm1
+ shufps $0x8c, %xmm1, %xmm6
+ pxor %xmm2, %xmm1
+ pxor %xmm6, %xmm1
+ movdqu %xmm1, (%rsi)
+
+ ret
+ .size intel_aes_decrypt_init_256, .-intel_aes_decrypt_init_256
+
+
+ .type key_expansion256,@function
+ .align 16
+key_expansion256:
+ movd %eax, %xmm6
+ pshufd $0xff, %xmm2, %xmm2
+ shufps $0x10, %xmm1, %xmm6
+ pxor %xmm6, %xmm1
+ shufps $0x8c, %xmm1, %xmm6
+ pxor %xmm2, %xmm1
+ pxor %xmm6, %xmm1
+ movdqu %xmm1, (%rsi)
+
+ addq $16, %rsi
+ .byte 0x66,0x0f,0x3a,0xdf,0xe1,0x00 /* aeskeygenassist $0, %xmm1, %xmm4 */
+ pshufd $0xaa, %xmm4, %xmm4
+ shufps $0x10, %xmm3, %xmm6
+ pxor %xmm6, %xmm3
+ shufps $0x8c, %xmm3, %xmm6
+ pxor %xmm4, %xmm3
+ pxor %xmm6, %xmm3
+ movdqu %xmm3, (%rsi)
+ addq $16, %rsi
+ ret
+ .size key_expansion256, .-key_expansion256
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_encrypt_ecb_256,@function
+ .globl intel_aes_encrypt_ecb_256
+ .align 16
+intel_aes_encrypt_ecb_256:
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdi), %xmm2
+ movdqu 224(%rdi), %xmm15
+ xorl %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm2, %xmm3
+ pxor %xmm2, %xmm4
+ pxor %xmm2, %xmm5
+ pxor %xmm2, %xmm6
+ pxor %xmm2, %xmm7
+ pxor %xmm2, %xmm8
+ pxor %xmm2, %xmm9
+ pxor %xmm2, %xmm10
+
+// complete loop unrolling
+ movdqu 16(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ movdqu 160(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 176(%rdi), %xmm1
+ movdqu 192(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb /* aesenc %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3 /* aesenc %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb /* aesenc %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3 /* aesenc %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb /* aesenc %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3 /* aesenc %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3 /* aesenc %xmm11, %xmm10 */
+
+ movdqu 208(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xdc,0xd9 /* aesenc %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe1 /* aesenc %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdc,0xe9 /* aesenc %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf1 /* aesenc %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdc,0xf9 /* aesenc %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1 /* aesenc %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1 /* aesenc %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xdf /* aesenclast %xmm15, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xe7 /* aesenclast %xmm15, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xef /* aesenclast %xmm15, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xf7 /* aesenclast %xmm15, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xff /* aesenclast %xmm15, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xc7 /* aesenclast %xmm15, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xcf /* aesenclast %xmm15, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xdd,0xd7 /* aesenclast %xmm15, %xmm10 */
+
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu (%rdi), %xmm8
+ movdqu 16(%rdi), %xmm2
+ movdqu 32(%rdi), %xmm3
+ movdqu 48(%rdi), %xmm4
+ movdqu 64(%rdi), %xmm5
+ movdqu 80(%rdi), %xmm6
+ movdqu 96(%rdi), %xmm7
+ movdqu 128(%rdi), %xmm9
+ movdqu 144(%rdi), %xmm10
+ movdqu 160(%rdi), %xmm11
+ movdqu 176(%rdi), %xmm12
+ movdqu 192(%rdi), %xmm13
+ movdqu 208(%rdi), %xmm14
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm8, %xmm1
+ movdqu 112(%rdi), %xmm8
+ .byte 0x66,0x0f,0x38,0xdc,0xca /* aesenc %xmm2, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcb /* aesenc %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcc /* aesenc %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcd /* aesenc %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xce /* aesenc %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcf /* aesenc %xmm7, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ movdqu (%rdi), %xmm8
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xca /* aesenc %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc /* aesenc %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd /* aesenc %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xce /* aesenc %xmm14, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xcf /* aesenclast %xmm15, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: xor %eax, %eax
+ ret
+ .size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_decrypt_ecb_256,@function
+ .globl intel_aes_decrypt_ecb_256
+ .align 16
+intel_aes_decrypt_ecb_256:
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdi), %xmm2
+ movdqu 224(%rdi), %xmm15
+ xorl %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm15, %xmm3
+ pxor %xmm15, %xmm4
+ pxor %xmm15, %xmm5
+ pxor %xmm15, %xmm6
+ pxor %xmm15, %xmm7
+ pxor %xmm15, %xmm8
+ pxor %xmm15, %xmm9
+ pxor %xmm15, %xmm10
+
+// complete loop unrolling
+ movdqu 208(%rdi), %xmm1
+ movdqu 192(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 176(%rdi), %xmm1
+ movdqu 160(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 16(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x0f,0x38,0xdf,0xda /* aesdeclast %xmm2, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdf,0xe2 /* aesdeclast %xmm2, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdf,0xea /* aesdeclast %xmm2, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdf,0xf2 /* aesdeclast %xmm2, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdf,0xfa /* aesdeclast %xmm2, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2 /* aesdeclast %xmm2, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2 /* aesdeclast %xmm2, %xmm10 */
+
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm2
+ movdqu 32(%rdi), %xmm3
+ movdqu 48(%rdi), %xmm4
+ movdqu 64(%rdi), %xmm5
+ movdqu 80(%rdi), %xmm6
+ movdqu 96(%rdi), %xmm7
+ movdqu 112(%rdi), %xmm8
+ movdqu 128(%rdi), %xmm9
+ movdqu 144(%rdi), %xmm10
+ movdqu 160(%rdi), %xmm11
+ movdqu 176(%rdi), %xmm12
+ movdqu 192(%rdi), %xmm13
+ movdqu 208(%rdi), %xmm14
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm15, %xmm1
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xce /* aesdec %xmm14, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcd /* aesdec %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcc /* aesdec %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xca /* aesdec %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc9 /* aesdec %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc8 /* aesdec %xmm8, %xmm1 */
+ movdqu (%rdi), %xmm8
+ .byte 0x66,0x0f,0x38,0xde,0xcf /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xce /* aesdec %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcd /* aesdec %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcc /* aesdec %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xca /* aesdec %xmm2, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdf,0xc8 /* aesdeclast %xmm8, %xmm1 */
+ movdqu 112(%rdi), %xmm8
+ movdqu %xmm1, (%rsi, %rax)
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: xor %eax, %eax
+ ret
+ .size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_encrypt_cbc_256,@function
+ .globl intel_aes_encrypt_cbc_256
+ .align 16
+intel_aes_encrypt_cbc_256:
+ testq %r9, %r9
+ je 2f
+
+// leaq IV_OFFSET(%rdi), %rdx
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 16(%rdi), %rdx
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdx), %xmm0
+ movdqu (%rdi), %xmm8
+ movdqu 16(%rdi), %xmm2
+ movdqu 32(%rdi), %xmm3
+ movdqu 48(%rdi), %xmm4
+ movdqu 64(%rdi), %xmm5
+ movdqu 80(%rdi), %xmm6
+ movdqu 96(%rdi), %xmm7
+ movdqu 128(%rdi), %xmm9
+ movdqu 144(%rdi), %xmm10
+ movdqu 160(%rdi), %xmm11
+ movdqu 176(%rdi), %xmm12
+ movdqu 192(%rdi), %xmm13
+ movdqu 208(%rdi), %xmm14
+ movdqu 224(%rdi), %xmm15
+
+ xorl %eax, %eax
+1: movdqu (%r8, %rax), %xmm1
+ pxor %xmm0, %xmm1
+ pxor %xmm8, %xmm1
+ movdqu 112(%rdi), %xmm8
+ .byte 0x66,0x0f,0x38,0xdc,0xca /* aesenc %xmm2, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcb /* aesenc %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcc /* aesenc %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcd /* aesenc %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xce /* aesenc %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xdc,0xcf /* aesenc %xmm7, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ movdqu (%rdi), %xmm8
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9 /* aesenc %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xca /* aesenc %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc /* aesenc %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd /* aesenc %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xce /* aesenc %xmm14, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xcf /* aesenclast %xmm15, %xmm1 */
+ movdqu %xmm1, (%rsi, %rax)
+ movdqa %xmm1, %xmm0
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 1b
+
+ movdqu %xmm0, (%rdx)
+
+2: xor %eax, %eax
+ ret
+ .size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256
+
+
+/* in %rdi : cx - context
+ in %rsi : output - pointer to output buffer
+ in %rdx : outputLen - pointer to variable for length of output
+ (filled by caller)
+ in %rcx : maxOutputLen - length of output buffer
+ in %r8 : input - pointer to input buffer
+ in %r9 : inputLen - length of input buffer
+ on stack: blocksize - AES blocksize (always 16, unused)
+*/
+ .type intel_aes_decrypt_cbc_256,@function
+ .globl intel_aes_decrypt_cbc_256
+ .align 16
+intel_aes_decrypt_cbc_256:
+// leaq IV_OFFSET(%rdi), %rdx
+// leaq EXPANDED_KEY_OFFSET(%rdi), %rdi
+ leaq 16(%rdi), %rdx
+ leaq 48(%rdi), %rdi
+
+ movdqu (%rdx), %xmm0
+ movdqu (%rdi), %xmm2
+ movdqu 224(%rdi), %xmm15
+ xorl %eax, %eax
+// cmpq $8*16, %r9
+ cmpq $128, %r9
+ jb 1f
+// leaq -8*16(%r9), %r11
+ leaq -128(%r9), %r11
+2: movdqu (%r8, %rax), %xmm3
+ movdqu 16(%r8, %rax), %xmm4
+ movdqu 32(%r8, %rax), %xmm5
+ movdqu 48(%r8, %rax), %xmm6
+ movdqu 64(%r8, %rax), %xmm7
+ movdqu 80(%r8, %rax), %xmm8
+ movdqu 96(%r8, %rax), %xmm9
+ movdqu 112(%r8, %rax), %xmm10
+ pxor %xmm15, %xmm3
+ pxor %xmm15, %xmm4
+ pxor %xmm15, %xmm5
+ pxor %xmm15, %xmm6
+ pxor %xmm15, %xmm7
+ pxor %xmm15, %xmm8
+ pxor %xmm15, %xmm9
+ pxor %xmm15, %xmm10
+
+// complete loop unrolling
+ movdqu 208(%rdi), %xmm1
+ movdqu 192(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 176(%rdi), %xmm1
+ movdqu 160(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 144(%rdi), %xmm1
+ movdqu 128(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 112(%rdi), %xmm1
+ movdqu 96(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 80(%rdi), %xmm1
+ movdqu 64(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm11
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xdb /* aesdec %xmm11, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xe3 /* aesdec %xmm11, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xeb /* aesdec %xmm11, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xf3 /* aesdec %xmm11, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xfb /* aesdec %xmm11, %xmm7 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xc3 /* aesdec %xmm11, %xmm8 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm9 */
+ .byte 0x66,0x45,0x0f,0x38,0xde,0xd3 /* aesdec %xmm11, %xmm10 */
+
+ movdqu 16(%rdi), %xmm1
+ .byte 0x66,0x0f,0x38,0xde,0xd9 /* aesdec %xmm1, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xde,0xe1 /* aesdec %xmm1, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xde,0xe9 /* aesdec %xmm1, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xde,0xf1 /* aesdec %xmm1, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xde,0xf9 /* aesdec %xmm1, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc1 /* aesdec %xmm1, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xc9 /* aesdec %xmm1, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xde,0xd1 /* aesdec %xmm1, %xmm10 */
+ .byte 0x66,0x0f,0x38,0xdf,0xda /* aesdeclast %xmm2, %xmm3 */
+ .byte 0x66,0x0f,0x38,0xdf,0xe2 /* aesdeclast %xmm2, %xmm4 */
+ .byte 0x66,0x0f,0x38,0xdf,0xea /* aesdeclast %xmm2, %xmm5 */
+ .byte 0x66,0x0f,0x38,0xdf,0xf2 /* aesdeclast %xmm2, %xmm6 */
+ .byte 0x66,0x0f,0x38,0xdf,0xfa /* aesdeclast %xmm2, %xmm7 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2 /* aesdeclast %xmm2, %xmm8 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm9 */
+ .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2 /* aesdeclast %xmm2, %xmm10 */
+
+ pxor %xmm0, %xmm3
+ movdqu (%r8, %rax), %xmm0
+ pxor %xmm0, %xmm4
+ movdqu 16(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm5
+ movdqu 32(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm6
+ movdqu 48(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm7
+ movdqu 64(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm8
+ movdqu 80(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm9
+ movdqu 96(%r8, %rax), %xmm0
+ pxor %xmm0, %xmm10
+ movdqu 112(%r8, %rax), %xmm0
+ movdqu %xmm3, (%rsi, %rax)
+ movdqu %xmm4, 16(%rsi, %rax)
+ movdqu %xmm5, 32(%rsi, %rax)
+ movdqu %xmm6, 48(%rsi, %rax)
+ movdqu %xmm7, 64(%rsi, %rax)
+ movdqu %xmm8, 80(%rsi, %rax)
+ movdqu %xmm9, 96(%rsi, %rax)
+ movdqu %xmm10, 112(%rsi, %rax)
+// addq $8*16, %rax
+ addq $128, %rax
+ cmpq %r11, %rax
+ jbe 2b
+1: cmpq %rax, %r9
+ je 5f
+
+ movdqu 16(%rdi), %xmm2
+ movdqu 32(%rdi), %xmm3
+ movdqu 48(%rdi), %xmm4
+ movdqu 64(%rdi), %xmm5
+ movdqu 80(%rdi), %xmm6
+ movdqu 96(%rdi), %xmm7
+ movdqu 112(%rdi), %xmm8
+ movdqu 128(%rdi), %xmm9
+ movdqu 144(%rdi), %xmm10
+ movdqu 160(%rdi), %xmm11
+ movdqu 176(%rdi), %xmm12
+ movdqu 192(%rdi), %xmm13
+ movdqu 208(%rdi), %xmm14
+
+4: movdqu (%r8, %rax), %xmm1
+ pxor %xmm15, %xmm1
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xce /* aesdec %xmm14, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcd /* aesdec %xmm13, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcc /* aesdec %xmm12, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xcb /* aesdec %xmm11, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xca /* aesdec %xmm10, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc9 /* aesdec %xmm9, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xde,0xc8 /* aesdec %xmm8, %xmm1 */
+ movdqu (%rdi), %xmm8
+ .byte 0x66,0x0f,0x38,0xde,0xcf /* aesdec %xmm7, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xce /* aesdec %xmm6, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcd /* aesdec %xmm5, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcc /* aesdec %xmm4, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm3, %xmm1 */
+ .byte 0x66,0x0f,0x38,0xde,0xca /* aesdec %xmm2, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdf,0xc8 /* aesdeclast %xmm8, %xmm1 */
+ movdqu 112(%rdi), %xmm8
+ pxor %xmm0, %xmm1
+ movdqu (%r8, %rax), %xmm0 /* fetch the IV before we store the block */
+ movdqu %xmm1, (%rsi, %rax) /* in case input buf = output buf */
+ addq $16, %rax
+ cmpq %rax, %r9
+ jne 4b
+
+5: movdqu %xmm0, (%rdx)
+
+ xor %eax, %eax
+ ret
+ .size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256