diff options
Diffstat (limited to 'nss/lib/freebl/intel-aes.s')
-rw-r--r-- | nss/lib/freebl/intel-aes.s | 284 |
1 files changed, 155 insertions, 129 deletions
diff --git a/nss/lib/freebl/intel-aes.s b/nss/lib/freebl/intel-aes.s index a83529a..2dfcfa1 100644 --- a/nss/lib/freebl/intel-aes.s +++ b/nss/lib/freebl/intel-aes.s @@ -7,6 +7,20 @@ #define IV_OFFSET 16 #define EXPANDED_KEY_OFFSET 48 +/* + * Warning: the length values used in this module are "unsigned int" + * in C, which is 32-bit. When they're passed in registers, use only + * the low 32 bits, because the top half is unspecified. + * + * This is called from C code, so the contents of those bits can + * depend on the C compiler's optimization decisions. This means that + * mistakes might not be obvious in testing if those bits happen to be + * zero in your build. + * + * Exception: 32-bit lea instructions use a 64-bit address because the + * address size doesn't affect the result, and that form is more + * compactly encoded and preferred by compilers over a 32-bit address. + */ /* in %rdi : the key in %rsi : buffer for expanded key @@ -119,10 +133,11 @@ key_expansion128: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_encrypt_ecb_128,@function @@ -135,11 +150,11 @@ intel_aes_encrypt_ecb_128: movdqu (%rdi), %xmm2 movdqu 160(%rdi), %xmm12 xor %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -260,11 +275,11 @@ intel_aes_encrypt_ecb_128: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm3 @@ -290,8 +305,8 @@ intel_aes_encrypt_ecb_128: .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb /* aesenc %xmm11, %xmm1 */ .byte 0x66,0x41,0x0f,0x38,0xdd,0xcc /* aesenclast %xmm12, %xmm1 */ movdqu %xmm1, (%rsi, %rax) - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: xor %eax, %eax @@ -302,10 +317,11 @@ intel_aes_encrypt_ecb_128: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_decrypt_ecb_128,@function @@ -318,11 +334,11 @@ intel_aes_decrypt_ecb_128: movdqu (%rdi), %xmm2 movdqu 160(%rdi), %xmm12 xorl %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -443,11 +459,11 @@ intel_aes_decrypt_ecb_128: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm3 @@ -473,8 +489,8 @@ intel_aes_decrypt_ecb_128: .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm7, %xmm1 */ .byte 0x66,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm1 */ movdqu %xmm1, (%rsi, %rax) - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: xor %eax, %eax @@ -485,17 +501,18 @@ intel_aes_decrypt_ecb_128: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_encrypt_cbc_128,@function .globl intel_aes_encrypt_cbc_128 .align 16 intel_aes_encrypt_cbc_128: - testq %r9, %r9 + testl %r9d, %r9d je 2f // leaq IV_OFFSET(%rdi), %rdx @@ -532,8 +549,8 @@ intel_aes_encrypt_cbc_128: .byte 0x66,0x41,0x0f,0x38,0xdd,0xcc /* aesenclast %xmm12, %xmm1 */ movdqu %xmm1, (%rsi, %rax) movdqa %xmm1, %xmm0 - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 1b movdqu %xmm0, (%rdx) @@ -546,10 +563,11 @@ intel_aes_encrypt_cbc_128: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_decrypt_cbc_128,@function @@ -565,9 +583,9 @@ intel_aes_decrypt_cbc_128: movdqu (%rdi), %xmm2 /* first key block */ movdqu 160(%rdi), %xmm12 /* last key block */ xorl %eax, %eax - cmpq $128, %r9 + cmpl $128, %r9d jb 1f - leaq -128(%r9), %r11 + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 /* 1st data block */ movdqu 16(%r8, %rax), %xmm4 /* 2d data block */ movdqu 32(%r8, %rax), %xmm5 @@ -704,10 +722,10 @@ intel_aes_decrypt_cbc_128: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) - addq $128, %rax - cmpq %r11, %rax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm3 @@ -736,8 +754,8 @@ intel_aes_decrypt_cbc_128: pxor %xmm0, %xmm1 movdqu %xmm1, (%rsi, %rax) movdqa %xmm13, %xmm0 - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: movdqu %xmm0, (%rdx) @@ -873,10 +891,11 @@ key_expansion192: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_encrypt_ecb_192,@function @@ -889,11 +908,11 @@ intel_aes_encrypt_ecb_192: movdqu (%rdi), %xmm2 movdqu 192(%rdi), %xmm14 xorl %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -1033,11 +1052,11 @@ intel_aes_encrypt_ecb_192: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm3 @@ -1067,8 +1086,8 @@ intel_aes_encrypt_ecb_192: .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd /* aesenc %xmm13, %xmm1 */ .byte 0x66,0x41,0x0f,0x38,0xdd,0xce /* aesenclast %xmm14, %xmm1 */ movdqu %xmm1, (%rsi, %rax) - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: xor %eax, %eax @@ -1079,10 +1098,11 @@ intel_aes_encrypt_ecb_192: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_decrypt_ecb_192,@function @@ -1095,11 +1115,11 @@ intel_aes_decrypt_ecb_192: movdqu (%rdi), %xmm2 movdqu 192(%rdi), %xmm14 xorl %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -1239,11 +1259,11 @@ intel_aes_decrypt_ecb_192: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm3 @@ -1273,8 +1293,8 @@ intel_aes_decrypt_ecb_192: .byte 0x66,0x0f,0x38,0xde,0xcb /* aesdec %xmm3, %xmm1 */ .byte 0x66,0x0f,0x38,0xdf,0xca /* aesdeclast %xmm2, %xmm1 */ movdqu %xmm1, (%rsi, %rax) - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: xor %eax, %eax @@ -1285,17 +1305,18 @@ intel_aes_decrypt_ecb_192: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_encrypt_cbc_192,@function .globl intel_aes_encrypt_cbc_192 .align 16 intel_aes_encrypt_cbc_192: - testq %r9, %r9 + testl %r9d, %r9d je 2f // leaq IV_OFFSET(%rdi), %rdx @@ -1336,8 +1357,8 @@ intel_aes_encrypt_cbc_192: .byte 0x66,0x41,0x0f,0x38,0xdd,0xce /* aesenclast %xmm14, %xmm1 */ movdqu %xmm1, (%rsi, %rax) movdqa %xmm1, %xmm0 - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 1b movdqu %xmm0, (%rdx) @@ -1350,10 +1371,11 @@ intel_aes_encrypt_cbc_192: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %exx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_decrypt_cbc_192,@function @@ -1367,9 +1389,9 @@ intel_aes_decrypt_cbc_192: movdqu (%rdi), %xmm2 movdqu 192(%rdi), %xmm14 xorl %eax, %eax - cmpq $128, %r9 + cmpl $128, %r9d jb 1f - leaq -128(%r9), %r11 + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -1525,10 +1547,10 @@ intel_aes_decrypt_cbc_192: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) - addq $128, %rax - cmpq %r11, %rax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm3 @@ -1561,8 +1583,8 @@ intel_aes_decrypt_cbc_192: pxor %xmm0, %xmm1 movdqu %xmm1, (%rsi, %rax) movdqa %xmm15, %xmm0 - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: movdqu %xmm0, (%rdx) @@ -1705,10 +1727,11 @@ key_expansion256: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_encrypt_ecb_256,@function @@ -1721,11 +1744,11 @@ intel_aes_encrypt_ecb_256: movdqu (%rdi), %xmm2 movdqu 224(%rdi), %xmm15 xorl %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -1884,11 +1907,11 @@ intel_aes_encrypt_ecb_256: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu (%rdi), %xmm8 @@ -1924,8 +1947,8 @@ intel_aes_encrypt_ecb_256: .byte 0x66,0x41,0x0f,0x38,0xdc,0xce /* aesenc %xmm14, %xmm1 */ .byte 0x66,0x41,0x0f,0x38,0xdd,0xcf /* aesenclast %xmm15, %xmm1 */ movdqu %xmm1, (%rsi, %rax) - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: xor %eax, %eax @@ -1936,10 +1959,11 @@ intel_aes_encrypt_ecb_256: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_decrypt_ecb_256,@function @@ -1952,11 +1976,11 @@ intel_aes_decrypt_ecb_256: movdqu (%rdi), %xmm2 movdqu 224(%rdi), %xmm15 xorl %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -2115,11 +2139,11 @@ intel_aes_decrypt_ecb_256: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm2 @@ -2155,8 +2179,8 @@ intel_aes_decrypt_ecb_256: .byte 0x66,0x41,0x0f,0x38,0xdf,0xc8 /* aesdeclast %xmm8, %xmm1 */ movdqu 112(%rdi), %xmm8 movdqu %xmm1, (%rsi, %rax) - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: xor %eax, %eax @@ -2167,17 +2191,18 @@ intel_aes_decrypt_ecb_256: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_encrypt_cbc_256,@function .globl intel_aes_encrypt_cbc_256 .align 16 intel_aes_encrypt_cbc_256: - testq %r9, %r9 + testl %r9d, %r9d je 2f // leaq IV_OFFSET(%rdi), %rdx @@ -2223,8 +2248,8 @@ intel_aes_encrypt_cbc_256: .byte 0x66,0x41,0x0f,0x38,0xdd,0xcf /* aesenclast %xmm15, %xmm1 */ movdqu %xmm1, (%rsi, %rax) movdqa %xmm1, %xmm0 - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 1b movdqu %xmm0, (%rdx) @@ -2237,10 +2262,11 @@ intel_aes_encrypt_cbc_256: /* in %rdi : cx - context in %rsi : output - pointer to output buffer in %rdx : outputLen - pointer to variable for length of output - (filled by caller) - in %rcx : maxOutputLen - length of output buffer + (already filled in by caller) + in %ecx : maxOutputLen - length of output buffer + (already checked by caller) in %r8 : input - pointer to input buffer - in %r9 : inputLen - length of input buffer + in %r9d : inputLen - length of input buffer on stack: blocksize - AES blocksize (always 16, unused) */ .type intel_aes_decrypt_cbc_256,@function @@ -2256,11 +2282,11 @@ intel_aes_decrypt_cbc_256: movdqu (%rdi), %xmm2 movdqu 224(%rdi), %xmm15 xorl %eax, %eax -// cmpq $8*16, %r9 - cmpq $128, %r9 +// cmpl $8*16, %r9d + cmpl $128, %r9d jb 1f -// leaq -8*16(%r9), %r11 - leaq -128(%r9), %r11 +// leal -8*16(%r9), %r11d + leal -128(%r9), %r11d 2: movdqu (%r8, %rax), %xmm3 movdqu 16(%r8, %rax), %xmm4 movdqu 32(%r8, %rax), %xmm5 @@ -2435,11 +2461,11 @@ intel_aes_decrypt_cbc_256: movdqu %xmm8, 80(%rsi, %rax) movdqu %xmm9, 96(%rsi, %rax) movdqu %xmm10, 112(%rsi, %rax) -// addq $8*16, %rax - addq $128, %rax - cmpq %r11, %rax +// addl $8*16, %eax + addl $128, %eax + cmpl %r11d, %eax jbe 2b -1: cmpq %rax, %r9 +1: cmpl %eax, %r9d je 5f movdqu 16(%rdi), %xmm2 @@ -2477,8 +2503,8 @@ intel_aes_decrypt_cbc_256: pxor %xmm0, %xmm1 movdqu (%r8, %rax), %xmm0 /* fetch the IV before we store the block */ movdqu %xmm1, (%rsi, %rax) /* in case input buf = output buf */ - addq $16, %rax - cmpq %rax, %r9 + addl $16, %eax + cmpl %eax, %r9d jne 4b 5: movdqu %xmm0, (%rdx) |