From bc36db1ebad82ec3da8cc30259bc293093c0aaab Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Fri, 26 Apr 2019 14:43:19 +0200 Subject: Updated asm files to latest version under cryptogams license Signed-off-by: Nikos Mavrogiannopoulos --- cfg.mk | 19 +- devel/openssl | 2 +- devel/perlasm/cpuid-x86.pl | 5 + devel/perlasm/sha256-ssse3-x86_64.pl | 1 + devel/perlasm/sha256-ssse3-x86_64.pl.license | 1 + lib/accelerated/aarch64/elf/aes-aarch64.s | 4 +- lib/accelerated/aarch64/elf/ghash-aarch64.s | 5 +- lib/accelerated/aarch64/elf/sha1-armv8.s | 2 - lib/accelerated/aarch64/elf/sha256-armv8.s | 4 +- lib/accelerated/aarch64/elf/sha512-armv8.s | 4 +- lib/accelerated/aarch64/macosx/aes-aarch64.s | 4 +- lib/accelerated/aarch64/macosx/ghash-aarch64.s | 5 +- lib/accelerated/aarch64/macosx/sha1-armv8.s | 2 - lib/accelerated/aarch64/macosx/sha256-armv8.s | 4 +- lib/accelerated/aarch64/macosx/sha512-armv8.s | 4 +- lib/accelerated/x86/coff/aes-ssse3-x86.s | 5 +- lib/accelerated/x86/coff/aes-ssse3-x86_64.s | 30 +- lib/accelerated/x86/coff/aesni-gcm-x86_64.s | 34 +- lib/accelerated/x86/coff/aesni-x86.s | 2038 ++++-- lib/accelerated/x86/coff/aesni-x86_64.s | 1710 ++++- lib/accelerated/x86/coff/cpuid-x86.s | 1 - lib/accelerated/x86/coff/ghash-x86_64.s | 76 +- lib/accelerated/x86/coff/sha1-ssse3-x86.s | 3 +- lib/accelerated/x86/coff/sha1-ssse3-x86_64.s | 5238 +++++++++++--- lib/accelerated/x86/coff/sha256-ssse3-x86.s | 56 +- lib/accelerated/x86/coff/sha256-ssse3-x86_64.s | 5731 +++++++++++++++ lib/accelerated/x86/coff/sha512-ssse3-x86.s | 5 +- lib/accelerated/x86/coff/sha512-ssse3-x86_64.s | 8315 ++++++++++++++-------- lib/accelerated/x86/elf/aes-ssse3-x86.s | 5 +- lib/accelerated/x86/elf/aes-ssse3-x86_64.s | 30 +- lib/accelerated/x86/elf/aesni-gcm-x86_64.s | 34 +- lib/accelerated/x86/elf/aesni-x86.s | 2045 ++++-- lib/accelerated/x86/elf/aesni-x86_64.s | 1423 +++- lib/accelerated/x86/elf/cpuid-x86.s | 1 - lib/accelerated/x86/elf/cpuid-x86_64.s | 3 - lib/accelerated/x86/elf/ghash-x86_64.s | 80 +- lib/accelerated/x86/elf/sha1-ssse3-x86.s | 6 +- lib/accelerated/x86/elf/sha1-ssse3-x86_64.s | 4793 ++++++++++--- lib/accelerated/x86/elf/sha256-ssse3-x86.s | 59 +- lib/accelerated/x86/elf/sha256-ssse3-x86_64.s | 5471 ++++++++++++++ lib/accelerated/x86/elf/sha512-ssse3-x86.s | 8 +- lib/accelerated/x86/elf/sha512-ssse3-x86_64.s | 8199 +++++++++++++-------- lib/accelerated/x86/files.mk | 6 +- lib/accelerated/x86/macosx/aes-ssse3-x86.s | 5 +- lib/accelerated/x86/macosx/aes-ssse3-x86_64.s | 30 +- lib/accelerated/x86/macosx/aesni-gcm-x86_64.s | 34 +- lib/accelerated/x86/macosx/aesni-x86.s | 2038 ++++-- lib/accelerated/x86/macosx/aesni-x86_64.s | 1313 +++- lib/accelerated/x86/macosx/cpuid-x86.s | 1 - lib/accelerated/x86/macosx/ghash-x86_64.s | 68 +- lib/accelerated/x86/macosx/sha1-ssse3-x86.s | 3 +- lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s | 4784 ++++++++++--- lib/accelerated/x86/macosx/sha256-ssse3-x86.s | 60 +- lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s | 5470 ++++++++++++++ lib/accelerated/x86/macosx/sha512-ssse3-x86.s | 5 +- lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s | 8188 +++++++++++++-------- lib/accelerated/x86/sha-x86.h | 3 - 57 files changed, 53711 insertions(+), 13762 deletions(-) create mode 120000 devel/perlasm/sha256-ssse3-x86_64.pl create mode 120000 devel/perlasm/sha256-ssse3-x86_64.pl.license create mode 100644 lib/accelerated/x86/coff/sha256-ssse3-x86_64.s create mode 100644 lib/accelerated/x86/elf/sha256-ssse3-x86_64.s create mode 100644 lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s diff --git a/cfg.mk b/cfg.mk index 3837d9619e..f7ae6408fd 100644 --- a/cfg.mk +++ b/cfg.mk @@ -138,6 +138,7 @@ ASM_SOURCES_XXX := \ lib/accelerated/x86/XXX/sha1-ssse3-x86.s \ lib/accelerated/x86/XXX/sha1-ssse3-x86_64.s \ lib/accelerated/x86/XXX/sha256-ssse3-x86.s \ + lib/accelerated/x86/XXX/sha256-ssse3-x86_64.s \ lib/accelerated/x86/XXX/sha512-ssse3-x86.s \ lib/accelerated/x86/XXX/sha512-ssse3-x86_64.s \ lib/accelerated/x86/XXX/aesni-gcm-x86_64.s \ @@ -158,7 +159,7 @@ X86_FILES=XXX/aesni-x86.s XXX/cpuid-x86.s XXX/sha1-ssse3-x86.s \ X86_64_FILES=XXX/aesni-x86_64.s XXX/cpuid-x86_64.s XXX/ghash-x86_64.s \ XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s \ - XXX/aesni-gcm-x86_64.s + XXX/aesni-gcm-x86_64.s XXX/sha256-ssse3-x86_64.s X86_PADLOCK_FILES=XXX/e_padlock-x86.s X86_64_PADLOCK_FILES=XXX/e_padlock-x86_64.s @@ -194,27 +195,27 @@ lib/accelerated/x86/files.mk: $(ASM_SOURCES_ELF) # Appro's code lib/accelerated/x86/elf/%.s: devel/perlasm/%.pl .submodule.stamp - cat $<.license > $@ - CC=gcc perl $< elf >> $@ + CC=gcc perl $< elf $@.tmp + cat $<.license $@.tmp > $@ && rm -f $@.tmp echo "" >> $@ echo ".section .note.GNU-stack,\"\",%progbits" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ lib/accelerated/x86/coff/%-x86.s: devel/perlasm/%-x86.pl .submodule.stamp - cat $<.license > $@ - CC=gcc perl $< coff >> $@ + CC=gcc perl $< coff $@.tmp + cat $<.license $@.tmp > $@ && rm -f $@.tmp echo "" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ lib/accelerated/x86/coff/%-x86_64.s: devel/perlasm/%-x86_64.pl .submodule.stamp - cat $<.license > $@ - CC=gcc perl $< mingw64 >> $@ + CC=gcc perl $< mingw64 $@.tmp + cat $<.license $@.tmp > $@ && rm -f $@.tmp echo "" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl .submodule.stamp - cat $<.license > $@ - CC=gcc perl $< macosx >> $@ + CC=gcc perl $< macosx $@.tmp + cat $<.license $@.tmp > $@ && rm -f $@.tmp echo "" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ diff --git a/devel/openssl b/devel/openssl index 2805ee1e09..7216e9a20a 160000 --- a/devel/openssl +++ b/devel/openssl @@ -1 +1 @@ -Subproject commit 2805ee1e095a78f596dc7adf778441e2edb9f15c +Subproject commit 7216e9a20aee620d85185a6ddb8caa30f11f2192 diff --git a/devel/perlasm/cpuid-x86.pl b/devel/perlasm/cpuid-x86.pl index fa9c14e577..a5541d4586 100644 --- a/devel/perlasm/cpuid-x86.pl +++ b/devel/perlasm/cpuid-x86.pl @@ -10,6 +10,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../crypto/perlasm"); require "x86asm.pl"; +$output=pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],$0); &function_begin_B("gnutls_cpuid"); @@ -55,3 +58,5 @@ require "x86asm.pl"; &asciz("CPUID for x86"); &asm_finish(); + +close STDOUT; diff --git a/devel/perlasm/sha256-ssse3-x86_64.pl b/devel/perlasm/sha256-ssse3-x86_64.pl new file mode 120000 index 0000000000..126162756a --- /dev/null +++ b/devel/perlasm/sha256-ssse3-x86_64.pl @@ -0,0 +1 @@ +sha512-ssse3-x86_64.pl \ No newline at end of file diff --git a/devel/perlasm/sha256-ssse3-x86_64.pl.license b/devel/perlasm/sha256-ssse3-x86_64.pl.license new file mode 120000 index 0000000000..614714ae16 --- /dev/null +++ b/devel/perlasm/sha256-ssse3-x86_64.pl.license @@ -0,0 +1 @@ +sha512-ssse3-x86_64.pl.license \ No newline at end of file diff --git a/lib/accelerated/aarch64/elf/aes-aarch64.s b/lib/accelerated/aarch64/elf/aes-aarch64.s index e5a2dc500d..ab227a8c14 100644 --- a/lib/accelerated/aarch64/elf/aes-aarch64.s +++ b/lib/accelerated/aarch64/elf/aes-aarch64.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 # 2 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" 2 @@ -226,6 +224,7 @@ aes_v8_set_encrypt_key: .type aes_v8_set_decrypt_key,%function .align 5 aes_v8_set_decrypt_key: +.inst 0xd503233f stp x29,x30,[sp,#-16]! add x29,sp,#0 bl .Lenc_key @@ -259,6 +258,7 @@ aes_v8_set_decrypt_key: eor x0,x0,x0 .Ldec_key_abort: ldp x29,x30,[sp],#16 +.inst 0xd50323bf ret .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key .globl aes_v8_encrypt diff --git a/lib/accelerated/aarch64/elf/ghash-aarch64.s b/lib/accelerated/aarch64/elf/ghash-aarch64.s index 3875047c5e..c30139985b 100644 --- a/lib/accelerated/aarch64/elf/ghash-aarch64.s +++ b/lib/accelerated/aarch64/elf/ghash-aarch64.s @@ -40,12 +40,11 @@ # 1 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 # 2 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" 2 + .text .arch armv8-a+crypto .globl gcm_init_v8 @@ -193,7 +192,7 @@ gcm_ghash_v8: subs x3,x3,#32 mov x12,#16 -# 158 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" +# 159 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" ld1 {v20.2d,v21.2d},[x1],#32 movi v19.16b,#0xe1 ld1 {v22.2d},[x1] diff --git a/lib/accelerated/aarch64/elf/sha1-armv8.s b/lib/accelerated/aarch64/elf/sha1-armv8.s index 5c588ff7a6..4b65cf6ea8 100644 --- a/lib/accelerated/aarch64/elf/sha1-armv8.s +++ b/lib/accelerated/aarch64/elf/sha1-armv8.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 # 2 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S" 2 diff --git a/lib/accelerated/aarch64/elf/sha256-armv8.s b/lib/accelerated/aarch64/elf/sha256-armv8.s index 4439e3cc69..bc3f146c68 100644 --- a/lib/accelerated/aarch64/elf/sha256-armv8.s +++ b/lib/accelerated/aarch64/elf/sha256-armv8.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S" # 56 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 @@ -69,6 +67,7 @@ sha256_block_data_order: tst w16,#(1<<0) b.ne .Lneon_entry +.inst 0xd503233f stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1028,6 +1027,7 @@ sha256_block_data_order: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 +.inst 0xd50323bf ret .size sha256_block_data_order,.-sha256_block_data_order diff --git a/lib/accelerated/aarch64/elf/sha512-armv8.s b/lib/accelerated/aarch64/elf/sha512-armv8.s index 0d65657dd9..b036c2a121 100644 --- a/lib/accelerated/aarch64/elf/sha512-armv8.s +++ b/lib/accelerated/aarch64/elf/sha512-armv8.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S" # 56 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 @@ -67,6 +65,7 @@ sha512_block_data_order: tst w16,#(1<<6) b.ne .Lv8_entry +.inst 0xd503233f stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1026,6 +1025,7 @@ sha512_block_data_order: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 +.inst 0xd50323bf ret .size sha512_block_data_order,.-sha512_block_data_order diff --git a/lib/accelerated/aarch64/macosx/aes-aarch64.s b/lib/accelerated/aarch64/macosx/aes-aarch64.s index f017bcd95d..7acabf3f25 100644 --- a/lib/accelerated/aarch64/macosx/aes-aarch64.s +++ b/lib/accelerated/aarch64/macosx/aes-aarch64.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 # 2 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S" 2 @@ -226,6 +224,7 @@ Lenc_key_abort: .align 5 _aes_v8_set_decrypt_key: +.long 0xd503233f stp x29,x30,[sp,#-16]! add x29,sp,#0 bl Lenc_key @@ -259,6 +258,7 @@ Loop_imc: eor x0,x0,x0 Ldec_key_abort: ldp x29,x30,[sp],#16 +.long 0xd50323bf ret .globl _aes_v8_encrypt diff --git a/lib/accelerated/aarch64/macosx/ghash-aarch64.s b/lib/accelerated/aarch64/macosx/ghash-aarch64.s index f49a8dbb70..bf33773aa8 100644 --- a/lib/accelerated/aarch64/macosx/ghash-aarch64.s +++ b/lib/accelerated/aarch64/macosx/ghash-aarch64.s @@ -40,12 +40,11 @@ # 1 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 # 2 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" 2 + .text .globl _gcm_init_v8 @@ -193,7 +192,7 @@ _gcm_ghash_v8: subs x3,x3,#32 mov x12,#16 -# 158 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" +# 159 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" ld1 {v20.2d,v21.2d},[x1],#32 movi v19.16b,#0xe1 ld1 {v22.2d},[x1] diff --git a/lib/accelerated/aarch64/macosx/sha1-armv8.s b/lib/accelerated/aarch64/macosx/sha1-armv8.s index 221dc11731..8e1e12edf6 100644 --- a/lib/accelerated/aarch64/macosx/sha1-armv8.s +++ b/lib/accelerated/aarch64/macosx/sha1-armv8.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 # 2 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S" 2 diff --git a/lib/accelerated/aarch64/macosx/sha256-armv8.s b/lib/accelerated/aarch64/macosx/sha256-armv8.s index b48f6ca42a..fc6424975c 100644 --- a/lib/accelerated/aarch64/macosx/sha256-armv8.s +++ b/lib/accelerated/aarch64/macosx/sha256-armv8.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S" # 56 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 @@ -69,6 +67,7 @@ _sha256_block_data_order: tst w16,#(1<<0) b.ne Lneon_entry +.long 0xd503233f stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1028,6 +1027,7 @@ Loop_16_xx: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 +.long 0xd50323bf ret diff --git a/lib/accelerated/aarch64/macosx/sha512-armv8.s b/lib/accelerated/aarch64/macosx/sha512-armv8.s index 798619bc9a..43af71fa48 100644 --- a/lib/accelerated/aarch64/macosx/sha512-armv8.s +++ b/lib/accelerated/aarch64/macosx/sha512-armv8.s @@ -40,8 +40,6 @@ # 1 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S" # 1 "" # 1 "" -# 1 "/usr/aarch64-linux-gnu/include/stdc-predef.h" 1 3 -# 1 "" 2 # 1 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S" # 56 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S" # 1 "lib/accelerated/aarch64/aarch64-common.h" 1 @@ -67,6 +65,7 @@ _sha512_block_data_order: tst w16,#(1<<6) b.ne Lv8_entry +.long 0xd503233f stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1026,6 +1025,7 @@ Loop_16_xx: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 +.long 0xd50323bf ret diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86.s b/lib/accelerated/x86/coff/aes-ssse3-x86.s index 6e6ea90973..c58ea23597 100644 --- a/lib/accelerated/x86/coff/aes-ssse3-x86.s +++ b/lib/accelerated/x86/coff/aes-ssse3-x86.s @@ -5,12 +5,11 @@ ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## -## For details see https://shiftleft.org/papers/vector_aes/ and -## https://crypto.stanford.edu/vpaes/. +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. # # *** This file is auto-generated *** # -.file "vpaes-x86.s" .text .align 64 .L_vpaes_consts: diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s index 8c4a7d709d..150c9921d7 100644 --- a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s @@ -5,8 +5,8 @@ ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## -## For details see https://shiftleft.org/papers/vector_aes/ and -## https://crypto.stanford.edu/vpaes/. +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. # # *** This file is auto-generated *** # @@ -30,6 +30,7 @@ .def _vpaes_encrypt_core; .scl 3; .type 32; .endef .p2align 4 _vpaes_encrypt_core: + movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -117,9 +118,11 @@ _vpaes_encrypt_core: + .def _vpaes_decrypt_core; .scl 3; .type 32; .endef .p2align 4 _vpaes_decrypt_core: + movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -223,6 +226,7 @@ _vpaes_decrypt_core: + .def _vpaes_schedule_core; .scl 3; .type 32; .endef .p2align 4 _vpaes_schedule_core: @@ -231,6 +235,7 @@ _vpaes_schedule_core: + call _vpaes_preheat movdqa .Lk_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -408,9 +413,11 @@ _vpaes_schedule_core: + .def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef .p2align 4 _vpaes_schedule_192_smear: + pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -438,11 +445,13 @@ _vpaes_schedule_192_smear: + .def _vpaes_schedule_round; .scl 3; .type 32; .endef .p2align 4 _vpaes_schedule_round: + pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 .byte 102,69,15,58,15,192,15 @@ -506,9 +515,11 @@ _vpaes_schedule_low_round: + .def _vpaes_schedule_transform; .scl 3; .type 32; .endef .p2align 4 _vpaes_schedule_transform: + movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -538,6 +549,7 @@ _vpaes_schedule_transform: + @@ -547,6 +559,7 @@ _vpaes_schedule_transform: .def _vpaes_schedule_mangle; .scl 3; .type 32; .endef .p2align 4 _vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 movdqa .Lk_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -616,6 +629,7 @@ _vpaes_schedule_mangle: + .globl vpaes_set_encrypt_key .def vpaes_set_encrypt_key; .scl 2; .type 32; .endef .p2align 4 @@ -628,6 +642,7 @@ vpaes_set_encrypt_key: movq %rdx,%rsi movq %r8,%rdx + leaq -184(%rsp),%rsp movaps %xmm6,16(%rsp) movaps %xmm7,32(%rsp) @@ -664,6 +679,7 @@ vpaes_set_encrypt_key: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_vpaes_set_encrypt_key: .globl vpaes_set_decrypt_key @@ -678,6 +694,7 @@ vpaes_set_decrypt_key: movq %rdx,%rsi movq %r8,%rdx + leaq -184(%rsp),%rsp movaps %xmm6,16(%rsp) movaps %xmm7,32(%rsp) @@ -719,6 +736,7 @@ vpaes_set_decrypt_key: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_vpaes_set_decrypt_key: .globl vpaes_encrypt @@ -733,6 +751,7 @@ vpaes_encrypt: movq %rdx,%rsi movq %r8,%rdx + leaq -184(%rsp),%rsp movaps %xmm6,16(%rsp) movaps %xmm7,32(%rsp) @@ -764,6 +783,7 @@ vpaes_encrypt: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_vpaes_encrypt: .globl vpaes_decrypt @@ -778,6 +798,7 @@ vpaes_decrypt: movq %rdx,%rsi movq %r8,%rdx + leaq -184(%rsp),%rsp movaps %xmm6,16(%rsp) movaps %xmm7,32(%rsp) @@ -809,6 +830,7 @@ vpaes_decrypt: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_vpaes_decrypt: .globl vpaes_cbc_encrypt .def vpaes_cbc_encrypt; .scl 2; .type 32; .endef @@ -825,6 +847,7 @@ vpaes_cbc_encrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 + xchgq %rcx,%rdx subq $16,%rcx jc .Lcbc_abort @@ -886,6 +909,7 @@ vpaes_cbc_encrypt: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_vpaes_cbc_encrypt: @@ -896,6 +920,7 @@ vpaes_cbc_encrypt: .def _vpaes_preheat; .scl 3; .type 32; .endef .p2align 4 _vpaes_preheat: + leaq .Lk_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -912,6 +937,7 @@ _vpaes_preheat: + .p2align 6 _vpaes_consts: .Lk_inv: diff --git a/lib/accelerated/x86/coff/aesni-gcm-x86_64.s b/lib/accelerated/x86/coff/aesni-gcm-x86_64.s index bc3554ca07..7988004cb0 100644 --- a/lib/accelerated/x86/coff/aesni-gcm-x86_64.s +++ b/lib/accelerated/x86/coff/aesni-gcm-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -365,17 +365,25 @@ aesni_gcm_decrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 + xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -168(%rsp),%rsp movaps %xmm6,-216(%rax) movaps %xmm7,-200(%rax) @@ -459,17 +467,25 @@ aesni_gcm_decrypt: movaps -88(%rax),%xmm14 movaps -72(%rax),%xmm15 movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + .Lgcm_dec_abort: movq %r10,%rax movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_gcm_decrypt: .def _aesni_ctr32_6x; .scl 3; .type 32; .endef .p2align 5 @@ -577,17 +593,25 @@ aesni_gcm_encrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 + xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -168(%rsp),%rsp movaps %xmm6,-216(%rax) movaps %xmm7,-200(%rax) @@ -835,17 +859,25 @@ aesni_gcm_encrypt: movaps -88(%rax),%xmm14 movaps -72(%rax),%xmm15 movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + .Lgcm_enc_abort: movq %r10,%rax movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_gcm_encrypt: .p2align 6 .Lbswap_mask: diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s index 502be77883..c6aa1a1e2a 100644 --- a/lib/accelerated/x86/coff/aesni-x86.s +++ b/lib/accelerated/x86/coff/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "devel/perlasm/aesni-x86.s" .text .globl _aesni_encrypt .def _aesni_encrypt; .scl 2; .type 32; .endef @@ -60,7 +59,10 @@ _aesni_encrypt: leal 16(%edx),%edx jnz .L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .globl _aesni_decrypt .def _aesni_decrypt; .scl 2; .type 32; .endef @@ -83,31 +85,87 @@ _aesni_decrypt: leal 16(%edx),%edx jnz .L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.def __aesni_encrypt2; .scl 3; .type 32; .endef +.align 16 +__aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L002enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L002enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.def __aesni_decrypt2; .scl 3; .type 32; .endef +.align 16 +__aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L003dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L003dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 ret .def __aesni_encrypt3; .scl 3; .type 32; .endef .align 16 __aesni_encrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L002enc3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L004enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 - movups (%edx),%xmm0 - jnz .L002enc3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L004enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -119,25 +177,26 @@ __aesni_encrypt3: .align 16 __aesni_decrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L003dec3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L005dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 - movups (%edx),%xmm0 - jnz .L003dec3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L005dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -150,27 +209,29 @@ __aesni_decrypt3: __aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L004enc4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L006enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%edx),%xmm0 - jnz .L004enc4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L006enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -185,27 +246,29 @@ __aesni_encrypt4: __aesni_decrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L005dec4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L007dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%edx),%xmm0 - jnz .L005dec4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L007dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -219,45 +282,42 @@ __aesni_decrypt4: .align 16 __aesni_encrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 -.byte 102,15,56,220,241 - movups (%edx),%xmm0 -.byte 102,15,56,220,249 - jmp .L_aesni_encrypt6_enter + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L008_aesni_encrypt6_inner .align 16 -.L006enc6_loop: +.L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 +.L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.align 16 .L_aesni_encrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%edx),%xmm0 - jnz .L006enc6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -275,45 +335,42 @@ __aesni_encrypt6: .align 16 __aesni_decrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 -.byte 102,15,56,222,241 - movups (%edx),%xmm0 -.byte 102,15,56,222,249 - jmp .L_aesni_decrypt6_enter + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L010_aesni_decrypt6_inner .align 16 -.L007dec6_loop: +.L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 +.L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.align 16 .L_aesni_decrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%edx),%xmm0 - jnz .L007dec6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -342,14 +399,14 @@ _aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L008ecb_ret + jz .L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L009ecb_decrypt + jz .L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L010ecb_enc_tail + jb .L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -358,9 +415,9 @@ _aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L011ecb_enc_loop6_enter + jmp .L015ecb_enc_loop6_enter .align 16 -.L012ecb_enc_loop6: +.L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -375,12 +432,12 @@ _aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L011ecb_enc_loop6_enter: +.L015ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L012ecb_enc_loop6 + jnc .L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -389,18 +446,18 @@ _aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L010ecb_enc_tail: + jz .L012ecb_ret +.L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L013ecb_enc_one + jb .L017ecb_enc_one movups 16(%esi),%xmm3 - je .L014ecb_enc_two + je .L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L015ecb_enc_three + jb .L019ecb_enc_three movups 48(%esi),%xmm5 - je .L016ecb_enc_four + je .L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -409,50 +466,49 @@ _aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L013ecb_enc_one: +.L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L017enc1_loop_3: +.L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L017enc1_loop_3 + jnz .L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L014ecb_enc_two: - xorps %xmm4,%xmm4 - call __aesni_encrypt3 +.L018ecb_enc_two: + call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L015ecb_enc_three: +.L019ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L016ecb_enc_four: +.L020ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L009ecb_decrypt: +.L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L018ecb_dec_tail + jb .L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -461,9 +517,9 @@ _aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L019ecb_dec_loop6_enter + jmp .L023ecb_dec_loop6_enter .align 16 -.L020ecb_dec_loop6: +.L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -478,12 +534,12 @@ _aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L019ecb_dec_loop6_enter: +.L023ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L020ecb_dec_loop6 + jnc .L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -492,18 +548,18 @@ _aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L018ecb_dec_tail: + jz .L012ecb_ret +.L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L021ecb_dec_one + jb .L025ecb_dec_one movups 16(%esi),%xmm3 - je .L022ecb_dec_two + je .L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L023ecb_dec_three + jb .L027ecb_dec_three movups 48(%esi),%xmm5 - je .L024ecb_dec_four + je .L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -512,44 +568,51 @@ _aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L021ecb_dec_one: +.L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L025dec1_loop_4: +.L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L025dec1_loop_4 + jnz .L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L022ecb_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +.L026ecb_dec_two: + call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L023ecb_dec_three: +.L027ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L024ecb_dec_four: +.L028ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L008ecb_ret: +.L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -587,48 +650,56 @@ _aesni_ccm64_encrypt_blocks: movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - shrl $1,%ecx + shll $4,%ecx + movl $16,%ebx leal (%edx),%ebp movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 - movl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx .byte 102,15,56,0,253 -.L026ccm64_enc_outer: +.L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 xorps %xmm0,%xmm2 movups 16(%ebp),%xmm1 xorps %xmm6,%xmm0 - leal 32(%ebp),%edx xorps %xmm0,%xmm3 - movups (%edx),%xmm0 -.L027ccm64_enc2_loop: + movups 32(%ebp),%xmm0 +.L031ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L027ccm64_enc2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 + decl %eax .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) - leal 16(%edi),%edi .byte 102,15,56,0,213 - jnz .L026ccm64_enc_outer + leal 16(%edi),%edi + jnz .L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -675,71 +746,82 @@ _aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L028enc1_loop_5: +.L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L028enc1_loop_5 + jnz .L032enc1_loop_5 .byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx movups (%esi),%xmm6 paddq 16(%esp),%xmm7 leal 16(%esi),%esi - jmp .L029ccm64_dec_outer + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp .L033ccm64_dec_outer .align 16 -.L029ccm64_dec_outer: +.L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L030ccm64_dec_break + jz .L034ccm64_dec_break movups (%ebp),%xmm0 - shrl $1,%ecx + movl %ebx,%ecx movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movups (%edx),%xmm0 -.L031ccm64_dec2_loop: + movups 32(%ebp),%xmm0 +.L035ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L031ccm64_dec2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 - jmp .L029ccm64_dec_outer + leal 16(%esi),%esi + jmp .L033ccm64_dec_outer .align 16 -.L030ccm64_dec_break: +.L034ccm64_dec_break: + movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L032enc1_loop_6: +.L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L032enc1_loop_6 + jnz .L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -764,7 +846,7 @@ _aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L033ctr32_one_shortcut + je .L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -780,63 +862,59 @@ _aesni_ctr32_encrypt_blocks: .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx - pxor %xmm1,%xmm1 pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,203,0 +.byte 102,15,58,34,195,0 leal 3(%ebx),%ebp -.byte 102,15,58,34,197,0 +.byte 102,15,58,34,205,0 incl %ebx -.byte 102,15,58,34,203,1 +.byte 102,15,58,34,195,1 incl %ebp -.byte 102,15,58,34,197,1 +.byte 102,15,58,34,205,1 incl %ebx -.byte 102,15,58,34,203,2 +.byte 102,15,58,34,195,2 incl %ebp -.byte 102,15,58,34,197,2 - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 - movdqa %xmm0,64(%esp) +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 - pshufd $192,%xmm1,%xmm2 - pshufd $128,%xmm1,%xmm3 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L034ctr32_tail + jb .L038ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx movdqa %xmm7,32(%esp) - shrl $1,%ecx movl %edx,%ebp - movl %ecx,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L035ctr32_loop6 -.align 16 -.L035ctr32_loop6: - pshufd $64,%xmm1,%xmm4 - movdqa 32(%esp),%xmm1 - pshufd $192,%xmm0,%xmm5 - por %xmm1,%xmm2 - pshufd $128,%xmm0,%xmm6 - por %xmm1,%xmm3 - pshufd $64,%xmm0,%xmm7 - por %xmm1,%xmm4 - por %xmm1,%xmm5 - por %xmm1,%xmm6 - por %xmm1,%xmm7 - movups (%ebp),%xmm0 - movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx - decl %ecx + jmp .L039ctr32_loop6 +.align 16 +.L039ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 +.byte 102,15,56,220,209 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -847,51 +925,51 @@ _aesni_ctr32_encrypt_blocks: movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 - movdqa 48(%esp),%xmm1 + movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 - paddd 64(%esp),%xmm0 + paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 - movdqa %xmm0,64(%esp) -.byte 102,15,56,0,194 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 movups %xmm6,64(%edi) - pshufd $192,%xmm1,%xmm2 + pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi - movl %ebx,%ecx - pshufd $128,%xmm1,%xmm3 + pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L035ctr32_loop6 + jnc .L039ctr32_loop6 addl $6,%eax - jz .L036ctr32_ret + jz .L040ctr32_ret + movdqu (%ebp),%xmm7 movl %ebp,%edx - leal 1(,%ecx,2),%ecx - movdqa 32(%esp),%xmm7 -.L034ctr32_tail: + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +.L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L037ctr32_one - pshufd $64,%xmm1,%xmm4 + jb .L041ctr32_one + pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L038ctr32_two - pshufd $192,%xmm0,%xmm5 + je .L042ctr32_two + pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L039ctr32_three - pshufd $128,%xmm0,%xmm6 + jb .L043ctr32_three + pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L040ctr32_four + je .L044ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -909,39 +987,39 @@ _aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L033ctr32_one_shortcut: +.L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L037ctr32_one: +.L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L041enc1_loop_7: +.L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L041enc1_loop_7 + jnz .L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L038ctr32_two: - call __aesni_encrypt3 +.L042ctr32_two: + call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L039ctr32_three: +.L043ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -952,9 +1030,9 @@ _aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L040ctr32_four: +.L044ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -968,7 +1046,18 @@ _aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L036ctr32_ret: +.L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -992,12 +1081,12 @@ _aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L042enc1_loop_8: +.L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L042enc1_loop_8 + jnz .L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1021,12 +1110,14 @@ _aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L043xts_enc_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L044xts_enc_loop6 + jc .L047xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L048xts_enc_loop6 .align 16 -.L044xts_enc_loop6: +.L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1062,6 +1153,7 @@ _aesni_xts_encrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1077,19 +1169,17 @@ _aesni_xts_encrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,220,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1114,26 +1204,25 @@ _aesni_xts_encrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L044xts_enc_loop6 - leal 1(,%ecx,2),%ecx + jnc .L048xts_enc_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L043xts_enc_short: +.L047xts_enc_short: addl $96,%eax - jz .L045xts_enc_done6x + jz .L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L046xts_enc_one + jb .L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L047xts_enc_two + je .L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1142,7 +1231,7 @@ _aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L048xts_enc_three + jb .L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1152,7 +1241,7 @@ _aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L049xts_enc_four + je .L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1184,9 +1273,9 @@ _aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L046xts_enc_one: +.L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1194,37 +1283,36 @@ _aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L051enc1_loop_9: +.L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L051enc1_loop_9 + jnz .L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L047xts_enc_two: +.L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - xorps %xmm4,%xmm4 - call __aesni_encrypt3 + call __aesni_encrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L048xts_enc_three: +.L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1242,9 +1330,9 @@ _aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L049xts_enc_four: +.L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1266,28 +1354,28 @@ _aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L045xts_enc_done6x: +.L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L052xts_enc_ret + jz .L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L053xts_enc_steal + jmp .L057xts_enc_steal .align 16 -.L050xts_enc_done: +.L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L052xts_enc_ret + jz .L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L053xts_enc_steal: +.L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1295,7 +1383,7 @@ _aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L053xts_enc_steal + jnz .L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1305,16 +1393,30 @@ _aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L054enc1_loop_10: +.L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L054enc1_loop_10 + jnz .L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L052xts_enc_ret: +.L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1338,12 +1440,12 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L055enc1_loop_11: +.L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L055enc1_loop_11 + jnz .L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1372,12 +1474,14 @@ _aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L056xts_dec_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L057xts_dec_loop6 + jc .L060xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L061xts_dec_loop6 .align 16 -.L057xts_dec_loop6: +.L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1413,6 +1517,7 @@ _aesni_xts_decrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1428,19 +1533,17 @@ _aesni_xts_decrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,222,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 call .L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1465,26 +1568,25 @@ _aesni_xts_decrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L057xts_dec_loop6 - leal 1(,%ecx,2),%ecx + jnc .L061xts_dec_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L056xts_dec_short: +.L060xts_dec_short: addl $96,%eax - jz .L058xts_dec_done6x + jz .L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L059xts_dec_one + jb .L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L060xts_dec_two + je .L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1493,7 +1595,7 @@ _aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L061xts_dec_three + jb .L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1503,7 +1605,7 @@ _aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L062xts_dec_four + je .L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1535,9 +1637,9 @@ _aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L059xts_dec_one: +.L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1545,36 +1647,36 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L064dec1_loop_12: +.L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L064dec1_loop_12 + jnz .L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L060xts_dec_two: +.L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - call __aesni_decrypt3 + call __aesni_decrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L061xts_dec_three: +.L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1592,9 +1694,9 @@ _aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L062xts_dec_four: +.L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1616,20 +1718,20 @@ _aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L058xts_dec_done6x: +.L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L065xts_dec_ret + jz .L069xts_dec_ret movl %eax,112(%esp) - jmp .L066xts_dec_only_one_more + jmp .L070xts_dec_only_one_more .align 16 -.L063xts_dec_done: +.L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L065xts_dec_ret + jz .L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1639,7 +1741,7 @@ _aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L066xts_dec_only_one_more: +.L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1653,16 +1755,16 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L067dec1_loop_13: +.L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L067dec1_loop_13 + jnz .L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L068xts_dec_steal: +.L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1670,7 +1772,7 @@ _aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L068xts_dec_steal + jnz .L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1680,105 +1782,908 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_14: +.L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_14 + jnz .L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L065xts_dec_ret: +.L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret -.globl _aesni_cbc_encrypt -.def _aesni_cbc_encrypt; .scl 2; .type 32; .endef +.globl _aesni_ocb_encrypt +.def _aesni_ocb_encrypt; .scl 2; .type 32; .endef .align 16 -_aesni_cbc_encrypt: -.L_aesni_cbc_encrypt_begin: +_aesni_ocb_encrypt: +.L_aesni_ocb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi + movl 40(%esp),%ecx + movl 48(%esp),%ebx movl 20(%esp),%esi - movl %esp,%ebx movl 24(%esp),%edi - subl $24,%ebx movl 28(%esp),%eax - andl $-16,%ebx movl 32(%esp),%edx + movdqu (%ecx),%xmm0 movl 36(%esp),%ebp - testl %eax,%eax - jz .L070cbc_abort - cmpl $0,40(%esp) - xchgl %esp,%ebx - movups (%ebp),%xmm7 + movdqu (%ebx),%xmm1 + movl 44(%esp),%ebx + movl %esp,%ecx + subl $132,%esp + andl $-16,%esp + subl %esi,%edi + shll $4,%eax + leal -96(%esi,%eax,1),%eax + movl %edi,120(%esp) + movl %eax,124(%esp) + movl %ecx,128(%esp) movl 240(%edx),%ecx - movl %edx,%ebp - movl %ebx,16(%esp) - movl %ecx,%ebx - je .L071cbc_decrypt - movaps %xmm7,%xmm2 - cmpl $16,%eax - jb .L072cbc_enc_tail - subl $16,%eax - jmp .L073cbc_enc_loop -.align 16 -.L073cbc_enc_loop: - movups (%esi),%xmm7 + testl $1,%ebp + jnz .L074odd + bsfl %ebp,%eax + addl $1,%ebp + shll $4,%eax + movdqu (%ebx,%eax,1),%xmm7 + movl %edx,%eax + movdqu (%esi),%xmm2 leal 16(%esi),%esi + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 movups (%edx),%xmm0 movups 16(%edx),%xmm1 - xorps %xmm0,%xmm7 leal 32(%edx),%edx - xorps %xmm7,%xmm2 -.L074enc1_loop_15: + xorps %xmm0,%xmm2 +.L075enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L074enc1_loop_15 + jnz .L075enc1_loop_15 .byte 102,15,56,221,209 - movl %ebx,%ecx - movl %ebp,%edx - movups %xmm2,(%edi) - leal 16(%edi),%edi - subl $16,%eax - jnc .L073cbc_enc_loop - addl $16,%eax - jnz .L072cbc_enc_tail - movaps %xmm2,%xmm7 - jmp .L075cbc_ret -.L072cbc_enc_tail: - movl %eax,%ecx -.long 2767451785 - movl $16,%ecx - subl %eax,%ecx - xorl %eax,%eax -.long 2868115081 - leal -16(%edi),%edi - movl %ebx,%ecx - movl %edi,%esi - movl %ebp,%edx - jmp .L073cbc_enc_loop -.align 16 -.L071cbc_decrypt: - cmpl $80,%eax - jbe .L076cbc_dec_tail - movaps %xmm7,(%esp) - subl $80,%eax - jmp .L077cbc_dec_loop6_enter -.align 16 -.L078cbc_dec_loop6: - movaps %xmm0,(%esp) - movups %xmm7,(%edi) + xorps %xmm7,%xmm2 + movdqa %xmm7,%xmm0 + movdqa %xmm6,%xmm1 + movups %xmm2,-16(%edi,%esi,1) + movl 240(%eax),%ecx + movl %eax,%edx + movl 124(%esp),%eax +.L074odd: + shll $4,%ecx + movl $16,%edi + subl %ecx,%edi + movl %edx,112(%esp) + leal 32(%edx,%ecx,1),%edx + movl %edi,116(%esp) + cmpl %eax,%esi + ja .L076short + jmp .L077grandloop +.align 32 +.L077grandloop: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + leal 5(%ebp),%edi + addl $6,%ebp + bsfl %ecx,%ecx + bsfl %eax,%eax + bsfl %edi,%edi + shll $4,%ecx + shll $4,%eax + shll $4,%edi + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + movdqu (%ebx,%edi,1),%xmm7 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm6,%xmm1 + pxor %xmm0,%xmm6 + pxor %xmm7,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm1,96(%esp) + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor 80(%esp),%xmm7 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movl 120(%esp),%edi + movl 124(%esp),%eax + call .L_aesni_encrypt6_enter + movdqa 80(%esp),%xmm0 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor %xmm0,%xmm7 + movdqa 96(%esp),%xmm1 + movdqu %xmm2,-96(%edi,%esi,1) + movdqu %xmm3,-80(%edi,%esi,1) + movdqu %xmm4,-64(%edi,%esi,1) + movdqu %xmm5,-48(%edi,%esi,1) + movdqu %xmm6,-32(%edi,%esi,1) + movdqu %xmm7,-16(%edi,%esi,1) + cmpl %eax,%esi + jb .L077grandloop +.L076short: + addl $96,%eax + subl %esi,%eax + jz .L078done + cmpl $32,%eax + jb .L079one + je .L080two + cmpl $64,%eax + jb .L081three + je .L082four + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm6,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm1,96(%esp) + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movl 120(%esp),%edi + call .L_aesni_encrypt6_enter + movdqa 64(%esp),%xmm0 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor %xmm0,%xmm6 + movdqa 96(%esp),%xmm1 + movdqu %xmm2,(%edi,%esi,1) + movdqu %xmm3,16(%edi,%esi,1) + movdqu %xmm4,32(%edi,%esi,1) + movdqu %xmm5,48(%edi,%esi,1) + movdqu %xmm6,64(%edi,%esi,1) + jmp .L078done +.align 16 +.L079one: + movdqu (%ebx),%xmm7 + movl 112(%esp),%edx + movdqu (%esi),%xmm2 + movl 240(%edx),%ecx + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movl 120(%esp),%edi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L083enc1_loop_16: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L083enc1_loop_16 +.byte 102,15,56,221,209 + xorps %xmm7,%xmm2 + movdqa %xmm7,%xmm0 + movdqa %xmm6,%xmm1 + movups %xmm2,(%edi,%esi,1) + jmp .L078done +.align 16 +.L080two: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm6 + movdqu (%ebx,%ecx,1),%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movl 240(%edx),%ecx + pxor %xmm0,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm3 + movdqa %xmm1,%xmm5 + movl 120(%esp),%edi + call __aesni_encrypt2 + xorps %xmm6,%xmm2 + xorps %xmm7,%xmm3 + movdqa %xmm7,%xmm0 + movdqa %xmm5,%xmm1 + movups %xmm2,(%edi,%esi,1) + movups %xmm3,16(%edi,%esi,1) + jmp .L078done +.align 16 +.L081three: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm5 + movdqu (%ebx,%ecx,1),%xmm6 + movdqa %xmm5,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movl 240(%edx),%ecx + pxor %xmm0,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm5,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm6,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm7,%xmm4 + movdqa %xmm1,96(%esp) + movl 120(%esp),%edi + call __aesni_encrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movdqa %xmm7,%xmm0 + movdqa 96(%esp),%xmm1 + movups %xmm2,(%edi,%esi,1) + movups %xmm3,16(%edi,%esi,1) + movups %xmm4,32(%edi,%esi,1) + jmp .L078done +.align 16 +.L082four: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + movl 112(%esp),%edx + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm4 + movdqu (%ebx,%ecx,1),%xmm5 + movdqa %xmm4,%xmm6 + movdqu (%ebx,%eax,1),%xmm7 + pxor %xmm0,%xmm4 + movdqu (%esi),%xmm2 + pxor %xmm4,%xmm5 + movdqu 16(%esi),%xmm3 + pxor %xmm5,%xmm6 + movdqa %xmm4,(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm5,16(%esp) + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movl 240(%edx),%ecx + pxor %xmm2,%xmm1 + pxor (%esp),%xmm2 + pxor %xmm3,%xmm1 + pxor 16(%esp),%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm7,%xmm5 + movdqa %xmm1,96(%esp) + movl 120(%esp),%edi + call __aesni_encrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm6,%xmm4 + movups %xmm2,(%edi,%esi,1) + xorps %xmm7,%xmm5 + movups %xmm3,16(%edi,%esi,1) + movdqa %xmm7,%xmm0 + movups %xmm4,32(%edi,%esi,1) + movdqa 96(%esp),%xmm1 + movups %xmm5,48(%edi,%esi,1) +.L078done: + movl 128(%esp),%edx + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm2,16(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm2,32(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm2,48(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm2,64(%esp) + movdqa %xmm2,80(%esp) + movdqa %xmm2,96(%esp) + leal (%edx),%esp + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movdqu %xmm0,(%ecx) + pxor %xmm0,%xmm0 + movdqu %xmm1,(%ebx) + pxor %xmm1,%xmm1 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aesni_ocb_decrypt +.def _aesni_ocb_decrypt; .scl 2; .type 32; .endef +.align 16 +_aesni_ocb_decrypt: +.L_aesni_ocb_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movdqu (%ecx),%xmm0 + movl 36(%esp),%ebp + movdqu (%ebx),%xmm1 + movl 44(%esp),%ebx + movl %esp,%ecx + subl $132,%esp + andl $-16,%esp + subl %esi,%edi + shll $4,%eax + leal -96(%esi,%eax,1),%eax + movl %edi,120(%esp) + movl %eax,124(%esp) + movl %ecx,128(%esp) + movl 240(%edx),%ecx + testl $1,%ebp + jnz .L084odd + bsfl %ebp,%eax + addl $1,%ebp + shll $4,%eax + movdqu (%ebx,%eax,1),%xmm7 + movl %edx,%eax + movdqu (%esi),%xmm2 + leal 16(%esi),%esi + pxor %xmm0,%xmm7 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L085dec1_loop_17: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L085dec1_loop_17 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm1 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm1 + movups %xmm2,-16(%edi,%esi,1) + movl 240(%eax),%ecx + movl %eax,%edx + movl 124(%esp),%eax +.L084odd: + shll $4,%ecx + movl $16,%edi + subl %ecx,%edi + movl %edx,112(%esp) + leal 32(%edx,%ecx,1),%edx + movl %edi,116(%esp) + cmpl %eax,%esi + ja .L086short + jmp .L087grandloop +.align 32 +.L087grandloop: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + leal 5(%ebp),%edi + addl $6,%ebp + bsfl %ecx,%ecx + bsfl %eax,%eax + bsfl %edi,%edi + shll $4,%ecx + shll $4,%eax + shll $4,%edi + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + movdqu (%ebx,%edi,1),%xmm7 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor 80(%esp),%xmm7 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movl 120(%esp),%edi + movl 124(%esp),%eax + call .L_aesni_decrypt6_enter + movdqa 80(%esp),%xmm0 + pxor (%esp),%xmm2 + movdqa 96(%esp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + movdqu %xmm2,-96(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqu %xmm3,-80(%edi,%esi,1) + pxor %xmm4,%xmm1 + movdqu %xmm4,-64(%edi,%esi,1) + pxor %xmm5,%xmm1 + movdqu %xmm5,-48(%edi,%esi,1) + pxor %xmm6,%xmm1 + movdqu %xmm6,-32(%edi,%esi,1) + pxor %xmm7,%xmm1 + movdqu %xmm7,-16(%edi,%esi,1) + cmpl %eax,%esi + jb .L087grandloop +.L086short: + addl $96,%eax + subl %esi,%eax + jz .L088done + cmpl $32,%eax + jb .L089one + je .L090two + cmpl $64,%eax + jb .L091three + je .L092four + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + pxor %xmm7,%xmm7 + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movl 120(%esp),%edi + call .L_aesni_decrypt6_enter + movdqa 64(%esp),%xmm0 + pxor (%esp),%xmm2 + movdqa 96(%esp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm2,%xmm1 + movdqu %xmm2,(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqu %xmm3,16(%edi,%esi,1) + pxor %xmm4,%xmm1 + movdqu %xmm4,32(%edi,%esi,1) + pxor %xmm5,%xmm1 + movdqu %xmm5,48(%edi,%esi,1) + pxor %xmm6,%xmm1 + movdqu %xmm6,64(%edi,%esi,1) + jmp .L088done +.align 16 +.L089one: + movdqu (%ebx),%xmm7 + movl 112(%esp),%edx + movdqu (%esi),%xmm2 + movl 240(%edx),%ecx + pxor %xmm0,%xmm7 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movl 120(%esp),%edi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L093dec1_loop_18: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L093dec1_loop_18 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm1 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm1 + movups %xmm2,(%edi,%esi,1) + jmp .L088done +.align 16 +.L090two: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm6 + movdqu (%ebx,%ecx,1),%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movl 240(%edx),%ecx + movdqa %xmm1,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm6,%xmm2 + pxor %xmm7,%xmm3 + movl 120(%esp),%edi + call __aesni_decrypt2 + xorps %xmm6,%xmm2 + xorps %xmm7,%xmm3 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm5 + movups %xmm2,(%edi,%esi,1) + xorps %xmm3,%xmm5 + movups %xmm3,16(%edi,%esi,1) + movaps %xmm5,%xmm1 + jmp .L088done +.align 16 +.L091three: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm5 + movdqu (%ebx,%ecx,1),%xmm6 + movdqa %xmm5,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movl 240(%edx),%ecx + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm5,%xmm2 + pxor %xmm6,%xmm3 + pxor %xmm7,%xmm4 + movl 120(%esp),%edi + call __aesni_decrypt3 + movdqa 96(%esp),%xmm1 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi,%esi,1) + pxor %xmm2,%xmm1 + movdqa %xmm7,%xmm0 + movups %xmm3,16(%edi,%esi,1) + pxor %xmm3,%xmm1 + movups %xmm4,32(%edi,%esi,1) + pxor %xmm4,%xmm1 + jmp .L088done +.align 16 +.L092four: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + movl 112(%esp),%edx + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm4 + movdqu (%ebx,%ecx,1),%xmm5 + movdqa %xmm4,%xmm6 + movdqu (%ebx,%eax,1),%xmm7 + pxor %xmm0,%xmm4 + movdqu (%esi),%xmm2 + pxor %xmm4,%xmm5 + movdqu 16(%esi),%xmm3 + pxor %xmm5,%xmm6 + movdqa %xmm4,(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm5,16(%esp) + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movl 240(%edx),%ecx + movdqa %xmm1,96(%esp) + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm7,%xmm5 + movl 120(%esp),%edi + call __aesni_decrypt4 + movdqa 96(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm6,%xmm4 + movups %xmm2,(%edi,%esi,1) + pxor %xmm2,%xmm1 + xorps %xmm7,%xmm5 + movups %xmm3,16(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqa %xmm7,%xmm0 + movups %xmm4,32(%edi,%esi,1) + pxor %xmm4,%xmm1 + movups %xmm5,48(%edi,%esi,1) + pxor %xmm5,%xmm1 +.L088done: + movl 128(%esp),%edx + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm2,16(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm2,32(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm2,48(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm2,64(%esp) + movdqa %xmm2,80(%esp) + movdqa %xmm2,96(%esp) + leal (%edx),%esp + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movdqu %xmm0,(%ecx) + pxor %xmm0,%xmm0 + movdqu %xmm1,(%ebx) + pxor %xmm1,%xmm1 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aesni_cbc_encrypt +.def _aesni_cbc_encrypt; .scl 2; .type 32; .endef +.align 16 +_aesni_cbc_encrypt: +.L_aesni_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl %esp,%ebx + movl 24(%esp),%edi + subl $24,%ebx + movl 28(%esp),%eax + andl $-16,%ebx + movl 32(%esp),%edx + movl 36(%esp),%ebp + testl %eax,%eax + jz .L094cbc_abort + cmpl $0,40(%esp) + xchgl %esp,%ebx + movups (%ebp),%xmm7 + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ebx,16(%esp) + movl %ecx,%ebx + je .L095cbc_decrypt + movaps %xmm7,%xmm2 + cmpl $16,%eax + jb .L096cbc_enc_tail + subl $16,%eax + jmp .L097cbc_enc_loop +.align 16 +.L097cbc_enc_loop: + movups (%esi),%xmm7 + leal 16(%esi),%esi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm7 + leal 32(%edx),%edx + xorps %xmm7,%xmm2 +.L098enc1_loop_19: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L098enc1_loop_19 +.byte 102,15,56,221,209 + movl %ebx,%ecx + movl %ebp,%edx + movups %xmm2,(%edi) leal 16(%edi),%edi -.L077cbc_dec_loop6_enter: + subl $16,%eax + jnc .L097cbc_enc_loop + addl $16,%eax + jnz .L096cbc_enc_tail + movaps %xmm2,%xmm7 + pxor %xmm2,%xmm2 + jmp .L099cbc_ret +.L096cbc_enc_tail: + movl %eax,%ecx +.long 2767451785 + movl $16,%ecx + subl %eax,%ecx + xorl %eax,%eax +.long 2868115081 + leal -16(%edi),%edi + movl %ebx,%ecx + movl %edi,%esi + movl %ebp,%edx + jmp .L097cbc_enc_loop +.align 16 +.L095cbc_decrypt: + cmpl $80,%eax + jbe .L100cbc_dec_tail + movaps %xmm7,(%esp) + subl $80,%eax + jmp .L101cbc_dec_loop6_enter +.align 16 +.L102cbc_dec_loop6: + movaps %xmm0,(%esp) + movups %xmm7,(%edi) + leal 16(%edi),%edi +.L101cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1808,28 +2713,28 @@ _aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L078cbc_dec_loop6 + ja .L102cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L079cbc_dec_tail_collected + jle .L103cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L076cbc_dec_tail: +.L100cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L080cbc_dec_one + jbe .L104cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L081cbc_dec_two + jbe .L105cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L082cbc_dec_three + jbe .L106cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L083cbc_dec_four + jbe .L107cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1847,56 +2752,62 @@ _aesni_cbc_encrypt: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L080cbc_dec_one: +.L104cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L084dec1_loop_16: +.L109dec1_loop_20: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L084dec1_loop_16 + jnz .L109dec1_loop_20 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L081cbc_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +.L105cbc_dec_two: + call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L082cbc_dec_three: +.L106cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L083cbc_dec_four: +.L107cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1906,28 +2817,44 @@ _aesni_cbc_encrypt: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -.L079cbc_dec_tail_collected: + jmp .L108cbc_dec_tail_collected +.align 16 +.L103cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +.L108cbc_dec_tail_collected: andl $15,%eax - jnz .L085cbc_dec_tail_partial + jnz .L110cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L075cbc_ret + pxor %xmm0,%xmm0 + jmp .L099cbc_ret .align 16 -.L085cbc_dec_tail_partial: +.L110cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L075cbc_ret: + movdqa %xmm2,(%esp) +.L099cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -.L070cbc_abort: + pxor %xmm7,%xmm7 +.L094cbc_abort: popl %edi popl %esi popl %ebx @@ -1936,52 +2863,62 @@ _aesni_cbc_encrypt: .def __aesni_set_encrypt_key; .scl 3; .type 32; .endef .align 16 __aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz .L086bad_pointer + jz .L111bad_pointer testl %edx,%edx - jz .L086bad_pointer + jz .L111bad_pointer + call .L112pic +.L112pic: + popl %ebx + leal .Lkey_const-.L112pic(%ebx),%ebx + leal __gnutls_x86_cpuid_s,%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je .L08714rounds + je .L11314rounds cmpl $192,%ecx - je .L08812rounds + je .L11412rounds cmpl $128,%ecx - jne .L089bad_keybits + jne .L115bad_keybits .align 16 -.L09010rounds: +.L11610rounds: + cmpl $268435456,%ebp + je .L11710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L091key_128_cold + call .L118key_128_cold .byte 102,15,58,223,200,2 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,4 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,8 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,16 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,32 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,64 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,128 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,27 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,54 - call .L092key_128 + call .L119key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp .L120good_key .align 16 -.L092key_128: +.L119key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L091key_128_cold: +.L118key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -1990,38 +2927,91 @@ __aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L08812rounds: +.L11710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L121loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L121loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L120good_key +.align 16 +.L11412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je .L12212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L093key_192a_cold + call .L123key_192a_cold .byte 102,15,58,223,202,2 - call .L094key_192b + call .L124key_192b .byte 102,15,58,223,202,4 - call .L095key_192a + call .L125key_192a .byte 102,15,58,223,202,8 - call .L094key_192b + call .L124key_192b .byte 102,15,58,223,202,16 - call .L095key_192a + call .L125key_192a .byte 102,15,58,223,202,32 - call .L094key_192b + call .L124key_192b .byte 102,15,58,223,202,64 - call .L095key_192a + call .L125key_192a .byte 102,15,58,223,202,128 - call .L094key_192b + call .L124key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp .L120good_key .align 16 -.L095key_192a: +.L125key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L093key_192a_cold: +.L123key_192a_cold: movaps %xmm2,%xmm5 -.L096key_192b_warm: +.L126key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2035,56 +3025,90 @@ __aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L094key_192b: +.L124key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L096key_192b_warm + jmp .L126key_192b_warm +.align 16 +.L12212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L127loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L127loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L120good_key .align 16 -.L08714rounds: +.L11314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je .L12814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L097key_256a_cold + call .L129key_256a_cold .byte 102,15,58,223,200,1 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,2 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,2 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,4 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,4 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,8 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,8 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,16 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,16 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,32 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,32 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,64 - call .L099key_256a + call .L131key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp .L120good_key .align 16 -.L099key_256a: +.L131key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L097key_256a_cold: +.L129key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2093,7 +3117,7 @@ __aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L098key_256b: +.L130key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2103,13 +3127,70 @@ __aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 16 +.L12814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L132loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L133done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L132loop_key256 +.L133done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L120good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 4 -.L086bad_pointer: +.L111bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 4 -.L089bad_keybits: +.L115bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .globl _aesni_set_encrypt_key .def _aesni_set_encrypt_key; .scl 2; .type 32; .endef @@ -2133,7 +3214,7 @@ _aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L100dec_key_ret + jnz .L134dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2141,7 +3222,7 @@ _aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L101dec_key_inverse: +.L135dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2151,15 +3232,24 @@ _aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L101dec_key_inverse + ja .L135dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -.L100dec_key_ret: +.L134dec_key_ret: ret +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 +.comm __gnutls_x86_cpuid_s,16 diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s index 79ffbf70c7..4e8de065f2 100644 --- a/lib/accelerated/x86/coff/aesni-x86_64.s +++ b/lib/accelerated/x86/coff/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -43,6 +43,7 @@ .def aesni_encrypt; .scl 2; .type 32; .endef .p2align 4 aesni_encrypt: + movups (%rcx),%xmm2 movl 240(%r8),%eax movups (%r8),%xmm0 @@ -63,10 +64,12 @@ aesni_encrypt: .byte 0xf3,0xc3 + .globl aesni_decrypt .def aesni_decrypt; .scl 2; .type 32; .endef .p2align 4 aesni_decrypt: + movups (%rcx),%xmm2 movl 240(%r8),%eax movups (%r8),%xmm0 @@ -86,9 +89,11 @@ aesni_decrypt: pxor %xmm2,%xmm2 .byte 0xf3,0xc3 + .def _aesni_encrypt2; .scl 3; .type 32; .endef .p2align 4 _aesni_encrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -115,9 +120,11 @@ _aesni_encrypt2: .byte 102,15,56,221,216 .byte 0xf3,0xc3 + .def _aesni_decrypt2; .scl 3; .type 32; .endef .p2align 4 _aesni_decrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -144,9 +151,11 @@ _aesni_decrypt2: .byte 102,15,56,223,216 .byte 0xf3,0xc3 + .def _aesni_encrypt3; .scl 3; .type 32; .endef .p2align 4 _aesni_encrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -178,9 +187,11 @@ _aesni_encrypt3: .byte 102,15,56,221,224 .byte 0xf3,0xc3 + .def _aesni_decrypt3; .scl 3; .type 32; .endef .p2align 4 _aesni_decrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -212,9 +223,11 @@ _aesni_decrypt3: .byte 102,15,56,223,224 .byte 0xf3,0xc3 + .def _aesni_encrypt4; .scl 3; .type 32; .endef .p2align 4 _aesni_encrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -252,9 +265,11 @@ _aesni_encrypt4: .byte 102,15,56,221,232 .byte 0xf3,0xc3 + .def _aesni_decrypt4; .scl 3; .type 32; .endef .p2align 4 _aesni_decrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -292,9 +307,11 @@ _aesni_decrypt4: .byte 102,15,56,223,232 .byte 0xf3,0xc3 + .def _aesni_encrypt6; .scl 3; .type 32; .endef .p2align 4 _aesni_encrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -346,9 +363,11 @@ _aesni_encrypt6: .byte 102,15,56,221,248 .byte 0xf3,0xc3 + .def _aesni_decrypt6; .scl 3; .type 32; .endef .p2align 4 _aesni_decrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -400,9 +419,11 @@ _aesni_decrypt6: .byte 102,15,56,223,248 .byte 0xf3,0xc3 + .def _aesni_encrypt8; .scl 3; .type 32; .endef .p2align 4 _aesni_encrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -464,9 +485,11 @@ _aesni_encrypt8: .byte 102,68,15,56,221,200 .byte 0xf3,0xc3 + .def _aesni_decrypt8; .scl 3; .type 32; .endef .p2align 4 _aesni_decrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -528,6 +551,7 @@ _aesni_decrypt8: .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 + .globl aesni_ecb_encrypt .def aesni_ecb_encrypt; .scl 2; .type 32; .endef .p2align 4 @@ -542,6 +566,7 @@ aesni_ecb_encrypt: movq %r9,%rcx movq 40(%rsp),%r8 + leaq -88(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,16(%rsp) @@ -897,6 +922,7 @@ aesni_ecb_encrypt: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_ecb_encrypt: .globl aesni_ccm64_encrypt_blocks .def aesni_ccm64_encrypt_blocks; .scl 2; .type 32; .endef @@ -1130,6 +1156,7 @@ aesni_ctr32_encrypt_blocks: movq %r9,%rcx movq 40(%rsp),%r8 + cmpq $1,%rdx jne .Lctr32_bulk @@ -1159,22 +1186,23 @@ aesni_ctr32_encrypt_blocks: .p2align 4 .Lctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 + pushq %rbp + subq $288,%rsp andq $-16,%rsp - movaps %xmm6,-168(%rax) - movaps %xmm7,-152(%rax) - movaps %xmm8,-136(%rax) - movaps %xmm9,-120(%rax) - movaps %xmm10,-104(%rax) - movaps %xmm11,-88(%rax) - movaps %xmm12,-72(%rax) - movaps %xmm13,-56(%rax) - movaps %xmm14,-40(%rax) - movaps %xmm15,-24(%rax) + movaps %xmm6,-168(%r11) + movaps %xmm7,-152(%r11) + movaps %xmm8,-136(%r11) + movaps %xmm9,-120(%r11) + movaps %xmm10,-104(%r11) + movaps %xmm11,-88(%r11) + movaps %xmm12,-72(%r11) + movaps %xmm13,-56(%r11) + movaps %xmm14,-40(%r11) + movaps %xmm15,-24(%r11) .Lctr32_body: - leaq -8(%rax),%rbp @@ -1183,7 +1211,7 @@ aesni_ctr32_encrypt_blocks: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1199,8 +1227,8 @@ aesni_ctr32_encrypt_blocks: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1209,25 +1237,25 @@ aesni_ctr32_encrypt_blocks: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl _gnutls_x86_cpuid_s+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1251,7 +1279,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp .Lctr32_loop6 @@ -1262,32 +1290,32 @@ aesni_ctr32_encrypt_blocks: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1348,7 +1376,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1361,7 +1389,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1375,7 +1403,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1389,7 +1417,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1403,7 +1431,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1417,7 +1445,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1431,7 +1459,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1446,7 +1474,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1681,32 +1709,32 @@ aesni_ctr32_encrypt_blocks: .Lctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 - movaps -160(%rbp),%xmm6 - movaps %xmm0,-160(%rbp) - movaps -144(%rbp),%xmm7 - movaps %xmm0,-144(%rbp) - movaps -128(%rbp),%xmm8 - movaps %xmm0,-128(%rbp) - movaps -112(%rbp),%xmm9 - movaps %xmm0,-112(%rbp) - movaps -96(%rbp),%xmm10 - movaps %xmm0,-96(%rbp) - movaps -80(%rbp),%xmm11 - movaps %xmm0,-80(%rbp) - movaps -64(%rbp),%xmm12 - movaps %xmm0,-64(%rbp) - movaps -48(%rbp),%xmm13 - movaps %xmm0,-48(%rbp) - movaps -32(%rbp),%xmm14 - movaps %xmm0,-32(%rbp) - movaps -16(%rbp),%xmm15 - movaps %xmm0,-16(%rbp) + movaps -168(%r11),%xmm6 + movaps %xmm0,-168(%r11) + movaps -152(%r11),%xmm7 + movaps %xmm0,-152(%r11) + movaps -136(%r11),%xmm8 + movaps %xmm0,-136(%r11) + movaps -120(%r11),%xmm9 + movaps %xmm0,-120(%r11) + movaps -104(%r11),%xmm10 + movaps %xmm0,-104(%r11) + movaps -88(%r11),%xmm11 + movaps %xmm0,-88(%r11) + movaps -72(%r11),%xmm12 + movaps %xmm0,-72(%r11) + movaps -56(%r11),%xmm13 + movaps %xmm0,-56(%r11) + movaps -40(%r11),%xmm14 + movaps %xmm0,-40(%r11) + movaps -24(%r11),%xmm15 + movaps %xmm0,-24(%r11) movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) @@ -1715,12 +1743,15 @@ aesni_ctr32_encrypt_blocks: movaps %xmm0,80(%rsp) movaps %xmm0,96(%rsp) movaps %xmm0,112(%rsp) - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + .Lctr32_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_ctr32_encrypt_blocks: .globl aesni_xts_encrypt .def aesni_xts_encrypt; .scl 2; .type 32; .endef @@ -1737,22 +1768,24 @@ aesni_xts_encrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - leaq (%rsp),%rax + + leaq (%rsp),%r11 + pushq %rbp + subq $272,%rsp andq $-16,%rsp - movaps %xmm6,-168(%rax) - movaps %xmm7,-152(%rax) - movaps %xmm8,-136(%rax) - movaps %xmm9,-120(%rax) - movaps %xmm10,-104(%rax) - movaps %xmm11,-88(%rax) - movaps %xmm12,-72(%rax) - movaps %xmm13,-56(%rax) - movaps %xmm14,-40(%rax) - movaps %xmm15,-24(%rax) + movaps %xmm6,-168(%r11) + movaps %xmm7,-152(%r11) + movaps %xmm8,-136(%r11) + movaps %xmm9,-120(%r11) + movaps %xmm10,-104(%r11) + movaps %xmm11,-88(%r11) + movaps %xmm12,-72(%r11) + movaps %xmm13,-56(%r11) + movaps %xmm14,-40(%r11) + movaps %xmm15,-24(%r11) .Lxts_enc_body: - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1768,7 +1801,7 @@ aesni_xts_encrypt: jnz .Loop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1824,9 +1857,9 @@ aesni_xts_encrypt: jc .Lxts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop @@ -1851,7 +1884,7 @@ aesni_xts_encrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1860,7 +1893,7 @@ aesni_xts_encrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1875,7 +1908,7 @@ aesni_xts_encrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 @@ -1907,7 +1940,7 @@ aesni_xts_encrypt: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1975,10 +2008,10 @@ aesni_xts_encrypt: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -2005,7 +2038,7 @@ aesni_xts_encrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_enc_short: @@ -2161,7 +2194,7 @@ aesni_xts_encrypt: jnz .Lxts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2187,26 +2220,26 @@ aesni_xts_encrypt: pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 - movaps -160(%rbp),%xmm6 - movaps %xmm0,-160(%rbp) - movaps -144(%rbp),%xmm7 - movaps %xmm0,-144(%rbp) - movaps -128(%rbp),%xmm8 - movaps %xmm0,-128(%rbp) - movaps -112(%rbp),%xmm9 - movaps %xmm0,-112(%rbp) - movaps -96(%rbp),%xmm10 - movaps %xmm0,-96(%rbp) - movaps -80(%rbp),%xmm11 - movaps %xmm0,-80(%rbp) - movaps -64(%rbp),%xmm12 - movaps %xmm0,-64(%rbp) - movaps -48(%rbp),%xmm13 - movaps %xmm0,-48(%rbp) - movaps -32(%rbp),%xmm14 - movaps %xmm0,-32(%rbp) - movaps -16(%rbp),%xmm15 - movaps %xmm0,-16(%rbp) + movaps -168(%r11),%xmm6 + movaps %xmm0,-168(%r11) + movaps -152(%r11),%xmm7 + movaps %xmm0,-152(%r11) + movaps -136(%r11),%xmm8 + movaps %xmm0,-136(%r11) + movaps -120(%r11),%xmm9 + movaps %xmm0,-120(%r11) + movaps -104(%r11),%xmm10 + movaps %xmm0,-104(%r11) + movaps -88(%r11),%xmm11 + movaps %xmm0,-88(%r11) + movaps -72(%r11),%xmm12 + movaps %xmm0,-72(%r11) + movaps -56(%r11),%xmm13 + movaps %xmm0,-56(%r11) + movaps -40(%r11),%xmm14 + movaps %xmm0,-40(%r11) + movaps -24(%r11),%xmm15 + movaps %xmm0,-24(%r11) movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) @@ -2214,12 +2247,15 @@ aesni_xts_encrypt: movaps %xmm0,64(%rsp) movaps %xmm0,80(%rsp) movaps %xmm0,96(%rsp) - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + .Lxts_enc_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_xts_encrypt: .globl aesni_xts_decrypt .def aesni_xts_decrypt; .scl 2; .type 32; .endef @@ -2236,22 +2272,24 @@ aesni_xts_decrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - leaq (%rsp),%rax + + leaq (%rsp),%r11 + pushq %rbp + subq $272,%rsp andq $-16,%rsp - movaps %xmm6,-168(%rax) - movaps %xmm7,-152(%rax) - movaps %xmm8,-136(%rax) - movaps %xmm9,-120(%rax) - movaps %xmm10,-104(%rax) - movaps %xmm11,-88(%rax) - movaps %xmm12,-72(%rax) - movaps %xmm13,-56(%rax) - movaps %xmm14,-40(%rax) - movaps %xmm15,-24(%rax) + movaps %xmm6,-168(%r11) + movaps %xmm7,-152(%r11) + movaps %xmm8,-136(%r11) + movaps %xmm9,-120(%r11) + movaps %xmm10,-104(%r11) + movaps %xmm11,-88(%r11) + movaps %xmm12,-72(%r11) + movaps %xmm13,-56(%r11) + movaps %xmm14,-40(%r11) + movaps %xmm15,-24(%r11) .Lxts_dec_body: - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2273,7 +2311,7 @@ aesni_xts_decrypt: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2329,9 +2367,9 @@ aesni_xts_decrypt: jc .Lxts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop @@ -2356,7 +2394,7 @@ aesni_xts_decrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2365,7 +2403,7 @@ aesni_xts_decrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2380,7 +2418,7 @@ aesni_xts_decrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 @@ -2412,7 +2450,7 @@ aesni_xts_decrypt: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2480,10 +2518,10 @@ aesni_xts_decrypt: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2510,7 +2548,7 @@ aesni_xts_decrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_dec_short: @@ -2667,7 +2705,7 @@ aesni_xts_decrypt: jz .Lxts_dec_ret .Lxts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2697,7 +2735,7 @@ aesni_xts_decrypt: jnz .Lxts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2723,26 +2761,26 @@ aesni_xts_decrypt: pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 - movaps -160(%rbp),%xmm6 - movaps %xmm0,-160(%rbp) - movaps -144(%rbp),%xmm7 - movaps %xmm0,-144(%rbp) - movaps -128(%rbp),%xmm8 - movaps %xmm0,-128(%rbp) - movaps -112(%rbp),%xmm9 - movaps %xmm0,-112(%rbp) - movaps -96(%rbp),%xmm10 - movaps %xmm0,-96(%rbp) - movaps -80(%rbp),%xmm11 - movaps %xmm0,-80(%rbp) - movaps -64(%rbp),%xmm12 - movaps %xmm0,-64(%rbp) - movaps -48(%rbp),%xmm13 - movaps %xmm0,-48(%rbp) - movaps -32(%rbp),%xmm14 - movaps %xmm0,-32(%rbp) - movaps -16(%rbp),%xmm15 - movaps %xmm0,-16(%rbp) + movaps -168(%r11),%xmm6 + movaps %xmm0,-168(%r11) + movaps -152(%r11),%xmm7 + movaps %xmm0,-152(%r11) + movaps -136(%r11),%xmm8 + movaps %xmm0,-136(%r11) + movaps -120(%r11),%xmm9 + movaps %xmm0,-120(%r11) + movaps -104(%r11),%xmm10 + movaps %xmm0,-104(%r11) + movaps -88(%r11),%xmm11 + movaps %xmm0,-88(%r11) + movaps -72(%r11),%xmm12 + movaps %xmm0,-72(%r11) + movaps -56(%r11),%xmm13 + movaps %xmm0,-56(%r11) + movaps -40(%r11),%xmm14 + movaps %xmm0,-40(%r11) + movaps -24(%r11),%xmm15 + movaps %xmm0,-24(%r11) movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) @@ -2750,21 +2788,24 @@ aesni_xts_decrypt: movaps %xmm0,64(%rsp) movaps %xmm0,80(%rsp) movaps %xmm0,96(%rsp) - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + .Lxts_dec_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_xts_decrypt: -.globl aesni_cbc_encrypt -.def aesni_cbc_encrypt; .scl 2; .type 32; .endef -.p2align 4 -aesni_cbc_encrypt: +.globl aesni_ocb_encrypt +.def aesni_ocb_encrypt; .scl 2; .type 32; .endef +.p2align 5 +aesni_ocb_encrypt: movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%rax -.LSEH_begin_aesni_cbc_encrypt: +.LSEH_begin_aesni_ocb_encrypt: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx @@ -2772,181 +2813,1096 @@ aesni_cbc_encrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - testq %rdx,%rdx - jz .Lcbc_ret + + leaq (%rsp),%rax + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + leaq -160(%rsp),%rsp + movaps %xmm6,0(%rsp) + movaps %xmm7,16(%rsp) + movaps %xmm8,32(%rsp) + movaps %xmm9,48(%rsp) + movaps %xmm10,64(%rsp) + movaps %xmm11,80(%rsp) + movaps %xmm12,96(%rsp) + movaps %xmm13,112(%rsp) + movaps %xmm14,128(%rsp) + movaps %xmm15,144(%rsp) +.Locb_enc_body: + movq 56(%rax),%rbx + movq 56+8(%rax),%rbp movl 240(%rcx),%r10d movq %rcx,%r11 - testl %r9d,%r9d - jz .Lcbc_decrypt + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb .Lcbc_enc_tail - subq $16,%rdx - jmp .Lcbc_enc_loop -.p2align 4 -.Lcbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -.Loop_enc1_15: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc .Lcbc_enc_loop - addq $16,%rdx - jnz .Lcbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp .Lcbc_ret + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 -.Lcbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp .Lcbc_enc_loop + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 -.p2align 4 -.Lcbc_decrypt: - cmpq $16,%rdx - jne .Lcbc_decrypt_bulk + testq $1,%r8 + jnz .Locb_enc_odd + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + call __ocb_encrypt1 - movdqu (%rdi),%xmm2 - movdqu (%r8),%xmm3 - movdqa %xmm2,%xmm4 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_16: -.byte 102,15,56,222,209 - decl %r10d - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqu %xmm4,(%r8) - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 + movdqa %xmm7,%xmm15 movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp .Lcbc_ret -.p2align 4 -.Lcbc_decrypt_bulk: - leaq (%rsp),%rax - pushq %rbp - subq $176,%rsp - andq $-16,%rsp - movaps %xmm6,16(%rsp) - movaps %xmm7,32(%rsp) - movaps %xmm8,48(%rsp) - movaps %xmm9,64(%rsp) - movaps %xmm10,80(%rsp) - movaps %xmm11,96(%rsp) - movaps %xmm12,112(%rsp) - movaps %xmm13,128(%rsp) - movaps %xmm14,144(%rsp) - movaps %xmm15,160(%rsp) -.Lcbc_decrypt_body: - leaq -8(%rax),%rbp - movups (%r8),%xmm10 - movl %r10d,%eax - cmpq $0x50,%rdx - jbe .Lcbc_dec_tail + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_enc_done + +.Locb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 - movups (%rcx),%xmm0 + subq $6,%rdx + jc .Locb_enc_short + jmp .Locb_enc_grandloop + +.p2align 5 +.Locb_enc_grandloop: movdqu 0(%rdi),%xmm2 movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 - movl _gnutls_x86_cpuid_s+4(%rip),%r9d - cmpq $0x70,%rdx - jbe .Lcbc_dec_six_or_seven + leaq 96(%rdi),%rdi - andl $71303168,%r9d - subq $0x50,%rdx - cmpl $4194304,%r9d - je .Lcbc_dec_loop6_enter - subq $0x20,%rdx - leaq 112(%rcx),%rcx - jmp .Lcbc_dec_loop8_enter -.p2align 4 -.Lcbc_dec_loop8: - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi -.Lcbc_dec_loop8_enter: - movdqu 96(%rdi),%xmm8 - pxor %xmm0,%xmm2 - movdqu 112(%rdi),%xmm9 - pxor %xmm0,%xmm3 - movups 16-112(%rcx),%xmm1 - pxor %xmm0,%xmm4 - xorq %r11,%r11 - cmpq $0x70,%rdx - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 + call __ocb_encrypt6 -.byte 102,15,56,222,209 - pxor %xmm0,%xmm9 - movups 32-112(%rcx),%xmm0 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 -.byte 102,68,15,56,222,201 - addq %rdi,%r11 - movups 48-112(%rcx),%xmm1 -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_enc_grandloop + +.Locb_enc_short: + addq $6,%rdx + jz .Locb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_enc_one + movdqu 16(%rdi),%xmm3 + je .Locb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_enc_three + movdqu 48(%rdi),%xmm5 + je .Locb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp .Locb_enc_done + +.p2align 4 +.Locb_enc_one: + movdqa %xmm10,%xmm7 + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp .Locb_enc_done + +.p2align 4 +.Locb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp .Locb_enc_done + +.p2align 4 +.Locb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp .Locb_enc_done + +.p2align 4 +.Locb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +.Locb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movaps 0(%rsp),%xmm6 + movaps %xmm0,0(%rsp) + movaps 16(%rsp),%xmm7 + movaps %xmm0,16(%rsp) + movaps 32(%rsp),%xmm8 + movaps %xmm0,32(%rsp) + movaps 48(%rsp),%xmm9 + movaps %xmm0,48(%rsp) + movaps 64(%rsp),%xmm10 + movaps %xmm0,64(%rsp) + movaps 80(%rsp),%xmm11 + movaps %xmm0,80(%rsp) + movaps 96(%rsp),%xmm12 + movaps %xmm0,96(%rsp) + movaps 112(%rsp),%xmm13 + movaps %xmm0,112(%rsp) + movaps 128(%rsp),%xmm14 + movaps %xmm0,128(%rsp) + movaps 144(%rsp),%xmm15 + movaps %xmm0,144(%rsp) + leaq 160+40(%rsp),%rax +.Locb_enc_pop: + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +.Locb_enc_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_aesni_ocb_encrypt: + +.def __ocb_encrypt6; .scl 3; .type 32; .endef +.p2align 5 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_enc_loop6 + +.p2align 5 +.Locb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 + + +.def __ocb_encrypt4; .scl 3; .type 32; .endef +.p2align 5 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop4 + +.p2align 5 +.Locb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 + + +.def __ocb_encrypt1; .scl 3; .type 32; .endef +.p2align 5 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop1 + +.p2align 5 +.Locb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 + + +.globl aesni_ocb_decrypt +.def aesni_ocb_decrypt; .scl 2; .type 32; .endef +.p2align 5 +aesni_ocb_decrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_aesni_ocb_decrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + leaq (%rsp),%rax + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + leaq -160(%rsp),%rsp + movaps %xmm6,0(%rsp) + movaps %xmm7,16(%rsp) + movaps %xmm8,32(%rsp) + movaps %xmm9,48(%rsp) + movaps %xmm10,64(%rsp) + movaps %xmm11,80(%rsp) + movaps %xmm12,96(%rsp) + movaps %xmm13,112(%rsp) + movaps %xmm14,128(%rsp) + movaps %xmm15,144(%rsp) +.Locb_dec_body: + movq 56(%rax),%rbx + movq 56+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz .Locb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_dec_done + +.Locb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc .Locb_dec_short + jmp .Locb_dec_grandloop + +.p2align 5 +.Locb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_dec_grandloop + +.Locb_dec_short: + addq $6,%rdx + jz .Locb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_dec_one + movdqu 16(%rdi),%xmm3 + je .Locb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_dec_three + movdqu 48(%rdi),%xmm5 + je .Locb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp .Locb_dec_done + +.p2align 4 +.Locb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp .Locb_dec_done + +.p2align 4 +.Locb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp .Locb_dec_done + +.p2align 4 +.Locb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp .Locb_dec_done + +.p2align 4 +.Locb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +.Locb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movaps 0(%rsp),%xmm6 + movaps %xmm0,0(%rsp) + movaps 16(%rsp),%xmm7 + movaps %xmm0,16(%rsp) + movaps 32(%rsp),%xmm8 + movaps %xmm0,32(%rsp) + movaps 48(%rsp),%xmm9 + movaps %xmm0,48(%rsp) + movaps 64(%rsp),%xmm10 + movaps %xmm0,64(%rsp) + movaps 80(%rsp),%xmm11 + movaps %xmm0,80(%rsp) + movaps 96(%rsp),%xmm12 + movaps %xmm0,96(%rsp) + movaps 112(%rsp),%xmm13 + movaps %xmm0,112(%rsp) + movaps 128(%rsp),%xmm14 + movaps %xmm0,128(%rsp) + movaps 144(%rsp),%xmm15 + movaps %xmm0,144(%rsp) + leaq 160+40(%rsp),%rax +.Locb_dec_pop: + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +.Locb_dec_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_aesni_ocb_decrypt: + +.def __ocb_decrypt6; .scl 3; .type 32; .endef +.p2align 5 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_dec_loop6 + +.p2align 5 +.Locb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 + + +.def __ocb_decrypt4; .scl 3; .type 32; .endef +.p2align 5 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop4 + +.p2align 5 +.Locb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 + + +.def __ocb_decrypt1; .scl 3; .type 32; .endef +.p2align 5 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop1 + +.p2align 5 +.Locb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 + +.globl aesni_cbc_encrypt +.def aesni_cbc_encrypt; .scl 2; .type 32; .endef +.p2align 4 +aesni_cbc_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_aesni_cbc_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + testq %rdx,%rdx + jz .Lcbc_ret + + movl 240(%rcx),%r10d + movq %rcx,%r11 + testl %r9d,%r9d + jz .Lcbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb .Lcbc_enc_tail + subq $16,%rdx + jmp .Lcbc_enc_loop +.p2align 4 +.Lcbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +.Loop_enc1_15: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_15 +.byte 102,15,56,221,209 + movl %r10d,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc .Lcbc_enc_loop + addq $16,%rdx + jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp .Lcbc_ret + +.Lcbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp .Lcbc_enc_loop + +.p2align 4 +.Lcbc_decrypt: + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.p2align 4 +.Lcbc_decrypt_bulk: + leaq (%rsp),%r11 + + pushq %rbp + + subq $176,%rsp + andq $-16,%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lcbc_decrypt_body: + movq %rcx,%rbp + movups (%r8),%xmm10 + movl %r10d,%eax + cmpq $0x50,%rdx + jbe .Lcbc_dec_tail + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl _gnutls_x86_cpuid_s+4(%rip),%r9d + cmpq $0x70,%rdx + jbe .Lcbc_dec_six_or_seven + + andl $71303168,%r9d + subq $0x50,%rdx + cmpl $4194304,%r9d + je .Lcbc_dec_loop6_enter + subq $0x20,%rdx + leaq 112(%rcx),%rcx + jmp .Lcbc_dec_loop8_enter +.p2align 4 +.Lcbc_dec_loop8: + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi +.Lcbc_dec_loop8_enter: + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + movq $-1,%rbp + cmpq $0x70,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + +.byte 102,15,56,222,209 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 + adcq $0,%rbp + andq $128,%rbp +.byte 102,68,15,56,222,201 + addq %rdi,%rbp + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 .byte 102,68,15,56,222,200 movups 64-112(%rcx),%xmm0 nop @@ -3076,18 +4032,18 @@ aesni_cbc_encrypt: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -3206,7 +4162,7 @@ aesni_cbc_encrypt: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3375,18 +4331,23 @@ aesni_cbc_encrypt: movaps %xmm0,144(%rsp) movaps 160(%rsp),%xmm15 movaps %xmm0,160(%rsp) - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + .Lcbc_ret: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_aesni_cbc_encrypt: .globl aesni_set_decrypt_key .def aesni_set_decrypt_key; .scl 2; .type 32; .endef .p2align 4 aesni_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shll $4,%edx testl %eax,%eax @@ -3419,7 +4380,9 @@ aesni_set_decrypt_key: pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp + .byte 0xf3,0xc3 + .LSEH_end_set_decrypt_key: .globl aesni_set_encrypt_key @@ -3427,7 +4390,9 @@ aesni_set_decrypt_key: .p2align 4 aesni_set_encrypt_key: __aesni_set_encrypt_key: + .byte 0x48,0x83,0xEC,0x08 + movq $-1,%rax testq %rcx,%rcx jz .Lenc_key_ret @@ -3720,7 +4685,9 @@ __aesni_set_encrypt_key: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp + .byte 0xf3,0xc3 + .LSEH_end_set_encrypt_key: .p2align 4 @@ -3889,13 +4856,75 @@ ctr_xts_se_handler: cmpq %r10,%rbx jae .Lcommon_seh_tail - movq 160(%r8),%rax - leaq -160(%rax),%rsi + movq 208(%r8),%rax + + leaq -168(%rax),%rsi + leaq 512(%r8),%rdi + movl $20,%ecx +.long 0xa548f3fc + + movq -8(%rax),%rbp + movq %rbp,160(%r8) + jmp .Lcommon_seh_tail + + +.def ocb_se_handler; .scl 3; .type 32; .endef +.p2align 4 +ocb_se_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + movq 8(%r9),%rsi + movq 56(%r9),%r11 + + movl 0(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jb .Lcommon_seh_tail + + movl 4(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jae .Lcommon_seh_tail + + movl 8(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jae .Locb_no_xmm + + movq 152(%r8),%rax + + leaq (%rax),%rsi leaq 512(%r8),%rdi movl $20,%ecx .long 0xa548f3fc + leaq 160+40(%rax),%rax + +.Locb_no_xmm: + movq -8(%rax),%rbx + movq -16(%rax),%rbp + movq -24(%rax),%r12 + movq -32(%rax),%r13 + movq -40(%rax),%r14 - jmp .Lcommon_rbp_tail + movq %rbx,144(%r8) + movq %rbp,160(%r8) + movq %r12,216(%r8) + movq %r13,224(%r8) + movq %r14,232(%r8) + + jmp .Lcommon_seh_tail .def cbc_se_handler; .scl 3; .type 32; .endef .p2align 4 @@ -3918,9 +4947,13 @@ cbc_se_handler: cmpq %r10,%rbx jb .Lcommon_seh_tail + movq 120(%r8),%rax + leaq .Lcbc_decrypt_body(%rip),%r10 cmpq %r10,%rbx - jb .Lrestore_cbc_rax + jb .Lcommon_seh_tail + + movq 152(%r8),%rax leaq .Lcbc_ret(%rip),%r10 cmpq %r10,%rbx @@ -3931,15 +4964,10 @@ cbc_se_handler: movl $20,%ecx .long 0xa548f3fc -.Lcommon_rbp_tail: - movq 160(%r8),%rax - movq (%rax),%rbp - leaq 8(%rax),%rax - movq %rbp,160(%r8) - jmp .Lcommon_seh_tail + movq 208(%r8),%rax -.Lrestore_cbc_rax: - movq 120(%r8),%rax + movq -8(%rax),%rbp + movq %rbp,160(%r8) .Lcommon_seh_tail: movq 8(%rax),%rdi @@ -4006,6 +5034,14 @@ cbc_se_handler: .rva .LSEH_begin_aesni_xts_decrypt .rva .LSEH_end_aesni_xts_decrypt .rva .LSEH_info_xts_dec + +.rva .LSEH_begin_aesni_ocb_encrypt +.rva .LSEH_end_aesni_ocb_encrypt +.rva .LSEH_info_ocb_enc + +.rva .LSEH_begin_aesni_ocb_decrypt +.rva .LSEH_end_aesni_ocb_decrypt +.rva .LSEH_info_ocb_dec .rva .LSEH_begin_aesni_cbc_encrypt .rva .LSEH_end_aesni_cbc_encrypt .rva .LSEH_info_cbc @@ -4043,6 +5079,18 @@ cbc_se_handler: .byte 9,0,0,0 .rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue +.LSEH_info_ocb_enc: +.byte 9,0,0,0 +.rva ocb_se_handler +.rva .Locb_enc_body,.Locb_enc_epilogue +.rva .Locb_enc_pop +.long 0 +.LSEH_info_ocb_dec: +.byte 9,0,0,0 +.rva ocb_se_handler +.rva .Locb_dec_body,.Locb_dec_epilogue +.rva .Locb_dec_pop +.long 0 .LSEH_info_cbc: .byte 9,0,0,0 .rva cbc_se_handler diff --git a/lib/accelerated/x86/coff/cpuid-x86.s b/lib/accelerated/x86/coff/cpuid-x86.s index 4baa8b0b45..610e9617db 100644 --- a/lib/accelerated/x86/coff/cpuid-x86.s +++ b/lib/accelerated/x86/coff/cpuid-x86.s @@ -21,7 +21,6 @@ # # *** This file is auto-generated *** # -.file "devel/perlasm/cpuid-x86.s" .text .globl _gnutls_cpuid .def _gnutls_cpuid; .scl 2; .type 32; .endef diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s index f4bcee28f0..de207e4002 100644 --- a/lib/accelerated/x86/coff/ghash-x86_64.s +++ b/lib/accelerated/x86/coff/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,9 +51,21 @@ gcm_gmult_4bit: movq %rcx,%rdi movq %rdx,%rsi + pushq %rbx + pushq %rbp + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $280,%rsp + .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -130,12 +142,17 @@ gcm_gmult_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + .Lgmult_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_gcm_gmult_4bit: .globl gcm_ghash_4bit .def gcm_ghash_4bit; .scl 2; .type 32; .endef @@ -150,13 +167,21 @@ gcm_ghash_4bit: movq %r8,%rdx movq %r9,%rcx + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -701,23 +726,33 @@ gcm_ghash_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq 0(%rsi),%rsp + .Lghash_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_gcm_ghash_4bit: .globl gcm_init_clmul .def gcm_init_clmul; .scl 2; .type 32; .endef .p2align 4 gcm_init_clmul: + .L_init_clmul: .LSEH_begin_gcm_init_clmul: @@ -877,10 +912,12 @@ gcm_init_clmul: .LSEH_end_gcm_init_clmul: .byte 0xf3,0xc3 + .globl gcm_gmult_clmul .def gcm_gmult_clmul; .scl 2; .type 32; .endef .p2align 4 gcm_gmult_clmul: + .L_gmult_clmul: movdqu (%rcx),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -928,10 +965,12 @@ gcm_gmult_clmul: movdqu %xmm0,(%rcx) .byte 0xf3,0xc3 + .globl gcm_ghash_clmul .def gcm_ghash_clmul; .scl 2; .type 32; .endef .p2align 5 gcm_ghash_clmul: + .L_ghash_clmul: leaq -136(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: @@ -1337,10 +1376,12 @@ gcm_ghash_clmul: .LSEH_end_gcm_ghash_clmul: .byte 0xf3,0xc3 + .globl gcm_init_avx .def gcm_init_avx; .scl 2; .type 32; .endef .p2align 5 gcm_init_avx: + .LSEH_begin_gcm_init_avx: .byte 0x48,0x83,0xec,0x18 @@ -1451,16 +1492,20 @@ gcm_init_avx: .LSEH_end_gcm_init_avx: .byte 0xf3,0xc3 + .globl gcm_gmult_avx .def gcm_gmult_avx; .scl 2; .type 32; .endef .p2align 5 gcm_gmult_avx: + jmp .L_gmult_clmul + .globl gcm_ghash_avx .def gcm_ghash_avx; .scl 2; .type 32; .endef .p2align 5 gcm_ghash_avx: + leaq -136(%rsp),%rax .LSEH_begin_gcm_ghash_avx: @@ -1859,6 +1904,7 @@ gcm_ghash_avx: .LSEH_end_gcm_ghash_avx: .byte 0xf3,0xc3 + .p2align 6 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -1945,14 +1991,20 @@ se_handler: cmpq %r10,%rbx jae .Lin_prologue - leaq 24(%rax),%rax + leaq 48+280(%rax),%rax movq -8(%rax),%rbx movq -16(%rax),%rbp movq -24(%rax),%r12 + movq -32(%rax),%r13 + movq -40(%rax),%r14 + movq -48(%rax),%r15 movq %rbx,144(%r8) movq %rbp,160(%r8) movq %r12,216(%r8) + movq %r13,224(%r8) + movq %r14,232(%r8) + movq %r15,240(%r8) .Lin_prologue: movq 8(%rax),%rdi diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86.s b/lib/accelerated/x86/coff/sha1-ssse3-x86.s index 22c17e7353..30f9ded212 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha1-586.s" .text .globl _sha1_block_data_order .def _sha1_block_data_order; .scl 2; .type 32; .endef diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s index 13203c2b90..cdfc88254e 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -52,25 +52,45 @@ sha1_block_data_order: movq %rdx,%rsi movq %r8,%rdx + movl _gnutls_x86_cpuid_s+0(%rip),%r9d movl _gnutls_x86_cpuid_s+4(%rip),%r8d + movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu + testl $536870912,%r10d + jnz _shaext_shortcut + andl $296,%r10d + cmpl $296,%r10d + je _avx2_shortcut + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .p2align 4 .Lialu: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 - movq %rsp,%r11 + + pushq %r14 + movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) + .Lprologue: movl 0(%r8),%esi @@ -84,1230 +104,1168 @@ sha1_block_data_order: .Lloop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax + xorl 52(%rsp),%edx + xorl %r13d,%eax roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp - xorl %r13d,%eax + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1327,16 +1285,216 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + .Lepilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 + .LSEH_end_sha1_block_data_order: +.def sha1_block_data_order_shaext; .scl 3; .type 32; .endef +.p2align 5 +sha1_block_data_order_shaext: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha1_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +_shaext_shortcut: + + leaq -72(%rsp),%rsp + movaps %xmm6,-8-64(%rax) + movaps %xmm7,-8-48(%rax) + movaps %xmm8,-8-32(%rax) + movaps %xmm9,-8-16(%rax) +.Lprologue_shaext: + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%r8 + paddd %xmm4,%xmm1 + cmovneq %r8,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + movaps -8-64(%rax),%xmm6 + movaps -8-48(%rax),%xmm7 + movaps -8-32(%rax),%xmm8 + movaps -8-16(%rax),%xmm9 + movq %rax,%rsp +.Lepilogue_shaext: + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_sha1_block_data_order_shaext: .def sha1_block_data_order_ssse3; .scl 3; .type 32; .endef .p2align 4 sha1_block_data_order_ssse3: @@ -1349,24 +1507,35 @@ sha1_block_data_order_ssse3: movq %r8,%rdx _ssse3_shortcut: + + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + + pushq %r13 + + pushq %r14 + leaq -160(%rsp),%rsp - movaps %xmm6,64+0(%rsp) - movaps %xmm7,64+16(%rsp) - movaps %xmm8,64+32(%rsp) - movaps %xmm9,64+48(%rsp) - movaps %xmm10,64+64(%rsp) - movaps %xmm11,64+80(%rsp) + movaps %xmm6,-40-96(%r11) + movaps %xmm7,-40-80(%r11) + movaps %xmm8,-40-64(%r11) + movaps %xmm9,-40-48(%r11) + movaps %xmm10,-40-32(%r11) + movaps %xmm11,-40-16(%r11) .Lprologue_ssse3: + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1378,18 +1547,18 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1401,24 +1570,24 @@ _ssse3_shortcut: jmp .Loop_ssse3 .p2align 4 .Loop_ssse3: - movdqa %xmm1,%xmm4 rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi addl 0(%rsp),%ebp - paddd %xmm3,%xmm9 + punpcklqdq %xmm1,%xmm4 xorl %ecx,%ebx roll $5,%eax - psrldq $4,%xmm8 addl %esi,%ebp + psrldq $4,%xmm8 andl %ebx,%edi - pxor %xmm0,%xmm4 xorl %ecx,%ebx + pxor %xmm0,%xmm4 addl %eax,%ebp - pxor %xmm2,%xmm8 rorl $7,%eax + pxor %xmm2,%xmm8 xorl %ecx,%edi movl %ebp,%esi addl 4(%rsp),%edx @@ -1429,57 +1598,57 @@ _ssse3_shortcut: addl %edi,%edx andl %eax,%esi movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp + movdqa %xmm4,%xmm8 xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi addl 8(%rsp),%ecx + psrld $31,%xmm8 xorl %eax,%ebp roll $5,%edx - psrld $31,%xmm8 addl %esi,%ecx - andl %ebp,%edi movdqa %xmm10,%xmm9 + andl %ebp,%edi xorl %eax,%ebp - addl %edx,%ecx psrld $30,%xmm10 - por %xmm8,%xmm4 + addl %edx,%ecx rorl $7,%edx + por %xmm8,%xmm4 xorl %eax,%edi movl %ecx,%esi addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx + movdqa -64(%r14),%xmm10 roll $5,%ecx - movdqa 0(%r11),%xmm10 addl %edi,%ebx andl %edx,%esi pxor %xmm9,%xmm4 xorl %ebp,%edx addl %ecx,%ebx - movdqa %xmm2,%xmm5 rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi addl 16(%rsp),%eax - paddd %xmm4,%xmm10 + punpcklqdq %xmm2,%xmm5 xorl %edx,%ecx roll $5,%ebx - psrldq $4,%xmm9 addl %esi,%eax + psrldq $4,%xmm9 andl %ecx,%edi - pxor %xmm1,%xmm5 xorl %edx,%ecx + pxor %xmm1,%xmm5 addl %ebx,%eax - pxor %xmm3,%xmm9 rorl $7,%ebx + pxor %xmm3,%xmm9 xorl %edx,%edi movl %eax,%esi addl 20(%rsp),%ebp @@ -1490,57 +1659,57 @@ _ssse3_shortcut: addl %edi,%ebp andl %ebx,%esi movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax + movdqa %xmm5,%xmm9 xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi addl 24(%rsp),%edx + psrld $31,%xmm9 xorl %ebx,%eax roll $5,%ebp - psrld $31,%xmm9 addl %esi,%edx - andl %eax,%edi movdqa %xmm8,%xmm10 + andl %eax,%edi xorl %ebx,%eax - addl %ebp,%edx psrld $30,%xmm8 - por %xmm9,%xmm5 + addl %ebp,%edx rorl $7,%ebp + por %xmm9,%xmm5 xorl %ebx,%edi movl %edx,%esi addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp + movdqa -32(%r14),%xmm8 roll $5,%edx - movdqa 16(%r11),%xmm8 addl %edi,%ecx andl %ebp,%esi pxor %xmm10,%xmm5 xorl %eax,%ebp addl %edx,%ecx - movdqa %xmm3,%xmm6 rorl $7,%edx + pshufd $238,%xmm2,%xmm6 xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi addl 32(%rsp),%ebx - paddd %xmm5,%xmm8 + punpcklqdq %xmm3,%xmm6 xorl %ebp,%edx roll $5,%ecx - psrldq $4,%xmm10 addl %esi,%ebx + psrldq $4,%xmm10 andl %edx,%edi - pxor %xmm2,%xmm6 xorl %ebp,%edx + pxor %xmm2,%xmm6 addl %ecx,%ebx - pxor %xmm4,%xmm10 rorl $7,%ecx + pxor %xmm4,%xmm10 xorl %ebp,%edi movl %ebx,%esi addl 36(%rsp),%eax @@ -1551,57 +1720,57 @@ _ssse3_shortcut: addl %edi,%eax andl %ecx,%esi movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx + movdqa %xmm6,%xmm10 xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi addl 40(%rsp),%ebp + psrld $31,%xmm10 xorl %ecx,%ebx roll $5,%eax - psrld $31,%xmm10 addl %esi,%ebp - andl %ebx,%edi movdqa %xmm9,%xmm8 + andl %ebx,%edi xorl %ecx,%ebx - addl %eax,%ebp psrld $30,%xmm9 - por %xmm10,%xmm6 + addl %eax,%ebp rorl $7,%eax + por %xmm10,%xmm6 xorl %ecx,%edi movl %ebp,%esi addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax + movdqa -32(%r14),%xmm9 roll $5,%ebp - movdqa 16(%r11),%xmm9 addl %edi,%edx andl %eax,%esi pxor %xmm8,%xmm6 xorl %ebx,%eax addl %ebp,%edx - movdqa %xmm4,%xmm7 rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi addl 48(%rsp),%ecx - paddd %xmm6,%xmm9 + punpcklqdq %xmm4,%xmm7 xorl %eax,%ebp roll $5,%edx - psrldq $4,%xmm8 addl %esi,%ecx + psrldq $4,%xmm8 andl %ebp,%edi - pxor %xmm3,%xmm7 xorl %eax,%ebp + pxor %xmm3,%xmm7 addl %edx,%ecx - pxor %xmm5,%xmm8 rorl $7,%edx + pxor %xmm5,%xmm8 xorl %eax,%edi movl %ecx,%esi addl 52(%rsp),%ebx @@ -1612,78 +1781,78 @@ _ssse3_shortcut: addl %edi,%ebx andl %edx,%esi movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx + movdqa %xmm7,%xmm8 xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi addl 56(%rsp),%eax + psrld $31,%xmm8 xorl %edx,%ecx roll $5,%ebx - psrld $31,%xmm8 addl %esi,%eax - andl %ecx,%edi movdqa %xmm10,%xmm9 + andl %ecx,%edi xorl %edx,%ecx - addl %ebx,%eax psrld $30,%xmm10 - por %xmm8,%xmm7 + addl %ebx,%eax rorl $7,%ebx + por %xmm8,%xmm7 xorl %edx,%edi movl %eax,%esi addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx + movdqa -32(%r14),%xmm10 roll $5,%eax - movdqa 16(%r11),%xmm10 addl %edi,%ebp andl %ebx,%esi pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx addl %eax,%ebp - movdqa %xmm7,%xmm9 rorl $7,%eax pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 xorl %ecx,%esi movl %ebp,%edi addl 0(%rsp),%edx - pxor %xmm1,%xmm0 + punpcklqdq %xmm7,%xmm9 xorl %ebx,%eax roll $5,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm7,%xmm10 + pxor %xmm1,%xmm0 addl %esi,%edx andl %eax,%edi - pxor %xmm9,%xmm0 + movdqa %xmm10,%xmm8 xorl %ebx,%eax + paddd %xmm7,%xmm10 addl %ebp,%edx + pxor %xmm9,%xmm0 rorl $7,%ebp xorl %ebx,%edi - movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) movl %edx,%esi addl 4(%rsp),%ecx + movdqa %xmm0,%xmm9 xorl %eax,%ebp roll $5,%edx - pslld $2,%xmm0 + movdqa %xmm10,48(%rsp) addl %edi,%ecx andl %ebp,%esi - psrld $30,%xmm9 xorl %eax,%ebp + pslld $2,%xmm0 addl %edx,%ecx rorl $7,%edx + psrld $30,%xmm9 xorl %eax,%esi movl %ecx,%edi addl 8(%rsp),%ebx por %xmm9,%xmm0 xorl %ebp,%edx roll $5,%ecx - movdqa %xmm0,%xmm10 + pshufd $238,%xmm7,%xmm10 addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx @@ -1696,18 +1865,18 @@ _ssse3_shortcut: xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax - addl 16(%rsp),%ebp pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 + addl 16(%rsp),%ebp xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx + paddd %xmm0,%xmm8 addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx @@ -1715,43 +1884,43 @@ _ssse3_shortcut: movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 - movdqa %xmm8,0(%rsp) addl %edi,%edx xorl %ebx,%esi + movdqa %xmm8,0(%rsp) rorl $7,%eax addl %ebp,%edx - pslld $2,%xmm1 addl 24(%rsp),%ecx + pslld $2,%xmm1 xorl %eax,%esi - psrld $30,%xmm10 movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp - addl %edx,%ecx por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx + pshufd $238,%xmm0,%xmm8 xorl %ebp,%edi - movdqa %xmm1,%xmm8 movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx - addl 32(%rsp),%eax pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 + addl 32(%rsp),%eax xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 + movdqa 0(%r14),%xmm10 rorl $7,%ecx + paddd %xmm1,%xmm9 addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp @@ -1759,43 +1928,43 @@ _ssse3_shortcut: movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 - movdqa %xmm9,16(%rsp) addl %edi,%ebp xorl %ecx,%esi + movdqa %xmm9,16(%rsp) rorl $7,%ebx addl %eax,%ebp - pslld $2,%xmm2 addl 40(%rsp),%edx + pslld $2,%xmm2 xorl %ebx,%esi - psrld $30,%xmm8 movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax - addl %ebp,%edx por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx + pshufd $238,%xmm1,%xmm9 xorl %eax,%edi - movdqa %xmm2,%xmm9 movl %edx,%esi roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx - addl 48(%rsp),%ebx pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 + addl 48(%rsp),%ebx xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 addl %esi,%ebx xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx + paddd %xmm2,%xmm10 addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax @@ -1803,43 +1972,43 @@ _ssse3_shortcut: movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 - movdqa %xmm10,32(%rsp) addl %edi,%eax xorl %edx,%esi + movdqa %xmm10,32(%rsp) rorl $7,%ecx addl %ebx,%eax - pslld $2,%xmm3 addl 56(%rsp),%ebp + pslld $2,%xmm3 xorl %ecx,%esi - psrld $30,%xmm9 movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx - addl %eax,%ebp por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx + pshufd $238,%xmm2,%xmm10 xorl %ebx,%edi - movdqa %xmm3,%xmm10 movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx - addl 0(%rsp),%ecx pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 + addl 0(%rsp),%ecx xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 addl %esi,%ecx xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp + paddd %xmm3,%xmm8 addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx @@ -1847,43 +2016,43 @@ _ssse3_shortcut: movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 - movdqa %xmm8,48(%rsp) addl %edi,%ebx xorl %ebp,%esi + movdqa %xmm8,48(%rsp) rorl $7,%edx addl %ecx,%ebx - pslld $2,%xmm4 addl 8(%rsp),%eax + pslld $2,%xmm4 xorl %edx,%esi - psrld $30,%xmm10 movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx - addl %ebx,%eax por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp + pshufd $238,%xmm3,%xmm8 xorl %ecx,%edi - movdqa %xmm4,%xmm8 movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp - addl 16(%rsp),%edx pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 + addl 16(%rsp),%edx xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 addl %esi,%edx xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax + paddd %xmm4,%xmm9 addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx @@ -1891,24 +2060,24 @@ _ssse3_shortcut: movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 - movdqa %xmm9,0(%rsp) addl %edi,%ecx xorl %eax,%esi + movdqa %xmm9,0(%rsp) rorl $7,%ebp addl %edx,%ecx - pslld $2,%xmm5 addl 24(%rsp),%ebx + pslld $2,%xmm5 xorl %ebp,%esi - psrld $30,%xmm8 movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx - addl %ecx,%ebx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi @@ -1917,47 +2086,47 @@ _ssse3_shortcut: xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax - addl 32(%rsp),%ebp pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + addl 32(%rsp),%ebp andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx - pxor %xmm7,%xmm6 + punpcklqdq %xmm5,%xmm9 movl %eax,%edi xorl %ecx,%esi - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - pxor %xmm9,%xmm6 + movdqa %xmm10,%xmm8 xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 addl %eax,%ebp addl 36(%rsp),%edx - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax + movdqa %xmm6,%xmm9 movl %ebp,%esi - pslld $2,%xmm6 xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp - psrld $30,%xmm9 addl %edi,%edx xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx andl %eax,%esi - por %xmm9,%xmm6 xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - movdqa %xmm6,%xmm10 movl %edx,%edi xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx xorl %ebp,%edi xorl %eax,%ebp @@ -1973,47 +2142,47 @@ _ssse3_shortcut: xorl %edx,%esi xorl %ebp,%edx addl %ecx,%ebx - addl 48(%rsp),%eax pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 + addl 48(%rsp),%eax andl %edx,%esi xorl %ebp,%edx rorl $7,%ecx - pxor %xmm0,%xmm7 + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi xorl %edx,%esi - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - pxor %xmm10,%xmm7 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 addl %ebx,%eax addl 52(%rsp),%ebp - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx + movdqa %xmm7,%xmm10 movl %eax,%esi - pslld $2,%xmm7 xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax - psrld $30,%xmm10 addl %edi,%ebp xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx andl %ebx,%esi - por %xmm10,%xmm7 xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - movdqa %xmm7,%xmm8 movl %ebp,%edi xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx xorl %eax,%edi xorl %ebx,%eax @@ -2029,47 +2198,47 @@ _ssse3_shortcut: xorl %ebp,%esi xorl %eax,%ebp addl %edx,%ecx - addl 0(%rsp),%ebx pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 + addl 0(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp rorl $7,%edx - pxor %xmm1,%xmm0 + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi xorl %ebp,%esi - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - pxor %xmm8,%xmm0 + movdqa %xmm9,%xmm10 xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 addl %ecx,%ebx addl 4(%rsp),%eax - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx + movdqa %xmm0,%xmm8 movl %ebx,%esi - pslld $2,%xmm0 xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx - psrld $30,%xmm8 addl %edi,%eax xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp andl %ecx,%esi - por %xmm8,%xmm0 xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - movdqa %xmm0,%xmm9 movl %eax,%edi xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp xorl %ebx,%edi xorl %ecx,%ebx @@ -2085,47 +2254,47 @@ _ssse3_shortcut: xorl %eax,%esi xorl %ebx,%eax addl %ebp,%edx - addl 16(%rsp),%ecx pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 + addl 16(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%ebp - pxor %xmm2,%xmm1 + punpcklqdq %xmm0,%xmm9 movl %edx,%edi xorl %eax,%esi - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - pxor %xmm9,%xmm1 + movdqa %xmm10,%xmm8 xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 addl %edx,%ecx addl 20(%rsp),%ebx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx + movdqa %xmm1,%xmm9 movl %ecx,%esi - pslld $2,%xmm1 xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx - psrld $30,%xmm9 addl %edi,%ebx xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax andl %edx,%esi - por %xmm9,%xmm1 xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - movdqa %xmm1,%xmm10 movl %ebx,%edi xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax xorl %ecx,%edi xorl %edx,%ecx @@ -2141,47 +2310,47 @@ _ssse3_shortcut: xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%ebp - addl 32(%rsp),%edx pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 + addl 32(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax - pxor %xmm3,%xmm2 + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi xorl %ebx,%esi - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm9 xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 addl %ebp,%edx addl 36(%rsp),%ecx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp + movdqa %xmm2,%xmm10 movl %edx,%esi - pslld $2,%xmm2 xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx - psrld $30,%xmm10 addl %edi,%ecx xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx andl %ebp,%esi - por %xmm10,%xmm2 xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - movdqa %xmm2,%xmm8 movl %ecx,%edi xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx xorl %edx,%edi xorl %ebp,%edx @@ -2196,18 +2365,18 @@ _ssse3_shortcut: addl %edi,%eax xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 + addl 48(%rsp),%ebp xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx + paddd %xmm2,%xmm9 addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx @@ -2215,22 +2384,22 @@ _ssse3_shortcut: movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 - movdqa %xmm9,32(%rsp) addl %edi,%edx xorl %ebx,%esi + movdqa %xmm9,32(%rsp) rorl $7,%eax addl %ebp,%edx - pslld $2,%xmm3 addl 56(%rsp),%ecx + pslld $2,%xmm3 xorl %eax,%esi - psrld $30,%xmm8 movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp - addl %edx,%ecx por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi @@ -2240,13 +2409,13 @@ _ssse3_shortcut: rorl $7,%edx addl %ecx,%ebx addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %edx,%esi movl %ebx,%edi roll $5,%ebx + paddd %xmm3,%xmm10 addl %esi,%eax - movdqa %xmm10,48(%rsp) xorl %edx,%edi + movdqa %xmm10,48(%rsp) rorl $7,%ecx addl %ebx,%eax addl 4(%rsp),%ebp @@ -2275,8 +2444,8 @@ _ssse3_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2285,23 +2454,23 @@ _ssse3_shortcut: addq $64,%r9 addl 16(%rsp),%ebx xorl %ebp,%esi -.byte 102,15,56,0,206 movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx - paddd %xmm9,%xmm0 addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx + paddd %xmm9,%xmm0 addl %ecx,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax xorl %edx,%edi - psubd %xmm9,%xmm0 movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx + psubd %xmm9,%xmm0 addl %ebx,%eax addl 24(%rsp),%ebp xorl %ecx,%esi @@ -2321,23 +2490,23 @@ _ssse3_shortcut: addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi -.byte 102,15,56,0,214 movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx - paddd %xmm9,%xmm1 addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp + paddd %xmm9,%xmm1 addl %edx,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx xorl %ebp,%edi - psubd %xmm9,%xmm1 movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx + psubd %xmm9,%xmm1 addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi @@ -2345,197 +2514,3079 @@ _ssse3_shortcut: roll $5,%ebx addl %esi,%eax xorl %edx,%edi - rorl $7,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi +.byte 102,15,56,0,222 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + paddd %xmm9,%xmm2 + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + movdqa %xmm2,32(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_ssse3 + +.p2align 4 +.Ldone_ssse3: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movaps -40-96(%r11),%xmm6 + movaps -40-80(%r11),%xmm7 + movaps -40-64(%r11),%xmm8 + movaps -40-48(%r11),%xmm9 + movaps -40-32(%r11),%xmm10 + movaps -40-16(%r11),%xmm11 + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +.Lepilogue_ssse3: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha1_block_data_order_ssse3: +.def sha1_block_data_order_avx; .scl 3; .type 32; .endef +.p2align 4 +sha1_block_data_order_avx: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha1_block_data_order_avx: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +_avx_shortcut: + + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + leaq -160(%rsp),%rsp + vzeroupper + vmovaps %xmm6,-40-96(%r11) + vmovaps %xmm7,-40-80(%r11) + vmovaps %xmm8,-40-64(%r11) + vmovaps %xmm9,-40-48(%r11) + vmovaps %xmm10,-40-32(%r11) + vmovaps %xmm11,-40-16(%r11) +.Lprologue_avx: + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.p2align 4 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.p2align 4 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movaps -40-96(%r11),%xmm6 + movaps -40-80(%r11),%xmm7 + movaps -40-64(%r11),%xmm8 + movaps -40-48(%r11),%xmm9 + movaps -40-32(%r11),%xmm10 + movaps -40-16(%r11),%xmm11 + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +.Lepilogue_avx: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha1_block_data_order_avx: +.def sha1_block_data_order_avx2; .scl 3; .type 32; .endef +.p2align 4 +sha1_block_data_order_avx2: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha1_block_data_order_avx2: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +_avx2_shortcut: + + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + vzeroupper + leaq -96(%rsp),%rsp + vmovaps %xmm6,-40-96(%r11) + vmovaps %xmm7,-40-80(%r11) + vmovaps %xmm8,-40-64(%r11) + vmovaps %xmm9,-40-48(%r11) + vmovaps %xmm10,-40-32(%r11) + vmovaps %xmm11,-40-16(%r11) +.Lprologue_avx2: + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp .Loop_avx2 +.p2align 5 +.Loop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp .Lalign32_1 +.p2align 5 +.Lalign32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp .Lalign32_2 +.p2align 5 +.Lalign32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je .Ldone_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja .Last_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp .Last_avx2 + +.p2align 5 +.Last_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi -.byte 102,15,56,0,222 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax movl %ebp,%edi - roll $5,%ebp - paddd %xmm9,%xmm2 - addl %esi,%edx xorl %ebx,%edi - rorl $7,%eax - addl %ebp,%edx - movdqa %xmm2,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - psubd %xmm9,%xmm2 - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - rorl $7,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) + addl %r12d,%edx andl %edi,%esi - jmp .Loop_ssse3 - -.p2align 4 -.Ldone_ssse3: - addl 16(%rsp),%ebx + addl -32(%r13),%ecx xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx + movl %eax,%edi xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp .Lalign32_3 +.p2align 5 +.Lalign32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - roll $5,%eax - addl %esi,%ebp + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi xorl %ecx,%edi - rorl $7,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - addl %esi,%eax + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi xorl %edx,%edi - rorl $7,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax movl %ebp,%edi - roll $5,%ebp - addl %esi,%edx xorl %ebx,%edi - rorl $7,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - rorl $7,%ecx - addl %ebx,%eax - addl 0(%r8),%eax + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - movaps 64+0(%rsp),%xmm6 - movaps 64+16(%rsp),%xmm7 - movaps 64+32(%rsp),%xmm8 - movaps 64+48(%rsp),%xmm9 - movaps 64+64(%rsp),%xmm10 - movaps 64+80(%rsp),%xmm11 - leaq 160(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp -.Lepilogue_ssse3: + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe .Loop_avx2 + +.Ldone_avx2: + vzeroupper + movaps -40-96(%r11),%xmm6 + movaps -40-80(%r11),%xmm7 + movaps -40-64(%r11),%xmm8 + movaps -40-48(%r11),%xmm9 + movaps -40-32(%r11),%xmm10 + movaps -40-16(%r11),%xmm11 + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +.Lepilogue_avx2: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 -.LSEH_end_sha1_block_data_order_ssse3: + +.LSEH_end_sha1_block_data_order_avx2: .p2align 6 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 @@ -2567,19 +5618,51 @@ se_handler: jae .Lcommon_seh_tail movq 64(%rax),%rax - leaq 32(%rax),%rax movq -8(%rax),%rbx movq -16(%rax),%rbp movq -24(%rax),%r12 movq -32(%rax),%r13 + movq -40(%rax),%r14 movq %rbx,144(%r8) movq %rbp,160(%r8) movq %r12,216(%r8) movq %r13,224(%r8) + movq %r14,232(%r8) jmp .Lcommon_seh_tail +.def shaext_handler; .scl 3; .type 32; .endef +.p2align 4 +shaext_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + leaq .Lprologue_shaext(%rip),%r10 + cmpq %r10,%rbx + jb .Lcommon_seh_tail + + leaq .Lepilogue_shaext(%rip),%r10 + cmpq %r10,%rbx + jae .Lcommon_seh_tail + + leaq -8-64(%rax),%rsi + leaq 512(%r8),%rdi + movl $8,%ecx +.long 0xa548f3fc + + jmp .Lcommon_seh_tail .def ssse3_handler; .scl 3; .type 32; .endef .p2align 4 @@ -2606,25 +5689,28 @@ ssse3_handler: cmpq %r10,%rbx jb .Lcommon_seh_tail - movq 152(%r8),%rax + movq 208(%r8),%rax movl 4(%r11),%r10d leaq (%rsi,%r10,1),%r10 cmpq %r10,%rbx jae .Lcommon_seh_tail - leaq 64(%rax),%rsi + leaq -40-96(%rax),%rsi leaq 512(%r8),%rdi movl $12,%ecx -.long 0xa548f3fc - leaq 184(%rax),%rax +.long 0xa548f3fc movq -8(%rax),%rbx movq -16(%rax),%rbp movq -24(%rax),%r12 + movq -32(%rax),%r13 + movq -40(%rax),%r14 movq %rbx,144(%r8) movq %rbp,160(%r8) movq %r12,216(%r8) + movq %r13,224(%r8) + movq %r14,232(%r8) .Lcommon_seh_tail: movq 8(%rax),%rdi @@ -2636,7 +5722,7 @@ ssse3_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -2671,16 +5757,36 @@ ssse3_handler: .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order +.rva .LSEH_begin_sha1_block_data_order_shaext +.rva .LSEH_end_sha1_block_data_order_shaext +.rva .LSEH_info_sha1_block_data_order_shaext .rva .LSEH_begin_sha1_block_data_order_ssse3 .rva .LSEH_end_sha1_block_data_order_ssse3 .rva .LSEH_info_sha1_block_data_order_ssse3 +.rva .LSEH_begin_sha1_block_data_order_avx +.rva .LSEH_end_sha1_block_data_order_avx +.rva .LSEH_info_sha1_block_data_order_avx +.rva .LSEH_begin_sha1_block_data_order_avx2 +.rva .LSEH_end_sha1_block_data_order_avx2 +.rva .LSEH_info_sha1_block_data_order_avx2 .section .xdata .p2align 3 .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler +.LSEH_info_sha1_block_data_order_shaext: +.byte 9,0,0,0 +.rva shaext_handler .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler -.rva .Lprologue_ssse3,.Lepilogue_ssse3 +.rva .Lprologue_ssse3,.Lepilogue_ssse3 +.LSEH_info_sha1_block_data_order_avx: +.byte 9,0,0,0 +.rva ssse3_handler +.rva .Lprologue_avx,.Lepilogue_avx +.LSEH_info_sha1_block_data_order_avx2: +.byte 9,0,0,0 +.rva ssse3_handler +.rva .Lprologue_avx2,.Lepilogue_avx2 diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86.s b/lib/accelerated/x86/coff/sha256-ssse3-x86.s index eaa435408e..05cd61d1b1 100644 --- a/lib/accelerated/x86/coff/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha512-586.s" .text .globl _sha256_block_data_order .def _sha256_block_data_order; .scl 2; .type 32; .endef @@ -64,20 +63,6 @@ _sha256_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) - leal __gnutls_x86_cpuid_s-.L001K256(%ebp),%edx - movl (%edx),%ecx - movl 4(%edx),%edx - testl $1048576,%ecx - jnz .L002loop - testl $2048,%edx - andl $1073741824,%ecx - andl $268435456,%edx - orl %edx,%ecx - cmpl $1342177280,%ecx - je .L003loop_shrd - subl %edi,%eax - cmpl $256,%eax - jae .L004unrolled jmp .L002loop .align 16 .L002loop: @@ -149,7 +134,7 @@ _sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00500_15: +.L00300_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -187,11 +172,11 @@ _sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00500_15 + jne .L00300_15 movl 156(%esp),%ecx - jmp .L00616_63 + jmp .L00416_63 .align 16 -.L00616_63: +.L00416_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -246,7 +231,7 @@ _sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00616_63 + jne .L00416_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -280,8 +265,8 @@ _sha256_block_data_order: popl %ebx popl %ebp ret -.align 16 -.L003loop_shrd: +.align 32 +.L005loop_shrd: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx @@ -350,7 +335,7 @@ _sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15_shrd: +.L00600_15_shrd: movl %edx,%ecx movl 24(%esp),%esi shrdl $14,%ecx,%ecx @@ -388,11 +373,11 @@ _sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15_shrd + jne .L00600_15_shrd movl 156(%esp),%ecx - jmp .L00816_63_shrd + jmp .L00716_63_shrd .align 16 -.L00816_63_shrd: +.L00716_63_shrd: movl %ecx,%ebx movl 104(%esp),%esi shrdl $11,%ecx,%ecx @@ -447,7 +432,7 @@ _sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63_shrd + jne .L00716_63_shrd movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -474,7 +459,7 @@ _sha256_block_data_order: leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi - jb .L003loop_shrd + jb .L005loop_shrd movl 12(%esp),%esp popl %edi popl %esi @@ -485,8 +470,13 @@ _sha256_block_data_order: .L001K256: .long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 .long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 .align 16 -.L004unrolled: +.L008unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -3392,10 +3382,4 @@ _sha256_block_data_order: popl %ebx popl %ebp ret -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.comm __gnutls_x86_cpuid_s,16 diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha256-ssse3-x86_64.s new file mode 100644 index 0000000000..d2fc1957ea --- /dev/null +++ b/lib/accelerated/x86/coff/sha256-ssse3-x86_64.s @@ -0,0 +1,5731 @@ +# Copyright (c) 2011-2016, Andy Polyakov +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the Andy Polyakov nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# *** This file is auto-generated *** +# +.text + + +.globl sha256_block_data_order +.def sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 4 +sha256_block_data_order: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + leaq _gnutls_x86_cpuid_s(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut + testl $512,%r10d + jnz .Lssse3_shortcut + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +.Lprologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.p2align 4 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp .Lrounds_16_xx +.p2align 4 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha256_block_data_order: +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.def sha256_block_data_order_shaext; .scl 3; .type 32; .endef +.p2align 6 +sha256_block_data_order_shaext: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +_shaext_shortcut: + leaq -88(%rsp),%rsp + movaps %xmm6,-8-80(%rax) + movaps %xmm7,-8-64(%rax) + movaps %xmm8,-8-48(%rax) + movaps %xmm9,-8-32(%rax) + movaps %xmm10,-8-16(%rax) +.Lprologue_shaext: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + movaps -8-80(%rax),%xmm6 + movaps -8-64(%rax),%xmm7 + movaps -8-48(%rax),%xmm8 + movaps -8-32(%rax),%xmm9 + movaps -8-16(%rax),%xmm10 + movq %rax,%rsp +.Lepilogue_shaext: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_sha256_block_data_order_shaext: +.def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef +.p2align 6 +sha256_block_data_order_ssse3: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order_ssse3: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + +.Lssse3_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + + movaps %xmm6,64+32(%rsp) + movaps %xmm7,64+48(%rsp) + movaps %xmm8,64+64(%rsp) + movaps %xmm9,64+80(%rsp) +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.p2align 4 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.p2align 4 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 88(%rsp),%rsi + + movaps 64+32(%rsp),%xmm6 + movaps 64+48(%rsp),%xmm7 + movaps 64+64(%rsp),%xmm8 + movaps 64+80(%rsp),%xmm9 + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue_ssse3: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha256_block_data_order_ssse3: +.def sha256_block_data_order_avx; .scl 3; .type 32; .endef +.p2align 6 +sha256_block_data_order_avx: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order_avx: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + +.Lavx_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + + movaps %xmm6,64+32(%rsp) + movaps %xmm7,64+48(%rsp) + movaps %xmm8,64+64(%rsp) + movaps %xmm9,64+80(%rsp) +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.p2align 4 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.p2align 4 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi + + vzeroupper + movaps 64+32(%rsp),%xmm6 + movaps 64+48(%rsp),%xmm7 + movaps 64+64(%rsp),%xmm8 + movaps 64+80(%rsp),%xmm9 + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue_avx: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha256_block_data_order_avx: +.def sha256_block_data_order_avx2; .scl 3; .type 32; .endef +.p2align 6 +sha256_block_data_order_avx2: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order_avx2: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + +.Lavx2_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $608,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + + movaps %xmm6,64+32(%rsp) + movaps %xmm7,64+48(%rsp) + movaps %xmm8,64+64(%rsp) + movaps %xmm9,64+80(%rsp) +.Lprologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp .Loop_avx2 +.p2align 4 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + leaq -64(%rsp),%rsp + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.p2align 4 +.Lavx2_00_47: + leaq -64(%rsp),%rsp + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp .Lower_avx2 +.p2align 4 +.Lower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + +.Ldone_avx2: + leaq (%rbp),%rsp + movq 88(%rsp),%rsi + + vzeroupper + movaps 64+32(%rsp),%xmm6 + movaps 64+48(%rsp),%xmm7 + movaps 64+64(%rsp),%xmm8 + movaps 64+80(%rsp),%xmm9 + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue_avx2: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha256_block_data_order_avx2: + +.def se_handler; .scl 3; .type 32; .endef +.p2align 4 +se_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + movq 8(%r9),%rsi + movq 56(%r9),%r11 + + movl 0(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + movq 152(%r8),%rax + + movl 4(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jae .Lin_prologue + leaq .Lavx2_shortcut(%rip),%r10 + cmpq %r10,%rbx + jb .Lnot_in_avx2 + + andq $-1024,%rax + addq $448,%rax +.Lnot_in_avx2: + movq %rax,%rsi + movq 64+24(%rax),%rax + + movq -8(%rax),%rbx + movq -16(%rax),%rbp + movq -24(%rax),%r12 + movq -32(%rax),%r13 + movq -40(%rax),%r14 + movq -48(%rax),%r15 + movq %rbx,144(%r8) + movq %rbp,160(%r8) + movq %r12,216(%r8) + movq %r13,224(%r8) + movq %r14,232(%r8) + movq %r15,240(%r8) + + leaq .Lepilogue(%rip),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + leaq 64+32(%rsi),%rsi + leaq 512(%r8),%rdi + movl $8,%ecx +.long 0xa548f3fc + +.Lin_prologue: + movq 8(%rax),%rdi + movq 16(%rax),%rsi + movq %rax,152(%r8) + movq %rsi,168(%r8) + movq %rdi,176(%r8) + + movq 40(%r9),%rdi + movq %r8,%rsi + movl $154,%ecx +.long 0xa548f3fc + + movq %r9,%rsi + xorq %rcx,%rcx + movq 8(%rsi),%rdx + movq 0(%rsi),%r8 + movq 16(%rsi),%r9 + movq 40(%rsi),%r10 + leaq 56(%rsi),%r11 + leaq 24(%rsi),%r12 + movq %r10,32(%rsp) + movq %r11,40(%rsp) + movq %r12,48(%rsp) + movq %rcx,56(%rsp) + call *__imp_RtlVirtualUnwind(%rip) + + movl $1,%eax + addq $64,%rsp + popfq + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %rdi + popq %rsi + .byte 0xf3,0xc3 + +.def shaext_handler; .scl 3; .type 32; .endef +.p2align 4 +shaext_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + leaq .Lprologue_shaext(%rip),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + leaq .Lepilogue_shaext(%rip),%r10 + cmpq %r10,%rbx + jae .Lin_prologue + + leaq -8-80(%rax),%rsi + leaq 512(%r8),%rdi + movl $10,%ecx +.long 0xa548f3fc + + jmp .Lin_prologue + +.section .pdata +.p2align 2 +.rva .LSEH_begin_sha256_block_data_order +.rva .LSEH_end_sha256_block_data_order +.rva .LSEH_info_sha256_block_data_order +.rva .LSEH_begin_sha256_block_data_order_shaext +.rva .LSEH_end_sha256_block_data_order_shaext +.rva .LSEH_info_sha256_block_data_order_shaext +.rva .LSEH_begin_sha256_block_data_order_ssse3 +.rva .LSEH_end_sha256_block_data_order_ssse3 +.rva .LSEH_info_sha256_block_data_order_ssse3 +.rva .LSEH_begin_sha256_block_data_order_avx +.rva .LSEH_end_sha256_block_data_order_avx +.rva .LSEH_info_sha256_block_data_order_avx +.rva .LSEH_begin_sha256_block_data_order_avx2 +.rva .LSEH_end_sha256_block_data_order_avx2 +.rva .LSEH_info_sha256_block_data_order_avx2 +.section .xdata +.p2align 3 +.LSEH_info_sha256_block_data_order: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue,.Lepilogue +.LSEH_info_sha256_block_data_order_shaext: +.byte 9,0,0,0 +.rva shaext_handler +.LSEH_info_sha256_block_data_order_ssse3: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue_ssse3,.Lepilogue_ssse3 +.LSEH_info_sha256_block_data_order_avx: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue_avx,.Lepilogue_avx +.LSEH_info_sha256_block_data_order_avx2: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue_avx2,.Lepilogue_avx2 + diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86.s b/lib/accelerated/x86/coff/sha512-ssse3-x86.s index acad0ec1e7..72a7f73d77 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha512-586.s" .text .globl _sha512_block_data_order .def _sha512_block_data_order; .scl 2; .type 32; .endef @@ -594,6 +593,8 @@ _sha512_block_data_order: .long 4234509866,1501505948 .long 987167468,1607167915 .long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s index 034dab2388..419fa2a980 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,2880 +40,5516 @@ .text -.globl sha256_block_data_order -.def sha256_block_data_order; .scl 2; .type 32; .endef +.globl sha512_block_data_order +.def sha512_block_data_order; .scl 2; .type 32; .endef .p2align 4 -sha256_block_data_order: +sha512_block_data_order: movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%rax -.LSEH_begin_sha256_block_data_order: +.LSEH_begin_sha512_block_data_order: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx + leaq _gnutls_x86_cpuid_s(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $512,%r10d - jnz .Lssse3_shortcut + testl $2048,%r10d + jnz .Lxop_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx - subq $64+32,%rsp - leaq (%rsi,%rdx,4),%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + .Lprologue: - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 jmp .Lloop .p2align 4 .Lloop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - leaq 4(%rbp),%rbp - addl %r14d,%edx - - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - leaq 20(%rbp),%rbp - addl %r14d,%eax - - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - leaq 4(%rbp),%rbp - addl %r14d,%edx - - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 4(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp jmp .Lrounds_16_xx .p2align 4 .Lrounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi - movl 56(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 36(%rsp),%r12d - xorl %r15d,%r14d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r14d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 8(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - - movl 60(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 40(%rsp),%r12d - xorl %edi,%r14d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %r14d,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 12(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - - movl 0(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 44(%rsp),%r12d - xorl %r15d,%r14d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r14d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 16(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - - movl 4(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 48(%rsp),%r12d - xorl %edi,%r14d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %r14d,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 20(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - - movl 8(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 52(%rsp),%r12d - xorl %r15d,%r14d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r14d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 24(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%edx - - - movl 12(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 56(%rsp),%r12d - xorl %edi,%r14d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %r14d,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 28(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - - movl 16(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 60(%rsp),%r12d - xorl %r15d,%r14d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r14d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 32(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - - movl 20(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 0(%rsp),%r12d - xorl %edi,%r14d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %r14d,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 36(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - - - movl 24(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 4(%rsp),%r12d - xorl %r15d,%r14d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r14d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 40(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - - movl 28(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 8(%rsp),%r12d - xorl %edi,%r14d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %r14d,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 44(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - - movl 32(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 12(%rsp),%r12d - xorl %r15d,%r14d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r14d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 48(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - - movl 36(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 16(%rsp),%r12d - xorl %edi,%r14d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %r14d,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 52(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - - movl 40(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 20(%rsp),%r12d - xorl %r15d,%r14d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r14d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 56(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%edx - - - movl 44(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 24(%rsp),%r12d - xorl %edi,%r14d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %r14d,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 60(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - - movl 48(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 28(%rsp),%r12d - xorl %r15d,%r14d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r14d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 0(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - - movl 52(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 32(%rsp),%r12d - xorl %edi,%r14d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %r14d,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 4(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - - cmpb $0,3(%rbp) + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) jnz .Lrounds_16_xx - movq 64+0(%rsp),%rdi - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) jb .Lloop - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 152(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + .Lepilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 -.LSEH_end_sha256_block_data_order: + +.LSEH_end_sha512_block_data_order: .p2align 6 -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.def sha512_block_data_order_xop; .scl 3; .type 32; .endef .p2align 6 -sha256_block_data_order_ssse3: +sha512_block_data_order_xop: movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%rax -.LSEH_begin_sha256_block_data_order_ssse3: +.LSEH_begin_sha512_block_data_order_xop: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx -.Lssse3_shortcut: + +.Lxop_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,4),%rdx + subq $256,%rsp + leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) - movaps %xmm6,64+32(%rsp) - movaps %xmm7,64+48(%rsp) - movaps %xmm8,64+64(%rsp) - movaps %xmm9,64+80(%rsp) -.Lprologue_ssse3: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - movdqa K256+512+32(%rip),%xmm8 - movdqa K256+512+64(%rip),%xmm9 - jmp .Lloop_ssse3 + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + + movaps %xmm6,128+32(%rsp) + movaps %xmm7,128+48(%rsp) + movaps %xmm8,128+64(%rsp) + movaps %xmm9,128+80(%rsp) + movaps %xmm10,128+96(%rsp) + movaps %xmm11,128+112(%rsp) +.Lprologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_xop .p2align 4 -.Lloop_ssse3: - movdqa K256+512(%rip),%xmm7 - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 -.byte 102,15,56,0,199 - leaq K256(%rip),%rbp -.byte 102,15,56,0,207 - movdqa 0(%rbp),%xmm4 -.byte 102,15,56,0,215 - movdqa 32(%rbp),%xmm5 - paddd %xmm0,%xmm4 - movdqa 64(%rbp),%xmm6 -.byte 102,15,56,0,223 - movdqa 96(%rbp),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lssse3_00_47 +.Lloop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lxop_00_47 .p2align 4 -.Lssse3_00_47: - subq $-32*4,%rbp - rorl $14,%r13d - movl %r14d,%eax - movdqa %xmm1,%xmm4 - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d -.byte 102,15,58,15,224,4 - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,250,4 - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - psrld $7,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - pshufd $250,%xmm3,%xmm7 - movl %r8d,%r12d - pslld $14,%xmm5 - xorl %edx,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r9d,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - pxor %xmm6,%xmm4 - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - psrlq $17,%xmm6 - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - pxor %xmm6,%xmm7 - xorl %ecx,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - pxor %xmm6,%xmm7 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d -.byte 102,65,15,56,0,248 - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm0 - addl %r12d,%r9d - pshufd $80,%xmm0,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - movdqa %xmm7,%xmm6 - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - psrld $10,%xmm7 - movl %ebx,%r13d - psrlq $17,%xmm6 - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - pxor %xmm6,%xmm7 - addl 12(%rsp),%r8d - movl %r9d,%edi - movdqa 0(%rbp),%xmm6 - rorl $11,%r14d - xorl %edx,%r12d -.byte 102,65,15,56,0,249 - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - paddd %xmm7,%xmm0 - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - paddd %xmm0,%xmm6 - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movl %r14d,%r8d - movdqa %xmm2,%xmm4 - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d -.byte 102,15,58,15,225,4 - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,251,4 - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - psrld $7,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - pshufd $250,%xmm0,%xmm7 - movl %eax,%r12d - pslld $14,%xmm5 - xorl %r11d,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %ebx,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - pxor %xmm6,%xmm4 - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - psrlq $17,%xmm6 - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - pxor %xmm6,%xmm7 - xorl %r10d,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - pxor %xmm6,%xmm7 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx -.byte 102,65,15,56,0,248 - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm1 - addl %r12d,%ebx - pshufd $80,%xmm1,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - movdqa %xmm7,%xmm6 - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - psrld $10,%xmm7 - movl %r9d,%r13d - psrlq $17,%xmm6 - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - pxor %xmm6,%xmm7 - addl 28(%rsp),%eax - movl %ebx,%edi - movdqa 32(%rbp),%xmm6 - rorl $11,%r14d - xorl %r11d,%r12d -.byte 102,65,15,56,0,249 - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - paddd %xmm7,%xmm1 - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - paddd %xmm1,%xmm6 - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movdqa %xmm3,%xmm4 - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d -.byte 102,15,58,15,226,4 - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,248,4 - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - psrld $7,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - pshufd $250,%xmm1,%xmm7 - movl %r8d,%r12d - pslld $14,%xmm5 - xorl %edx,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r9d,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - pxor %xmm6,%xmm4 - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - psrlq $17,%xmm6 - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - pxor %xmm6,%xmm7 - xorl %ecx,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - pxor %xmm6,%xmm7 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d -.byte 102,65,15,56,0,248 - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm2 - addl %r12d,%r9d - pshufd $80,%xmm2,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - movdqa %xmm7,%xmm6 - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - psrld $10,%xmm7 - movl %ebx,%r13d - psrlq $17,%xmm6 - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - pxor %xmm6,%xmm7 - addl 44(%rsp),%r8d - movl %r9d,%edi - movdqa 64(%rbp),%xmm6 - rorl $11,%r14d - xorl %edx,%r12d -.byte 102,65,15,56,0,249 - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - paddd %xmm7,%xmm2 - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - paddd %xmm2,%xmm6 - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movl %r14d,%r8d - movdqa %xmm0,%xmm4 - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d -.byte 102,15,58,15,227,4 - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,249,4 - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - psrld $7,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - pshufd $250,%xmm2,%xmm7 - movl %eax,%r12d - pslld $14,%xmm5 - xorl %r11d,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %ebx,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - pxor %xmm6,%xmm4 - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - psrlq $17,%xmm6 - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - pxor %xmm6,%xmm7 - xorl %r10d,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - pxor %xmm6,%xmm7 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx -.byte 102,65,15,56,0,248 - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm3 - addl %r12d,%ebx - pshufd $80,%xmm3,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - movdqa %xmm7,%xmm6 - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - psrld $10,%xmm7 - movl %r9d,%r13d - psrlq $17,%xmm6 - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - pxor %xmm6,%xmm7 - addl 60(%rsp),%eax - movl %ebx,%edi - movdqa 96(%rbp),%xmm6 - rorl $11,%r14d - xorl %r11d,%r12d -.byte 102,65,15,56,0,249 - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - paddd %xmm7,%xmm3 - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - paddd %xmm3,%xmm6 - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne .Lssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_ssse3 - - movq 64+24(%rsp),%rsi - movaps 64+32(%rsp),%xmm6 - movaps 64+48(%rsp),%xmm7 - movaps 64+64(%rsp),%xmm8 - movaps 64+80(%rsp),%xmm9 - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_ssse3: +.Lxop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lxop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_xop + + movq 152(%rsp),%rsi + + vzeroupper + movaps 128+32(%rsp),%xmm6 + movaps 128+48(%rsp),%xmm7 + movaps 128+64(%rsp),%xmm8 + movaps 128+80(%rsp),%xmm9 + movaps 128+96(%rsp),%xmm10 + movaps 128+112(%rsp),%xmm11 + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue_xop: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha512_block_data_order_xop: +.def sha512_block_data_order_avx; .scl 3; .type 32; .endef +.p2align 6 +sha512_block_data_order_avx: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha512_block_data_order_avx: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + +.Lavx_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $256,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + + movaps %xmm6,128+32(%rsp) + movaps %xmm7,128+48(%rsp) + movaps %xmm8,128+64(%rsp) + movaps %xmm9,128+80(%rsp) + movaps %xmm10,128+96(%rsp) + movaps %xmm11,128+112(%rsp) +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.p2align 4 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.p2align 4 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi + + vzeroupper + movaps 128+32(%rsp),%xmm6 + movaps 128+48(%rsp),%xmm7 + movaps 128+64(%rsp),%xmm8 + movaps 128+80(%rsp),%xmm9 + movaps 128+96(%rsp),%xmm10 + movaps 128+112(%rsp),%xmm11 + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue_avx: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + +.LSEH_end_sha512_block_data_order_avx: +.def sha512_block_data_order_avx2; .scl 3; .type 32; .endef +.p2align 6 +sha512_block_data_order_avx2: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha512_block_data_order_avx2: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + +.Lavx2_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1408,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + + movaps %xmm6,128+32(%rsp) + movaps %xmm7,128+48(%rsp) + movaps %xmm8,128+64(%rsp) + movaps %xmm9,128+80(%rsp) + movaps %xmm10,128+96(%rsp) + movaps %xmm11,128+112(%rsp) +.Lprologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Loop_avx2 +.p2align 4 +.Loop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + leaq -128(%rsp),%rsp + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp .Lavx2_00_47 + +.p2align 4 +.Lavx2_00_47: + leaq -128(%rsp),%rsp + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne .Lavx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je .Ldone_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp .Lower_avx2 +.p2align 4 +.Lower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + +.Ldone_avx2: + leaq (%rbp),%rsp + movq 152(%rsp),%rsi + + vzeroupper + movaps 128+32(%rsp),%xmm6 + movaps 128+48(%rsp),%xmm7 + movaps 128+64(%rsp),%xmm8 + movaps 128+80(%rsp),%xmm9 + movaps 128+96(%rsp),%xmm10 + movaps 128+112(%rsp),%xmm11 + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +.Lepilogue_avx2: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 -.LSEH_end_sha256_block_data_order_ssse3: + +.LSEH_end_sha512_block_data_order_avx2: .def se_handler; .scl 3; .type 32; .endef .p2align 4 @@ -2946,9 +5582,15 @@ se_handler: leaq (%rsi,%r10,1),%r10 cmpq %r10,%rbx jae .Lin_prologue + leaq .Lavx2_shortcut(%rip),%r10 + cmpq %r10,%rbx + jb .Lnot_in_avx2 + + andq $-2048,%rax + addq $1152,%rax +.Lnot_in_avx2: movq %rax,%rsi - movq 64+24(%rax),%rax - leaq 48(%rax),%rax + movq 128+24(%rax),%rax movq -8(%rax),%rbx movq -16(%rax),%rbp @@ -2965,12 +5607,12 @@ se_handler: leaq .Lepilogue(%rip),%r10 cmpq %r10,%rbx - jb .Lin_prologue + jb .Lin_prologue - leaq 64+32(%rsi),%rsi + leaq 128+32(%rsi),%rsi leaq 512(%r8),%rdi - movl $8,%ecx -.long 0xa548f3fc + movl $12,%ecx +.long 0xa548f3fc .Lin_prologue: movq 8(%rax),%rdi @@ -2982,7 +5624,7 @@ se_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -3011,23 +5653,36 @@ se_handler: popq %rsi .byte 0xf3,0xc3 - .section .pdata .p2align 2 -.rva .LSEH_begin_sha256_block_data_order -.rva .LSEH_end_sha256_block_data_order -.rva .LSEH_info_sha256_block_data_order -.rva .LSEH_begin_sha256_block_data_order_ssse3 -.rva .LSEH_end_sha256_block_data_order_ssse3 -.rva .LSEH_info_sha256_block_data_order_ssse3 +.rva .LSEH_begin_sha512_block_data_order +.rva .LSEH_end_sha512_block_data_order +.rva .LSEH_info_sha512_block_data_order +.rva .LSEH_begin_sha512_block_data_order_xop +.rva .LSEH_end_sha512_block_data_order_xop +.rva .LSEH_info_sha512_block_data_order_xop +.rva .LSEH_begin_sha512_block_data_order_avx +.rva .LSEH_end_sha512_block_data_order_avx +.rva .LSEH_info_sha512_block_data_order_avx +.rva .LSEH_begin_sha512_block_data_order_avx2 +.rva .LSEH_end_sha512_block_data_order_avx2 +.rva .LSEH_info_sha512_block_data_order_avx2 .section .xdata .p2align 3 -.LSEH_info_sha256_block_data_order: +.LSEH_info_sha512_block_data_order: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue,.Lepilogue +.LSEH_info_sha512_block_data_order_xop: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue_xop,.Lepilogue_xop +.LSEH_info_sha512_block_data_order_avx: .byte 9,0,0,0 .rva se_handler -.rva .Lprologue,.Lepilogue -.LSEH_info_sha256_block_data_order_ssse3: +.rva .Lprologue_avx,.Lepilogue_avx +.LSEH_info_sha512_block_data_order_avx2: .byte 9,0,0,0 .rva se_handler -.rva .Lprologue_ssse3,.Lepilogue_ssse3 +.rva .Lprologue_avx2,.Lepilogue_avx2 diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86.s b/lib/accelerated/x86/elf/aes-ssse3-x86.s index 2b677952d1..265e28a7ef 100644 --- a/lib/accelerated/x86/elf/aes-ssse3-x86.s +++ b/lib/accelerated/x86/elf/aes-ssse3-x86.s @@ -5,12 +5,11 @@ ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## -## For details see https://shiftleft.org/papers/vector_aes/ and -## https://crypto.stanford.edu/vpaes/. +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. # # *** This file is auto-generated *** # -.file "vpaes-x86.s" .text .align 64 .L_vpaes_consts: diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s index d086050e37..ea1216baf7 100644 --- a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s @@ -5,8 +5,8 @@ ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## -## For details see https://shiftleft.org/papers/vector_aes/ and -## https://crypto.stanford.edu/vpaes/. +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. # # *** This file is auto-generated *** # @@ -30,6 +30,7 @@ .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: +.cfi_startproc movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -110,6 +111,7 @@ _vpaes_encrypt_core: pxor %xmm4,%xmm0 .byte 102,15,56,0,193 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core @@ -120,6 +122,7 @@ _vpaes_encrypt_core: .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: +.cfi_startproc movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -216,6 +219,7 @@ _vpaes_decrypt_core: pxor %xmm4,%xmm0 .byte 102,15,56,0,194 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_decrypt_core,.-_vpaes_decrypt_core @@ -226,6 +230,7 @@ _vpaes_decrypt_core: .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: +.cfi_startproc @@ -392,6 +397,7 @@ _vpaes_schedule_core: pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -411,6 +417,7 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: +.cfi_startproc pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -419,6 +426,7 @@ _vpaes_schedule_192_smear: movdqa %xmm6,%xmm0 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear @@ -442,6 +450,7 @@ _vpaes_schedule_192_smear: .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: +.cfi_startproc pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 @@ -495,6 +504,7 @@ _vpaes_schedule_low_round: pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_round,.-_vpaes_schedule_round @@ -509,6 +519,7 @@ _vpaes_schedule_low_round: .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: +.cfi_startproc movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -519,6 +530,7 @@ _vpaes_schedule_transform: .byte 102,15,56,0,193 pxor %xmm2,%xmm0 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_transform,.-_vpaes_schedule_transform @@ -547,6 +559,7 @@ _vpaes_schedule_transform: .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: +.cfi_startproc movdqa %xmm0,%xmm4 movdqa .Lk_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -611,6 +624,7 @@ _vpaes_schedule_mangle: andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -620,6 +634,7 @@ _vpaes_schedule_mangle: .type vpaes_set_encrypt_key,@function .align 16 vpaes_set_encrypt_key: +.cfi_startproc movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -630,12 +645,14 @@ vpaes_set_encrypt_key: call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key .globl vpaes_set_decrypt_key .type vpaes_set_decrypt_key,@function .align 16 vpaes_set_decrypt_key: +.cfi_startproc movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -651,33 +668,39 @@ vpaes_set_decrypt_key: call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key .globl vpaes_encrypt .type vpaes_encrypt,@function .align 16 vpaes_encrypt: +.cfi_startproc movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_encrypt,.-vpaes_encrypt .globl vpaes_decrypt .type vpaes_decrypt,@function .align 16 vpaes_decrypt: +.cfi_startproc movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_decrypt,.-vpaes_decrypt .globl vpaes_cbc_encrypt .type vpaes_cbc_encrypt,@function .align 16 vpaes_cbc_encrypt: +.cfi_startproc xchgq %rcx,%rdx subq $16,%rcx jc .Lcbc_abort @@ -713,6 +736,7 @@ vpaes_cbc_encrypt: movdqu %xmm6,(%r8) .Lcbc_abort: .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt @@ -723,6 +747,7 @@ vpaes_cbc_encrypt: .type _vpaes_preheat,@function .align 16 _vpaes_preheat: +.cfi_startproc leaq .Lk_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -732,6 +757,7 @@ _vpaes_preheat: movdqa 80(%r10),%xmm15 movdqa 96(%r10),%xmm14 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat diff --git a/lib/accelerated/x86/elf/aesni-gcm-x86_64.s b/lib/accelerated/x86/elf/aesni-gcm-x86_64.s index 07f177d8d4..e26d18d69f 100644 --- a/lib/accelerated/x86/elf/aesni-gcm-x86_64.s +++ b/lib/accelerated/x86/elf/aesni-gcm-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -354,17 +354,25 @@ _aesni_ctr32_ghash_6x: .type aesni_gcm_decrypt,@function .align 32 aesni_gcm_decrypt: +.cfi_startproc xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 vzeroupper vmovdqu (%r8),%xmm1 @@ -426,15 +434,23 @@ aesni_gcm_decrypt: vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax .byte 0xf3,0xc3 +.cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt .type _aesni_ctr32_6x,@function .align 32 @@ -531,17 +547,25 @@ _aesni_ctr32_6x: .type aesni_gcm_encrypt,@function .align 32 aesni_gcm_encrypt: +.cfi_startproc xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 vzeroupper vmovdqu (%r8),%xmm1 @@ -767,15 +791,23 @@ aesni_gcm_encrypt: vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax .byte 0xf3,0xc3 +.cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt .align 64 .Lbswap_mask: diff --git a/lib/accelerated/x86/elf/aesni-x86.s b/lib/accelerated/x86/elf/aesni-x86.s index 5d70f2568f..aaf0bab635 100644 --- a/lib/accelerated/x86/elf/aesni-x86.s +++ b/lib/accelerated/x86/elf/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "devel/perlasm/aesni-x86.s" .text .globl aesni_encrypt .type aesni_encrypt,@function @@ -60,7 +59,10 @@ aesni_encrypt: leal 16(%edx),%edx jnz .L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_encrypt,.-.L_aesni_encrypt_begin .globl aesni_decrypt @@ -84,32 +86,90 @@ aesni_decrypt: leal 16(%edx),%edx jnz .L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_decrypt,.-.L_aesni_decrypt_begin +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L002enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L002enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L003dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L003dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + ret +.size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L002enc3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L004enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 - movups (%edx),%xmm0 - jnz .L002enc3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L004enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -122,25 +182,26 @@ _aesni_encrypt3: .align 16 _aesni_decrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L003dec3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L005dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 - movups (%edx),%xmm0 - jnz .L003dec3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L005dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -154,27 +215,29 @@ _aesni_decrypt3: _aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L004enc4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L006enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%edx),%xmm0 - jnz .L004enc4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L006enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -190,27 +253,29 @@ _aesni_encrypt4: _aesni_decrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L005dec4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L007dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%edx),%xmm0 - jnz .L005dec4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L007dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -225,45 +290,42 @@ _aesni_decrypt4: .align 16 _aesni_encrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 -.byte 102,15,56,220,241 - movups (%edx),%xmm0 -.byte 102,15,56,220,249 - jmp .L_aesni_encrypt6_enter + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L008_aesni_encrypt6_inner .align 16 -.L006enc6_loop: +.L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 +.L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.align 16 .L_aesni_encrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%edx),%xmm0 - jnz .L006enc6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -282,45 +344,42 @@ _aesni_encrypt6: .align 16 _aesni_decrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 -.byte 102,15,56,222,241 - movups (%edx),%xmm0 -.byte 102,15,56,222,249 - jmp .L_aesni_decrypt6_enter + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L010_aesni_decrypt6_inner .align 16 -.L007dec6_loop: +.L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 +.L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.align 16 .L_aesni_decrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%edx),%xmm0 - jnz .L007dec6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -350,14 +409,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L008ecb_ret + jz .L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L009ecb_decrypt + jz .L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L010ecb_enc_tail + jb .L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -366,9 +425,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L011ecb_enc_loop6_enter + jmp .L015ecb_enc_loop6_enter .align 16 -.L012ecb_enc_loop6: +.L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -383,12 +442,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L011ecb_enc_loop6_enter: +.L015ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L012ecb_enc_loop6 + jnc .L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -397,18 +456,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L010ecb_enc_tail: + jz .L012ecb_ret +.L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L013ecb_enc_one + jb .L017ecb_enc_one movups 16(%esi),%xmm3 - je .L014ecb_enc_two + je .L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L015ecb_enc_three + jb .L019ecb_enc_three movups 48(%esi),%xmm5 - je .L016ecb_enc_four + je .L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -417,50 +476,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L013ecb_enc_one: +.L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L017enc1_loop_3: +.L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L017enc1_loop_3 + jnz .L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L014ecb_enc_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 +.L018ecb_enc_two: + call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L015ecb_enc_three: +.L019ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L016ecb_enc_four: +.L020ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L009ecb_decrypt: +.L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L018ecb_dec_tail + jb .L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -469,9 +527,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L019ecb_dec_loop6_enter + jmp .L023ecb_dec_loop6_enter .align 16 -.L020ecb_dec_loop6: +.L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -486,12 +544,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L019ecb_dec_loop6_enter: +.L023ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L020ecb_dec_loop6 + jnc .L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -500,18 +558,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L018ecb_dec_tail: + jz .L012ecb_ret +.L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L021ecb_dec_one + jb .L025ecb_dec_one movups 16(%esi),%xmm3 - je .L022ecb_dec_two + je .L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L023ecb_dec_three + jb .L027ecb_dec_three movups 48(%esi),%xmm5 - je .L024ecb_dec_four + je .L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -520,44 +578,51 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L021ecb_dec_one: +.L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L025dec1_loop_4: +.L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L025dec1_loop_4 + jnz .L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L022ecb_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 +.L026ecb_dec_two: + call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L023ecb_dec_three: +.L027ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L012ecb_ret .align 16 -.L024ecb_dec_four: +.L028ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L008ecb_ret: +.L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -596,48 +661,56 @@ aesni_ccm64_encrypt_blocks: movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - shrl $1,%ecx + shll $4,%ecx + movl $16,%ebx leal (%edx),%ebp movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 - movl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx .byte 102,15,56,0,253 -.L026ccm64_enc_outer: +.L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 xorps %xmm0,%xmm2 movups 16(%ebp),%xmm1 xorps %xmm6,%xmm0 - leal 32(%ebp),%edx xorps %xmm0,%xmm3 - movups (%edx),%xmm0 -.L027ccm64_enc2_loop: + movups 32(%ebp),%xmm0 +.L031ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L027ccm64_enc2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 + decl %eax .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) - leal 16(%edi),%edi .byte 102,15,56,0,213 - jnz .L026ccm64_enc_outer + leal 16(%edi),%edi + jnz .L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -685,71 +758,82 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L028enc1_loop_5: +.L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L028enc1_loop_5 + jnz .L032enc1_loop_5 .byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx movups (%esi),%xmm6 paddq 16(%esp),%xmm7 leal 16(%esi),%esi - jmp .L029ccm64_dec_outer + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp .L033ccm64_dec_outer .align 16 -.L029ccm64_dec_outer: +.L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L030ccm64_dec_break + jz .L034ccm64_dec_break movups (%ebp),%xmm0 - shrl $1,%ecx + movl %ebx,%ecx movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movups (%edx),%xmm0 -.L031ccm64_dec2_loop: + movups 32(%ebp),%xmm0 +.L035ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L031ccm64_dec2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 - jmp .L029ccm64_dec_outer + leal 16(%esi),%esi + jmp .L033ccm64_dec_outer .align 16 -.L030ccm64_dec_break: +.L034ccm64_dec_break: + movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L032enc1_loop_6: +.L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L032enc1_loop_6 + jnz .L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -775,7 +859,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L033ctr32_one_shortcut + je .L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -791,63 +875,59 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx - pxor %xmm1,%xmm1 pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,203,0 +.byte 102,15,58,34,195,0 leal 3(%ebx),%ebp -.byte 102,15,58,34,197,0 +.byte 102,15,58,34,205,0 incl %ebx -.byte 102,15,58,34,203,1 +.byte 102,15,58,34,195,1 incl %ebp -.byte 102,15,58,34,197,1 +.byte 102,15,58,34,205,1 incl %ebx -.byte 102,15,58,34,203,2 +.byte 102,15,58,34,195,2 incl %ebp -.byte 102,15,58,34,197,2 - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 - movdqa %xmm0,64(%esp) +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 - pshufd $192,%xmm1,%xmm2 - pshufd $128,%xmm1,%xmm3 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L034ctr32_tail + jb .L038ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx movdqa %xmm7,32(%esp) - shrl $1,%ecx movl %edx,%ebp - movl %ecx,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L035ctr32_loop6 -.align 16 -.L035ctr32_loop6: - pshufd $64,%xmm1,%xmm4 - movdqa 32(%esp),%xmm1 - pshufd $192,%xmm0,%xmm5 - por %xmm1,%xmm2 - pshufd $128,%xmm0,%xmm6 - por %xmm1,%xmm3 - pshufd $64,%xmm0,%xmm7 - por %xmm1,%xmm4 - por %xmm1,%xmm5 - por %xmm1,%xmm6 - por %xmm1,%xmm7 - movups (%ebp),%xmm0 - movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx - decl %ecx + jmp .L039ctr32_loop6 +.align 16 +.L039ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 +.byte 102,15,56,220,209 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -858,51 +938,51 @@ aesni_ctr32_encrypt_blocks: movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 - movdqa 48(%esp),%xmm1 + movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 - paddd 64(%esp),%xmm0 + paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 - movdqa %xmm0,64(%esp) -.byte 102,15,56,0,194 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 movups %xmm6,64(%edi) - pshufd $192,%xmm1,%xmm2 + pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi - movl %ebx,%ecx - pshufd $128,%xmm1,%xmm3 + pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L035ctr32_loop6 + jnc .L039ctr32_loop6 addl $6,%eax - jz .L036ctr32_ret + jz .L040ctr32_ret + movdqu (%ebp),%xmm7 movl %ebp,%edx - leal 1(,%ecx,2),%ecx - movdqa 32(%esp),%xmm7 -.L034ctr32_tail: + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +.L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L037ctr32_one - pshufd $64,%xmm1,%xmm4 + jb .L041ctr32_one + pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L038ctr32_two - pshufd $192,%xmm0,%xmm5 + je .L042ctr32_two + pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L039ctr32_three - pshufd $128,%xmm0,%xmm6 + jb .L043ctr32_three + pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L040ctr32_four + je .L044ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -920,39 +1000,39 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L033ctr32_one_shortcut: +.L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L037ctr32_one: +.L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L041enc1_loop_7: +.L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L041enc1_loop_7 + jnz .L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L038ctr32_two: - call _aesni_encrypt3 +.L042ctr32_two: + call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L039ctr32_three: +.L043ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -963,9 +1043,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L036ctr32_ret + jmp .L040ctr32_ret .align 16 -.L040ctr32_four: +.L044ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -979,7 +1059,18 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L036ctr32_ret: +.L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -1004,12 +1095,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L042enc1_loop_8: +.L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L042enc1_loop_8 + jnz .L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1033,12 +1124,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L043xts_enc_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L044xts_enc_loop6 + jc .L047xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L048xts_enc_loop6 .align 16 -.L044xts_enc_loop6: +.L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1074,6 +1167,7 @@ aesni_xts_encrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1089,19 +1183,17 @@ aesni_xts_encrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,220,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1126,26 +1218,25 @@ aesni_xts_encrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L044xts_enc_loop6 - leal 1(,%ecx,2),%ecx + jnc .L048xts_enc_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L043xts_enc_short: +.L047xts_enc_short: addl $96,%eax - jz .L045xts_enc_done6x + jz .L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L046xts_enc_one + jb .L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L047xts_enc_two + je .L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1154,7 +1245,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L048xts_enc_three + jb .L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1164,7 +1255,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L049xts_enc_four + je .L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1196,9 +1287,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L046xts_enc_one: +.L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1206,37 +1297,36 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L051enc1_loop_9: +.L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L051enc1_loop_9 + jnz .L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L047xts_enc_two: +.L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - xorps %xmm4,%xmm4 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L048xts_enc_three: +.L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1254,9 +1344,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L049xts_enc_four: +.L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1278,28 +1368,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L054xts_enc_done .align 16 -.L045xts_enc_done6x: +.L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L052xts_enc_ret + jz .L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L053xts_enc_steal + jmp .L057xts_enc_steal .align 16 -.L050xts_enc_done: +.L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L052xts_enc_ret + jz .L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L053xts_enc_steal: +.L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1307,7 +1397,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L053xts_enc_steal + jnz .L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1317,16 +1407,30 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L054enc1_loop_10: +.L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L054enc1_loop_10 + jnz .L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L052xts_enc_ret: +.L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1351,12 +1455,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L055enc1_loop_11: +.L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L055enc1_loop_11 + jnz .L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1385,12 +1489,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L056xts_dec_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L057xts_dec_loop6 + jc .L060xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L061xts_dec_loop6 .align 16 -.L057xts_dec_loop6: +.L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1426,6 +1532,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1441,19 +1548,17 @@ aesni_xts_decrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,222,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 call .L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1478,26 +1583,25 @@ aesni_xts_decrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L057xts_dec_loop6 - leal 1(,%ecx,2),%ecx + jnc .L061xts_dec_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L056xts_dec_short: +.L060xts_dec_short: addl $96,%eax - jz .L058xts_dec_done6x + jz .L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L059xts_dec_one + jb .L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L060xts_dec_two + je .L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1506,7 +1610,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L061xts_dec_three + jb .L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1516,7 +1620,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L062xts_dec_four + je .L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1548,9 +1652,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L059xts_dec_one: +.L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1558,36 +1662,36 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L064dec1_loop_12: +.L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L064dec1_loop_12 + jnz .L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L060xts_dec_two: +.L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L061xts_dec_three: +.L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1605,9 +1709,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L062xts_dec_four: +.L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1629,20 +1733,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L067xts_dec_done .align 16 -.L058xts_dec_done6x: +.L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L065xts_dec_ret + jz .L069xts_dec_ret movl %eax,112(%esp) - jmp .L066xts_dec_only_one_more + jmp .L070xts_dec_only_one_more .align 16 -.L063xts_dec_done: +.L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L065xts_dec_ret + jz .L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1652,7 +1756,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L066xts_dec_only_one_more: +.L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1666,16 +1770,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L067dec1_loop_13: +.L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L067dec1_loop_13 + jnz .L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L068xts_dec_steal: +.L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1683,7 +1787,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L068xts_dec_steal + jnz .L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1693,16 +1797,30 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_14: +.L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_14 + jnz .L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L065xts_dec_ret: +.L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1710,89 +1828,880 @@ aesni_xts_decrypt: popl %ebp ret .size aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin -.globl aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function +.globl aesni_ocb_encrypt +.type aesni_ocb_encrypt,@function .align 16 -aesni_cbc_encrypt: -.L_aesni_cbc_encrypt_begin: +aesni_ocb_encrypt: +.L_aesni_ocb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi + movl 40(%esp),%ecx + movl 48(%esp),%ebx movl 20(%esp),%esi - movl %esp,%ebx movl 24(%esp),%edi - subl $24,%ebx movl 28(%esp),%eax - andl $-16,%ebx movl 32(%esp),%edx + movdqu (%ecx),%xmm0 movl 36(%esp),%ebp - testl %eax,%eax - jz .L070cbc_abort - cmpl $0,40(%esp) - xchgl %esp,%ebx - movups (%ebp),%xmm7 + movdqu (%ebx),%xmm1 + movl 44(%esp),%ebx + movl %esp,%ecx + subl $132,%esp + andl $-16,%esp + subl %esi,%edi + shll $4,%eax + leal -96(%esi,%eax,1),%eax + movl %edi,120(%esp) + movl %eax,124(%esp) + movl %ecx,128(%esp) movl 240(%edx),%ecx - movl %edx,%ebp - movl %ebx,16(%esp) - movl %ecx,%ebx - je .L071cbc_decrypt - movaps %xmm7,%xmm2 - cmpl $16,%eax - jb .L072cbc_enc_tail - subl $16,%eax - jmp .L073cbc_enc_loop -.align 16 -.L073cbc_enc_loop: - movups (%esi),%xmm7 + testl $1,%ebp + jnz .L074odd + bsfl %ebp,%eax + addl $1,%ebp + shll $4,%eax + movdqu (%ebx,%eax,1),%xmm7 + movl %edx,%eax + movdqu (%esi),%xmm2 leal 16(%esi),%esi + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 movups (%edx),%xmm0 movups 16(%edx),%xmm1 - xorps %xmm0,%xmm7 leal 32(%edx),%edx - xorps %xmm7,%xmm2 -.L074enc1_loop_15: + xorps %xmm0,%xmm2 +.L075enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L074enc1_loop_15 + jnz .L075enc1_loop_15 .byte 102,15,56,221,209 - movl %ebx,%ecx - movl %ebp,%edx - movups %xmm2,(%edi) - leal 16(%edi),%edi - subl $16,%eax - jnc .L073cbc_enc_loop - addl $16,%eax - jnz .L072cbc_enc_tail - movaps %xmm2,%xmm7 - jmp .L075cbc_ret -.L072cbc_enc_tail: - movl %eax,%ecx -.long 2767451785 - movl $16,%ecx - subl %eax,%ecx - xorl %eax,%eax -.long 2868115081 - leal -16(%edi),%edi - movl %ebx,%ecx - movl %edi,%esi - movl %ebp,%edx - jmp .L073cbc_enc_loop -.align 16 -.L071cbc_decrypt: - cmpl $80,%eax - jbe .L076cbc_dec_tail - movaps %xmm7,(%esp) - subl $80,%eax - jmp .L077cbc_dec_loop6_enter -.align 16 -.L078cbc_dec_loop6: - movaps %xmm0,(%esp) - movups %xmm7,(%edi) + xorps %xmm7,%xmm2 + movdqa %xmm7,%xmm0 + movdqa %xmm6,%xmm1 + movups %xmm2,-16(%edi,%esi,1) + movl 240(%eax),%ecx + movl %eax,%edx + movl 124(%esp),%eax +.L074odd: + shll $4,%ecx + movl $16,%edi + subl %ecx,%edi + movl %edx,112(%esp) + leal 32(%edx,%ecx,1),%edx + movl %edi,116(%esp) + cmpl %eax,%esi + ja .L076short + jmp .L077grandloop +.align 32 +.L077grandloop: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + leal 5(%ebp),%edi + addl $6,%ebp + bsfl %ecx,%ecx + bsfl %eax,%eax + bsfl %edi,%edi + shll $4,%ecx + shll $4,%eax + shll $4,%edi + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + movdqu (%ebx,%edi,1),%xmm7 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm6,%xmm1 + pxor %xmm0,%xmm6 + pxor %xmm7,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm1,96(%esp) + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor 80(%esp),%xmm7 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movl 120(%esp),%edi + movl 124(%esp),%eax + call .L_aesni_encrypt6_enter + movdqa 80(%esp),%xmm0 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor %xmm0,%xmm7 + movdqa 96(%esp),%xmm1 + movdqu %xmm2,-96(%edi,%esi,1) + movdqu %xmm3,-80(%edi,%esi,1) + movdqu %xmm4,-64(%edi,%esi,1) + movdqu %xmm5,-48(%edi,%esi,1) + movdqu %xmm6,-32(%edi,%esi,1) + movdqu %xmm7,-16(%edi,%esi,1) + cmpl %eax,%esi + jb .L077grandloop +.L076short: + addl $96,%eax + subl %esi,%eax + jz .L078done + cmpl $32,%eax + jb .L079one + je .L080two + cmpl $64,%eax + jb .L081three + je .L082four + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm6,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm1,96(%esp) + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movl 120(%esp),%edi + call .L_aesni_encrypt6_enter + movdqa 64(%esp),%xmm0 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor %xmm0,%xmm6 + movdqa 96(%esp),%xmm1 + movdqu %xmm2,(%edi,%esi,1) + movdqu %xmm3,16(%edi,%esi,1) + movdqu %xmm4,32(%edi,%esi,1) + movdqu %xmm5,48(%edi,%esi,1) + movdqu %xmm6,64(%edi,%esi,1) + jmp .L078done +.align 16 +.L079one: + movdqu (%ebx),%xmm7 + movl 112(%esp),%edx + movdqu (%esi),%xmm2 + movl 240(%edx),%ecx + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movl 120(%esp),%edi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L083enc1_loop_16: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L083enc1_loop_16 +.byte 102,15,56,221,209 + xorps %xmm7,%xmm2 + movdqa %xmm7,%xmm0 + movdqa %xmm6,%xmm1 + movups %xmm2,(%edi,%esi,1) + jmp .L078done +.align 16 +.L080two: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm6 + movdqu (%ebx,%ecx,1),%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movl 240(%edx),%ecx + pxor %xmm0,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm3 + movdqa %xmm1,%xmm5 + movl 120(%esp),%edi + call _aesni_encrypt2 + xorps %xmm6,%xmm2 + xorps %xmm7,%xmm3 + movdqa %xmm7,%xmm0 + movdqa %xmm5,%xmm1 + movups %xmm2,(%edi,%esi,1) + movups %xmm3,16(%edi,%esi,1) + jmp .L078done +.align 16 +.L081three: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm5 + movdqu (%ebx,%ecx,1),%xmm6 + movdqa %xmm5,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movl 240(%edx),%ecx + pxor %xmm0,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm5,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm6,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm7,%xmm4 + movdqa %xmm1,96(%esp) + movl 120(%esp),%edi + call _aesni_encrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movdqa %xmm7,%xmm0 + movdqa 96(%esp),%xmm1 + movups %xmm2,(%edi,%esi,1) + movups %xmm3,16(%edi,%esi,1) + movups %xmm4,32(%edi,%esi,1) + jmp .L078done +.align 16 +.L082four: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + movl 112(%esp),%edx + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm4 + movdqu (%ebx,%ecx,1),%xmm5 + movdqa %xmm4,%xmm6 + movdqu (%ebx,%eax,1),%xmm7 + pxor %xmm0,%xmm4 + movdqu (%esi),%xmm2 + pxor %xmm4,%xmm5 + movdqu 16(%esi),%xmm3 + pxor %xmm5,%xmm6 + movdqa %xmm4,(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm5,16(%esp) + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movl 240(%edx),%ecx + pxor %xmm2,%xmm1 + pxor (%esp),%xmm2 + pxor %xmm3,%xmm1 + pxor 16(%esp),%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm7,%xmm5 + movdqa %xmm1,96(%esp) + movl 120(%esp),%edi + call _aesni_encrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm6,%xmm4 + movups %xmm2,(%edi,%esi,1) + xorps %xmm7,%xmm5 + movups %xmm3,16(%edi,%esi,1) + movdqa %xmm7,%xmm0 + movups %xmm4,32(%edi,%esi,1) + movdqa 96(%esp),%xmm1 + movups %xmm5,48(%edi,%esi,1) +.L078done: + movl 128(%esp),%edx + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm2,16(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm2,32(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm2,48(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm2,64(%esp) + movdqa %xmm2,80(%esp) + movdqa %xmm2,96(%esp) + leal (%edx),%esp + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movdqu %xmm0,(%ecx) + pxor %xmm0,%xmm0 + movdqu %xmm1,(%ebx) + pxor %xmm1,%xmm1 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aesni_ocb_encrypt,.-.L_aesni_ocb_encrypt_begin +.globl aesni_ocb_decrypt +.type aesni_ocb_decrypt,@function +.align 16 +aesni_ocb_decrypt: +.L_aesni_ocb_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movdqu (%ecx),%xmm0 + movl 36(%esp),%ebp + movdqu (%ebx),%xmm1 + movl 44(%esp),%ebx + movl %esp,%ecx + subl $132,%esp + andl $-16,%esp + subl %esi,%edi + shll $4,%eax + leal -96(%esi,%eax,1),%eax + movl %edi,120(%esp) + movl %eax,124(%esp) + movl %ecx,128(%esp) + movl 240(%edx),%ecx + testl $1,%ebp + jnz .L084odd + bsfl %ebp,%eax + addl $1,%ebp + shll $4,%eax + movdqu (%ebx,%eax,1),%xmm7 + movl %edx,%eax + movdqu (%esi),%xmm2 + leal 16(%esi),%esi + pxor %xmm0,%xmm7 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L085dec1_loop_17: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L085dec1_loop_17 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm1 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm1 + movups %xmm2,-16(%edi,%esi,1) + movl 240(%eax),%ecx + movl %eax,%edx + movl 124(%esp),%eax +.L084odd: + shll $4,%ecx + movl $16,%edi + subl %ecx,%edi + movl %edx,112(%esp) + leal 32(%edx,%ecx,1),%edx + movl %edi,116(%esp) + cmpl %eax,%esi + ja .L086short + jmp .L087grandloop +.align 32 +.L087grandloop: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + leal 5(%ebp),%edi + addl $6,%ebp + bsfl %ecx,%ecx + bsfl %eax,%eax + bsfl %edi,%edi + shll $4,%ecx + shll $4,%eax + shll $4,%edi + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + movdqu (%ebx,%edi,1),%xmm7 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor 80(%esp),%xmm7 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movl 120(%esp),%edi + movl 124(%esp),%eax + call .L_aesni_decrypt6_enter + movdqa 80(%esp),%xmm0 + pxor (%esp),%xmm2 + movdqa 96(%esp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + movdqu %xmm2,-96(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqu %xmm3,-80(%edi,%esi,1) + pxor %xmm4,%xmm1 + movdqu %xmm4,-64(%edi,%esi,1) + pxor %xmm5,%xmm1 + movdqu %xmm5,-48(%edi,%esi,1) + pxor %xmm6,%xmm1 + movdqu %xmm6,-32(%edi,%esi,1) + pxor %xmm7,%xmm1 + movdqu %xmm7,-16(%edi,%esi,1) + cmpl %eax,%esi + jb .L087grandloop +.L086short: + addl $96,%eax + subl %esi,%eax + jz .L088done + cmpl $32,%eax + jb .L089one + je .L090two + cmpl $64,%eax + jb .L091three + je .L092four + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + pxor %xmm7,%xmm7 + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movl 120(%esp),%edi + call .L_aesni_decrypt6_enter + movdqa 64(%esp),%xmm0 + pxor (%esp),%xmm2 + movdqa 96(%esp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm2,%xmm1 + movdqu %xmm2,(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqu %xmm3,16(%edi,%esi,1) + pxor %xmm4,%xmm1 + movdqu %xmm4,32(%edi,%esi,1) + pxor %xmm5,%xmm1 + movdqu %xmm5,48(%edi,%esi,1) + pxor %xmm6,%xmm1 + movdqu %xmm6,64(%edi,%esi,1) + jmp .L088done +.align 16 +.L089one: + movdqu (%ebx),%xmm7 + movl 112(%esp),%edx + movdqu (%esi),%xmm2 + movl 240(%edx),%ecx + pxor %xmm0,%xmm7 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movl 120(%esp),%edi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L093dec1_loop_18: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L093dec1_loop_18 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm1 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm1 + movups %xmm2,(%edi,%esi,1) + jmp .L088done +.align 16 +.L090two: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm6 + movdqu (%ebx,%ecx,1),%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movl 240(%edx),%ecx + movdqa %xmm1,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm6,%xmm2 + pxor %xmm7,%xmm3 + movl 120(%esp),%edi + call _aesni_decrypt2 + xorps %xmm6,%xmm2 + xorps %xmm7,%xmm3 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm5 + movups %xmm2,(%edi,%esi,1) + xorps %xmm3,%xmm5 + movups %xmm3,16(%edi,%esi,1) + movaps %xmm5,%xmm1 + jmp .L088done +.align 16 +.L091three: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm5 + movdqu (%ebx,%ecx,1),%xmm6 + movdqa %xmm5,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movl 240(%edx),%ecx + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm5,%xmm2 + pxor %xmm6,%xmm3 + pxor %xmm7,%xmm4 + movl 120(%esp),%edi + call _aesni_decrypt3 + movdqa 96(%esp),%xmm1 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi,%esi,1) + pxor %xmm2,%xmm1 + movdqa %xmm7,%xmm0 + movups %xmm3,16(%edi,%esi,1) + pxor %xmm3,%xmm1 + movups %xmm4,32(%edi,%esi,1) + pxor %xmm4,%xmm1 + jmp .L088done +.align 16 +.L092four: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + movl 112(%esp),%edx + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm4 + movdqu (%ebx,%ecx,1),%xmm5 + movdqa %xmm4,%xmm6 + movdqu (%ebx,%eax,1),%xmm7 + pxor %xmm0,%xmm4 + movdqu (%esi),%xmm2 + pxor %xmm4,%xmm5 + movdqu 16(%esi),%xmm3 + pxor %xmm5,%xmm6 + movdqa %xmm4,(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm5,16(%esp) + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movl 240(%edx),%ecx + movdqa %xmm1,96(%esp) + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm7,%xmm5 + movl 120(%esp),%edi + call _aesni_decrypt4 + movdqa 96(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm6,%xmm4 + movups %xmm2,(%edi,%esi,1) + pxor %xmm2,%xmm1 + xorps %xmm7,%xmm5 + movups %xmm3,16(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqa %xmm7,%xmm0 + movups %xmm4,32(%edi,%esi,1) + pxor %xmm4,%xmm1 + movups %xmm5,48(%edi,%esi,1) + pxor %xmm5,%xmm1 +.L088done: + movl 128(%esp),%edx + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm2,16(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm2,32(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm2,48(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm2,64(%esp) + movdqa %xmm2,80(%esp) + movdqa %xmm2,96(%esp) + leal (%edx),%esp + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movdqu %xmm0,(%ecx) + pxor %xmm0,%xmm0 + movdqu %xmm1,(%ebx) + pxor %xmm1,%xmm1 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aesni_ocb_decrypt,.-.L_aesni_ocb_decrypt_begin +.globl aesni_cbc_encrypt +.type aesni_cbc_encrypt,@function +.align 16 +aesni_cbc_encrypt: +.L_aesni_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl %esp,%ebx + movl 24(%esp),%edi + subl $24,%ebx + movl 28(%esp),%eax + andl $-16,%ebx + movl 32(%esp),%edx + movl 36(%esp),%ebp + testl %eax,%eax + jz .L094cbc_abort + cmpl $0,40(%esp) + xchgl %esp,%ebx + movups (%ebp),%xmm7 + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ebx,16(%esp) + movl %ecx,%ebx + je .L095cbc_decrypt + movaps %xmm7,%xmm2 + cmpl $16,%eax + jb .L096cbc_enc_tail + subl $16,%eax + jmp .L097cbc_enc_loop +.align 16 +.L097cbc_enc_loop: + movups (%esi),%xmm7 + leal 16(%esi),%esi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm7 + leal 32(%edx),%edx + xorps %xmm7,%xmm2 +.L098enc1_loop_19: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L098enc1_loop_19 +.byte 102,15,56,221,209 + movl %ebx,%ecx + movl %ebp,%edx + movups %xmm2,(%edi) leal 16(%edi),%edi -.L077cbc_dec_loop6_enter: + subl $16,%eax + jnc .L097cbc_enc_loop + addl $16,%eax + jnz .L096cbc_enc_tail + movaps %xmm2,%xmm7 + pxor %xmm2,%xmm2 + jmp .L099cbc_ret +.L096cbc_enc_tail: + movl %eax,%ecx +.long 2767451785 + movl $16,%ecx + subl %eax,%ecx + xorl %eax,%eax +.long 2868115081 + leal -16(%edi),%edi + movl %ebx,%ecx + movl %edi,%esi + movl %ebp,%edx + jmp .L097cbc_enc_loop +.align 16 +.L095cbc_decrypt: + cmpl $80,%eax + jbe .L100cbc_dec_tail + movaps %xmm7,(%esp) + subl $80,%eax + jmp .L101cbc_dec_loop6_enter +.align 16 +.L102cbc_dec_loop6: + movaps %xmm0,(%esp) + movups %xmm7,(%edi) + leal 16(%edi),%edi +.L101cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1822,28 +2731,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L078cbc_dec_loop6 + ja .L102cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L079cbc_dec_tail_collected + jle .L103cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L076cbc_dec_tail: +.L100cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L080cbc_dec_one + jbe .L104cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L081cbc_dec_two + jbe .L105cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L082cbc_dec_three + jbe .L106cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L083cbc_dec_four + jbe .L107cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1861,56 +2770,62 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L080cbc_dec_one: +.L104cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L084dec1_loop_16: +.L109dec1_loop_20: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L084dec1_loop_16 + jnz .L109dec1_loop_20 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L081cbc_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 +.L105cbc_dec_two: + call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L082cbc_dec_three: +.L106cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L079cbc_dec_tail_collected + jmp .L108cbc_dec_tail_collected .align 16 -.L083cbc_dec_four: +.L107cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1920,28 +2835,44 @@ aesni_cbc_encrypt: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -.L079cbc_dec_tail_collected: + jmp .L108cbc_dec_tail_collected +.align 16 +.L103cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +.L108cbc_dec_tail_collected: andl $15,%eax - jnz .L085cbc_dec_tail_partial + jnz .L110cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L075cbc_ret + pxor %xmm0,%xmm0 + jmp .L099cbc_ret .align 16 -.L085cbc_dec_tail_partial: +.L110cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L075cbc_ret: + movdqa %xmm2,(%esp) +.L099cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -.L070cbc_abort: + pxor %xmm7,%xmm7 +.L094cbc_abort: popl %edi popl %esi popl %ebx @@ -1951,52 +2882,62 @@ aesni_cbc_encrypt: .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz .L086bad_pointer + jz .L111bad_pointer testl %edx,%edx - jz .L086bad_pointer + jz .L111bad_pointer + call .L112pic +.L112pic: + popl %ebx + leal .Lkey_const-.L112pic(%ebx),%ebx + leal _gnutls_x86_cpuid_s,%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je .L08714rounds + je .L11314rounds cmpl $192,%ecx - je .L08812rounds + je .L11412rounds cmpl $128,%ecx - jne .L089bad_keybits + jne .L115bad_keybits .align 16 -.L09010rounds: +.L11610rounds: + cmpl $268435456,%ebp + je .L11710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L091key_128_cold + call .L118key_128_cold .byte 102,15,58,223,200,2 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,4 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,8 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,16 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,32 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,64 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,128 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,27 - call .L092key_128 + call .L119key_128 .byte 102,15,58,223,200,54 - call .L092key_128 + call .L119key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp .L120good_key .align 16 -.L092key_128: +.L119key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L091key_128_cold: +.L118key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2005,38 +2946,91 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L08812rounds: +.L11710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L121loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L121loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L120good_key +.align 16 +.L11412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je .L12212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L093key_192a_cold + call .L123key_192a_cold .byte 102,15,58,223,202,2 - call .L094key_192b + call .L124key_192b .byte 102,15,58,223,202,4 - call .L095key_192a + call .L125key_192a .byte 102,15,58,223,202,8 - call .L094key_192b + call .L124key_192b .byte 102,15,58,223,202,16 - call .L095key_192a + call .L125key_192a .byte 102,15,58,223,202,32 - call .L094key_192b + call .L124key_192b .byte 102,15,58,223,202,64 - call .L095key_192a + call .L125key_192a .byte 102,15,58,223,202,128 - call .L094key_192b + call .L124key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp .L120good_key .align 16 -.L095key_192a: +.L125key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L093key_192a_cold: +.L123key_192a_cold: movaps %xmm2,%xmm5 -.L096key_192b_warm: +.L126key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2050,56 +3044,90 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L094key_192b: +.L124key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L096key_192b_warm + jmp .L126key_192b_warm +.align 16 +.L12212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L127loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L127loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L120good_key .align 16 -.L08714rounds: +.L11314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je .L12814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L097key_256a_cold + call .L129key_256a_cold .byte 102,15,58,223,200,1 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,2 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,2 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,4 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,4 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,8 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,8 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,16 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,16 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,32 - call .L099key_256a + call .L131key_256a .byte 102,15,58,223,200,32 - call .L098key_256b + call .L130key_256b .byte 102,15,58,223,202,64 - call .L099key_256a + call .L131key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp .L120good_key .align 16 -.L099key_256a: +.L131key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L097key_256a_cold: +.L129key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2108,7 +3136,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L098key_256b: +.L130key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2118,13 +3146,70 @@ _aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 16 +.L12814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L132loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L133done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L132loop_key256 +.L133done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L120good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 4 -.L086bad_pointer: +.L111bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 4 -.L089bad_keybits: +.L115bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key .globl aesni_set_encrypt_key @@ -2150,7 +3235,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L100dec_key_ret + jnz .L134dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2158,7 +3243,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L101dec_key_inverse: +.L135dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2168,20 +3253,26 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L101dec_key_inverse + ja .L135dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -.L100dec_key_ret: +.L134dec_key_ret: ret .size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 - +.comm _gnutls_x86_cpuid_s,16,4 .section .note.GNU-stack,"",%progbits - - diff --git a/lib/accelerated/x86/elf/aesni-x86_64.s b/lib/accelerated/x86/elf/aesni-x86_64.s index 76d44fc2a8..43cf4e68de 100644 --- a/lib/accelerated/x86/elf/aesni-x86_64.s +++ b/lib/accelerated/x86/elf/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -43,6 +43,7 @@ .type aesni_encrypt,@function .align 16 aesni_encrypt: +.cfi_startproc movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -61,12 +62,14 @@ aesni_encrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_encrypt,.-aesni_encrypt .globl aesni_decrypt .type aesni_decrypt,@function .align 16 aesni_decrypt: +.cfi_startproc movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -85,10 +88,12 @@ aesni_decrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_decrypt, .-aesni_decrypt .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -114,10 +119,12 @@ _aesni_encrypt2: .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt2,.-_aesni_encrypt2 .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -143,10 +150,12 @@ _aesni_decrypt2: .byte 102,15,56,223,208 .byte 102,15,56,223,216 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -177,10 +186,12 @@ _aesni_encrypt3: .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt3,.-_aesni_encrypt3 .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -211,10 +222,12 @@ _aesni_decrypt3: .byte 102,15,56,223,216 .byte 102,15,56,223,224 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt3,.-_aesni_decrypt3 .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -251,10 +264,12 @@ _aesni_encrypt4: .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt4,.-_aesni_encrypt4 .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -291,10 +306,12 @@ _aesni_decrypt4: .byte 102,15,56,223,224 .byte 102,15,56,223,232 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt4,.-_aesni_decrypt4 .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -345,10 +362,12 @@ _aesni_encrypt6: .byte 102,15,56,221,240 .byte 102,15,56,221,248 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt6,.-_aesni_encrypt6 .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -399,10 +418,12 @@ _aesni_decrypt6: .byte 102,15,56,223,240 .byte 102,15,56,223,248 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt6,.-_aesni_decrypt6 .type _aesni_encrypt8,@function .align 16 _aesni_encrypt8: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -463,10 +484,12 @@ _aesni_encrypt8: .byte 102,68,15,56,221,192 .byte 102,68,15,56,221,200 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt8,.-_aesni_encrypt8 .type _aesni_decrypt8,@function .align 16 _aesni_decrypt8: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -527,11 +550,13 @@ _aesni_decrypt8: .byte 102,68,15,56,223,192 .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt8,.-_aesni_decrypt8 .globl aesni_ecb_encrypt .type aesni_ecb_encrypt,@function .align 16 aesni_ecb_encrypt: +.cfi_startproc andq $-16,%rdx jz .Lecb_ret @@ -869,6 +894,7 @@ aesni_ecb_encrypt: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ecb_encrypt,.-aesni_ecb_encrypt .globl aesni_ccm64_encrypt_blocks .type aesni_ccm64_encrypt_blocks,@function @@ -1034,6 +1060,7 @@ aesni_ccm64_decrypt_blocks: .type aesni_ctr32_encrypt_blocks,@function .align 16 aesni_ctr32_encrypt_blocks: +.cfi_startproc cmpq $1,%rdx jne .Lctr32_bulk @@ -1063,11 +1090,12 @@ aesni_ctr32_encrypt_blocks: .align 16 .Lctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1076,7 +1104,7 @@ aesni_ctr32_encrypt_blocks: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1092,8 +1120,8 @@ aesni_ctr32_encrypt_blocks: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1102,25 +1130,25 @@ aesni_ctr32_encrypt_blocks: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl _gnutls_x86_cpuid_s+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1144,7 +1172,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp .Lctr32_loop6 @@ -1155,32 +1183,32 @@ aesni_ctr32_encrypt_blocks: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1241,7 +1269,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1254,7 +1282,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1268,7 +1296,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1282,7 +1310,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1296,7 +1324,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1310,7 +1338,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1324,7 +1352,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1339,7 +1367,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1574,7 +1602,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1598,20 +1626,25 @@ aesni_ctr32_encrypt_blocks: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lctr32_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks .globl aesni_xts_encrypt .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq (%rsp),%rax +.cfi_startproc + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1627,7 +1660,7 @@ aesni_xts_encrypt: jnz .Loop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1683,9 +1716,9 @@ aesni_xts_encrypt: jc .Lxts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop @@ -1710,7 +1743,7 @@ aesni_xts_encrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1719,7 +1752,7 @@ aesni_xts_encrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1734,7 +1767,7 @@ aesni_xts_encrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 @@ -1766,7 +1799,7 @@ aesni_xts_encrypt: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1834,10 +1867,10 @@ aesni_xts_encrypt: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1864,7 +1897,7 @@ aesni_xts_encrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_enc_short: @@ -2020,7 +2053,7 @@ aesni_xts_encrypt: jnz .Lxts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2063,20 +2096,25 @@ aesni_xts_encrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_xts_encrypt,.-aesni_xts_encrypt .globl aesni_xts_decrypt .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq (%rsp),%rax +.cfi_startproc + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2098,7 +2136,7 @@ aesni_xts_decrypt: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2154,9 +2192,9 @@ aesni_xts_decrypt: jc .Lxts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop @@ -2181,7 +2219,7 @@ aesni_xts_decrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2190,7 +2228,7 @@ aesni_xts_decrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2205,7 +2243,7 @@ aesni_xts_decrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 @@ -2237,7 +2275,7 @@ aesni_xts_decrypt: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2305,10 +2343,10 @@ aesni_xts_decrypt: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2335,7 +2373,7 @@ aesni_xts_decrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_dec_short: @@ -2492,7 +2530,7 @@ aesni_xts_decrypt: jz .Lxts_dec_ret .Lxts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2522,7 +2560,7 @@ aesni_xts_decrypt: jnz .Lxts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2565,221 +2603,1079 @@ aesni_xts_decrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_xts_decrypt,.-aesni_xts_decrypt -.globl aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function -.align 16 -aesni_cbc_encrypt: - testq %rdx,%rdx - jz .Lcbc_ret +.globl aesni_ocb_encrypt +.type aesni_ocb_encrypt,@function +.align 32 +aesni_ocb_encrypt: +.cfi_startproc + leaq (%rsp),%rax + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp movl 240(%rcx),%r10d movq %rcx,%r11 - testl %r9d,%r9d - jz .Lcbc_decrypt + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb .Lcbc_enc_tail - subq $16,%rdx - jmp .Lcbc_enc_loop -.align 16 -.Lcbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -.Loop_enc1_15: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc .Lcbc_enc_loop - addq $16,%rdx - jnz .Lcbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp .Lcbc_ret + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 -.Lcbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp .Lcbc_enc_loop + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 -.align 16 -.Lcbc_decrypt: - cmpq $16,%rdx - jne .Lcbc_decrypt_bulk + testq $1,%r8 + jnz .Locb_enc_odd + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + call __ocb_encrypt1 - movdqu (%rdi),%xmm2 - movdqu (%r8),%xmm3 - movdqa %xmm2,%xmm4 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_16: -.byte 102,15,56,222,209 - decl %r10d - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqu %xmm4,(%r8) - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 + movdqa %xmm7,%xmm15 movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp .Lcbc_ret -.align 16 -.Lcbc_decrypt_bulk: - leaq (%rsp),%rax - pushq %rbp - subq $16,%rsp - andq $-16,%rsp - leaq -8(%rax),%rbp - movups (%r8),%xmm10 - movl %r10d,%eax - cmpq $0x50,%rdx - jbe .Lcbc_dec_tail + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_enc_done + +.Locb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 - movups (%rcx),%xmm0 + subq $6,%rdx + jc .Locb_enc_short + jmp .Locb_enc_grandloop + +.align 32 +.Locb_enc_grandloop: movdqu 0(%rdi),%xmm2 movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 - movl _gnutls_x86_cpuid_s+4(%rip),%r9d - cmpq $0x70,%rdx - jbe .Lcbc_dec_six_or_seven + leaq 96(%rdi),%rdi + + call __ocb_encrypt6 + + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_enc_grandloop + +.Locb_enc_short: + addq $6,%rdx + jz .Locb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_enc_one + movdqu 16(%rdi),%xmm3 + je .Locb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_enc_three + movdqu 48(%rdi),%xmm5 + je .Locb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp .Locb_enc_done - andl $71303168,%r9d - subq $0x50,%rdx - cmpl $4194304,%r9d - je .Lcbc_dec_loop6_enter - subq $0x20,%rdx - leaq 112(%rcx),%rcx - jmp .Lcbc_dec_loop8_enter .align 16 -.Lcbc_dec_loop8: - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi -.Lcbc_dec_loop8_enter: - movdqu 96(%rdi),%xmm8 - pxor %xmm0,%xmm2 - movdqu 112(%rdi),%xmm9 - pxor %xmm0,%xmm3 - movups 16-112(%rcx),%xmm1 - pxor %xmm0,%xmm4 - xorq %r11,%r11 - cmpq $0x70,%rdx - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 +.Locb_enc_one: + movdqa %xmm10,%xmm7 -.byte 102,15,56,222,209 - pxor %xmm0,%xmm9 - movups 32-112(%rcx),%xmm0 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 -.byte 102,68,15,56,222,201 - addq %rdi,%r11 - movups 48-112(%rcx),%xmm1 -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 64-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 80-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 96-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 112-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 128-112(%rcx),%xmm0 + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp .Locb_enc_done + +.align 16 +.Locb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +.Locb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax +.cfi_def_cfa %rax,8 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Locb_enc_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_ocb_encrypt,.-aesni_ocb_encrypt + +.type __ocb_encrypt6,@function +.align 32 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_enc_loop6 + +.align 32 +.Locb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 +.size __ocb_encrypt6,.-__ocb_encrypt6 + +.type __ocb_encrypt4,@function +.align 32 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop4 + +.align 32 +.Locb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 +.size __ocb_encrypt4,.-__ocb_encrypt4 + +.type __ocb_encrypt1,@function +.align 32 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop1 + +.align 32 +.Locb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 +.size __ocb_encrypt1,.-__ocb_encrypt1 + +.globl aesni_ocb_decrypt +.type aesni_ocb_decrypt,@function +.align 32 +aesni_ocb_decrypt: +.cfi_startproc + leaq (%rsp),%rax + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz .Locb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_dec_done + +.Locb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc .Locb_dec_short + jmp .Locb_dec_grandloop + +.align 32 +.Locb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_dec_grandloop + +.Locb_dec_short: + addq $6,%rdx + jz .Locb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_dec_one + movdqu 16(%rdi),%xmm3 + je .Locb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_dec_three + movdqu 48(%rdi),%xmm5 + je .Locb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp .Locb_dec_done + +.align 16 +.Locb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +.Locb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax +.cfi_def_cfa %rax,8 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Locb_dec_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_ocb_decrypt,.-aesni_ocb_decrypt + +.type __ocb_decrypt6,@function +.align 32 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_dec_loop6 + +.align 32 +.Locb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 +.size __ocb_decrypt6,.-__ocb_decrypt6 + +.type __ocb_decrypt4,@function +.align 32 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop4 + +.align 32 +.Locb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 +.size __ocb_decrypt4,.-__ocb_decrypt4 + +.type __ocb_decrypt1,@function +.align 32 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop1 + +.align 32 +.Locb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 +.size __ocb_decrypt1,.-__ocb_decrypt1 +.globl aesni_cbc_encrypt +.type aesni_cbc_encrypt,@function +.align 16 +aesni_cbc_encrypt: +.cfi_startproc + testq %rdx,%rdx + jz .Lcbc_ret + + movl 240(%rcx),%r10d + movq %rcx,%r11 + testl %r9d,%r9d + jz .Lcbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb .Lcbc_enc_tail + subq $16,%rdx + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +.Loop_enc1_15: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_15 +.byte 102,15,56,221,209 + movl %r10d,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc .Lcbc_enc_loop + addq $16,%rdx + jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp .Lcbc_ret + +.Lcbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp .Lcbc_enc_loop + +.align 16 +.Lcbc_decrypt: + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 + pushq %rbp +.cfi_offset %rbp,-16 + subq $16,%rsp + andq $-16,%rsp + movq %rcx,%rbp + movups (%r8),%xmm10 + movl %r10d,%eax + cmpq $0x50,%rdx + jbe .Lcbc_dec_tail + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl _gnutls_x86_cpuid_s+4(%rip),%r9d + cmpq $0x70,%rdx + jbe .Lcbc_dec_six_or_seven + + andl $71303168,%r9d + subq $0x50,%rdx + cmpl $4194304,%r9d + je .Lcbc_dec_loop6_enter + subq $0x20,%rdx + leaq 112(%rcx),%rcx + jmp .Lcbc_dec_loop8_enter +.align 16 +.Lcbc_dec_loop8: + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi +.Lcbc_dec_loop8_enter: + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + movq $-1,%rbp + cmpq $0x70,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + +.byte 102,15,56,222,209 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 + adcq $0,%rbp + andq $128,%rbp +.byte 102,68,15,56,222,201 + addq %rdi,%rbp + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 nop .byte 102,15,56,222,209 .byte 102,15,56,222,217 @@ -2867,18 +3763,18 @@ aesni_cbc_encrypt: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -2997,7 +3893,7 @@ aesni_cbc_encrypt: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3150,16 +4046,21 @@ aesni_cbc_encrypt: .Lcbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lcbc_ret: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_encrypt,.-aesni_cbc_encrypt .globl aesni_set_decrypt_key .type aesni_set_decrypt_key,@function .align 16 aesni_set_decrypt_key: +.cfi_startproc .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3192,7 +4093,9 @@ aesni_set_decrypt_key: pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_decrypt_key: .size aesni_set_decrypt_key,.-aesni_set_decrypt_key .globl aesni_set_encrypt_key @@ -3200,7 +4103,9 @@ aesni_set_decrypt_key: .align 16 aesni_set_encrypt_key: __aesni_set_encrypt_key: +.cfi_startproc .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 movq $-1,%rax testq %rdi,%rdi jz .Lenc_key_ret @@ -3493,7 +4398,9 @@ __aesni_set_encrypt_key: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_encrypt_key: .align 16 diff --git a/lib/accelerated/x86/elf/cpuid-x86.s b/lib/accelerated/x86/elf/cpuid-x86.s index 8817cd34f2..4427b6bdd0 100644 --- a/lib/accelerated/x86/elf/cpuid-x86.s +++ b/lib/accelerated/x86/elf/cpuid-x86.s @@ -21,7 +21,6 @@ # # *** This file is auto-generated *** # -.file "devel/perlasm/cpuid-x86.s" .text .globl gnutls_cpuid .type gnutls_cpuid,@function diff --git a/lib/accelerated/x86/elf/cpuid-x86_64.s b/lib/accelerated/x86/elf/cpuid-x86_64.s index 82858b2444..0740edcd26 100644 --- a/lib/accelerated/x86/elf/cpuid-x86_64.s +++ b/lib/accelerated/x86/elf/cpuid-x86_64.s @@ -56,7 +56,4 @@ gnutls_cpuid: .byte 0xf3,0xc3 .size gnutls_cpuid,.-gnutls_cpuid - .section .note.GNU-stack,"",%progbits - - diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s index e2568a6fd6..1e4d18b341 100644 --- a/lib/accelerated/x86/elf/ghash-x86_64.s +++ b/lib/accelerated/x86/elf/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,9 +44,27 @@ .type gcm_gmult_4bit,@function .align 16 gcm_gmult_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -123,22 +141,41 @@ gcm_gmult_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit .globl gcm_ghash_4bit .type gcm_ghash_4bit,@function .align 16 gcm_ghash_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -683,21 +720,31 @@ gcm_ghash_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit .globl gcm_init_clmul .type gcm_init_clmul,@function .align 16 gcm_init_clmul: +.cfi_startproc .L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -849,11 +896,13 @@ gcm_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: +.cfi_startproc .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -900,11 +949,13 @@ gcm_gmult_clmul: .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .type gcm_ghash_clmul,@function .align 32 gcm_ghash_clmul: +.cfi_startproc .L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm10 @@ -1283,11 +1334,13 @@ gcm_ghash_clmul: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul .globl gcm_init_avx .type gcm_init_avx,@function .align 32 gcm_init_avx: +.cfi_startproc vzeroupper vmovdqu (%rsi),%xmm2 @@ -1390,17 +1443,21 @@ gcm_init_avx: vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx .type gcm_gmult_avx,@function .align 32 gcm_gmult_avx: +.cfi_startproc jmp .L_gmult_clmul +.cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx .globl gcm_ghash_avx .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: +.cfi_startproc vzeroupper vmovdqu (%rdi),%xmm10 @@ -1772,6 +1829,7 @@ gcm_ghash_avx: vmovdqu %xmm10,(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86.s b/lib/accelerated/x86/elf/sha1-ssse3-x86.s index 7b585a0f3e..8bfbcb6b39 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha1-586.s" .text .globl sha1_block_data_order .type sha1_block_data_order,@function @@ -1418,7 +1417,4 @@ sha1_block_data_order: .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - .section .note.GNU-stack,"",%progbits - - diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s index af40532f12..1e6546e11e 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,25 +44,45 @@ .type sha1_block_data_order,@function .align 16 sha1_block_data_order: +.cfi_startproc movl _gnutls_x86_cpuid_s+0(%rip),%r9d movl _gnutls_x86_cpuid_s+4(%rip),%r8d + movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu + testl $536870912,%r10d + jnz _shaext_shortcut + andl $296,%r10d + cmpl $296,%r10d + je _avx2_shortcut + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .align 16 .Lialu: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 - movq %rsp,%r11 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%r8),%esi @@ -76,1230 +96,1168 @@ sha1_block_data_order: .Lloop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax - movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + + movl %esi,%ecx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1319,29 +1277,218 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp +.cfi_def_cfa %rsi,8 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order +.type sha1_block_data_order_shaext,@function +.align 32 +sha1_block_data_order_shaext: +_shaext_shortcut: +.cfi_startproc + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%r8 + paddd %xmm4,%xmm1 + cmovneq %r8,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) +.cfi_endproc + .byte 0xf3,0xc3 +.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1353,18 +1500,18 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1376,24 +1523,24 @@ _ssse3_shortcut: jmp .Loop_ssse3 .align 16 .Loop_ssse3: - movdqa %xmm1,%xmm4 rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi addl 0(%rsp),%ebp - paddd %xmm3,%xmm9 + punpcklqdq %xmm1,%xmm4 xorl %ecx,%ebx roll $5,%eax - psrldq $4,%xmm8 addl %esi,%ebp + psrldq $4,%xmm8 andl %ebx,%edi - pxor %xmm0,%xmm4 xorl %ecx,%ebx + pxor %xmm0,%xmm4 addl %eax,%ebp - pxor %xmm2,%xmm8 rorl $7,%eax + pxor %xmm2,%xmm8 xorl %ecx,%edi movl %ebp,%esi addl 4(%rsp),%edx @@ -1404,57 +1551,57 @@ _ssse3_shortcut: addl %edi,%edx andl %eax,%esi movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp + movdqa %xmm4,%xmm8 xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi addl 8(%rsp),%ecx + psrld $31,%xmm8 xorl %eax,%ebp roll $5,%edx - psrld $31,%xmm8 addl %esi,%ecx - andl %ebp,%edi movdqa %xmm10,%xmm9 + andl %ebp,%edi xorl %eax,%ebp - addl %edx,%ecx psrld $30,%xmm10 - por %xmm8,%xmm4 + addl %edx,%ecx rorl $7,%edx + por %xmm8,%xmm4 xorl %eax,%edi movl %ecx,%esi addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx + movdqa -64(%r14),%xmm10 roll $5,%ecx - movdqa 0(%r11),%xmm10 addl %edi,%ebx andl %edx,%esi pxor %xmm9,%xmm4 xorl %ebp,%edx addl %ecx,%ebx - movdqa %xmm2,%xmm5 rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi addl 16(%rsp),%eax - paddd %xmm4,%xmm10 + punpcklqdq %xmm2,%xmm5 xorl %edx,%ecx roll $5,%ebx - psrldq $4,%xmm9 addl %esi,%eax + psrldq $4,%xmm9 andl %ecx,%edi - pxor %xmm1,%xmm5 xorl %edx,%ecx + pxor %xmm1,%xmm5 addl %ebx,%eax - pxor %xmm3,%xmm9 rorl $7,%ebx + pxor %xmm3,%xmm9 xorl %edx,%edi movl %eax,%esi addl 20(%rsp),%ebp @@ -1465,57 +1612,57 @@ _ssse3_shortcut: addl %edi,%ebp andl %ebx,%esi movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax + movdqa %xmm5,%xmm9 xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi addl 24(%rsp),%edx + psrld $31,%xmm9 xorl %ebx,%eax roll $5,%ebp - psrld $31,%xmm9 addl %esi,%edx - andl %eax,%edi movdqa %xmm8,%xmm10 + andl %eax,%edi xorl %ebx,%eax - addl %ebp,%edx psrld $30,%xmm8 - por %xmm9,%xmm5 + addl %ebp,%edx rorl $7,%ebp + por %xmm9,%xmm5 xorl %ebx,%edi movl %edx,%esi addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp + movdqa -32(%r14),%xmm8 roll $5,%edx - movdqa 16(%r11),%xmm8 addl %edi,%ecx andl %ebp,%esi pxor %xmm10,%xmm5 xorl %eax,%ebp addl %edx,%ecx - movdqa %xmm3,%xmm6 rorl $7,%edx + pshufd $238,%xmm2,%xmm6 xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi addl 32(%rsp),%ebx - paddd %xmm5,%xmm8 + punpcklqdq %xmm3,%xmm6 xorl %ebp,%edx roll $5,%ecx - psrldq $4,%xmm10 addl %esi,%ebx + psrldq $4,%xmm10 andl %edx,%edi - pxor %xmm2,%xmm6 xorl %ebp,%edx + pxor %xmm2,%xmm6 addl %ecx,%ebx - pxor %xmm4,%xmm10 rorl $7,%ecx + pxor %xmm4,%xmm10 xorl %ebp,%edi movl %ebx,%esi addl 36(%rsp),%eax @@ -1526,57 +1673,57 @@ _ssse3_shortcut: addl %edi,%eax andl %ecx,%esi movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx + movdqa %xmm6,%xmm10 xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi addl 40(%rsp),%ebp + psrld $31,%xmm10 xorl %ecx,%ebx roll $5,%eax - psrld $31,%xmm10 addl %esi,%ebp - andl %ebx,%edi movdqa %xmm9,%xmm8 + andl %ebx,%edi xorl %ecx,%ebx - addl %eax,%ebp psrld $30,%xmm9 - por %xmm10,%xmm6 + addl %eax,%ebp rorl $7,%eax + por %xmm10,%xmm6 xorl %ecx,%edi movl %ebp,%esi addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax + movdqa -32(%r14),%xmm9 roll $5,%ebp - movdqa 16(%r11),%xmm9 addl %edi,%edx andl %eax,%esi pxor %xmm8,%xmm6 xorl %ebx,%eax addl %ebp,%edx - movdqa %xmm4,%xmm7 rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi addl 48(%rsp),%ecx - paddd %xmm6,%xmm9 + punpcklqdq %xmm4,%xmm7 xorl %eax,%ebp roll $5,%edx - psrldq $4,%xmm8 addl %esi,%ecx + psrldq $4,%xmm8 andl %ebp,%edi - pxor %xmm3,%xmm7 xorl %eax,%ebp + pxor %xmm3,%xmm7 addl %edx,%ecx - pxor %xmm5,%xmm8 rorl $7,%edx + pxor %xmm5,%xmm8 xorl %eax,%edi movl %ecx,%esi addl 52(%rsp),%ebx @@ -1587,78 +1734,78 @@ _ssse3_shortcut: addl %edi,%ebx andl %edx,%esi movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx + movdqa %xmm7,%xmm8 xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi addl 56(%rsp),%eax + psrld $31,%xmm8 xorl %edx,%ecx roll $5,%ebx - psrld $31,%xmm8 addl %esi,%eax - andl %ecx,%edi movdqa %xmm10,%xmm9 + andl %ecx,%edi xorl %edx,%ecx - addl %ebx,%eax psrld $30,%xmm10 - por %xmm8,%xmm7 + addl %ebx,%eax rorl $7,%ebx + por %xmm8,%xmm7 xorl %edx,%edi movl %eax,%esi addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx + movdqa -32(%r14),%xmm10 roll $5,%eax - movdqa 16(%r11),%xmm10 addl %edi,%ebp andl %ebx,%esi pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx addl %eax,%ebp - movdqa %xmm7,%xmm9 rorl $7,%eax pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 xorl %ecx,%esi movl %ebp,%edi addl 0(%rsp),%edx - pxor %xmm1,%xmm0 + punpcklqdq %xmm7,%xmm9 xorl %ebx,%eax roll $5,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm7,%xmm10 + pxor %xmm1,%xmm0 addl %esi,%edx andl %eax,%edi - pxor %xmm9,%xmm0 + movdqa %xmm10,%xmm8 xorl %ebx,%eax + paddd %xmm7,%xmm10 addl %ebp,%edx + pxor %xmm9,%xmm0 rorl $7,%ebp xorl %ebx,%edi - movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) movl %edx,%esi addl 4(%rsp),%ecx + movdqa %xmm0,%xmm9 xorl %eax,%ebp roll $5,%edx - pslld $2,%xmm0 + movdqa %xmm10,48(%rsp) addl %edi,%ecx andl %ebp,%esi - psrld $30,%xmm9 xorl %eax,%ebp + pslld $2,%xmm0 addl %edx,%ecx rorl $7,%edx + psrld $30,%xmm9 xorl %eax,%esi movl %ecx,%edi addl 8(%rsp),%ebx por %xmm9,%xmm0 xorl %ebp,%edx roll $5,%ecx - movdqa %xmm0,%xmm10 + pshufd $238,%xmm7,%xmm10 addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx @@ -1671,18 +1818,18 @@ _ssse3_shortcut: xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax - addl 16(%rsp),%ebp pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 + addl 16(%rsp),%ebp xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx + paddd %xmm0,%xmm8 addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx @@ -1690,43 +1837,43 @@ _ssse3_shortcut: movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 - movdqa %xmm8,0(%rsp) addl %edi,%edx xorl %ebx,%esi + movdqa %xmm8,0(%rsp) rorl $7,%eax addl %ebp,%edx - pslld $2,%xmm1 addl 24(%rsp),%ecx + pslld $2,%xmm1 xorl %eax,%esi - psrld $30,%xmm10 movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp - addl %edx,%ecx por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx + pshufd $238,%xmm0,%xmm8 xorl %ebp,%edi - movdqa %xmm1,%xmm8 movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx - addl 32(%rsp),%eax pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 + addl 32(%rsp),%eax xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 + movdqa 0(%r14),%xmm10 rorl $7,%ecx + paddd %xmm1,%xmm9 addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp @@ -1734,43 +1881,43 @@ _ssse3_shortcut: movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 - movdqa %xmm9,16(%rsp) addl %edi,%ebp xorl %ecx,%esi + movdqa %xmm9,16(%rsp) rorl $7,%ebx addl %eax,%ebp - pslld $2,%xmm2 addl 40(%rsp),%edx + pslld $2,%xmm2 xorl %ebx,%esi - psrld $30,%xmm8 movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax - addl %ebp,%edx por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx + pshufd $238,%xmm1,%xmm9 xorl %eax,%edi - movdqa %xmm2,%xmm9 movl %edx,%esi roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx - addl 48(%rsp),%ebx pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 + addl 48(%rsp),%ebx xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 addl %esi,%ebx xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx + paddd %xmm2,%xmm10 addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax @@ -1778,43 +1925,43 @@ _ssse3_shortcut: movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 - movdqa %xmm10,32(%rsp) addl %edi,%eax xorl %edx,%esi + movdqa %xmm10,32(%rsp) rorl $7,%ecx addl %ebx,%eax - pslld $2,%xmm3 addl 56(%rsp),%ebp + pslld $2,%xmm3 xorl %ecx,%esi - psrld $30,%xmm9 movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx - addl %eax,%ebp por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx + pshufd $238,%xmm2,%xmm10 xorl %ebx,%edi - movdqa %xmm3,%xmm10 movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx - addl 0(%rsp),%ecx pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 + addl 0(%rsp),%ecx xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 addl %esi,%ecx xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp + paddd %xmm3,%xmm8 addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx @@ -1822,43 +1969,43 @@ _ssse3_shortcut: movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 - movdqa %xmm8,48(%rsp) addl %edi,%ebx xorl %ebp,%esi + movdqa %xmm8,48(%rsp) rorl $7,%edx addl %ecx,%ebx - pslld $2,%xmm4 addl 8(%rsp),%eax + pslld $2,%xmm4 xorl %edx,%esi - psrld $30,%xmm10 movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx - addl %ebx,%eax por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp + pshufd $238,%xmm3,%xmm8 xorl %ecx,%edi - movdqa %xmm4,%xmm8 movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp - addl 16(%rsp),%edx pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 + addl 16(%rsp),%edx xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 addl %esi,%edx xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax + paddd %xmm4,%xmm9 addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx @@ -1866,24 +2013,24 @@ _ssse3_shortcut: movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 - movdqa %xmm9,0(%rsp) addl %edi,%ecx xorl %eax,%esi + movdqa %xmm9,0(%rsp) rorl $7,%ebp addl %edx,%ecx - pslld $2,%xmm5 addl 24(%rsp),%ebx + pslld $2,%xmm5 xorl %ebp,%esi - psrld $30,%xmm8 movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx - addl %ecx,%ebx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi @@ -1892,47 +2039,47 @@ _ssse3_shortcut: xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax - addl 32(%rsp),%ebp pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + addl 32(%rsp),%ebp andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx - pxor %xmm7,%xmm6 + punpcklqdq %xmm5,%xmm9 movl %eax,%edi xorl %ecx,%esi - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - pxor %xmm9,%xmm6 + movdqa %xmm10,%xmm8 xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 addl %eax,%ebp addl 36(%rsp),%edx - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax + movdqa %xmm6,%xmm9 movl %ebp,%esi - pslld $2,%xmm6 xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp - psrld $30,%xmm9 addl %edi,%edx xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx andl %eax,%esi - por %xmm9,%xmm6 xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - movdqa %xmm6,%xmm10 movl %edx,%edi xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx xorl %ebp,%edi xorl %eax,%ebp @@ -1948,47 +2095,47 @@ _ssse3_shortcut: xorl %edx,%esi xorl %ebp,%edx addl %ecx,%ebx - addl 48(%rsp),%eax pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 + addl 48(%rsp),%eax andl %edx,%esi xorl %ebp,%edx rorl $7,%ecx - pxor %xmm0,%xmm7 + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi xorl %edx,%esi - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - pxor %xmm10,%xmm7 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 addl %ebx,%eax addl 52(%rsp),%ebp - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx + movdqa %xmm7,%xmm10 movl %eax,%esi - pslld $2,%xmm7 xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax - psrld $30,%xmm10 addl %edi,%ebp xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx andl %ebx,%esi - por %xmm10,%xmm7 xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - movdqa %xmm7,%xmm8 movl %ebp,%edi xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx xorl %eax,%edi xorl %ebx,%eax @@ -2004,47 +2151,47 @@ _ssse3_shortcut: xorl %ebp,%esi xorl %eax,%ebp addl %edx,%ecx - addl 0(%rsp),%ebx pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 + addl 0(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp rorl $7,%edx - pxor %xmm1,%xmm0 + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi xorl %ebp,%esi - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - pxor %xmm8,%xmm0 + movdqa %xmm9,%xmm10 xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 addl %ecx,%ebx addl 4(%rsp),%eax - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx + movdqa %xmm0,%xmm8 movl %ebx,%esi - pslld $2,%xmm0 xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx - psrld $30,%xmm8 addl %edi,%eax xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp andl %ecx,%esi - por %xmm8,%xmm0 xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - movdqa %xmm0,%xmm9 movl %eax,%edi xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp xorl %ebx,%edi xorl %ecx,%ebx @@ -2060,47 +2207,47 @@ _ssse3_shortcut: xorl %eax,%esi xorl %ebx,%eax addl %ebp,%edx - addl 16(%rsp),%ecx pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 + addl 16(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%ebp - pxor %xmm2,%xmm1 + punpcklqdq %xmm0,%xmm9 movl %edx,%edi xorl %eax,%esi - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - pxor %xmm9,%xmm1 + movdqa %xmm10,%xmm8 xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 addl %edx,%ecx addl 20(%rsp),%ebx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx + movdqa %xmm1,%xmm9 movl %ecx,%esi - pslld $2,%xmm1 xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx - psrld $30,%xmm9 addl %edi,%ebx xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax andl %edx,%esi - por %xmm9,%xmm1 xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - movdqa %xmm1,%xmm10 movl %ebx,%edi xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax xorl %ecx,%edi xorl %edx,%ecx @@ -2116,47 +2263,47 @@ _ssse3_shortcut: xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%ebp - addl 32(%rsp),%edx pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 + addl 32(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax - pxor %xmm3,%xmm2 + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi xorl %ebx,%esi - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm9 xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 addl %ebp,%edx addl 36(%rsp),%ecx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp + movdqa %xmm2,%xmm10 movl %edx,%esi - pslld $2,%xmm2 xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx - psrld $30,%xmm10 addl %edi,%ecx xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx andl %ebp,%esi - por %xmm10,%xmm2 xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - movdqa %xmm2,%xmm8 movl %ecx,%edi xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx xorl %edx,%edi xorl %ebp,%edx @@ -2171,18 +2318,18 @@ _ssse3_shortcut: addl %edi,%eax xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 + addl 48(%rsp),%ebp xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx + paddd %xmm2,%xmm9 addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx @@ -2190,22 +2337,22 @@ _ssse3_shortcut: movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 - movdqa %xmm9,32(%rsp) addl %edi,%edx xorl %ebx,%esi + movdqa %xmm9,32(%rsp) rorl $7,%eax addl %ebp,%edx - pslld $2,%xmm3 addl 56(%rsp),%ecx + pslld $2,%xmm3 xorl %eax,%esi - psrld $30,%xmm8 movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp - addl %edx,%ecx por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi @@ -2215,13 +2362,13 @@ _ssse3_shortcut: rorl $7,%edx addl %ecx,%ebx addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %edx,%esi movl %ebx,%edi roll $5,%ebx + paddd %xmm3,%xmm10 addl %esi,%eax - movdqa %xmm10,48(%rsp) xorl %edx,%edi + movdqa %xmm10,48(%rsp) rorl $7,%ecx addl %ebx,%eax addl 4(%rsp),%ebp @@ -2250,8 +2397,8 @@ _ssse3_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2260,23 +2407,23 @@ _ssse3_shortcut: addq $64,%r9 addl 16(%rsp),%ebx xorl %ebp,%esi -.byte 102,15,56,0,206 movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx - paddd %xmm9,%xmm0 addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx + paddd %xmm9,%xmm0 addl %ecx,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax xorl %edx,%edi - psubd %xmm9,%xmm0 movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx + psubd %xmm9,%xmm0 addl %ebx,%eax addl 24(%rsp),%ebp xorl %ecx,%esi @@ -2296,23 +2443,23 @@ _ssse3_shortcut: addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi -.byte 102,15,56,0,214 movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx - paddd %xmm9,%xmm1 addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp + paddd %xmm9,%xmm1 addl %edx,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx xorl %ebp,%edi - psubd %xmm9,%xmm1 movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx + psubd %xmm9,%xmm1 addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi @@ -2332,23 +2479,23 @@ _ssse3_shortcut: addl %eax,%ebp addl 48(%rsp),%edx xorl %ebx,%esi -.byte 102,15,56,0,222 movl %ebp,%edi +.byte 102,15,56,0,222 roll $5,%ebp - paddd %xmm9,%xmm2 addl %esi,%edx xorl %ebx,%edi rorl $7,%eax + paddd %xmm9,%xmm2 addl %ebp,%edx - movdqa %xmm2,32(%rsp) addl 52(%rsp),%ecx xorl %eax,%edi - psubd %xmm9,%xmm2 movl %edx,%esi + movdqa %xmm2,32(%rsp) roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp + psubd %xmm9,%xmm2 addl %edx,%ecx addl 56(%rsp),%ebx xorl %ebp,%esi @@ -2488,25 +2635,2857 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq 64(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 -.align 64 -K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 - +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + leaq -64(%rsp),%rsp + vzeroupper + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 -.section .note.GNU-stack,"",%progbits + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.align 16 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.align 16 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +.type sha1_block_data_order_avx2,@function +.align 16 +sha1_block_data_order_avx2: +_avx2_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + vzeroupper + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp .Loop_avx2 +.align 32 +.Loop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp .Lalign32_1 +.align 32 +.Lalign32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp .Lalign32_2 +.align 32 +.Lalign32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je .Ldone_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja .Last_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp .Last_avx2 + +.align 32 +.Last_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp .Lalign32_3 +.align 32 +.Lalign32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + xorl %ebp,%esi + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe .Loop_avx2 + +.Ldone_avx2: + vzeroupper + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/elf/sha256-ssse3-x86.s b/lib/accelerated/x86/elf/sha256-ssse3-x86.s index 7470ef7407..8d9aaa4a81 100644 --- a/lib/accelerated/x86/elf/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha512-586.s" .text .globl sha256_block_data_order .type sha256_block_data_order,@function @@ -64,20 +63,6 @@ sha256_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) - leal _gnutls_x86_cpuid_s-.L001K256(%ebp),%edx - movl (%edx),%ecx - movl 4(%edx),%edx - testl $1048576,%ecx - jnz .L002loop - testl $2048,%edx - andl $1073741824,%ecx - andl $268435456,%edx - orl %edx,%ecx - cmpl $1342177280,%ecx - je .L003loop_shrd - subl %edi,%eax - cmpl $256,%eax - jae .L004unrolled jmp .L002loop .align 16 .L002loop: @@ -149,7 +134,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00500_15: +.L00300_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -187,11 +172,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00500_15 + jne .L00300_15 movl 156(%esp),%ecx - jmp .L00616_63 + jmp .L00416_63 .align 16 -.L00616_63: +.L00416_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -246,7 +231,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00616_63 + jne .L00416_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -280,8 +265,8 @@ sha256_block_data_order: popl %ebx popl %ebp ret -.align 16 -.L003loop_shrd: +.align 32 +.L005loop_shrd: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx @@ -350,7 +335,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15_shrd: +.L00600_15_shrd: movl %edx,%ecx movl 24(%esp),%esi shrdl $14,%ecx,%ecx @@ -388,11 +373,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15_shrd + jne .L00600_15_shrd movl 156(%esp),%ecx - jmp .L00816_63_shrd + jmp .L00716_63_shrd .align 16 -.L00816_63_shrd: +.L00716_63_shrd: movl %ecx,%ebx movl 104(%esp),%esi shrdl $11,%ecx,%ecx @@ -447,7 +432,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63_shrd + jne .L00716_63_shrd movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -474,7 +459,7 @@ sha256_block_data_order: leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi - jb .L003loop_shrd + jb .L005loop_shrd movl 12(%esp),%esp popl %edi popl %esi @@ -485,8 +470,13 @@ sha256_block_data_order: .L001K256: .long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 .long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 .align 16 -.L004unrolled: +.L008unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -3393,14 +3383,5 @@ sha256_block_data_order: popl %ebp ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.comm _gnutls_x86_cpuid_s,16,4 - .section .note.GNU-stack,"",%progbits - - diff --git a/lib/accelerated/x86/elf/sha256-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha256-ssse3-x86_64.s new file mode 100644 index 0000000000..4b08e0c85e --- /dev/null +++ b/lib/accelerated/x86/elf/sha256-ssse3-x86_64.s @@ -0,0 +1,5471 @@ +# Copyright (c) 2011-2016, Andy Polyakov +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the Andy Polyakov nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# *** This file is auto-generated *** +# +.text + + +.globl sha256_block_data_order +.type sha256_block_data_order,@function +.align 16 +sha256_block_data_order: +.cfi_startproc + leaq _gnutls_x86_cpuid_s(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut + testl $512,%r10d + jnz .Lssse3_shortcut + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order,.-sha256_block_data_order +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha256_block_data_order_shaext,@function +.align 64 +sha256_block_data_order_shaext: +_shaext_shortcut: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +.type sha256_block_data_order_ssse3,@function +.align 64 +sha256_block_data_order_ssse3: +.cfi_startproc +.Lssse3_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.cfi_startproc +.Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx +.type sha256_block_data_order_avx2,@function +.align 64 +sha256_block_data_order_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $544,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + leaq -64(%rsp),%rsp + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -64(%rsp),%rsp + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + +.Ldone_avx2: + leaq (%rbp),%rsp + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86.s b/lib/accelerated/x86/elf/sha512-ssse3-x86.s index 0b99b22ec9..481c777154 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha512-586.s" .text .globl sha512_block_data_order .type sha512_block_data_order,@function @@ -594,6 +593,8 @@ sha512_block_data_order: .long 4234509866,1501505948 .long 987167468,1607167915 .long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 .size sha512_block_data_order,.-.L_sha512_block_data_order_begin .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 @@ -601,7 +602,4 @@ sha512_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 - .section .note.GNU-stack,"",%progbits - - diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s index d51d8169bc..e384d7e9e8 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,2854 +40,5439 @@ .text -.globl sha256_block_data_order -.type sha256_block_data_order,@function +.globl sha512_block_data_order +.type sha512_block_data_order,@function .align 16 -sha256_block_data_order: +sha512_block_data_order: +.cfi_startproc leaq _gnutls_x86_cpuid_s(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $512,%r10d - jnz .Lssse3_shortcut + testl $2048,%r10d + jnz .Lxop_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx - subq $64+32,%rsp - leaq (%rsi,%rdx,4),%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 jmp .Lloop .align 16 .Lloop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - leaq 4(%rbp),%rbp - addl %r14d,%edx - - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - leaq 20(%rbp),%rbp - addl %r14d,%eax - - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - leaq 4(%rbp),%rbp - addl %r14d,%edx - - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 4(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: - - movl 56(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 36(%rsp),%r12d - xorl %r15d,%r14d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r14d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 8(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - - movl 60(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 40(%rsp),%r12d - xorl %edi,%r14d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %r14d,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 12(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - - movl 0(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 44(%rsp),%r12d - xorl %r15d,%r14d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r14d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 16(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - - movl 4(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 48(%rsp),%r12d - xorl %edi,%r14d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %r14d,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 20(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - - movl 8(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 52(%rsp),%r12d - xorl %r15d,%r14d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r14d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 24(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%edx - - - movl 12(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 56(%rsp),%r12d - xorl %edi,%r14d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %r14d,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 28(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - - movl 16(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 60(%rsp),%r12d - xorl %r15d,%r14d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r14d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 32(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - - movl 20(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 0(%rsp),%r12d - xorl %edi,%r14d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %r14d,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 36(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - - - movl 24(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 4(%rsp),%r12d - xorl %r15d,%r14d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r14d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 40(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - - movl 28(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 8(%rsp),%r12d - xorl %edi,%r14d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %r14d,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 44(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - - movl 32(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 12(%rsp),%r12d - xorl %r15d,%r14d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r14d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 48(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - - movl 36(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 16(%rsp),%r12d - xorl %edi,%r14d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %r14d,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 52(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - - movl 40(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 20(%rsp),%r12d - xorl %r15d,%r14d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r14d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 56(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%edx - - - movl 44(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 24(%rsp),%r12d - xorl %edi,%r14d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %r14d,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 60(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - - movl 48(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 28(%rsp),%r12d - xorl %r15d,%r14d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r14d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 0(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - - movl 52(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 32(%rsp),%r12d - xorl %edi,%r14d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %r14d,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 4(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - - cmpb $0,3(%rbp) + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) jnz .Lrounds_16_xx - movq 64+0(%rsp),%rdi - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) jb .Lloop - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 -.size sha256_block_data_order,.-sha256_block_data_order +.cfi_endproc +.size sha512_block_data_order,.-sha512_block_data_order .align 64 -.type K256,@object -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha256_block_data_order_ssse3,@function +.type K512,@object +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha512_block_data_order_xop,@function .align 64 -sha256_block_data_order_ssse3: -.Lssse3_shortcut: +sha512_block_data_order_xop: +.cfi_startproc +.Lxop_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) -.Lprologue_ssse3: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - movdqa K256+512+32(%rip),%xmm8 - movdqa K256+512+64(%rip),%xmm9 - jmp .Lloop_ssse3 + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_xop .align 16 -.Lloop_ssse3: - movdqa K256+512(%rip),%xmm7 - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 -.byte 102,15,56,0,199 - leaq K256(%rip),%rbp -.byte 102,15,56,0,207 - movdqa 0(%rbp),%xmm4 -.byte 102,15,56,0,215 - movdqa 32(%rbp),%xmm5 - paddd %xmm0,%xmm4 - movdqa 64(%rbp),%xmm6 -.byte 102,15,56,0,223 - movdqa 96(%rbp),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lssse3_00_47 +.Lloop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lxop_00_47 .align 16 -.Lssse3_00_47: - subq $-32*4,%rbp - rorl $14,%r13d - movl %r14d,%eax - movdqa %xmm1,%xmm4 - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d -.byte 102,15,58,15,224,4 - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,250,4 - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - psrld $7,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - pshufd $250,%xmm3,%xmm7 - movl %r8d,%r12d - pslld $14,%xmm5 - xorl %edx,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r9d,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - pxor %xmm6,%xmm4 - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - psrlq $17,%xmm6 - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - pxor %xmm6,%xmm7 - xorl %ecx,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - pxor %xmm6,%xmm7 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d -.byte 102,65,15,56,0,248 - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm0 - addl %r12d,%r9d - pshufd $80,%xmm0,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - movdqa %xmm7,%xmm6 - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - psrld $10,%xmm7 - movl %ebx,%r13d - psrlq $17,%xmm6 - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - pxor %xmm6,%xmm7 - addl 12(%rsp),%r8d - movl %r9d,%edi - movdqa 0(%rbp),%xmm6 - rorl $11,%r14d - xorl %edx,%r12d -.byte 102,65,15,56,0,249 - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - paddd %xmm7,%xmm0 - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - paddd %xmm0,%xmm6 - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movl %r14d,%r8d - movdqa %xmm2,%xmm4 - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d -.byte 102,15,58,15,225,4 - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,251,4 - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - psrld $7,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - pshufd $250,%xmm0,%xmm7 - movl %eax,%r12d - pslld $14,%xmm5 - xorl %r11d,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %ebx,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - pxor %xmm6,%xmm4 - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - psrlq $17,%xmm6 - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - pxor %xmm6,%xmm7 - xorl %r10d,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - pxor %xmm6,%xmm7 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx -.byte 102,65,15,56,0,248 - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm1 - addl %r12d,%ebx - pshufd $80,%xmm1,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - movdqa %xmm7,%xmm6 - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - psrld $10,%xmm7 - movl %r9d,%r13d - psrlq $17,%xmm6 - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - pxor %xmm6,%xmm7 - addl 28(%rsp),%eax - movl %ebx,%edi - movdqa 32(%rbp),%xmm6 - rorl $11,%r14d - xorl %r11d,%r12d -.byte 102,65,15,56,0,249 - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - paddd %xmm7,%xmm1 - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - paddd %xmm1,%xmm6 - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movdqa %xmm3,%xmm4 - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d -.byte 102,15,58,15,226,4 - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,248,4 - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - psrld $7,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - pshufd $250,%xmm1,%xmm7 - movl %r8d,%r12d - pslld $14,%xmm5 - xorl %edx,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r9d,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - pxor %xmm6,%xmm4 - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - psrlq $17,%xmm6 - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - pxor %xmm6,%xmm7 - xorl %ecx,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - pxor %xmm6,%xmm7 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d -.byte 102,65,15,56,0,248 - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm2 - addl %r12d,%r9d - pshufd $80,%xmm2,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - movdqa %xmm7,%xmm6 - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - psrld $10,%xmm7 - movl %ebx,%r13d - psrlq $17,%xmm6 - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - pxor %xmm6,%xmm7 - addl 44(%rsp),%r8d - movl %r9d,%edi - movdqa 64(%rbp),%xmm6 - rorl $11,%r14d - xorl %edx,%r12d -.byte 102,65,15,56,0,249 - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - paddd %xmm7,%xmm2 - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - paddd %xmm2,%xmm6 - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movl %r14d,%r8d - movdqa %xmm0,%xmm4 - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d -.byte 102,15,58,15,227,4 - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,249,4 - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - psrld $7,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - pshufd $250,%xmm2,%xmm7 - movl %eax,%r12d - pslld $14,%xmm5 - xorl %r11d,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %ebx,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - pxor %xmm6,%xmm4 - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - psrlq $17,%xmm6 - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - pxor %xmm6,%xmm7 - xorl %r10d,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - pxor %xmm6,%xmm7 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx -.byte 102,65,15,56,0,248 - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm3 - addl %r12d,%ebx - pshufd $80,%xmm3,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - movdqa %xmm7,%xmm6 - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - psrld $10,%xmm7 - movl %r9d,%r13d - psrlq $17,%xmm6 - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - pxor %xmm6,%xmm7 - addl 60(%rsp),%eax - movl %ebx,%edi - movdqa 96(%rbp),%xmm6 - rorl $11,%r14d - xorl %r11d,%r12d -.byte 102,65,15,56,0,249 - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - paddd %xmm7,%xmm3 - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - paddd %xmm3,%xmm6 - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne .Lssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_ssse3 - - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_ssse3: +.Lxop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lxop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_xop + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_xop: .byte 0xf3,0xc3 -.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 - +.cfi_endproc +.size sha512_block_data_order_xop,.-sha512_block_data_order_xop +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.cfi_startproc +.Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 -.section .note.GNU-stack,"",%progbits +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx +.type sha512_block_data_order_avx2,@function +.align 64 +sha512_block_data_order_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $1312,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + leaq -128(%rsp),%rsp + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp .Lavx2_00_47 +.align 16 +.Lavx2_00_47: + leaq -128(%rsp),%rsp + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne .Lavx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je .Ldone_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + +.Ldone_avx2: + leaq (%rbp),%rsp + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/files.mk b/lib/accelerated/x86/files.mk index a134213922..cf688b3883 100644 --- a/lib/accelerated/x86/files.mk +++ b/lib/accelerated/x86/files.mk @@ -1,9 +1,9 @@ X86_FILES_ELF=elf/aesni-x86.s elf/cpuid-x86.s elf/sha1-ssse3-x86.s elf/sha256-ssse3-x86.s elf/sha512-ssse3-x86.s elf/aes-ssse3-x86.s X86_FILES_COFF=coff/aesni-x86.s coff/cpuid-x86.s coff/sha1-ssse3-x86.s coff/sha256-ssse3-x86.s coff/sha512-ssse3-x86.s coff/aes-ssse3-x86.s X86_FILES_MACOSX=macosx/aesni-x86.s macosx/cpuid-x86.s macosx/sha1-ssse3-x86.s macosx/sha256-ssse3-x86.s macosx/sha512-ssse3-x86.s macosx/aes-ssse3-x86.s -X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s elf/aesni-gcm-x86_64.s -X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s coff/aesni-gcm-x86_64.s -X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s macosx/aesni-gcm-x86_64.s +X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s elf/aesni-gcm-x86_64.s elf/sha256-ssse3-x86_64.s +X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s coff/aesni-gcm-x86_64.s coff/sha256-ssse3-x86_64.s +X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s macosx/aesni-gcm-x86_64.s macosx/sha256-ssse3-x86_64.s X86_PADLOCK_FILES_ELF=elf/e_padlock-x86.s X86_PADLOCK_FILES_COFF=coff/e_padlock-x86.s X86_PADLOCK_FILES_MACOSX=macosx/e_padlock-x86.s diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86.s b/lib/accelerated/x86/macosx/aes-ssse3-x86.s index c881a3d888..4be899281b 100644 --- a/lib/accelerated/x86/macosx/aes-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/aes-ssse3-x86.s @@ -5,12 +5,11 @@ ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## -## For details see https://shiftleft.org/papers/vector_aes/ and -## https://crypto.stanford.edu/vpaes/. +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. # # *** This file is auto-generated *** # -.file "vpaes-x86.s" .text .align 6,0x90 L_vpaes_consts: diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s index 414bb483e0..3d5c652266 100644 --- a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s @@ -5,8 +5,8 @@ ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## -## For details see https://shiftleft.org/papers/vector_aes/ and -## https://crypto.stanford.edu/vpaes/. +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. # # *** This file is auto-generated *** # @@ -30,6 +30,7 @@ .p2align 4 _vpaes_encrypt_core: + movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -118,8 +119,10 @@ L$enc_entry: + .p2align 4 _vpaes_decrypt_core: + movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -224,6 +227,7 @@ L$dec_entry: + .p2align 4 _vpaes_schedule_core: @@ -231,6 +235,7 @@ _vpaes_schedule_core: + call _vpaes_preheat movdqa L$k_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -409,8 +414,10 @@ L$schedule_mangle_last_dec: + .p2align 4 _vpaes_schedule_192_smear: + pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -438,11 +445,13 @@ _vpaes_schedule_192_smear: + .p2align 4 _vpaes_schedule_round: + pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 .byte 102,69,15,58,15,192,15 @@ -507,8 +516,10 @@ _vpaes_schedule_low_round: + .p2align 4 _vpaes_schedule_transform: + movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -538,6 +549,7 @@ _vpaes_schedule_transform: + @@ -547,6 +559,7 @@ _vpaes_schedule_transform: .p2align 4 _vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 movdqa L$k_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -616,10 +629,12 @@ L$schedule_mangle_both: + .globl _vpaes_set_encrypt_key .p2align 4 _vpaes_set_encrypt_key: + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -632,10 +647,12 @@ _vpaes_set_encrypt_key: .byte 0xf3,0xc3 + .globl _vpaes_set_decrypt_key .p2align 4 _vpaes_set_decrypt_key: + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -653,10 +670,12 @@ _vpaes_set_decrypt_key: .byte 0xf3,0xc3 + .globl _vpaes_encrypt .p2align 4 _vpaes_encrypt: + movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core @@ -664,20 +683,24 @@ _vpaes_encrypt: .byte 0xf3,0xc3 + .globl _vpaes_decrypt .p2align 4 _vpaes_decrypt: + movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 + .globl _vpaes_cbc_encrypt .p2align 4 _vpaes_cbc_encrypt: + xchgq %rcx,%rdx subq $16,%rcx jc L$cbc_abort @@ -721,8 +744,10 @@ L$cbc_abort: + .p2align 4 _vpaes_preheat: + leaq L$k_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -739,6 +764,7 @@ _vpaes_preheat: + .p2align 6 _vpaes_consts: L$k_inv: diff --git a/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s b/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s index 002041cee2..d540930b5b 100644 --- a/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s +++ b/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -354,17 +354,25 @@ L$6x_done: .p2align 5 _aesni_gcm_decrypt: + xorq %r10,%r10 cmpq $0x60,%rdx jb L$gcm_dec_abort leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + vzeroupper vmovdqu (%r8),%xmm1 @@ -426,17 +434,25 @@ L$dec_no_key_aliasing: vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$gcm_dec_abort: movq %r10,%rax .byte 0xf3,0xc3 + .p2align 5 _aesni_ctr32_6x: vmovdqu 0-128(%rcx),%xmm4 @@ -531,17 +547,25 @@ L$handle_ctr32_2: .p2align 5 _aesni_gcm_encrypt: + xorq %r10,%r10 cmpq $288,%rdx jb L$gcm_enc_abort leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + vzeroupper vmovdqu (%r8),%xmm1 @@ -767,16 +791,24 @@ L$enc_no_key_aliasing: vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$gcm_enc_abort: movq %r10,%rax .byte 0xf3,0xc3 + .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/lib/accelerated/x86/macosx/aesni-x86.s b/lib/accelerated/x86/macosx/aesni-x86.s index 09ca1cbc5c..ee50089146 100644 --- a/lib/accelerated/x86/macosx/aesni-x86.s +++ b/lib/accelerated/x86/macosx/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "devel/perlasm/aesni-x86.s" .text .globl _aesni_encrypt .align 4 @@ -59,7 +58,10 @@ L000enc1_loop_1: leal 16(%edx),%edx jnz L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .globl _aesni_decrypt .align 4 @@ -81,30 +83,84 @@ L001dec1_loop_2: leal 16(%edx),%edx jnz L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.align 4 +__aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L002enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L002enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.align 4 +__aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L003dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L003dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 ret .align 4 __aesni_encrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -L002enc3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L004enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 - movups (%edx),%xmm0 - jnz L002enc3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L004enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -115,25 +171,26 @@ L002enc3_loop: .align 4 __aesni_decrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -L003dec3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L005dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 - movups (%edx),%xmm0 - jnz L003dec3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L005dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -145,27 +202,29 @@ L003dec3_loop: __aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -L004enc4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L006enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%edx),%xmm0 - jnz L004enc4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L006enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -179,27 +238,29 @@ L004enc4_loop: __aesni_decrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -L005dec4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L007dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%edx),%xmm0 - jnz L005dec4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L007dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -212,45 +273,42 @@ L005dec4_loop: .align 4 __aesni_encrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 -.byte 102,15,56,220,241 - movups (%edx),%xmm0 -.byte 102,15,56,220,249 - jmp L_aesni_encrypt6_enter + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp L008_aesni_encrypt6_inner .align 4,0x90 -L006enc6_loop: +L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 +L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.align 4,0x90 L_aesni_encrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%edx),%xmm0 - jnz L006enc6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -267,45 +325,42 @@ L_aesni_encrypt6_enter: .align 4 __aesni_decrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 -.byte 102,15,56,222,241 - movups (%edx),%xmm0 -.byte 102,15,56,222,249 - jmp L_aesni_decrypt6_enter + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp L010_aesni_decrypt6_inner .align 4,0x90 -L007dec6_loop: +L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 +L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.align 4,0x90 L_aesni_decrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%edx),%xmm0 - jnz L007dec6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -333,14 +388,14 @@ L_aesni_ecb_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz L008ecb_ret + jz L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz L009ecb_decrypt + jz L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L010ecb_enc_tail + jb L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -349,9 +404,9 @@ L_aesni_ecb_encrypt_begin: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L011ecb_enc_loop6_enter + jmp L015ecb_enc_loop6_enter .align 4,0x90 -L012ecb_enc_loop6: +L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -366,12 +421,12 @@ L012ecb_enc_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L011ecb_enc_loop6_enter: +L015ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L012ecb_enc_loop6 + jnc L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -380,18 +435,18 @@ L011ecb_enc_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L008ecb_ret -L010ecb_enc_tail: + jz L012ecb_ret +L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L013ecb_enc_one + jb L017ecb_enc_one movups 16(%esi),%xmm3 - je L014ecb_enc_two + je L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L015ecb_enc_three + jb L019ecb_enc_three movups 48(%esi),%xmm5 - je L016ecb_enc_four + je L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -400,50 +455,49 @@ L010ecb_enc_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L013ecb_enc_one: +L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L017enc1_loop_3: +L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L017enc1_loop_3 + jnz L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L014ecb_enc_two: - xorps %xmm4,%xmm4 - call __aesni_encrypt3 +L018ecb_enc_two: + call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L015ecb_enc_three: +L019ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L016ecb_enc_four: +L020ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L009ecb_decrypt: +L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L018ecb_dec_tail + jb L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -452,9 +506,9 @@ L009ecb_decrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L019ecb_dec_loop6_enter + jmp L023ecb_dec_loop6_enter .align 4,0x90 -L020ecb_dec_loop6: +L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -469,12 +523,12 @@ L020ecb_dec_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L019ecb_dec_loop6_enter: +L023ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L020ecb_dec_loop6 + jnc L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -483,18 +537,18 @@ L019ecb_dec_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L008ecb_ret -L018ecb_dec_tail: + jz L012ecb_ret +L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L021ecb_dec_one + jb L025ecb_dec_one movups 16(%esi),%xmm3 - je L022ecb_dec_two + je L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L023ecb_dec_three + jb L027ecb_dec_three movups 48(%esi),%xmm5 - je L024ecb_dec_four + je L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -503,44 +557,51 @@ L018ecb_dec_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L021ecb_dec_one: +L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L025dec1_loop_4: +L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L025dec1_loop_4 + jnz L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L022ecb_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +L026ecb_dec_two: + call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L023ecb_dec_three: +L027ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L008ecb_ret + jmp L012ecb_ret .align 4,0x90 -L024ecb_dec_four: +L028ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L008ecb_ret: +L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -577,48 +638,56 @@ L_aesni_ccm64_encrypt_blocks_begin: movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - shrl $1,%ecx + shll $4,%ecx + movl $16,%ebx leal (%edx),%ebp movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 - movl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx .byte 102,15,56,0,253 -L026ccm64_enc_outer: +L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 xorps %xmm0,%xmm2 movups 16(%ebp),%xmm1 xorps %xmm6,%xmm0 - leal 32(%ebp),%edx xorps %xmm0,%xmm3 - movups (%edx),%xmm0 -L027ccm64_enc2_loop: + movups 32(%ebp),%xmm0 +L031ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz L027ccm64_enc2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 + decl %eax .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) - leal 16(%edi),%edi .byte 102,15,56,0,213 - jnz L026ccm64_enc_outer + leal 16(%edi),%edi + jnz L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -664,71 +733,82 @@ L_aesni_ccm64_decrypt_blocks_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L028enc1_loop_5: +L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L028enc1_loop_5 + jnz L032enc1_loop_5 .byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx movups (%esi),%xmm6 paddq 16(%esp),%xmm7 leal 16(%esi),%esi - jmp L029ccm64_dec_outer + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp L033ccm64_dec_outer .align 4,0x90 -L029ccm64_dec_outer: +L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz L030ccm64_dec_break + jz L034ccm64_dec_break movups (%ebp),%xmm0 - shrl $1,%ecx + movl %ebx,%ecx movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movups (%edx),%xmm0 -L031ccm64_dec2_loop: + movups 32(%ebp),%xmm0 +L035ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz L031ccm64_dec2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 - jmp L029ccm64_dec_outer + leal 16(%esi),%esi + jmp L033ccm64_dec_outer .align 4,0x90 -L030ccm64_dec_break: +L034ccm64_dec_break: + movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -L032enc1_loop_6: +L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L032enc1_loop_6 + jnz L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -752,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je L033ctr32_one_shortcut + je L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -768,63 +848,59 @@ L_aesni_ctr32_encrypt_blocks_begin: .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx - pxor %xmm1,%xmm1 pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,203,0 +.byte 102,15,58,34,195,0 leal 3(%ebx),%ebp -.byte 102,15,58,34,197,0 +.byte 102,15,58,34,205,0 incl %ebx -.byte 102,15,58,34,203,1 +.byte 102,15,58,34,195,1 incl %ebp -.byte 102,15,58,34,197,1 +.byte 102,15,58,34,205,1 incl %ebx -.byte 102,15,58,34,203,2 +.byte 102,15,58,34,195,2 incl %ebp -.byte 102,15,58,34,197,2 - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 - movdqa %xmm0,64(%esp) +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 - pshufd $192,%xmm1,%xmm2 - pshufd $128,%xmm1,%xmm3 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb L034ctr32_tail + jb L038ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx movdqa %xmm7,32(%esp) - shrl $1,%ecx movl %edx,%ebp - movl %ecx,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp L035ctr32_loop6 + jmp L039ctr32_loop6 .align 4,0x90 -L035ctr32_loop6: - pshufd $64,%xmm1,%xmm4 - movdqa 32(%esp),%xmm1 - pshufd $192,%xmm0,%xmm5 - por %xmm1,%xmm2 - pshufd $128,%xmm0,%xmm6 - por %xmm1,%xmm3 - pshufd $64,%xmm0,%xmm7 - por %xmm1,%xmm4 - por %xmm1,%xmm5 - por %xmm1,%xmm6 - por %xmm1,%xmm7 - movups (%ebp),%xmm0 - movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx - decl %ecx +L039ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 +.byte 102,15,56,220,209 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -835,51 +911,51 @@ L035ctr32_loop6: movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 - movdqa 48(%esp),%xmm1 + movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 - paddd 64(%esp),%xmm0 + paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 - movdqa %xmm0,64(%esp) -.byte 102,15,56,0,194 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 movups %xmm6,64(%edi) - pshufd $192,%xmm1,%xmm2 + pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi - movl %ebx,%ecx - pshufd $128,%xmm1,%xmm3 + pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc L035ctr32_loop6 + jnc L039ctr32_loop6 addl $6,%eax - jz L036ctr32_ret + jz L040ctr32_ret + movdqu (%ebp),%xmm7 movl %ebp,%edx - leal 1(,%ecx,2),%ecx - movdqa 32(%esp),%xmm7 -L034ctr32_tail: + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb L037ctr32_one - pshufd $64,%xmm1,%xmm4 + jb L041ctr32_one + pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je L038ctr32_two - pshufd $192,%xmm0,%xmm5 + je L042ctr32_two + pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb L039ctr32_three - pshufd $128,%xmm0,%xmm6 + jb L043ctr32_three + pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je L040ctr32_four + je L044ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -897,39 +973,39 @@ L034ctr32_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L036ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L033ctr32_one_shortcut: +L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -L037ctr32_one: +L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L041enc1_loop_7: +L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L041enc1_loop_7 + jnz L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp L036ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L038ctr32_two: - call __aesni_encrypt3 +L042ctr32_two: + call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L036ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L039ctr32_three: +L043ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -940,9 +1016,9 @@ L039ctr32_three: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L036ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L040ctr32_four: +L044ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -956,7 +1032,18 @@ L040ctr32_four: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L036ctr32_ret: +L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -979,12 +1066,12 @@ L_aesni_xts_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L042enc1_loop_8: +L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L042enc1_loop_8 + jnz L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1008,12 +1095,14 @@ L042enc1_loop_8: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc L043xts_enc_short - shrl $1,%ecx - movl %ecx,%ebx - jmp L044xts_enc_loop6 + jc L047xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L048xts_enc_loop6 .align 4,0x90 -L044xts_enc_loop6: +L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1049,6 +1138,7 @@ L044xts_enc_loop6: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1064,19 +1154,17 @@ L044xts_enc_loop6: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,220,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1101,26 +1189,25 @@ L044xts_enc_loop6: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc L044xts_enc_loop6 - leal 1(,%ecx,2),%ecx + jnc L048xts_enc_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L043xts_enc_short: +L047xts_enc_short: addl $96,%eax - jz L045xts_enc_done6x + jz L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L046xts_enc_one + jb L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L047xts_enc_two + je L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1129,7 +1216,7 @@ L043xts_enc_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L048xts_enc_three + jb L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1139,7 +1226,7 @@ L043xts_enc_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L049xts_enc_four + je L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1171,9 +1258,9 @@ L043xts_enc_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L050xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L046xts_enc_one: +L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1181,37 +1268,36 @@ L046xts_enc_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L051enc1_loop_9: +L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L051enc1_loop_9 + jnz L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L050xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L047xts_enc_two: +L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - xorps %xmm4,%xmm4 - call __aesni_encrypt3 + call __aesni_encrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L050xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L048xts_enc_three: +L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1229,9 +1315,9 @@ L048xts_enc_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L050xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L049xts_enc_four: +L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1253,28 +1339,28 @@ L049xts_enc_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L050xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L045xts_enc_done6x: +L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz L052xts_enc_ret + jz L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp L053xts_enc_steal + jmp L057xts_enc_steal .align 4,0x90 -L050xts_enc_done: +L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L052xts_enc_ret + jz L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -L053xts_enc_steal: +L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1282,7 +1368,7 @@ L053xts_enc_steal: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L053xts_enc_steal + jnz L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1292,16 +1378,30 @@ L053xts_enc_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L054enc1_loop_10: +L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L054enc1_loop_10 + jnz L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -L052xts_enc_ret: +L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1324,12 +1424,12 @@ L_aesni_xts_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L055enc1_loop_11: +L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L055enc1_loop_11 + jnz L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1358,12 +1458,14 @@ L055enc1_loop_11: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc L056xts_dec_short - shrl $1,%ecx - movl %ecx,%ebx - jmp L057xts_dec_loop6 + jc L060xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L061xts_dec_loop6 .align 4,0x90 -L057xts_dec_loop6: +L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1399,6 +1501,7 @@ L057xts_dec_loop6: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1414,19 +1517,17 @@ L057xts_dec_loop6: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,222,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 call L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1451,26 +1552,25 @@ L057xts_dec_loop6: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc L057xts_dec_loop6 - leal 1(,%ecx,2),%ecx + jnc L061xts_dec_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L056xts_dec_short: +L060xts_dec_short: addl $96,%eax - jz L058xts_dec_done6x + jz L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L059xts_dec_one + jb L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L060xts_dec_two + je L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1479,7 +1579,7 @@ L056xts_dec_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L061xts_dec_three + jb L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1489,7 +1589,7 @@ L056xts_dec_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L062xts_dec_four + je L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1521,9 +1621,9 @@ L056xts_dec_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L063xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L059xts_dec_one: +L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1531,36 +1631,36 @@ L059xts_dec_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L064dec1_loop_12: +L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L064dec1_loop_12 + jnz L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L063xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L060xts_dec_two: +L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - call __aesni_decrypt3 + call __aesni_decrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L063xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L061xts_dec_three: +L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1578,9 +1678,9 @@ L061xts_dec_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L063xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L062xts_dec_four: +L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1602,20 +1702,20 @@ L062xts_dec_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L063xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L058xts_dec_done6x: +L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz L065xts_dec_ret + jz L069xts_dec_ret movl %eax,112(%esp) - jmp L066xts_dec_only_one_more + jmp L070xts_dec_only_one_more .align 4,0x90 -L063xts_dec_done: +L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L065xts_dec_ret + jz L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1625,7 +1725,7 @@ L063xts_dec_done: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -L066xts_dec_only_one_more: +L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1639,16 +1739,16 @@ L066xts_dec_only_one_more: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L067dec1_loop_13: +L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L067dec1_loop_13 + jnz L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -L068xts_dec_steal: +L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1656,7 +1756,7 @@ L068xts_dec_steal: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L068xts_dec_steal + jnz L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1666,105 +1766,906 @@ L068xts_dec_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L069dec1_loop_14: +L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L069dec1_loop_14 + jnz L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -L065xts_dec_ret: +L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret -.globl _aesni_cbc_encrypt +.globl _aesni_ocb_encrypt .align 4 -_aesni_cbc_encrypt: -L_aesni_cbc_encrypt_begin: +_aesni_ocb_encrypt: +L_aesni_ocb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi + movl 40(%esp),%ecx + movl 48(%esp),%ebx movl 20(%esp),%esi - movl %esp,%ebx movl 24(%esp),%edi - subl $24,%ebx movl 28(%esp),%eax - andl $-16,%ebx movl 32(%esp),%edx + movdqu (%ecx),%xmm0 movl 36(%esp),%ebp - testl %eax,%eax - jz L070cbc_abort - cmpl $0,40(%esp) - xchgl %esp,%ebx - movups (%ebp),%xmm7 + movdqu (%ebx),%xmm1 + movl 44(%esp),%ebx + movl %esp,%ecx + subl $132,%esp + andl $-16,%esp + subl %esi,%edi + shll $4,%eax + leal -96(%esi,%eax,1),%eax + movl %edi,120(%esp) + movl %eax,124(%esp) + movl %ecx,128(%esp) movl 240(%edx),%ecx - movl %edx,%ebp - movl %ebx,16(%esp) - movl %ecx,%ebx - je L071cbc_decrypt - movaps %xmm7,%xmm2 - cmpl $16,%eax - jb L072cbc_enc_tail - subl $16,%eax - jmp L073cbc_enc_loop -.align 4,0x90 -L073cbc_enc_loop: - movups (%esi),%xmm7 + testl $1,%ebp + jnz L074odd + bsfl %ebp,%eax + addl $1,%ebp + shll $4,%eax + movdqu (%ebx,%eax,1),%xmm7 + movl %edx,%eax + movdqu (%esi),%xmm2 leal 16(%esi),%esi + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 movups (%edx),%xmm0 movups 16(%edx),%xmm1 - xorps %xmm0,%xmm7 leal 32(%edx),%edx - xorps %xmm7,%xmm2 -L074enc1_loop_15: + xorps %xmm0,%xmm2 +L075enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L074enc1_loop_15 + jnz L075enc1_loop_15 .byte 102,15,56,221,209 - movl %ebx,%ecx - movl %ebp,%edx - movups %xmm2,(%edi) - leal 16(%edi),%edi - subl $16,%eax - jnc L073cbc_enc_loop - addl $16,%eax - jnz L072cbc_enc_tail - movaps %xmm2,%xmm7 - jmp L075cbc_ret -L072cbc_enc_tail: - movl %eax,%ecx -.long 2767451785 - movl $16,%ecx - subl %eax,%ecx - xorl %eax,%eax -.long 2868115081 - leal -16(%edi),%edi - movl %ebx,%ecx - movl %edi,%esi - movl %ebp,%edx - jmp L073cbc_enc_loop -.align 4,0x90 -L071cbc_decrypt: - cmpl $80,%eax - jbe L076cbc_dec_tail - movaps %xmm7,(%esp) - subl $80,%eax - jmp L077cbc_dec_loop6_enter -.align 4,0x90 -L078cbc_dec_loop6: - movaps %xmm0,(%esp) - movups %xmm7,(%edi) - leal 16(%edi),%edi -L077cbc_dec_loop6_enter: - movdqu (%esi),%xmm2 + xorps %xmm7,%xmm2 + movdqa %xmm7,%xmm0 + movdqa %xmm6,%xmm1 + movups %xmm2,-16(%edi,%esi,1) + movl 240(%eax),%ecx + movl %eax,%edx + movl 124(%esp),%eax +L074odd: + shll $4,%ecx + movl $16,%edi + subl %ecx,%edi + movl %edx,112(%esp) + leal 32(%edx,%ecx,1),%edx + movl %edi,116(%esp) + cmpl %eax,%esi + ja L076short + jmp L077grandloop +.align 5,0x90 +L077grandloop: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + leal 5(%ebp),%edi + addl $6,%ebp + bsfl %ecx,%ecx + bsfl %eax,%eax + bsfl %edi,%edi + shll $4,%ecx + shll $4,%eax + shll $4,%edi + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + movdqu (%ebx,%edi,1),%xmm7 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm6,%xmm1 + pxor %xmm0,%xmm6 + pxor %xmm7,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm1,96(%esp) + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor 80(%esp),%xmm7 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movl 120(%esp),%edi + movl 124(%esp),%eax + call L_aesni_encrypt6_enter + movdqa 80(%esp),%xmm0 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor %xmm0,%xmm7 + movdqa 96(%esp),%xmm1 + movdqu %xmm2,-96(%edi,%esi,1) + movdqu %xmm3,-80(%edi,%esi,1) + movdqu %xmm4,-64(%edi,%esi,1) + movdqu %xmm5,-48(%edi,%esi,1) + movdqu %xmm6,-32(%edi,%esi,1) + movdqu %xmm7,-16(%edi,%esi,1) + cmpl %eax,%esi + jb L077grandloop +L076short: + addl $96,%eax + subl %esi,%eax + jz L078done + cmpl $32,%eax + jb L079one + je L080two + cmpl $64,%eax + jb L081three + je L082four + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm6,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm1,96(%esp) + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movl 120(%esp),%edi + call L_aesni_encrypt6_enter + movdqa 64(%esp),%xmm0 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor %xmm0,%xmm6 + movdqa 96(%esp),%xmm1 + movdqu %xmm2,(%edi,%esi,1) + movdqu %xmm3,16(%edi,%esi,1) + movdqu %xmm4,32(%edi,%esi,1) + movdqu %xmm5,48(%edi,%esi,1) + movdqu %xmm6,64(%edi,%esi,1) + jmp L078done +.align 4,0x90 +L079one: + movdqu (%ebx),%xmm7 + movl 112(%esp),%edx + movdqu (%esi),%xmm2 + movl 240(%edx),%ecx + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movl 120(%esp),%edi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L083enc1_loop_16: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L083enc1_loop_16 +.byte 102,15,56,221,209 + xorps %xmm7,%xmm2 + movdqa %xmm7,%xmm0 + movdqa %xmm6,%xmm1 + movups %xmm2,(%edi,%esi,1) + jmp L078done +.align 4,0x90 +L080two: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm6 + movdqu (%ebx,%ecx,1),%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movl 240(%edx),%ecx + pxor %xmm0,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm3 + movdqa %xmm1,%xmm5 + movl 120(%esp),%edi + call __aesni_encrypt2 + xorps %xmm6,%xmm2 + xorps %xmm7,%xmm3 + movdqa %xmm7,%xmm0 + movdqa %xmm5,%xmm1 + movups %xmm2,(%edi,%esi,1) + movups %xmm3,16(%edi,%esi,1) + jmp L078done +.align 4,0x90 +L081three: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm5 + movdqu (%ebx,%ecx,1),%xmm6 + movdqa %xmm5,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movl 240(%edx),%ecx + pxor %xmm0,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm2,%xmm1 + pxor %xmm5,%xmm2 + pxor %xmm3,%xmm1 + pxor %xmm6,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm7,%xmm4 + movdqa %xmm1,96(%esp) + movl 120(%esp),%edi + call __aesni_encrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movdqa %xmm7,%xmm0 + movdqa 96(%esp),%xmm1 + movups %xmm2,(%edi,%esi,1) + movups %xmm3,16(%edi,%esi,1) + movups %xmm4,32(%edi,%esi,1) + jmp L078done +.align 4,0x90 +L082four: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + movl 112(%esp),%edx + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm4 + movdqu (%ebx,%ecx,1),%xmm5 + movdqa %xmm4,%xmm6 + movdqu (%ebx,%eax,1),%xmm7 + pxor %xmm0,%xmm4 + movdqu (%esi),%xmm2 + pxor %xmm4,%xmm5 + movdqu 16(%esi),%xmm3 + pxor %xmm5,%xmm6 + movdqa %xmm4,(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm5,16(%esp) + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movl 240(%edx),%ecx + pxor %xmm2,%xmm1 + pxor (%esp),%xmm2 + pxor %xmm3,%xmm1 + pxor 16(%esp),%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm1 + pxor %xmm7,%xmm5 + movdqa %xmm1,96(%esp) + movl 120(%esp),%edi + call __aesni_encrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm6,%xmm4 + movups %xmm2,(%edi,%esi,1) + xorps %xmm7,%xmm5 + movups %xmm3,16(%edi,%esi,1) + movdqa %xmm7,%xmm0 + movups %xmm4,32(%edi,%esi,1) + movdqa 96(%esp),%xmm1 + movups %xmm5,48(%edi,%esi,1) +L078done: + movl 128(%esp),%edx + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm2,16(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm2,32(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm2,48(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm2,64(%esp) + movdqa %xmm2,80(%esp) + movdqa %xmm2,96(%esp) + leal (%edx),%esp + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movdqu %xmm0,(%ecx) + pxor %xmm0,%xmm0 + movdqu %xmm1,(%ebx) + pxor %xmm1,%xmm1 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aesni_ocb_decrypt +.align 4 +_aesni_ocb_decrypt: +L_aesni_ocb_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movdqu (%ecx),%xmm0 + movl 36(%esp),%ebp + movdqu (%ebx),%xmm1 + movl 44(%esp),%ebx + movl %esp,%ecx + subl $132,%esp + andl $-16,%esp + subl %esi,%edi + shll $4,%eax + leal -96(%esi,%eax,1),%eax + movl %edi,120(%esp) + movl %eax,124(%esp) + movl %ecx,128(%esp) + movl 240(%edx),%ecx + testl $1,%ebp + jnz L084odd + bsfl %ebp,%eax + addl $1,%ebp + shll $4,%eax + movdqu (%ebx,%eax,1),%xmm7 + movl %edx,%eax + movdqu (%esi),%xmm2 + leal 16(%esi),%esi + pxor %xmm0,%xmm7 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L085dec1_loop_17: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L085dec1_loop_17 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm1 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm1 + movups %xmm2,-16(%edi,%esi,1) + movl 240(%eax),%ecx + movl %eax,%edx + movl 124(%esp),%eax +L084odd: + shll $4,%ecx + movl $16,%edi + subl %ecx,%edi + movl %edx,112(%esp) + leal 32(%edx,%ecx,1),%edx + movl %edi,116(%esp) + cmpl %eax,%esi + ja L086short + jmp L087grandloop +.align 5,0x90 +L087grandloop: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + leal 5(%ebp),%edi + addl $6,%ebp + bsfl %ecx,%ecx + bsfl %eax,%eax + bsfl %edi,%edi + shll $4,%ecx + shll $4,%eax + shll $4,%edi + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + movdqu (%ebx,%edi,1),%xmm7 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor 80(%esp),%xmm7 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movl 120(%esp),%edi + movl 124(%esp),%eax + call L_aesni_decrypt6_enter + movdqa 80(%esp),%xmm0 + pxor (%esp),%xmm2 + movdqa 96(%esp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm2,%xmm1 + movdqu %xmm2,-96(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqu %xmm3,-80(%edi,%esi,1) + pxor %xmm4,%xmm1 + movdqu %xmm4,-64(%edi,%esi,1) + pxor %xmm5,%xmm1 + movdqu %xmm5,-48(%edi,%esi,1) + pxor %xmm6,%xmm1 + movdqu %xmm6,-32(%edi,%esi,1) + pxor %xmm7,%xmm1 + movdqu %xmm7,-16(%edi,%esi,1) + cmpl %eax,%esi + jb L087grandloop +L086short: + addl $96,%eax + subl %esi,%eax + jz L088done + cmpl $32,%eax + jb L089one + je L090two + cmpl $64,%eax + jb L091three + je L092four + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm2 + movdqu (%ebx,%ecx,1),%xmm3 + movl 116(%esp),%ecx + movdqa %xmm2,%xmm4 + movdqu (%ebx,%eax,1),%xmm5 + movdqa %xmm2,%xmm6 + pxor %xmm0,%xmm2 + pxor %xmm2,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm3,%xmm4 + movdqa %xmm3,16(%esp) + pxor %xmm4,%xmm5 + movdqa %xmm4,32(%esp) + pxor %xmm5,%xmm6 + movdqa %xmm5,48(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm6,64(%esp) + movups -48(%edx,%ecx,1),%xmm0 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + pxor %xmm7,%xmm7 + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + movups -32(%edx,%ecx,1),%xmm1 + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + movups -16(%edx,%ecx,1),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movl 120(%esp),%edi + call L_aesni_decrypt6_enter + movdqa 64(%esp),%xmm0 + pxor (%esp),%xmm2 + movdqa 96(%esp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + pxor 48(%esp),%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm2,%xmm1 + movdqu %xmm2,(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqu %xmm3,16(%edi,%esi,1) + pxor %xmm4,%xmm1 + movdqu %xmm4,32(%edi,%esi,1) + pxor %xmm5,%xmm1 + movdqu %xmm5,48(%edi,%esi,1) + pxor %xmm6,%xmm1 + movdqu %xmm6,64(%edi,%esi,1) + jmp L088done +.align 4,0x90 +L089one: + movdqu (%ebx),%xmm7 + movl 112(%esp),%edx + movdqu (%esi),%xmm2 + movl 240(%edx),%ecx + pxor %xmm0,%xmm7 + pxor %xmm7,%xmm2 + movdqa %xmm1,%xmm6 + movl 120(%esp),%edi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L093dec1_loop_18: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L093dec1_loop_18 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm1 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm1 + movups %xmm2,(%edi,%esi,1) + jmp L088done +.align 4,0x90 +L090two: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm6 + movdqu (%ebx,%ecx,1),%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movl 240(%edx),%ecx + movdqa %xmm1,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm6,%xmm2 + pxor %xmm7,%xmm3 + movl 120(%esp),%edi + call __aesni_decrypt2 + xorps %xmm6,%xmm2 + xorps %xmm7,%xmm3 + movdqa %xmm7,%xmm0 + xorps %xmm2,%xmm5 + movups %xmm2,(%edi,%esi,1) + xorps %xmm3,%xmm5 + movups %xmm3,16(%edi,%esi,1) + movaps %xmm5,%xmm1 + jmp L088done +.align 4,0x90 +L091three: + leal 1(%ebp),%ecx + movl 112(%esp),%edx + bsfl %ecx,%ecx + shll $4,%ecx + movdqu (%ebx),%xmm5 + movdqu (%ebx,%ecx,1),%xmm6 + movdqa %xmm5,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movl 240(%edx),%ecx + movdqa %xmm1,96(%esp) + pxor %xmm0,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm7 + pxor %xmm5,%xmm2 + pxor %xmm6,%xmm3 + pxor %xmm7,%xmm4 + movl 120(%esp),%edi + call __aesni_decrypt3 + movdqa 96(%esp),%xmm1 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi,%esi,1) + pxor %xmm2,%xmm1 + movdqa %xmm7,%xmm0 + movups %xmm3,16(%edi,%esi,1) + pxor %xmm3,%xmm1 + movups %xmm4,32(%edi,%esi,1) + pxor %xmm4,%xmm1 + jmp L088done +.align 4,0x90 +L092four: + leal 1(%ebp),%ecx + leal 3(%ebp),%eax + bsfl %ecx,%ecx + bsfl %eax,%eax + movl 112(%esp),%edx + shll $4,%ecx + shll $4,%eax + movdqu (%ebx),%xmm4 + movdqu (%ebx,%ecx,1),%xmm5 + movdqa %xmm4,%xmm6 + movdqu (%ebx,%eax,1),%xmm7 + pxor %xmm0,%xmm4 + movdqu (%esi),%xmm2 + pxor %xmm4,%xmm5 + movdqu 16(%esi),%xmm3 + pxor %xmm5,%xmm6 + movdqa %xmm4,(%esp) + pxor %xmm6,%xmm7 + movdqa %xmm5,16(%esp) + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movl 240(%edx),%ecx + movdqa %xmm1,96(%esp) + pxor (%esp),%xmm2 + pxor 16(%esp),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm7,%xmm5 + movl 120(%esp),%edi + call __aesni_decrypt4 + movdqa 96(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm6,%xmm4 + movups %xmm2,(%edi,%esi,1) + pxor %xmm2,%xmm1 + xorps %xmm7,%xmm5 + movups %xmm3,16(%edi,%esi,1) + pxor %xmm3,%xmm1 + movdqa %xmm7,%xmm0 + movups %xmm4,32(%edi,%esi,1) + pxor %xmm4,%xmm1 + movups %xmm5,48(%edi,%esi,1) + pxor %xmm5,%xmm1 +L088done: + movl 128(%esp),%edx + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm2,16(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm2,32(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm2,48(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm2,64(%esp) + movdqa %xmm2,80(%esp) + movdqa %xmm2,96(%esp) + leal (%edx),%esp + movl 40(%esp),%ecx + movl 48(%esp),%ebx + movdqu %xmm0,(%ecx) + pxor %xmm0,%xmm0 + movdqu %xmm1,(%ebx) + pxor %xmm1,%xmm1 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aesni_cbc_encrypt +.align 4 +_aesni_cbc_encrypt: +L_aesni_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl %esp,%ebx + movl 24(%esp),%edi + subl $24,%ebx + movl 28(%esp),%eax + andl $-16,%ebx + movl 32(%esp),%edx + movl 36(%esp),%ebp + testl %eax,%eax + jz L094cbc_abort + cmpl $0,40(%esp) + xchgl %esp,%ebx + movups (%ebp),%xmm7 + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ebx,16(%esp) + movl %ecx,%ebx + je L095cbc_decrypt + movaps %xmm7,%xmm2 + cmpl $16,%eax + jb L096cbc_enc_tail + subl $16,%eax + jmp L097cbc_enc_loop +.align 4,0x90 +L097cbc_enc_loop: + movups (%esi),%xmm7 + leal 16(%esi),%esi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm7 + leal 32(%edx),%edx + xorps %xmm7,%xmm2 +L098enc1_loop_19: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L098enc1_loop_19 +.byte 102,15,56,221,209 + movl %ebx,%ecx + movl %ebp,%edx + movups %xmm2,(%edi) + leal 16(%edi),%edi + subl $16,%eax + jnc L097cbc_enc_loop + addl $16,%eax + jnz L096cbc_enc_tail + movaps %xmm2,%xmm7 + pxor %xmm2,%xmm2 + jmp L099cbc_ret +L096cbc_enc_tail: + movl %eax,%ecx +.long 2767451785 + movl $16,%ecx + subl %eax,%ecx + xorl %eax,%eax +.long 2868115081 + leal -16(%edi),%edi + movl %ebx,%ecx + movl %edi,%esi + movl %ebp,%edx + jmp L097cbc_enc_loop +.align 4,0x90 +L095cbc_decrypt: + cmpl $80,%eax + jbe L100cbc_dec_tail + movaps %xmm7,(%esp) + subl $80,%eax + jmp L101cbc_dec_loop6_enter +.align 4,0x90 +L102cbc_dec_loop6: + movaps %xmm0,(%esp) + movups %xmm7,(%edi) + leal 16(%edi),%edi +L101cbc_dec_loop6_enter: + movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 movdqu 48(%esi),%xmm5 @@ -1793,28 +2694,28 @@ L077cbc_dec_loop6_enter: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja L078cbc_dec_loop6 + ja L102cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle L079cbc_dec_tail_collected + jle L103cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -L076cbc_dec_tail: +L100cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe L080cbc_dec_one + jbe L104cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe L081cbc_dec_two + jbe L105cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe L082cbc_dec_three + jbe L106cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe L083cbc_dec_four + jbe L107cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1832,56 +2733,62 @@ L076cbc_dec_tail: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp L079cbc_dec_tail_collected + jmp L108cbc_dec_tail_collected .align 4,0x90 -L080cbc_dec_one: +L104cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L084dec1_loop_16: +L109dec1_loop_20: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L084dec1_loop_16 + jnz L109dec1_loop_20 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp L079cbc_dec_tail_collected + jmp L108cbc_dec_tail_collected .align 4,0x90 -L081cbc_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +L105cbc_dec_two: + call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp L079cbc_dec_tail_collected + jmp L108cbc_dec_tail_collected .align 4,0x90 -L082cbc_dec_three: +L106cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp L079cbc_dec_tail_collected + jmp L108cbc_dec_tail_collected .align 4,0x90 -L083cbc_dec_four: +L107cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1891,28 +2798,44 @@ L083cbc_dec_four: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -L079cbc_dec_tail_collected: + jmp L108cbc_dec_tail_collected +.align 4,0x90 +L103cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +L108cbc_dec_tail_collected: andl $15,%eax - jnz L085cbc_dec_tail_partial + jnz L110cbc_dec_tail_partial movups %xmm2,(%edi) - jmp L075cbc_ret + pxor %xmm0,%xmm0 + jmp L099cbc_ret .align 4,0x90 -L085cbc_dec_tail_partial: +L110cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -L075cbc_ret: + movdqa %xmm2,(%esp) +L099cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -L070cbc_abort: + pxor %xmm7,%xmm7 +L094cbc_abort: popl %edi popl %esi popl %ebx @@ -1920,52 +2843,62 @@ L070cbc_abort: ret .align 4 __aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz L086bad_pointer + jz L111bad_pointer testl %edx,%edx - jz L086bad_pointer + jz L111bad_pointer + call L112pic +L112pic: + popl %ebx + leal Lkey_const-L112pic(%ebx),%ebx + movl L__gnutls_x86_cpuid_s$non_lazy_ptr-Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je L08714rounds + je L11314rounds cmpl $192,%ecx - je L08812rounds + je L11412rounds cmpl $128,%ecx - jne L089bad_keybits + jne L115bad_keybits .align 4,0x90 -L09010rounds: +L11610rounds: + cmpl $268435456,%ebp + je L11710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call L091key_128_cold + call L118key_128_cold .byte 102,15,58,223,200,2 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,4 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,8 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,16 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,32 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,64 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,128 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,27 - call L092key_128 + call L119key_128 .byte 102,15,58,223,200,54 - call L092key_128 + call L119key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp L120good_key .align 4,0x90 -L092key_128: +L119key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -L091key_128_cold: +L118key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -1974,38 +2907,91 @@ L091key_128_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L08812rounds: +L11710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +L121loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz L121loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp L120good_key +.align 4,0x90 +L11412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je L12212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call L093key_192a_cold + call L123key_192a_cold .byte 102,15,58,223,202,2 - call L094key_192b + call L124key_192b .byte 102,15,58,223,202,4 - call L095key_192a + call L125key_192a .byte 102,15,58,223,202,8 - call L094key_192b + call L124key_192b .byte 102,15,58,223,202,16 - call L095key_192a + call L125key_192a .byte 102,15,58,223,202,32 - call L094key_192b + call L124key_192b .byte 102,15,58,223,202,64 - call L095key_192a + call L125key_192a .byte 102,15,58,223,202,128 - call L094key_192b + call L124key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp L120good_key .align 4,0x90 -L095key_192a: +L125key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 4,0x90 -L093key_192a_cold: +L123key_192a_cold: movaps %xmm2,%xmm5 -L096key_192b_warm: +L126key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2019,56 +3005,90 @@ L096key_192b_warm: pxor %xmm3,%xmm2 ret .align 4,0x90 -L094key_192b: +L124key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp L096key_192b_warm + jmp L126key_192b_warm .align 4,0x90 -L08714rounds: +L12212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +L127loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz L127loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp L120good_key +.align 4,0x90 +L11314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je L12814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call L097key_256a_cold + call L129key_256a_cold .byte 102,15,58,223,200,1 - call L098key_256b + call L130key_256b .byte 102,15,58,223,202,2 - call L099key_256a + call L131key_256a .byte 102,15,58,223,200,2 - call L098key_256b + call L130key_256b .byte 102,15,58,223,202,4 - call L099key_256a + call L131key_256a .byte 102,15,58,223,200,4 - call L098key_256b + call L130key_256b .byte 102,15,58,223,202,8 - call L099key_256a + call L131key_256a .byte 102,15,58,223,200,8 - call L098key_256b + call L130key_256b .byte 102,15,58,223,202,16 - call L099key_256a + call L131key_256a .byte 102,15,58,223,200,16 - call L098key_256b + call L130key_256b .byte 102,15,58,223,202,32 - call L099key_256a + call L131key_256a .byte 102,15,58,223,200,32 - call L098key_256b + call L130key_256b .byte 102,15,58,223,202,64 - call L099key_256a + call L131key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp L120good_key .align 4,0x90 -L099key_256a: +L131key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -L097key_256a_cold: +L129key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2077,7 +3097,7 @@ L097key_256a_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L098key_256b: +L130key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2087,13 +3107,70 @@ L098key_256b: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 4,0x90 +L12814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +L132loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz L133done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp L132loop_key256 +L133done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +L120good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 2,0x90 -L086bad_pointer: +L111bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 2,0x90 -L089bad_keybits: +L115bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .globl _aesni_set_encrypt_key .align 4 @@ -2115,7 +3192,7 @@ L_aesni_set_decrypt_key_begin: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz L100dec_key_ret + jnz L134dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2123,7 +3200,7 @@ L_aesni_set_decrypt_key_begin: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -L101dec_key_inverse: +L135dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2133,15 +3210,28 @@ L101dec_key_inverse: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja L101dec_key_inverse + ja L135dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -L100dec_key_ret: +L134dec_key_ret: ret +.align 6,0x90 +Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L__gnutls_x86_cpuid_s$non_lazy_ptr: +.indirect_symbol __gnutls_x86_cpuid_s +.long 0 +.comm __gnutls_x86_cpuid_s,16,2 diff --git a/lib/accelerated/x86/macosx/aesni-x86_64.s b/lib/accelerated/x86/macosx/aesni-x86_64.s index f0a5606348..f6145f166b 100644 --- a/lib/accelerated/x86/macosx/aesni-x86_64.s +++ b/lib/accelerated/x86/macosx/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -43,6 +43,7 @@ .p2align 4 _aesni_encrypt: + movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -63,10 +64,12 @@ L$oop_enc1_1: .byte 0xf3,0xc3 + .globl _aesni_decrypt .p2align 4 _aesni_decrypt: + movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -87,8 +90,10 @@ L$oop_dec1_2: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -116,8 +121,10 @@ L$enc_loop2: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -145,8 +152,10 @@ L$dec_loop2: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -179,8 +188,10 @@ L$enc_loop3: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -213,8 +224,10 @@ L$dec_loop3: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -253,8 +266,10 @@ L$enc_loop4: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -293,8 +308,10 @@ L$dec_loop4: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -347,8 +364,10 @@ L$enc_loop6_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -401,8 +420,10 @@ L$dec_loop6_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -465,8 +486,10 @@ L$enc_loop8_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -528,10 +551,12 @@ L$dec_loop8_enter: .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 + .globl _aesni_ecb_encrypt .p2align 4 _aesni_ecb_encrypt: + andq $-16,%rdx jz L$ecb_ret @@ -870,6 +895,7 @@ L$ecb_ret: pxor %xmm1,%xmm1 .byte 0xf3,0xc3 + .globl _aesni_ccm64_encrypt_blocks .p2align 4 @@ -1034,6 +1060,7 @@ L$oop_enc1_6: .p2align 4 _aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx jne L$ctr32_bulk @@ -1063,11 +1090,12 @@ L$oop_enc1_7: .p2align 4 L$ctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 + pushq %rbp + subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1076,7 +1104,7 @@ L$ctr32_bulk: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1092,8 +1120,8 @@ L$ctr32_bulk: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1102,25 +1130,25 @@ L$ctr32_bulk: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl __gnutls_x86_cpuid_s+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1144,7 +1172,7 @@ L$ctr32_bulk: L$ctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp L$ctr32_loop6 @@ -1155,32 +1183,32 @@ L$ctr32_loop6: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1241,7 +1269,7 @@ L$ctr32_loop8: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1254,7 +1282,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1268,7 +1296,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1282,7 +1310,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1296,7 +1324,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1310,7 +1338,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1324,7 +1352,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1339,7 +1367,7 @@ L$ctr32_loop8: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1574,7 +1602,7 @@ L$ctr32_loop3: L$ctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1598,20 +1626,25 @@ L$ctr32_done: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$ctr32_epilogue: .byte 0xf3,0xc3 + .globl _aesni_xts_encrypt .p2align 4 _aesni_xts_encrypt: - leaq (%rsp),%rax + + leaq (%rsp),%r11 + pushq %rbp + subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1627,7 +1660,7 @@ L$oop_enc1_8: jnz L$oop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1683,9 +1716,9 @@ L$oop_enc1_8: jc L$xts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_enc_grandloop @@ -1710,7 +1743,7 @@ L$xts_enc_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1719,7 +1752,7 @@ L$xts_enc_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1734,7 +1767,7 @@ L$xts_enc_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_enc_loop6 @@ -1766,7 +1799,7 @@ L$xts_enc_loop6: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1834,10 +1867,10 @@ L$xts_enc_loop6: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1864,7 +1897,7 @@ L$xts_enc_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_enc_short: @@ -2020,7 +2053,7 @@ L$xts_enc_steal: jnz L$xts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2063,20 +2096,25 @@ L$xts_enc_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$xts_enc_epilogue: .byte 0xf3,0xc3 + .globl _aesni_xts_decrypt .p2align 4 _aesni_xts_decrypt: - leaq (%rsp),%rax + + leaq (%rsp),%r11 + pushq %rbp + subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2098,7 +2136,7 @@ L$oop_enc1_11: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2154,9 +2192,9 @@ L$oop_enc1_11: jc L$xts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_dec_grandloop @@ -2181,7 +2219,7 @@ L$xts_dec_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2190,7 +2228,7 @@ L$xts_dec_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2205,7 +2243,7 @@ L$xts_dec_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_dec_loop6 @@ -2237,7 +2275,7 @@ L$xts_dec_loop6: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2305,10 +2343,10 @@ L$xts_dec_loop6: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2335,7 +2373,7 @@ L$xts_dec_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_dec_short: @@ -2492,7 +2530,7 @@ L$xts_dec_done: jz L$xts_dec_ret L$xts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2522,7 +2560,7 @@ L$xts_dec_steal: jnz L$xts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2565,172 +2603,1020 @@ L$xts_dec_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$xts_dec_epilogue: .byte 0xf3,0xc3 -.globl _aesni_cbc_encrypt -.p2align 4 -_aesni_cbc_encrypt: - testq %rdx,%rdx - jz L$cbc_ret +.globl _aesni_ocb_encrypt + +.p2align 5 +_aesni_ocb_encrypt: + + leaq (%rsp),%rax + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp movl 240(%rcx),%r10d movq %rcx,%r11 - testl %r9d,%r9d - jz L$cbc_decrypt + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb L$cbc_enc_tail - subq $16,%rdx - jmp L$cbc_enc_loop -.p2align 4 -L$cbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -L$oop_enc1_15: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_15 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc L$cbc_enc_loop - addq $16,%rdx - jnz L$cbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp L$cbc_ret + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 -L$cbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp L$cbc_enc_loop + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 -.p2align 4 -L$cbc_decrypt: - cmpq $16,%rdx - jne L$cbc_decrypt_bulk + testq $1,%r8 + jnz L$ocb_enc_odd + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + call __ocb_encrypt1 - movdqu (%rdi),%xmm2 - movdqu (%r8),%xmm3 - movdqa %xmm2,%xmm4 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_16: -.byte 102,15,56,222,209 - decl %r10d - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqu %xmm4,(%r8) - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 + movdqa %xmm7,%xmm15 movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp L$cbc_ret -.p2align 4 -L$cbc_decrypt_bulk: - leaq (%rsp),%rax - pushq %rbp - subq $16,%rsp - andq $-16,%rsp - leaq -8(%rax),%rbp - movups (%r8),%xmm10 - movl %r10d,%eax - cmpq $0x50,%rdx - jbe L$cbc_dec_tail + leaq 16(%rsi),%rsi + subq $1,%rdx + jz L$ocb_enc_done + +L$ocb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 - movups (%rcx),%xmm0 + subq $6,%rdx + jc L$ocb_enc_short + jmp L$ocb_enc_grandloop + +.p2align 5 +L$ocb_enc_grandloop: movdqu 0(%rdi),%xmm2 movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 - movl __gnutls_x86_cpuid_s+4(%rip),%r9d - cmpq $0x70,%rdx - jbe L$cbc_dec_six_or_seven + leaq 96(%rdi),%rdi + + call __ocb_encrypt6 + + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ocb_enc_grandloop + +L$ocb_enc_short: + addq $6,%rdx + jz L$ocb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb L$ocb_enc_one + movdqu 16(%rdi),%xmm3 + je L$ocb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb L$ocb_enc_three + movdqu 48(%rdi),%xmm5 + je L$ocb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp L$ocb_enc_done - andl $71303168,%r9d - subq $0x50,%rdx - cmpl $4194304,%r9d - je L$cbc_dec_loop6_enter - subq $0x20,%rdx - leaq 112(%rcx),%rcx - jmp L$cbc_dec_loop8_enter .p2align 4 -L$cbc_dec_loop8: - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi -L$cbc_dec_loop8_enter: - movdqu 96(%rdi),%xmm8 - pxor %xmm0,%xmm2 - movdqu 112(%rdi),%xmm9 - pxor %xmm0,%xmm3 - movups 16-112(%rcx),%xmm1 - pxor %xmm0,%xmm4 - xorq %r11,%r11 - cmpq $0x70,%rdx - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 +L$ocb_enc_one: + movdqa %xmm10,%xmm7 -.byte 102,15,56,222,209 - pxor %xmm0,%xmm9 - movups 32-112(%rcx),%xmm0 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 -.byte 102,68,15,56,222,201 - addq %rdi,%r11 - movups 48-112(%rcx),%xmm1 + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +L$ocb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$ocb_enc_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp L$ocb_enc_loop6 + +.p2align 5 +L$ocb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp L$ocb_enc_loop4 + +.p2align 5 +L$ocb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp L$ocb_enc_loop1 + +.p2align 5 +L$ocb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 + + +.globl _aesni_ocb_decrypt + +.p2align 5 +_aesni_ocb_decrypt: + + leaq (%rsp),%rax + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz L$ocb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz L$ocb_dec_done + +L$ocb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc L$ocb_dec_short + jmp L$ocb_dec_grandloop + +.p2align 5 +L$ocb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ocb_dec_grandloop + +L$ocb_dec_short: + addq $6,%rdx + jz L$ocb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb L$ocb_dec_one + movdqu 16(%rdi),%xmm3 + je L$ocb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb L$ocb_dec_three + movdqu 48(%rdi),%xmm5 + je L$ocb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +L$ocb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$ocb_dec_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp L$ocb_dec_loop6 + +.p2align 5 +L$ocb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp L$ocb_dec_loop4 + +.p2align 5 +L$ocb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp L$ocb_dec_loop1 + +.p2align 5 +L$ocb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 + +.globl _aesni_cbc_encrypt + +.p2align 4 +_aesni_cbc_encrypt: + + testq %rdx,%rdx + jz L$cbc_ret + + movl 240(%rcx),%r10d + movq %rcx,%r11 + testl %r9d,%r9d + jz L$cbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb L$cbc_enc_tail + subq $16,%rdx + jmp L$cbc_enc_loop +.p2align 4 +L$cbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +L$oop_enc1_15: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_15 +.byte 102,15,56,221,209 + movl %r10d,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc L$cbc_enc_loop + addq $16,%rdx + jnz L$cbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp L$cbc_ret + +L$cbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp L$cbc_enc_loop + +.p2align 4 +L$cbc_decrypt: + cmpq $16,%rdx + jne L$cbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp L$cbc_ret +.p2align 4 +L$cbc_decrypt_bulk: + leaq (%rsp),%r11 + + pushq %rbp + + subq $16,%rsp + andq $-16,%rsp + movq %rcx,%rbp + movups (%r8),%xmm10 + movl %r10d,%eax + cmpq $0x50,%rdx + jbe L$cbc_dec_tail + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl __gnutls_x86_cpuid_s+4(%rip),%r9d + cmpq $0x70,%rdx + jbe L$cbc_dec_six_or_seven + + andl $71303168,%r9d + subq $0x50,%rdx + cmpl $4194304,%r9d + je L$cbc_dec_loop6_enter + subq $0x20,%rdx + leaq 112(%rcx),%rcx + jmp L$cbc_dec_loop8_enter +.p2align 4 +L$cbc_dec_loop8: + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi +L$cbc_dec_loop8_enter: + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + movq $-1,%rbp + cmpq $0x70,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + +.byte 102,15,56,222,209 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 + adcq $0,%rbp + andq $128,%rbp +.byte 102,68,15,56,222,201 + addq %rdi,%rbp + movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 .byte 102,15,56,222,224 @@ -2867,18 +3753,18 @@ L$cbc_dec_done: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -2997,7 +3883,7 @@ L$cbc_dec_loop6_enter: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3150,16 +4036,21 @@ L$cbc_dec_tail_partial: L$cbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$cbc_ret: .byte 0xf3,0xc3 + .globl _aesni_set_decrypt_key .p2align 4 _aesni_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3192,7 +4083,9 @@ L$dec_key_inverse: pxor %xmm0,%xmm0 L$dec_key_ret: addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_decrypt_key: .globl _aesni_set_encrypt_key @@ -3200,7 +4093,9 @@ L$SEH_end_set_decrypt_key: .p2align 4 _aesni_set_encrypt_key: __aesni_set_encrypt_key: + .byte 0x48,0x83,0xEC,0x08 + movq $-1,%rax testq %rdi,%rdi jz L$enc_key_ret @@ -3493,7 +4388,9 @@ L$enc_key_ret: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_encrypt_key: .p2align 4 diff --git a/lib/accelerated/x86/macosx/cpuid-x86.s b/lib/accelerated/x86/macosx/cpuid-x86.s index a8371ab56e..bd8e443fa9 100644 --- a/lib/accelerated/x86/macosx/cpuid-x86.s +++ b/lib/accelerated/x86/macosx/cpuid-x86.s @@ -21,7 +21,6 @@ # # *** This file is auto-generated *** # -.file "devel/perlasm/cpuid-x86.s" .text .globl _gnutls_cpuid .align 4 diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s index 8fe772fd35..5fd3216755 100644 --- a/lib/accelerated/x86/macosx/ghash-x86_64.s +++ b/lib/accelerated/x86/macosx/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,9 +44,21 @@ .p2align 4 _gcm_gmult_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $280,%rsp + L$gmult_prologue: movzbq 15(%rdi),%r8 @@ -123,22 +135,35 @@ L$break1: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$gmult_epilogue: .byte 0xf3,0xc3 + .globl _gcm_ghash_4bit .p2align 4 _gcm_ghash_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + L$ghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -683,21 +708,31 @@ L$outer_loop: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq 0(%rsi),%rsp + L$ghash_epilogue: .byte 0xf3,0xc3 + .globl _gcm_init_clmul .p2align 4 _gcm_init_clmul: + L$_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -850,10 +885,12 @@ L$_init_clmul: movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 + .globl _gcm_gmult_clmul .p2align 4 _gcm_gmult_clmul: + L$_gmult_clmul: movdqu (%rdi),%xmm0 movdqa L$bswap_mask(%rip),%xmm5 @@ -901,10 +938,12 @@ L$_gmult_clmul: movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 + .globl _gcm_ghash_clmul .p2align 5 _gcm_ghash_clmul: + L$_ghash_clmul: movdqa L$bswap_mask(%rip),%xmm10 @@ -1284,10 +1323,12 @@ L$done: movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 + .globl _gcm_init_avx .p2align 5 _gcm_init_avx: + vzeroupper vmovdqu (%rsi),%xmm2 @@ -1391,16 +1432,20 @@ L$init_start_avx: vzeroupper .byte 0xf3,0xc3 + .globl _gcm_gmult_avx .p2align 5 _gcm_gmult_avx: + jmp L$_gmult_clmul + .globl _gcm_ghash_avx .p2align 5 _gcm_ghash_avx: + vzeroupper vmovdqu (%rdi),%xmm10 @@ -1773,6 +1818,7 @@ L$tail_no_xor_avx: vzeroupper .byte 0xf3,0xc3 + .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86.s index 8e01010ce3..985d4af8db 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha1-586.s" .text .globl _sha1_block_data_order .align 4 diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s index 79c10de2ed..a576acc25f 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,25 +44,45 @@ .p2align 4 _sha1_block_data_order: + movl __gnutls_x86_cpuid_s+0(%rip),%r9d movl __gnutls_x86_cpuid_s+4(%rip),%r8d + movl __gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz L$ialu + testl $536870912,%r10d + jnz _shaext_shortcut + andl $296,%r10d + cmpl $296,%r10d + je _avx2_shortcut + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .p2align 4 L$ialu: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 - movq %rsp,%r11 + + pushq %r14 + movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) + L$prologue: movl 0(%r8),%esi @@ -76,1230 +96,1168 @@ L$prologue: L$loop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d - roll $5,%ecx - xorl 24(%rsp),%ebp + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax + xorl 52(%rsp),%edx + xorl %r13d,%eax roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp - xorl %r13d,%eax + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1319,29 +1277,218 @@ L$loop: jnz L$loop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + +.p2align 5 +sha1_block_data_order_shaext: +_shaext_shortcut: + + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + decq %rdx + leaq 64(%rsi),%r8 + paddd %xmm4,%xmm1 + cmovneq %r8,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz L$oop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + + .byte 0xf3,0xc3 + + .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: + + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + + pushq %r13 + + pushq %r14 + leaq -64(%rsp),%rsp + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1353,18 +1500,18 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1376,24 +1523,24 @@ _ssse3_shortcut: jmp L$oop_ssse3 .p2align 4 L$oop_ssse3: - movdqa %xmm1,%xmm4 rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi addl 0(%rsp),%ebp - paddd %xmm3,%xmm9 + punpcklqdq %xmm1,%xmm4 xorl %ecx,%ebx roll $5,%eax - psrldq $4,%xmm8 addl %esi,%ebp + psrldq $4,%xmm8 andl %ebx,%edi - pxor %xmm0,%xmm4 xorl %ecx,%ebx + pxor %xmm0,%xmm4 addl %eax,%ebp - pxor %xmm2,%xmm8 rorl $7,%eax + pxor %xmm2,%xmm8 xorl %ecx,%edi movl %ebp,%esi addl 4(%rsp),%edx @@ -1404,57 +1551,57 @@ L$oop_ssse3: addl %edi,%edx andl %eax,%esi movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp + movdqa %xmm4,%xmm8 xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi addl 8(%rsp),%ecx + psrld $31,%xmm8 xorl %eax,%ebp roll $5,%edx - psrld $31,%xmm8 addl %esi,%ecx - andl %ebp,%edi movdqa %xmm10,%xmm9 + andl %ebp,%edi xorl %eax,%ebp - addl %edx,%ecx psrld $30,%xmm10 - por %xmm8,%xmm4 + addl %edx,%ecx rorl $7,%edx + por %xmm8,%xmm4 xorl %eax,%edi movl %ecx,%esi addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx + movdqa -64(%r14),%xmm10 roll $5,%ecx - movdqa 0(%r11),%xmm10 addl %edi,%ebx andl %edx,%esi pxor %xmm9,%xmm4 xorl %ebp,%edx addl %ecx,%ebx - movdqa %xmm2,%xmm5 rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi addl 16(%rsp),%eax - paddd %xmm4,%xmm10 + punpcklqdq %xmm2,%xmm5 xorl %edx,%ecx roll $5,%ebx - psrldq $4,%xmm9 addl %esi,%eax + psrldq $4,%xmm9 andl %ecx,%edi - pxor %xmm1,%xmm5 xorl %edx,%ecx + pxor %xmm1,%xmm5 addl %ebx,%eax - pxor %xmm3,%xmm9 rorl $7,%ebx + pxor %xmm3,%xmm9 xorl %edx,%edi movl %eax,%esi addl 20(%rsp),%ebp @@ -1465,57 +1612,57 @@ L$oop_ssse3: addl %edi,%ebp andl %ebx,%esi movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax + movdqa %xmm5,%xmm9 xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi addl 24(%rsp),%edx + psrld $31,%xmm9 xorl %ebx,%eax roll $5,%ebp - psrld $31,%xmm9 addl %esi,%edx - andl %eax,%edi movdqa %xmm8,%xmm10 + andl %eax,%edi xorl %ebx,%eax - addl %ebp,%edx psrld $30,%xmm8 - por %xmm9,%xmm5 + addl %ebp,%edx rorl $7,%ebp + por %xmm9,%xmm5 xorl %ebx,%edi movl %edx,%esi addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp + movdqa -32(%r14),%xmm8 roll $5,%edx - movdqa 16(%r11),%xmm8 addl %edi,%ecx andl %ebp,%esi pxor %xmm10,%xmm5 xorl %eax,%ebp addl %edx,%ecx - movdqa %xmm3,%xmm6 rorl $7,%edx + pshufd $238,%xmm2,%xmm6 xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi addl 32(%rsp),%ebx - paddd %xmm5,%xmm8 + punpcklqdq %xmm3,%xmm6 xorl %ebp,%edx roll $5,%ecx - psrldq $4,%xmm10 addl %esi,%ebx + psrldq $4,%xmm10 andl %edx,%edi - pxor %xmm2,%xmm6 xorl %ebp,%edx + pxor %xmm2,%xmm6 addl %ecx,%ebx - pxor %xmm4,%xmm10 rorl $7,%ecx + pxor %xmm4,%xmm10 xorl %ebp,%edi movl %ebx,%esi addl 36(%rsp),%eax @@ -1526,57 +1673,57 @@ L$oop_ssse3: addl %edi,%eax andl %ecx,%esi movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx + movdqa %xmm6,%xmm10 xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi addl 40(%rsp),%ebp + psrld $31,%xmm10 xorl %ecx,%ebx roll $5,%eax - psrld $31,%xmm10 addl %esi,%ebp - andl %ebx,%edi movdqa %xmm9,%xmm8 + andl %ebx,%edi xorl %ecx,%ebx - addl %eax,%ebp psrld $30,%xmm9 - por %xmm10,%xmm6 + addl %eax,%ebp rorl $7,%eax + por %xmm10,%xmm6 xorl %ecx,%edi movl %ebp,%esi addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax + movdqa -32(%r14),%xmm9 roll $5,%ebp - movdqa 16(%r11),%xmm9 addl %edi,%edx andl %eax,%esi pxor %xmm8,%xmm6 xorl %ebx,%eax addl %ebp,%edx - movdqa %xmm4,%xmm7 rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi addl 48(%rsp),%ecx - paddd %xmm6,%xmm9 + punpcklqdq %xmm4,%xmm7 xorl %eax,%ebp roll $5,%edx - psrldq $4,%xmm8 addl %esi,%ecx + psrldq $4,%xmm8 andl %ebp,%edi - pxor %xmm3,%xmm7 xorl %eax,%ebp + pxor %xmm3,%xmm7 addl %edx,%ecx - pxor %xmm5,%xmm8 rorl $7,%edx + pxor %xmm5,%xmm8 xorl %eax,%edi movl %ecx,%esi addl 52(%rsp),%ebx @@ -1587,78 +1734,78 @@ L$oop_ssse3: addl %edi,%ebx andl %edx,%esi movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx + movdqa %xmm7,%xmm8 xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi addl 56(%rsp),%eax + psrld $31,%xmm8 xorl %edx,%ecx roll $5,%ebx - psrld $31,%xmm8 addl %esi,%eax - andl %ecx,%edi movdqa %xmm10,%xmm9 + andl %ecx,%edi xorl %edx,%ecx - addl %ebx,%eax psrld $30,%xmm10 - por %xmm8,%xmm7 + addl %ebx,%eax rorl $7,%ebx + por %xmm8,%xmm7 xorl %edx,%edi movl %eax,%esi addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx + movdqa -32(%r14),%xmm10 roll $5,%eax - movdqa 16(%r11),%xmm10 addl %edi,%ebp andl %ebx,%esi pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx addl %eax,%ebp - movdqa %xmm7,%xmm9 rorl $7,%eax pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 xorl %ecx,%esi movl %ebp,%edi addl 0(%rsp),%edx - pxor %xmm1,%xmm0 + punpcklqdq %xmm7,%xmm9 xorl %ebx,%eax roll $5,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm7,%xmm10 + pxor %xmm1,%xmm0 addl %esi,%edx andl %eax,%edi - pxor %xmm9,%xmm0 + movdqa %xmm10,%xmm8 xorl %ebx,%eax + paddd %xmm7,%xmm10 addl %ebp,%edx + pxor %xmm9,%xmm0 rorl $7,%ebp xorl %ebx,%edi - movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) movl %edx,%esi addl 4(%rsp),%ecx + movdqa %xmm0,%xmm9 xorl %eax,%ebp roll $5,%edx - pslld $2,%xmm0 + movdqa %xmm10,48(%rsp) addl %edi,%ecx andl %ebp,%esi - psrld $30,%xmm9 xorl %eax,%ebp + pslld $2,%xmm0 addl %edx,%ecx rorl $7,%edx + psrld $30,%xmm9 xorl %eax,%esi movl %ecx,%edi addl 8(%rsp),%ebx por %xmm9,%xmm0 xorl %ebp,%edx roll $5,%ecx - movdqa %xmm0,%xmm10 + pshufd $238,%xmm7,%xmm10 addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx @@ -1671,18 +1818,18 @@ L$oop_ssse3: xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax - addl 16(%rsp),%ebp pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 + addl 16(%rsp),%ebp xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx + paddd %xmm0,%xmm8 addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx @@ -1690,43 +1837,43 @@ L$oop_ssse3: movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 - movdqa %xmm8,0(%rsp) addl %edi,%edx xorl %ebx,%esi + movdqa %xmm8,0(%rsp) rorl $7,%eax addl %ebp,%edx - pslld $2,%xmm1 addl 24(%rsp),%ecx + pslld $2,%xmm1 xorl %eax,%esi - psrld $30,%xmm10 movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp - addl %edx,%ecx por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx + pshufd $238,%xmm0,%xmm8 xorl %ebp,%edi - movdqa %xmm1,%xmm8 movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx - addl 32(%rsp),%eax pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 + addl 32(%rsp),%eax xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 + movdqa 0(%r14),%xmm10 rorl $7,%ecx + paddd %xmm1,%xmm9 addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp @@ -1734,43 +1881,43 @@ L$oop_ssse3: movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 - movdqa %xmm9,16(%rsp) addl %edi,%ebp xorl %ecx,%esi + movdqa %xmm9,16(%rsp) rorl $7,%ebx addl %eax,%ebp - pslld $2,%xmm2 addl 40(%rsp),%edx + pslld $2,%xmm2 xorl %ebx,%esi - psrld $30,%xmm8 movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax - addl %ebp,%edx por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx + pshufd $238,%xmm1,%xmm9 xorl %eax,%edi - movdqa %xmm2,%xmm9 movl %edx,%esi roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx - addl 48(%rsp),%ebx pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 + addl 48(%rsp),%ebx xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 addl %esi,%ebx xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx + paddd %xmm2,%xmm10 addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax @@ -1778,43 +1925,43 @@ L$oop_ssse3: movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 - movdqa %xmm10,32(%rsp) addl %edi,%eax xorl %edx,%esi + movdqa %xmm10,32(%rsp) rorl $7,%ecx addl %ebx,%eax - pslld $2,%xmm3 addl 56(%rsp),%ebp + pslld $2,%xmm3 xorl %ecx,%esi - psrld $30,%xmm9 movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx - addl %eax,%ebp por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx + pshufd $238,%xmm2,%xmm10 xorl %ebx,%edi - movdqa %xmm3,%xmm10 movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx - addl 0(%rsp),%ecx pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 + addl 0(%rsp),%ecx xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 addl %esi,%ecx xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp + paddd %xmm3,%xmm8 addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx @@ -1822,43 +1969,43 @@ L$oop_ssse3: movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 - movdqa %xmm8,48(%rsp) addl %edi,%ebx xorl %ebp,%esi + movdqa %xmm8,48(%rsp) rorl $7,%edx addl %ecx,%ebx - pslld $2,%xmm4 addl 8(%rsp),%eax + pslld $2,%xmm4 xorl %edx,%esi - psrld $30,%xmm10 movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx - addl %ebx,%eax por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp + pshufd $238,%xmm3,%xmm8 xorl %ecx,%edi - movdqa %xmm4,%xmm8 movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp - addl 16(%rsp),%edx pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 + addl 16(%rsp),%edx xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 addl %esi,%edx xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax + paddd %xmm4,%xmm9 addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx @@ -1866,24 +2013,24 @@ L$oop_ssse3: movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 - movdqa %xmm9,0(%rsp) addl %edi,%ecx xorl %eax,%esi + movdqa %xmm9,0(%rsp) rorl $7,%ebp addl %edx,%ecx - pslld $2,%xmm5 addl 24(%rsp),%ebx + pslld $2,%xmm5 xorl %ebp,%esi - psrld $30,%xmm8 movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx - addl %ecx,%ebx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi @@ -1892,47 +2039,47 @@ L$oop_ssse3: xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax - addl 32(%rsp),%ebp pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + addl 32(%rsp),%ebp andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx - pxor %xmm7,%xmm6 + punpcklqdq %xmm5,%xmm9 movl %eax,%edi xorl %ecx,%esi - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - pxor %xmm9,%xmm6 + movdqa %xmm10,%xmm8 xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 addl %eax,%ebp addl 36(%rsp),%edx - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax + movdqa %xmm6,%xmm9 movl %ebp,%esi - pslld $2,%xmm6 xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp - psrld $30,%xmm9 addl %edi,%edx xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx andl %eax,%esi - por %xmm9,%xmm6 xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - movdqa %xmm6,%xmm10 movl %edx,%edi xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx xorl %ebp,%edi xorl %eax,%ebp @@ -1948,47 +2095,47 @@ L$oop_ssse3: xorl %edx,%esi xorl %ebp,%edx addl %ecx,%ebx - addl 48(%rsp),%eax pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 + addl 48(%rsp),%eax andl %edx,%esi xorl %ebp,%edx rorl $7,%ecx - pxor %xmm0,%xmm7 + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi xorl %edx,%esi - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - pxor %xmm10,%xmm7 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 addl %ebx,%eax addl 52(%rsp),%ebp - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx + movdqa %xmm7,%xmm10 movl %eax,%esi - pslld $2,%xmm7 xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax - psrld $30,%xmm10 addl %edi,%ebp xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx andl %ebx,%esi - por %xmm10,%xmm7 xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - movdqa %xmm7,%xmm8 movl %ebp,%edi xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx xorl %eax,%edi xorl %ebx,%eax @@ -2004,47 +2151,47 @@ L$oop_ssse3: xorl %ebp,%esi xorl %eax,%ebp addl %edx,%ecx - addl 0(%rsp),%ebx pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 + addl 0(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp rorl $7,%edx - pxor %xmm1,%xmm0 + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi xorl %ebp,%esi - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - pxor %xmm8,%xmm0 + movdqa %xmm9,%xmm10 xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 addl %ecx,%ebx addl 4(%rsp),%eax - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx + movdqa %xmm0,%xmm8 movl %ebx,%esi - pslld $2,%xmm0 xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx - psrld $30,%xmm8 addl %edi,%eax xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp andl %ecx,%esi - por %xmm8,%xmm0 xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - movdqa %xmm0,%xmm9 movl %eax,%edi xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp xorl %ebx,%edi xorl %ecx,%ebx @@ -2060,47 +2207,47 @@ L$oop_ssse3: xorl %eax,%esi xorl %ebx,%eax addl %ebp,%edx - addl 16(%rsp),%ecx pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 + addl 16(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%ebp - pxor %xmm2,%xmm1 + punpcklqdq %xmm0,%xmm9 movl %edx,%edi xorl %eax,%esi - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - pxor %xmm9,%xmm1 + movdqa %xmm10,%xmm8 xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 addl %edx,%ecx addl 20(%rsp),%ebx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx + movdqa %xmm1,%xmm9 movl %ecx,%esi - pslld $2,%xmm1 xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx - psrld $30,%xmm9 addl %edi,%ebx xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax andl %edx,%esi - por %xmm9,%xmm1 xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - movdqa %xmm1,%xmm10 movl %ebx,%edi xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax xorl %ecx,%edi xorl %edx,%ecx @@ -2116,47 +2263,47 @@ L$oop_ssse3: xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%ebp - addl 32(%rsp),%edx pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 + addl 32(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax - pxor %xmm3,%xmm2 + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi xorl %ebx,%esi - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm9 xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 addl %ebp,%edx addl 36(%rsp),%ecx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp + movdqa %xmm2,%xmm10 movl %edx,%esi - pslld $2,%xmm2 xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx - psrld $30,%xmm10 addl %edi,%ecx xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx andl %ebp,%esi - por %xmm10,%xmm2 xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - movdqa %xmm2,%xmm8 movl %ecx,%edi xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx xorl %edx,%edi xorl %ebp,%edx @@ -2171,18 +2318,18 @@ L$oop_ssse3: addl %edi,%eax xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 + addl 48(%rsp),%ebp xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx + paddd %xmm2,%xmm9 addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx @@ -2190,22 +2337,22 @@ L$oop_ssse3: movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 - movdqa %xmm9,32(%rsp) addl %edi,%edx xorl %ebx,%esi + movdqa %xmm9,32(%rsp) rorl $7,%eax addl %ebp,%edx - pslld $2,%xmm3 addl 56(%rsp),%ecx + pslld $2,%xmm3 xorl %eax,%esi - psrld $30,%xmm8 movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp - addl %edx,%ecx por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi @@ -2215,13 +2362,13 @@ L$oop_ssse3: rorl $7,%edx addl %ecx,%ebx addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %edx,%esi movl %ebx,%edi roll $5,%ebx + paddd %xmm3,%xmm10 addl %esi,%eax - movdqa %xmm10,48(%rsp) xorl %edx,%edi + movdqa %xmm10,48(%rsp) rorl $7,%ecx addl %ebx,%eax addl 4(%rsp),%ebp @@ -2250,8 +2397,8 @@ L$oop_ssse3: addl %edx,%ecx cmpq %r10,%r9 je L$done_ssse3 - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2260,23 +2407,23 @@ L$oop_ssse3: addq $64,%r9 addl 16(%rsp),%ebx xorl %ebp,%esi -.byte 102,15,56,0,206 movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx - paddd %xmm9,%xmm0 addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx + paddd %xmm9,%xmm0 addl %ecx,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax xorl %edx,%edi - psubd %xmm9,%xmm0 movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx + psubd %xmm9,%xmm0 addl %ebx,%eax addl 24(%rsp),%ebp xorl %ecx,%esi @@ -2296,23 +2443,23 @@ L$oop_ssse3: addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi -.byte 102,15,56,0,214 movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx - paddd %xmm9,%xmm1 addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp + paddd %xmm9,%xmm1 addl %edx,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx xorl %ebp,%edi - psubd %xmm9,%xmm1 movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx + psubd %xmm9,%xmm1 addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi @@ -2332,23 +2479,23 @@ L$oop_ssse3: addl %eax,%ebp addl 48(%rsp),%edx xorl %ebx,%esi -.byte 102,15,56,0,222 movl %ebp,%edi +.byte 102,15,56,0,222 roll $5,%ebp - paddd %xmm9,%xmm2 addl %esi,%edx xorl %ebx,%edi rorl $7,%eax + paddd %xmm9,%xmm2 addl %ebp,%edx - movdqa %xmm2,32(%rsp) addl 52(%rsp),%ecx xorl %eax,%edi - psubd %xmm9,%xmm2 movl %edx,%esi + movdqa %xmm2,32(%rsp) roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp + psubd %xmm9,%xmm2 addl %edx,%ecx addl 56(%rsp),%ebx xorl %ebp,%esi @@ -2488,21 +2635,2856 @@ L$done_ssse3: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq 64(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + + +.p2align 4 +sha1_block_data_order_avx: +_avx_shortcut: + + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + leaq -64(%rsp),%rsp + vzeroupper + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp L$oop_avx +.p2align 4 +L$oop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je L$done_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp L$oop_avx + +.p2align 4 +L$done_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +L$epilogue_avx: + .byte 0xf3,0xc3 + + + +.p2align 4 +sha1_block_data_order_avx2: +_avx2_shortcut: + + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + vzeroupper + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp L$oop_avx2 +.p2align 5 +L$oop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp L$align32_1 +.p2align 5 +L$align32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp L$align32_2 +.p2align 5 +L$align32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je L$done_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja L$ast_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp L$ast_avx2 + +.p2align 5 +L$ast_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp L$align32_3 +.p2align 5 +L$align32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + xorl %ebp,%esi + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe L$oop_avx2 + +L$done_avx2: + vzeroupper + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +L$epilogue_avx2: + .byte 0xf3,0xc3 + + .p2align 6 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s index 300212c915..8d257109ca 100644 --- a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha512-586.s" .text .globl _sha256_block_data_order .align 4 @@ -63,20 +62,6 @@ L000pic_point: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) - movl L__gnutls_x86_cpuid_s$non_lazy_ptr-L001K256(%ebp),%edx - movl (%edx),%ecx - movl 4(%edx),%edx - testl $1048576,%ecx - jnz L002loop - testl $2048,%edx - andl $1073741824,%ecx - andl $268435456,%edx - orl %edx,%ecx - cmpl $1342177280,%ecx - je L003loop_shrd - subl %edi,%eax - cmpl $256,%eax - jae L004unrolled jmp L002loop .align 4,0x90 L002loop: @@ -148,7 +133,7 @@ L002loop: movl %ecx,28(%esp) movl %edi,32(%esp) .align 4,0x90 -L00500_15: +L00300_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -186,11 +171,11 @@ L00500_15: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne L00500_15 + jne L00300_15 movl 156(%esp),%ecx - jmp L00616_63 + jmp L00416_63 .align 4,0x90 -L00616_63: +L00416_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -245,7 +230,7 @@ L00616_63: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne L00616_63 + jne L00416_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -279,8 +264,8 @@ L00616_63: popl %ebx popl %ebp ret -.align 4,0x90 -L003loop_shrd: +.align 5,0x90 +L005loop_shrd: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx @@ -349,7 +334,7 @@ L003loop_shrd: movl %ecx,28(%esp) movl %edi,32(%esp) .align 4,0x90 -L00700_15_shrd: +L00600_15_shrd: movl %edx,%ecx movl 24(%esp),%esi shrdl $14,%ecx,%ecx @@ -387,11 +372,11 @@ L00700_15_shrd: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne L00700_15_shrd + jne L00600_15_shrd movl 156(%esp),%ecx - jmp L00816_63_shrd + jmp L00716_63_shrd .align 4,0x90 -L00816_63_shrd: +L00716_63_shrd: movl %ecx,%ebx movl 104(%esp),%esi shrdl $11,%ecx,%ecx @@ -446,7 +431,7 @@ L00816_63_shrd: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne L00816_63_shrd + jne L00716_63_shrd movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -473,7 +458,7 @@ L00816_63_shrd: leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi - jb L003loop_shrd + jb L005loop_shrd movl 12(%esp),%esp popl %edi popl %esi @@ -484,8 +469,13 @@ L00816_63_shrd: L001K256: .long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 .long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 .align 4,0x90 -L004unrolled: +L008unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -3391,14 +3381,4 @@ L009grand_loop: popl %ebx popl %ebp ret -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L__gnutls_x86_cpuid_s$non_lazy_ptr: -.indirect_symbol __gnutls_x86_cpuid_s -.long 0 -.comm __gnutls_x86_cpuid_s,16,2 diff --git a/lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s new file mode 100644 index 0000000000..fd0c247359 --- /dev/null +++ b/lib/accelerated/x86/macosx/sha256-ssse3-x86_64.s @@ -0,0 +1,5470 @@ +# Copyright (c) 2011-2016, Andy Polyakov +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the Andy Polyakov nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# *** This file is auto-generated *** +# +.text + + +.globl _sha256_block_data_order + +.p2align 4 +_sha256_block_data_order: + + leaq __gnutls_x86_cpuid_s(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut + andl $296,%r11d + cmpl $296,%r11d + je L$avx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je L$avx_shortcut + testl $512,%r10d + jnz L$ssse3_shortcut + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp L$loop + +.p2align 4 +L$loop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz L$rounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue: + .byte 0xf3,0xc3 + + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + +.p2align 6 +sha256_block_data_order_shaext: +_shaext_shortcut: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 + + +.p2align 6 +sha256_block_data_order_ssse3: + +L$ssse3_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_ssse3: + .byte 0xf3,0xc3 + + + +.p2align 6 +sha256_block_data_order_avx: + +L$avx_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$avx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_avx + + movq 88(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx: + .byte 0xf3,0xc3 + + + +.p2align 6 +sha256_block_data_order_avx2: + +L$avx2_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $544,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp L$oop_avx2 +.p2align 4 +L$oop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + leaq -64(%rsp),%rsp + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp L$avx2_00_47 + +.p2align 4 +L$avx2_00_47: + leaq -64(%rsp),%rsp + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne L$avx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je L$done_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp L$ower_avx2 +.p2align 4 +L$ower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae L$ower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe L$oop_avx2 + leaq (%rsp),%rbp + +L$done_avx2: + leaq (%rbp),%rsp + movq 88(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx2: + .byte 0xf3,0xc3 + + + diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s index 0014a8116b..4e60bb45f6 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ # # *** This file is auto-generated *** # -.file "sha512-586.s" .text .globl _sha512_block_data_order .align 4 @@ -593,6 +592,8 @@ L001K512: .long 4234509866,1501505948 .long 987167468,1607167915 .long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s index 7e73227c2b..8bf161601e 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2013, Andy Polyakov +# Copyright (c) 2011-2016, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,2850 +40,5438 @@ .text -.globl _sha256_block_data_order +.globl _sha512_block_data_order .p2align 4 -_sha256_block_data_order: +_sha512_block_data_order: + leaq __gnutls_x86_cpuid_s(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $512,%r10d - jnz L$ssse3_shortcut + testl $2048,%r10d + jnz L$xop_shortcut + andl $296,%r11d + cmpl $296,%r11d + je L$avx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je L$avx_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx - subq $64+32,%rsp - leaq (%rsi,%rdx,4),%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + L$prologue: - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 jmp L$loop .p2align 4 L$loop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - leaq 4(%rbp),%rbp - addl %r14d,%edx - - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - leaq 20(%rbp),%rbp - addl %r14d,%eax - - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - leaq 4(%rbp),%rbp - addl %r14d,%edx - - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 4(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp jmp L$rounds_16_xx .p2align 4 L$rounds_16_xx: - - movl 56(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 36(%rsp),%r12d - xorl %r15d,%r14d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r14d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 8(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - - movl 60(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 40(%rsp),%r12d - xorl %edi,%r14d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %r14d,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 12(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - - movl 0(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 44(%rsp),%r12d - xorl %r15d,%r14d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r14d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 16(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - - movl 4(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 48(%rsp),%r12d - xorl %edi,%r14d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %r14d,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 20(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - - movl 8(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 52(%rsp),%r12d - xorl %r15d,%r14d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r14d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 24(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%edx - - - movl 12(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 56(%rsp),%r12d - xorl %edi,%r14d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %r14d,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 28(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - - movl 16(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 60(%rsp),%r12d - xorl %r15d,%r14d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r14d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 32(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - - movl 20(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 0(%rsp),%r12d - xorl %edi,%r14d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %r14d,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 36(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - - - movl 24(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 4(%rsp),%r12d - xorl %r15d,%r14d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r14d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - rorl $6,%r13d - xorl %ebx,%r15d - movl %ebx,%r11d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 40(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r11d - - - movl 28(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 8(%rsp),%r12d - xorl %edi,%r14d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %r14d,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - rorl $6,%r13d - xorl %eax,%edi - movl %eax,%r10d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 44(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r10d - - - movl 32(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 12(%rsp),%r12d - xorl %r15d,%r14d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r14d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - rorl $6,%r13d - xorl %r11d,%r15d - movl %r11d,%r9d - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 48(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%r9d - - - movl 36(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 16(%rsp),%r12d - xorl %edi,%r14d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %r14d,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - rorl $6,%r13d - xorl %r10d,%edi - movl %r10d,%r8d - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 52(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%r8d - - - movl 40(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 20(%rsp),%r12d - xorl %r15d,%r14d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r14d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - rorl $6,%r13d - xorl %r9d,%r15d - movl %r9d,%edx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 56(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%edx - - - movl 44(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 24(%rsp),%r12d - xorl %edi,%r14d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %r14d,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - rorl $6,%r13d - xorl %r8d,%edi - movl %r8d,%ecx - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 60(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ecx - - - movl 48(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%r15d - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r15d,%r14d - shrl $10,%r15d - - xorl %r13d,%r12d - rorl $17,%r14d - addl 28(%rsp),%r12d - xorl %r15d,%r14d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r14d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - rorl $6,%r13d - xorl %edx,%r15d - movl %edx,%ebx - - rorl $2,%r14d - andl %r15d,%edi - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 0(%rsp),%r13d - leaq 4(%rbp),%rbp - addl %r14d,%ebx - - - movl 52(%rsp),%r14d - - movl %r13d,%r12d - rorl $11,%r13d - movl %r14d,%edi - rorl $2,%r14d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %edi,%r14d - shrl $10,%edi - - xorl %r13d,%r12d - rorl $17,%r14d - addl 32(%rsp),%r12d - xorl %edi,%r14d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %r14d,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - rorl $6,%r13d - xorl %ecx,%edi - movl %ecx,%eax - - rorl $2,%r14d - andl %edi,%r15d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 4(%rsp),%r13d - leaq 20(%rbp),%rbp - addl %r14d,%eax - - cmpb $0,3(%rbp) + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) jnz L$rounds_16_xx - movq 64+0(%rsp),%rdi - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) jb L$loop - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 152(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 -sha256_block_data_order_ssse3: -L$ssse3_shortcut: +sha512_block_data_order_xop: + +L$xop_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) -L$prologue_ssse3: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - movdqa K256+512+32(%rip),%xmm8 - movdqa K256+512+64(%rip),%xmm9 - jmp L$loop_ssse3 + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_xop .p2align 4 -L$loop_ssse3: - movdqa K256+512(%rip),%xmm7 - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 -.byte 102,15,56,0,199 - leaq K256(%rip),%rbp -.byte 102,15,56,0,207 - movdqa 0(%rbp),%xmm4 -.byte 102,15,56,0,215 - movdqa 32(%rbp),%xmm5 - paddd %xmm0,%xmm4 - movdqa 64(%rbp),%xmm6 -.byte 102,15,56,0,223 - movdqa 96(%rbp),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$ssse3_00_47 +L$loop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$xop_00_47 .p2align 4 -L$ssse3_00_47: - subq $-32*4,%rbp - rorl $14,%r13d - movl %r14d,%eax - movdqa %xmm1,%xmm4 - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d -.byte 102,15,58,15,224,4 - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,250,4 - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - psrld $7,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - pshufd $250,%xmm3,%xmm7 - movl %r8d,%r12d - pslld $14,%xmm5 - xorl %edx,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r9d,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - pxor %xmm6,%xmm4 - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - psrlq $17,%xmm6 - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - pxor %xmm6,%xmm7 - xorl %ecx,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - pxor %xmm6,%xmm7 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d -.byte 102,65,15,56,0,248 - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm0 - addl %r12d,%r9d - pshufd $80,%xmm0,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - movdqa %xmm7,%xmm6 - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - psrld $10,%xmm7 - movl %ebx,%r13d - psrlq $17,%xmm6 - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - pxor %xmm6,%xmm7 - addl 12(%rsp),%r8d - movl %r9d,%edi - movdqa 0(%rbp),%xmm6 - rorl $11,%r14d - xorl %edx,%r12d -.byte 102,65,15,56,0,249 - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - paddd %xmm7,%xmm0 - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - paddd %xmm0,%xmm6 - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movl %r14d,%r8d - movdqa %xmm2,%xmm4 - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d -.byte 102,15,58,15,225,4 - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,251,4 - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - psrld $7,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - pshufd $250,%xmm0,%xmm7 - movl %eax,%r12d - pslld $14,%xmm5 - xorl %r11d,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %ebx,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - pxor %xmm6,%xmm4 - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - psrlq $17,%xmm6 - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - pxor %xmm6,%xmm7 - xorl %r10d,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - pxor %xmm6,%xmm7 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx -.byte 102,65,15,56,0,248 - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm1 - addl %r12d,%ebx - pshufd $80,%xmm1,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - movdqa %xmm7,%xmm6 - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - psrld $10,%xmm7 - movl %r9d,%r13d - psrlq $17,%xmm6 - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - pxor %xmm6,%xmm7 - addl 28(%rsp),%eax - movl %ebx,%edi - movdqa 32(%rbp),%xmm6 - rorl $11,%r14d - xorl %r11d,%r12d -.byte 102,65,15,56,0,249 - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - paddd %xmm7,%xmm1 - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - paddd %xmm1,%xmm6 - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movdqa %xmm3,%xmm4 - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d -.byte 102,15,58,15,226,4 - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,248,4 - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - psrld $7,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - pshufd $250,%xmm1,%xmm7 - movl %r8d,%r12d - pslld $14,%xmm5 - xorl %edx,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r9d,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - pxor %xmm6,%xmm4 - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - psrlq $17,%xmm6 - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - pxor %xmm6,%xmm7 - xorl %ecx,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - pxor %xmm6,%xmm7 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d -.byte 102,65,15,56,0,248 - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm2 - addl %r12d,%r9d - pshufd $80,%xmm2,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - movdqa %xmm7,%xmm6 - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - psrld $10,%xmm7 - movl %ebx,%r13d - psrlq $17,%xmm6 - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - pxor %xmm6,%xmm7 - addl 44(%rsp),%r8d - movl %r9d,%edi - movdqa 64(%rbp),%xmm6 - rorl $11,%r14d - xorl %edx,%r12d -.byte 102,65,15,56,0,249 - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - paddd %xmm7,%xmm2 - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - paddd %xmm2,%xmm6 - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movl %r14d,%r8d - movdqa %xmm0,%xmm4 - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d -.byte 102,15,58,15,227,4 - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,249,4 - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - psrld $7,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - pshufd $250,%xmm2,%xmm7 - movl %eax,%r12d - pslld $14,%xmm5 - xorl %r11d,%r13d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %ebx,%r12d - psrld $11,%xmm6 - rorl $5,%r13d - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - pxor %xmm6,%xmm4 - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - pxor %xmm5,%xmm4 - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - psrlq $17,%xmm6 - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - pxor %xmm6,%xmm7 - xorl %r10d,%r13d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - pxor %xmm6,%xmm7 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx -.byte 102,65,15,56,0,248 - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - paddd %xmm7,%xmm3 - addl %r12d,%ebx - pshufd $80,%xmm3,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - movdqa %xmm7,%xmm6 - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - psrld $10,%xmm7 - movl %r9d,%r13d - psrlq $17,%xmm6 - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - psrlq $2,%xmm6 - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - pxor %xmm6,%xmm7 - addl 60(%rsp),%eax - movl %ebx,%edi - movdqa 96(%rbp),%xmm6 - rorl $11,%r14d - xorl %r11d,%r12d -.byte 102,65,15,56,0,249 - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - paddd %xmm7,%xmm3 - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - paddd %xmm3,%xmm6 - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne L$ssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - addl %r11d,%edx - rorl $2,%r14d - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%edi - rorl $6,%r13d - addl %r12d,%r10d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - addl %r9d,%ebx - rorl $2,%r14d - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%edi - rorl $6,%r13d - addl %r12d,%r8d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - addl %edx,%r11d - rorl $2,%r14d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%edi - rorl $6,%r13d - addl %r12d,%ecx - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - addl %ebx,%r9d - rorl $2,%r14d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%edi - rorl $6,%r13d - addl %r12d,%eax - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop_ssse3 - - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_ssse3: +L$xop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$xop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_xop + + movq 152(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_xop: + .byte 0xf3,0xc3 + + + +.p2align 6 +sha512_block_data_order_avx: + +L$avx_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$avx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_avx + + movq 152(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx: + .byte 0xf3,0xc3 + + + +.p2align 6 +sha512_block_data_order_avx2: + +L$avx2_shortcut: + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1312,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$oop_avx2 +.p2align 4 +L$oop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + leaq -128(%rsp),%rsp + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp L$avx2_00_47 + +.p2align 4 +L$avx2_00_47: + leaq -128(%rsp),%rsp + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne L$avx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je L$done_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp L$ower_avx2 +.p2align 4 +L$ower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae L$ower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe L$oop_avx2 + leaq (%rsp),%rbp + +L$done_avx2: + leaq (%rbp),%rsp + movq 152(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx2: .byte 0xf3,0xc3 + diff --git a/lib/accelerated/x86/sha-x86.h b/lib/accelerated/x86/sha-x86.h index 4724604bdd..6bbbfb8641 100644 --- a/lib/accelerated/x86/sha-x86.h +++ b/lib/accelerated/x86/sha-x86.h @@ -3,9 +3,6 @@ #include -/* nettle's SHA512 is faster than openssl's */ -#undef ENABLE_SHA512 - extern const struct nettle_hash x86_sha1; extern const struct nettle_hash x86_sha224; extern const struct nettle_hash x86_sha256; -- cgit v1.2.1