diff options
author | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-09-24 00:43:57 +0000 |
---|---|---|
committer | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-09-24 00:43:57 +0000 |
commit | 982ba6fa712d44275c2541b6b9badf489cf9eda6 (patch) | |
tree | 7d4e77f11bb8dc49557b634d8380767aef1b8502 | |
parent | 489a156f9bc41028439b6375af6314e473565847 (diff) | |
download | cryptopp-982ba6fa712d44275c2541b6b9badf489cf9eda6.tar.gz |
- port x64 assembly code to MASM
- improve stack unwindability on x64 for GCC by not modifying RBP/RSP registers in inline assembly
git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@396 57ff6487-cd31-0410-9ec3-f628ee90f5f0
-rwxr-xr-x | cpu.h | 121 | ||||
-rw-r--r-- | panama.cpp | 230 | ||||
-rw-r--r-- | rijndael.cpp | 154 | ||||
-rwxr-xr-x | salsa.cpp | 719 | ||||
-rwxr-xr-x | sosemanuk.cpp | 228 | ||||
-rw-r--r-- | whrlpool.cpp | 94 | ||||
-rwxr-xr-x | x64masm.asm | 1842 |
7 files changed, 2854 insertions, 534 deletions
@@ -1,6 +1,15 @@ #ifndef CRYPTOPP_CPU_H #define CRYPTOPP_CPU_H +#ifdef CRYPTOPP_GENERATE_X64_MASM + +#define CRYPTOPP_X86_ASM_AVAILABLE +#define CRYPTOPP_BOOL_X64 1 +#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1 +#define NAMESPACE_END + +#else + #include "config.h" #ifdef CRYPTOPP_MSVC6PP_OR_LATER @@ -98,7 +107,18 @@ inline bool HasMMX() {return false;} #endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400 -#if defined(__GNUC__) +#endif + +#ifdef CRYPTOPP_GENERATE_X64_MASM + #define AS1(x) x*newline* + #define AS2(x, y) x, y*newline* + #define AS3(x, y, z) x, y, z*newline* + #define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline* + #define ASL(x) label##x:*newline* + #define ASJ(x, y, z) x label##y*newline* + #define ASC(x, y) x label##y*newline* + #define AS_HEX(y) y##h +#elif defined(__GNUC__) // define these in two steps to allow arguments to be expanded #define GNU_AS1(x) #x ";" #define GNU_AS2(x, y) #x ", " #y ";" @@ -113,6 +133,7 @@ inline bool HasMMX() {return false;} #define ASJ(x, y, z) GNU_ASJ(x, y, z) #define ASC(x, y) #x " " #y ";" #define CRYPTOPP_NAKED + #define AS_HEX(y) 0x##y #else #define AS1(x) __asm {x} #define AS2(x, y) __asm {x, y} @@ -122,25 +143,115 @@ inline bool HasMMX() {return false;} #define ASJ(x, y, z) __asm {x label##y} #define ASC(x, y) __asm {x label##y} #define CRYPTOPP_NAKED __declspec(naked) + #define AS_HEX(y) 0x##y #endif +#ifdef CRYPTOPP_GENERATE_X64_MASM +#define ASM_MOD(x, y) ((x) MOD (y)) +#else // GNU assembler doesn't seem to have mod operator #define ASM_MOD(x, y) ((x)-((x)/(y))*(y)) +#endif #if CRYPTOPP_BOOL_X86 + #define AS_REG_1 ecx + #define AS_REG_2 edx + #define AS_REG_3 esi + #define AS_REG_4 edi + #define AS_REG_5 eax + #define AS_REG_6 ebx + #define AS_REG_7 ebp + #define AS_REG_1d ecx + #define AS_REG_2d edx + #define AS_REG_3d esi + #define AS_REG_4d edi + #define AS_REG_5d eax + #define AS_REG_6d ebx + #define AS_REG_7d ebp #define WORD_SZ 4 #define WORD_REG(x) e##x #define WORD_PTR DWORD PTR - #define AS_PUSH(x) AS1(push e##x) - #define AS_POP(x) AS1(pop e##x) + #define AS_PUSH_IF86(x) AS1(push e##x) + #define AS_POP_IF86(x) AS1(pop e##x) + #define AS_JCXZ jecxz #elif CRYPTOPP_BOOL_X64 + #ifdef CRYPTOPP_GENERATE_X64_MASM + #define AS_REG_1 rcx + #define AS_REG_2 rdx + #define AS_REG_3 r8 + #define AS_REG_4 r9 + #define AS_REG_5 rax + #define AS_REG_6 r10 + #define AS_REG_7 r11 + #define AS_REG_1d ecx + #define AS_REG_2d edx + #define AS_REG_3d r8d + #define AS_REG_4d r9d + #define AS_REG_5d eax + #define AS_REG_6d r10d + #define AS_REG_7d r11d + #else + #define AS_REG_1 rdi + #define AS_REG_2 rsi + #define AS_REG_3 rdx + #define AS_REG_4 rcx + #define AS_REG_5 r8 + #define AS_REG_6 r9 + #define AS_REG_7 r10 + #define AS_REG_1d edi + #define AS_REG_2d esi + #define AS_REG_3d edx + #define AS_REG_4d ecx + #define AS_REG_5d r8d + #define AS_REG_6d r9d + #define AS_REG_7d r10d + #endif #define WORD_SZ 8 #define WORD_REG(x) r##x #define WORD_PTR QWORD PTR - #define AS_PUSH(x) AS1(pushq r##x) - #define AS_POP(x) AS1(popq r##x) + #define AS_PUSH_IF86(x) + #define AS_POP_IF86(x) + #define AS_JCXZ jrcxz #endif +// helper macro for stream cipher output +#define AS_XMM_OUTPUT4(labelPrefix, inputPtr, outputPtr, x0, x1, x2, x3, t, p0, p1, p2, p3, increment)\ + AS2( test inputPtr, inputPtr)\ + ASC( jz, labelPrefix##3)\ + AS2( test inputPtr, 15)\ + ASC( jnz, labelPrefix##7)\ + AS2( pxor xmm##x0, [inputPtr+p0*16])\ + AS2( pxor xmm##x1, [inputPtr+p1*16])\ + AS2( pxor xmm##x2, [inputPtr+p2*16])\ + AS2( pxor xmm##x3, [inputPtr+p3*16])\ + AS2( add inputPtr, increment*16)\ + ASC( jmp, labelPrefix##3)\ + ASL(labelPrefix##7)\ + AS2( movdqu xmm##t, [inputPtr+p0*16])\ + AS2( pxor xmm##x0, xmm##t)\ + AS2( movdqu xmm##t, [inputPtr+p1*16])\ + AS2( pxor xmm##x1, xmm##t)\ + AS2( movdqu xmm##t, [inputPtr+p2*16])\ + AS2( pxor xmm##x2, xmm##t)\ + AS2( movdqu xmm##t, [inputPtr+p3*16])\ + AS2( pxor xmm##x3, xmm##t)\ + AS2( add inputPtr, increment*16)\ + ASL(labelPrefix##3)\ + AS2( test outputPtr, 15)\ + ASC( jnz, labelPrefix##8)\ + AS2( movdqa [outputPtr+p0*16], xmm##x0)\ + AS2( movdqa [outputPtr+p1*16], xmm##x1)\ + AS2( movdqa [outputPtr+p2*16], xmm##x2)\ + AS2( movdqa [outputPtr+p3*16], xmm##x3)\ + ASC( jmp, labelPrefix##9)\ + ASL(labelPrefix##8)\ + AS2( movdqu [outputPtr+p0*16], xmm##x0)\ + AS2( movdqu [outputPtr+p1*16], xmm##x1)\ + AS2( movdqu [outputPtr+p2*16], xmm##x2)\ + AS2( movdqu [outputPtr+p3*16], xmm##x3)\ + ASL(labelPrefix##9)\ + AS2( add outputPtr, increment*16) + NAMESPACE_END #endif @@ -1,6 +1,11 @@ // panama.cpp - written and placed in the public domain by Wei Dai +// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM panama.cpp" to generate MASM code + #include "pch.h" + +#ifndef CRYPTOPP_GENERATE_X64_MASM + #include "panama.h" #include "misc.h" #include "cpu.h" @@ -16,41 +21,67 @@ void Panama<B>::Reset() #endif } -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM +#ifdef CRYPTOPP_X64_MASM_AVAILABLE +extern "C" { +void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y); +} +#elif CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +#ifdef CRYPTOPP_GENERATE_X64_MASM + Panama_SSE2_Pull PROC FRAME + alloc_stack(2*16+8) + save_xmm128 xmm6, 0h + save_xmm128 xmm7, 10h + .endprolog +#else #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code - void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) { #ifdef __GNUC__ __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS_PUSH( bx) + AS_POP_IF86( bx) #else - AS2( mov WORD_REG(cx), count) - AS2( mov WORD_REG(si), state) - AS2( mov WORD_REG(di), z) - AS2( mov WORD_REG(dx), y) + AS2( mov AS_REG_1, count) + AS2( mov AS_REG_2, state) + AS2( mov AS_REG_3, z) + AS2( mov AS_REG_4, y) #endif - AS2( shl WORD_REG(cx), 5) - ASJ( jz, 5, f) - AS2( mov ebx, [WORD_REG(si)+4*17]) - AS2( add WORD_REG(cx), WORD_REG(bx)) +#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM - AS_PUSH( bp) - AS_PUSH( cx) +#if CRYPTOPP_BOOL_X86 + #define REG_loopEnd [esp] +#elif defined(CRYPTOPP_GENERATE_X64_MASM) + #define REG_loopEnd rdi +#else + #define REG_loopEnd r8 +#endif - AS2( movdqa xmm0, [WORD_REG(si)+0*16]) - AS2( movdqa xmm1, [WORD_REG(si)+1*16]) - AS2( movdqa xmm2, [WORD_REG(si)+2*16]) - AS2( movdqa xmm3, [WORD_REG(si)+3*16]) - AS2( mov eax, [WORD_REG(si)+4*16]) + AS2( shl AS_REG_1, 5) + ASJ( jz, 5, f) + AS2( mov AS_REG_6d, [AS_REG_2+4*17]) + AS2( add AS_REG_1, AS_REG_6) + + #if CRYPTOPP_BOOL_X64 + AS2( mov REG_loopEnd, AS_REG_1) + #else + AS1( push ebp) + AS1( push AS_REG_1) + #endif + + AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+0*16]) + AS2( movdqa xmm1, XMMWORD PTR [AS_REG_2+1*16]) + AS2( movdqa xmm2, XMMWORD PTR [AS_REG_2+2*16]) + AS2( movdqa xmm3, XMMWORD PTR [AS_REG_2+3*16]) + AS2( mov eax, dword ptr [AS_REG_2+4*16]) ASL(4) // gamma and pi #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE - AS2( test WORD_REG(bx), 1) + AS2( test AS_REG_6, 1) ASJ( jnz, 6, f) #endif AS2( movdqa xmm6, xmm2) @@ -70,18 +101,18 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) ASL(7) #endif - AS2( movd ecx, xmm2) - AS1( not ecx) - AS2( movd ebp, xmm3) - AS2( or ecx, ebp) - AS2( xor eax, ecx) + AS2( movd AS_REG_1d, xmm2) + AS1( not AS_REG_1d) + AS2( movd AS_REG_7d, xmm3) + AS2( or AS_REG_1d, AS_REG_7d) + AS2( xor eax, AS_REG_1d) #define SSE2_Index(i) ASM_MOD(((i)*13+16), 17) #define pi(i) \ - AS2( movd ecx, xmm7)\ - AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ - AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx) + AS2( movd AS_REG_1d, xmm7)\ + AS2( rol AS_REG_1d, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ + AS2( mov [AS_REG_2+SSE2_Index(ASM_MOD(5*(i), 17))*4], AS_REG_1d) #define pi4(x, y, z, a, b, c, d) \ AS2( pcmpeqb xmm7, xmm7)\ @@ -110,65 +141,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) AS2( punpckhdq xmm2, xmm0) // 11 12 15 16 // keystream - AS2( test WORD_REG(di), WORD_REG(di)) + AS2( test AS_REG_3, AS_REG_3) ASJ( jz, 0, f) AS2( movdqa xmm6, xmm4) AS2( punpcklqdq xmm4, xmm2) AS2( punpckhqdq xmm6, xmm2) - AS2( test WORD_REG(dx), 0xf) + AS2( test AS_REG_4, 15) ASJ( jnz, 2, f) - AS2( test WORD_REG(dx), WORD_REG(dx)) + AS2( test AS_REG_4, AS_REG_4) ASJ( jz, 1, f) - AS2( pxor xmm4, [WORD_REG(dx)]) - AS2( pxor xmm6, [WORD_REG(dx)+16]) - AS2( add WORD_REG(dx), 32) + AS2( pxor xmm4, [AS_REG_4]) + AS2( pxor xmm6, [AS_REG_4+16]) + AS2( add AS_REG_4, 32) ASJ( jmp, 1, f) ASL(2) - AS2( movdqu xmm0, [WORD_REG(dx)]) - AS2( movdqu xmm2, [WORD_REG(dx)+16]) + AS2( movdqu xmm0, [AS_REG_4]) + AS2( movdqu xmm2, [AS_REG_4+16]) AS2( pxor xmm4, xmm0) AS2( pxor xmm6, xmm2) - AS2( add WORD_REG(dx), 32) + AS2( add AS_REG_4, 32) ASL(1) - AS2( test WORD_REG(di), 0xf) + AS2( test AS_REG_3, 15) ASJ( jnz, 3, f) - AS2( movdqa [WORD_REG(di)], xmm4) - AS2( movdqa [WORD_REG(di)+16], xmm6) - AS2( add WORD_REG(di), 32) + AS2( movdqa XMMWORD PTR [AS_REG_3], xmm4) + AS2( movdqa XMMWORD PTR [AS_REG_3+16], xmm6) + AS2( add AS_REG_3, 32) ASJ( jmp, 0, f) ASL(3) - AS2( movdqu [WORD_REG(di)], xmm4) - AS2( movdqu [WORD_REG(di)+16], xmm6) - AS2( add WORD_REG(di), 32) + AS2( movdqu XMMWORD PTR [AS_REG_3], xmm4) + AS2( movdqu XMMWORD PTR [AS_REG_3+16], xmm6) + AS2( add AS_REG_3, 32) ASL(0) // buffer update - AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32]) - AS2( and WORD_REG(cx), 31*32) - AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32]) - AS2( and WORD_REG(bp), 31*32) + AS2( lea AS_REG_1, [AS_REG_6 + 32]) + AS2( and AS_REG_1, 31*32) + AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32]) + AS2( and AS_REG_7, 31*32) - AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8]) + AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8]) AS2( pxor xmm3, xmm0) ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) - AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3) - AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8]) - AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0) + AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3) + AS2( pxor xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8]) + AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0) - AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8]) + AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8]) AS2( pxor xmm1, xmm4) - AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1) - AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8]) - AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4) + AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1) + AS2( pxor xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8]) + AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4) // theta - AS2( movdqa xmm3, [WORD_REG(si)+3*16]) - AS2( movdqa xmm2, [WORD_REG(si)+2*16]) - AS2( movdqa xmm1, [WORD_REG(si)+1*16]) - AS2( movdqa xmm0, [WORD_REG(si)+0*16]) + AS2( movdqa xmm3, XMMWORD PTR [AS_REG_2+3*16]) + AS2( movdqa xmm2, XMMWORD PTR [AS_REG_2+2*16]) + AS2( movdqa xmm1, XMMWORD PTR [AS_REG_2+1*16]) + AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+0*16]) #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE - AS2( test WORD_REG(bx), 1) + AS2( test AS_REG_6, 1) ASJ( jnz, 8, f) #endif AS2( movd xmm6, eax) @@ -199,10 +230,10 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) #endif AS2( xor eax, 1) - AS2( movd ecx, xmm0) - AS2( xor eax, ecx) - AS2( movd ecx, xmm3) - AS2( xor eax, ecx) + AS2( movd AS_REG_1d, xmm0) + AS2( xor eax, AS_REG_1d) + AS2( movd AS_REG_1d, xmm3) + AS2( xor eax, AS_REG_1d) AS2( pxor xmm3, xmm2) AS2( pxor xmm2, xmm1) @@ -214,21 +245,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) AS2( pxor xmm0, xmm4) // sigma - AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32]) - AS2( and WORD_REG(cx), 31*32) - AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32]) - AS2( and WORD_REG(bp), 31*32) + AS2( lea AS_REG_1, [AS_REG_6 + (32-4)*32]) + AS2( and AS_REG_1, 31*32) + AS2( lea AS_REG_7, [AS_REG_6 + 16*32]) + AS2( and AS_REG_7, 31*32) - AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16]) - AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16]) + AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*16]) + AS2( movdqa xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*16]) AS2( movdqa xmm6, xmm4) AS2( punpcklqdq xmm4, xmm5) AS2( punpckhqdq xmm6, xmm5) AS2( pxor xmm3, xmm4) AS2( pxor xmm2, xmm6) - AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16]) - AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16]) + AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+1*16]) + AS2( movdqa xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+1*16]) AS2( movdqa xmm6, xmm4) AS2( punpcklqdq xmm4, xmm5) AS2( punpckhqdq xmm6, xmm5) @@ -236,31 +267,48 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) AS2( pxor xmm0, xmm6) // loop - AS2( add WORD_REG(bx), 32) - AS2( cmp WORD_REG(bx), [WORD_REG(sp)]) + AS2( add AS_REG_6, 32) + AS2( cmp AS_REG_6, REG_loopEnd) ASJ( jne, 4, b) // save state - AS2( add WORD_REG(sp), WORD_SZ) - AS_POP( bp) - AS2( mov [WORD_REG(si)+4*16], eax) - AS2( movdqa [WORD_REG(si)+3*16], xmm3) - AS2( movdqa [WORD_REG(si)+2*16], xmm2) - AS2( movdqa [WORD_REG(si)+1*16], xmm1) - AS2( movdqa [WORD_REG(si)+0*16], xmm0) + AS2( mov [AS_REG_2+4*16], eax) + AS2( movdqa XMMWORD PTR [AS_REG_2+3*16], xmm3) + AS2( movdqa XMMWORD PTR [AS_REG_2+2*16], xmm2) + AS2( movdqa XMMWORD PTR [AS_REG_2+1*16], xmm1) + AS2( movdqa XMMWORD PTR [AS_REG_2+0*16], xmm0) + + #if CRYPTOPP_BOOL_X86 + AS2( add esp, 4) + AS1( pop ebp) + #endif ASL(5) #ifdef __GNUC__ - AS_POP( bx) - ".att_syntax prefix;" - : - : "c" (count), "S" (state), "D" (z), "d" (y) - : "%eax", "memory", "cc" + AS_POP_IF86( bx) + ".att_syntax prefix;" + : + #if CRYPTOPP_BOOL_X64 + : "D" (count), "S" (state), "d" (z), "c" (y) + : "%r8", "%r9", "r10", "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" + #else + : "c" (count), "d" (state), "S" (z), "D" (y) + : "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" + #endif ); #endif +#ifdef CRYPTOPP_GENERATE_X64_MASM + movdqa xmm6, [rsp + 0h] + movdqa xmm7, [rsp + 10h] + add rsp, 2*16+8 + ret + Panama_SSE2_Pull ENDP +#else } - #endif +#endif // #ifdef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +#ifndef CRYPTOPP_GENERATE_X64_MASM template <class B> void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y) @@ -411,7 +459,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt this->Iterate(1, buf); } -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) Panama_SSE2_Pull(32, this->m_state, NULL, NULL); else @@ -423,7 +471,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt template <class B> unsigned int PanamaCipherPolicy<B>::GetAlignment() const { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) return 16; else @@ -435,7 +483,7 @@ unsigned int PanamaCipherPolicy<B>::GetAlignment() const template <class B> void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input); else @@ -453,3 +501,5 @@ template class PanamaCipherPolicy<BigEndian>; template class PanamaCipherPolicy<LittleEndian>; NAMESPACE_END + +#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM diff --git a/rijndael.cpp b/rijndael.cpp index ac4f769..b89e3b3 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -2,6 +2,8 @@ // and Wei Dai from Paulo Baretto's Rijndael implementation // The original code and all modifications are in the public domain. +// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code + /* Defense against timing attacks was added in July 2006 by Wei Dai. @@ -48,6 +50,7 @@ being unloaded from L1 cache, until that round is finished. #include "pch.h" #ifndef CRYPTOPP_IMPORTS +#ifndef CRYPTOPP_GENERATE_X64_MASM #include "rijndael.h" #include "misc.h" @@ -145,27 +148,56 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); } +#ifdef CRYPTOPP_X64_MASM_AVAILABLE +extern "C" { +void Rijndael_Enc_ProcessAndXorBlock(const word32 *table, word32 cacheLineSize, const word32 *k, const word32 *kLoopEnd, const byte *inBlock, const byte *xorBlock, byte *outBlock); +} +#endif + #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { +#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM + +#ifdef CRYPTOPP_X64_MASM_AVAILABLE + Rijndael_Enc_ProcessAndXorBlock(Te, g_cacheLineSize, m_key, m_key + m_rounds*4, inBlock, xorBlock, outBlock); + return; +#endif + #if defined(CRYPTOPP_X86_ASM_AVAILABLE) + #ifdef CRYPTOPP_GENERATE_X64_MASM + ALIGN 8 + Rijndael_Enc_ProcessAndXorBlock PROC FRAME + rex_push_reg rbx + push_reg rsi + push_reg rdi + push_reg r12 + push_reg r13 + push_reg r14 + push_reg r15 + .endprolog + mov AS_REG_7, rcx + mov rdi, [rsp + 5*8 + 7*8] ; inBlock + #else if (HasMMX()) { const word32 *k = m_key; const word32 *kLoopEnd = k + m_rounds*4; + #endif + #if CRYPTOPP_BOOL_X64 #define K_REG r8 #define K_END_REG r9 #define SAVE_K #define RESTORE_K #define RESTORE_K_END - #define SAVE_0(x) AS2(mov r10d, x) - #define SAVE_1(x) AS2(mov r11d, x) - #define SAVE_2(x) AS2(mov r12d, x) - #define RESTORE_0(x) AS2(mov x, r10d) - #define RESTORE_1(x) AS2(mov x, r11d) - #define RESTORE_2(x) AS2(mov x, r12d) + #define SAVE_0(x) AS2(mov r13d, x) + #define SAVE_1(x) AS2(mov r14d, x) + #define SAVE_2(x) AS2(mov r15d, x) + #define RESTORE_0(x) AS2(mov x, r13d) + #define RESTORE_1(x) AS2(mov x, r14d) + #define RESTORE_2(x) AS2(mov x, r15d) #else #define K_REG esi #define K_END_REG edi @@ -184,22 +216,16 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS_PUSH( bx) - AS_PUSH( bp) - AS2( mov WORD_REG(bp), WORD_REG(ax)) #if CRYPTOPP_BOOL_X64 - // save these manually. clobber list doesn't seem to work as of GCC 4.1.0 - AS1( pushq K_REG) - AS1( pushq K_END_REG) - AS1( pushq r10) - AS1( pushq r11) - AS1( pushq r12) AS2( mov K_REG, rsi) AS2( mov K_END_REG, rcx) #else + AS1( push ebx) + AS1( push ebp) AS2( movd mm5, ecx) #endif -#else + AS2( mov AS_REG_7, WORD_REG(ax)) +#elif CRYPTOPP_BOOL_X86 #if _MSC_VER < 1300 const word32 *t = Te; AS2( mov eax, t) @@ -209,12 +235,12 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock AS2( mov K_REG, k) AS2( movd mm5, kLoopEnd) #if _MSC_VER < 1300 - AS_PUSH( bx) - AS_PUSH( bp) - AS2( mov ebp, eax) + AS1( push ebx) + AS1( push ebp) + AS2( mov AS_REG_7, eax) #else - AS_PUSH( bp) - AS2( lea ebp, Te) + AS1( push ebp) + AS2( lea AS_REG_7, Te) #endif #endif AS2( mov eax, [K_REG+0*4]) // s0 @@ -236,21 +262,21 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock AS2( and ebx, 0) AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence ASL(2) - AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) + AS2( and ebx, [AS_REG_7+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) + AS2( and ebx, [AS_REG_7+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) + AS2( and ebx, [AS_REG_7+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) + AS2( and ebx, [AS_REG_7+WORD_REG(di)]) AS2( add edi, edx) AS2( cmp edi, 1024) ASJ( jl, 2, b) - AS2( and ebx, [WORD_REG(bp)+1020]) + AS2( and ebx, [AS_REG_7+1020]) #if CRYPTOPP_BOOL_X64 - AS2( xor r10d, ebx) - AS2( xor r11d, ebx) - AS2( xor r12d, ebx) + AS2( xor r13d, ebx) + AS2( xor r14d, ebx) + AS2( xor r15d, ebx) #else AS2( movd mm6, ebx) AS2( pxor mm2, mm6) @@ -268,14 +294,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #define QUARTER_ROUND(t, a, b, c, d) \ AS2(movzx esi, t##l)\ - AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\ + AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ + AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\ AS2(shr e##t##x, 16)\ AS2(movzx esi, t##l)\ - AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ + AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)]) + AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)]) #define s0 xor edi #define s1 xor eax @@ -308,14 +334,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #define QUARTER_ROUND(t, a, b, c, d) \ AS2(movzx esi, t##l)\ - AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\ + AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ + AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\ AS2(shr e##t##x, 16)\ AS2(movzx esi, t##l)\ - AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ + AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)]) + AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)]) QUARTER_ROUND(d, s0, s1, s2, s3) RESTORE_2(edx) @@ -369,20 +395,20 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #define QUARTER_ROUND(a, b, c, d) \ AS2( movzx ebx, dl)\ - AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ + AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\ AS2( shl ebx, 3*8)\ AS2( xor a, ebx)\ AS2( movzx ebx, dh)\ - AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ + AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\ AS2( shl ebx, 2*8)\ AS2( xor b, ebx)\ AS2( shr edx, 16)\ AS2( movzx ebx, dl)\ AS2( shr edx, 8)\ - AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ + AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\ AS2( shl ebx, 1*8)\ AS2( xor c, ebx)\ - AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\ + AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(dx)])\ AS2( xor d, ebx) QUARTER_ROUND(eax, ecx, esi, edi) @@ -395,25 +421,22 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #undef QUARTER_ROUND -#if CRYPTOPP_BOOL_X64 - AS1(popq r12) - AS1(popq r11) - AS1(popq r10) - AS1(popq K_END_REG) - AS1(popq K_REG) -#else +#if CRYPTOPP_BOOL_X86 AS1(emms) + AS1(pop ebp) + #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) + AS1(pop ebx) + #endif #endif - AS_POP( bp) -#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) - AS_POP( bx) -#endif #ifdef __GNUC__ ".att_syntax prefix;" : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) : "memory", "cc" + #if CRYPTOPP_BOOL_X64 + , "%ebx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" + #endif ); if (xorBlock) @@ -428,7 +451,11 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock ((word32 *)outBlock)[2] = t2; ((word32 *)outBlock)[3] = t3; #else - AS2( mov WORD_REG(bx), xorBlock) + #if CRYPTOPP_BOOL_X64 + mov rbx, [rsp + 6*8 + 7*8] ; xorBlock + #else + AS2( mov ebx, xorBlock) + #endif AS2( test WORD_REG(bx), WORD_REG(bx)) ASJ( jz, 1, f) AS2( xor eax, [WORD_REG(bx)+0*4]) @@ -436,15 +463,33 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock AS2( xor esi, [WORD_REG(bx)+2*4]) AS2( xor edi, [WORD_REG(bx)+3*4]) ASL(1) - AS2( mov WORD_REG(bx), outBlock) + #if CRYPTOPP_BOOL_X64 + mov rbx, [rsp + 7*8 + 7*8] ; outBlock + #else + AS2( mov ebx, outBlock) + #endif AS2( mov [WORD_REG(bx)+0*4], eax) AS2( mov [WORD_REG(bx)+1*4], ecx) AS2( mov [WORD_REG(bx)+2*4], esi) AS2( mov [WORD_REG(bx)+3*4], edi) #endif + +#if CRYPTOPP_GENERATE_X64_MASM + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + ret + Rijndael_Enc_ProcessAndXorBlock ENDP +#else } else +#endif #endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE +#ifndef CRYPTOPP_GENERATE_X64_MASM { word32 s0, s1, s2, s3, t0, t1, t2, t3; const word32 *rk = m_key; @@ -674,3 +719,4 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock NAMESPACE_END #endif +#endif @@ -1,6 +1,11 @@ // salsa.cpp - written and placed in the public domain by Wei Dai +// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code + #include "pch.h" + +#ifndef CRYPTOPP_GENERATE_X64_MASM + #include "salsa.h" #include "misc.h" #include "argnames.h" @@ -53,7 +58,7 @@ void Salsa20_Policy::SeekToIteration(lword iterationCount) #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 unsigned int Salsa20_Policy::GetAlignment() const { -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE if (HasSSE2()) return 16; else @@ -63,7 +68,7 @@ unsigned int Salsa20_Policy::GetAlignment() const unsigned int Salsa20_Policy::GetOptimalBlockSize() const { -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE if (HasSSE2()) return 4*BYTES_PER_ITERATION; else @@ -72,267 +77,489 @@ unsigned int Salsa20_Policy::GetOptimalBlockSize() const } #endif +#ifdef CRYPTOPP_X64_MASM_AVAILABLE +extern "C" { +void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state); +} +#endif + void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { - int i; -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE - #define SSE2_QUARTER_ROUND(a, b, d, i) {\ - __m128i t = _mm_add_epi32(a, d); \ - b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \ - b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));} +#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM + +#ifdef CRYPTOPP_X64_MASM_AVAILABLE + Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data()); + return; +#endif +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#ifdef CRYPTOPP_GENERATE_X64_MASM + ALIGN 8 + Salsa20_OperateKeystream PROC FRAME + mov r10, [rsp + 5*8] ; state + alloc_stack(10*16 + 32*16 + 8) + save_xmm128 xmm6, 0200h + save_xmm128 xmm7, 0210h + save_xmm128 xmm8, 0220h + save_xmm128 xmm9, 0230h + save_xmm128 xmm10, 0240h + save_xmm128 xmm11, 0250h + save_xmm128 xmm12, 0260h + save_xmm128 xmm13, 0270h + save_xmm128 xmm14, 0280h + save_xmm128 xmm15, 0290h + .endprolog + + #define REG_output rcx + #define REG_input rdx + #define REG_iterationCount r8 + #define REG_state r10 + #define REG_rounds eax + #define REG_temp32 r11d + #define REG_temp r11 + #define SSE2_WORKSPACE rsp + #define SSE2_LOAD_ROUNDS mov eax, r9d +#else if (HasSSE2()) { - __m128i *s = (__m128i *)m_state.data(); - -#if _MSC_VER > 1400 || (defined(_MSC_VER) && CRYPTOPP_BOOL_X86) || (CRYPTOPP_GCC_VERSION >= 40000 && CRYPTOPP_BOOL_X86) - // This code triggers an internal compiler error on MSVC 2005 when compiling - // for x64 with optimizations on. hopefully it will get fixed in the next release. - // A bug report has been submitted at http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?FeedbackID=274123 - // Also, GCC 3.4.4 generates incorrect code for x86 at -O2. - // GCC 4.1.1 generates incorrect code for x64 at -O2 - if (iterationCount >= 4) - { - __m128i ss[16]; - ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0)); - ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1)); - ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2)); - ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3)); - ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0)); - ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2)); - ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3)); - ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1)); - ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2)); - ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3)); - ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0)); - ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1)); - ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2)); - ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3)); - - do - { - word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]); - for (i=0; i<4; i++) - { - countersLo[i] = m_state[8]; - countersHi[i] = m_state[5]; - if (++m_state[8] == 0) - ++m_state[5]; - } - - __m128i x0 = ss[0]; - __m128i x1 = ss[1]; - __m128i x2 = ss[2]; - __m128i x3 = ss[3]; - __m128i x4 = ss[4]; - __m128i x5 = ss[5]; - __m128i x6 = ss[6]; - __m128i x7 = ss[7]; - __m128i x8 = ss[8]; - __m128i x9 = ss[9]; - __m128i x10 = ss[10]; - __m128i x11 = ss[11]; - __m128i x12 = ss[12]; - __m128i x13 = ss[13]; - __m128i x14 = ss[14]; - __m128i x15 = ss[15]; - - for (i=m_rounds; i>0; i-=2) - { - #define QUARTER_ROUND(a, b, c, d) \ - SSE2_QUARTER_ROUND(a, b, d, 7) \ - SSE2_QUARTER_ROUND(b, c, a, 9) \ - SSE2_QUARTER_ROUND(c, d, b, 13) \ - SSE2_QUARTER_ROUND(d, a, c, 18) - - QUARTER_ROUND(x0, x4, x8, x12) - QUARTER_ROUND(x1, x5, x9, x13) - QUARTER_ROUND(x2, x6, x10, x14) - QUARTER_ROUND(x3, x7, x11, x15) - - QUARTER_ROUND(x0, x13, x10, x7) - QUARTER_ROUND(x1, x14, x11, x4) - QUARTER_ROUND(x2, x15, x8, x5) - QUARTER_ROUND(x3, x12, x9, x6) - - #undef QUARTER_ROUND - } - - x0 = _mm_add_epi32(x0, ss[0]); - x1 = _mm_add_epi32(x1, ss[1]); - x2 = _mm_add_epi32(x2, ss[2]); - x3 = _mm_add_epi32(x3, ss[3]); - x4 = _mm_add_epi32(x4, ss[4]); - x5 = _mm_add_epi32(x5, ss[5]); - x6 = _mm_add_epi32(x6, ss[6]); - x7 = _mm_add_epi32(x7, ss[7]); - x8 = _mm_add_epi32(x8, ss[8]); - x9 = _mm_add_epi32(x9, ss[9]); - x10 = _mm_add_epi32(x10, ss[10]); - x11 = _mm_add_epi32(x11, ss[11]); - x12 = _mm_add_epi32(x12, ss[12]); - x13 = _mm_add_epi32(x13, ss[13]); - x14 = _mm_add_epi32(x14, ss[14]); - x15 = _mm_add_epi32(x15, ss[15]); - - #define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\ - __m128i t0 = _mm_unpacklo_epi32(a, b);\ - __m128i t1 = _mm_unpacklo_epi32(c, d);\ - __m128i t2 = _mm_unpacklo_epi64(t0, t1);\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\ - t2 = _mm_unpackhi_epi64(t0, t1);\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\ - t0 = _mm_unpackhi_epi32(a, b);\ - t1 = _mm_unpackhi_epi32(c, d);\ - t2 = _mm_unpacklo_epi64(t0, t1);\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\ - t2 = _mm_unpackhi_epi64(t0, t1);\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)} - - #define SALSA_OUTPUT(x) \ - OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\ - OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\ - OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\ - OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15) - - CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION) - - #undef SALSA_OUTPUT - } while ((iterationCount-=4) >= 4); - } + #if CRYPTOPP_BOOL_X64 + #define REG_output %4 + #define REG_input %1 + #define REG_iterationCount %2 + #define REG_state %3 + #define REG_rounds eax + #define REG_temp32 edx + #define REG_temp rdx + #define SSE2_WORKSPACE %5 + #define SSE2_LOAD_ROUNDS AS2(mov eax, %0) + + __m128i workspace[32]; + #else + #define REG_output edi + #define REG_input eax + #define REG_iterationCount ecx + #define REG_state esi + #define REG_rounds ebx + #define REG_temp32 edx + #define REG_temp edx + #define SSE2_WORKSPACE esp + WORD_SZ + #ifdef __GNUC__ + // this assumes that a frame pointer is used + #define SSE2_LOAD_ROUNDS ".att_syntax prefix;movl %0, %%ebx;.intel_syntax noprefix;" + #else + #define SSE2_LOAD_ROUNDS AS2(mov REG_rounds, r) + #endif + #endif + + word32 r = m_rounds; + + #ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS_PUSH_IF86( bx) + #else + void *s = m_state.data(); + + AS2( mov REG_iterationCount, iterationCount) + AS2( mov REG_state, s) + AS2( mov REG_input, input) + AS2( mov REG_output, output) + #endif +#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM + + AS2( cmp REG_iterationCount, 4) + ASJ( jl, 5, f) + +#if CRYPTOPP_BOOL_X86 + AS2( mov ebx, esp) + AS2( and esp, -16) + AS2( sub esp, 32*16) + AS1( push ebx) #endif - if (!IsP4() && iterationCount > 0) - { - const __m128i s_maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0)); - const __m128i s_maskHi32 = _mm_slli_epi64(s_maskLo32, 32); - - do - { - __m128i x0 = s[0]; - __m128i x1 = s[1]; - __m128i x2 = s[2]; - __m128i x3 = s[3]; - - for (i=m_rounds; i>0; i-=2) - { - SSE2_QUARTER_ROUND(x0, x1, x3, 7) - SSE2_QUARTER_ROUND(x1, x2, x0, 9) - SSE2_QUARTER_ROUND(x2, x3, x1, 13) - SSE2_QUARTER_ROUND(x3, x0, x2, 18) - - x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3)); - x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); - x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1)); - - SSE2_QUARTER_ROUND(x0, x3, x1, 7) - SSE2_QUARTER_ROUND(x3, x2, x0, 9) - SSE2_QUARTER_ROUND(x2, x1, x3, 13) - SSE2_QUARTER_ROUND(x1, x0, x2, 18) - - x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1)); - x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); - x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3)); - } - - x0 = _mm_add_epi32(x0, s[0]); - x1 = _mm_add_epi32(x1, s[1]); - x2 = _mm_add_epi32(x2, s[2]); - x3 = _mm_add_epi32(x3, s[3]); - - if (++m_state[8] == 0) - ++m_state[5]; - - __m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32)); - k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3)); - __m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32)); - k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3)); - __m128i k20 = _mm_or_si128(_mm_and_si128(x2, s_maskLo32), _mm_and_si128(x1, s_maskHi32)); - __m128i k31 = _mm_or_si128(_mm_and_si128(x3, s_maskLo32), _mm_and_si128(x2, s_maskHi32)); - - __m128i k0 = _mm_unpackhi_epi64(k02, k20); - __m128i k1 = _mm_unpackhi_epi64(k13, k31); - __m128i k2 = _mm_unpacklo_epi64(k20, k02); - __m128i k3 = _mm_unpacklo_epi64(k31, k13); - - #define SSE2_OUTPUT(x) {\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\ - CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)} - - CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION); - } - while (--iterationCount); - } +#define SSE2_EXPAND_S(i, j) \ + ASS( pshufd xmm4, xmm##i, j, j, j, j) \ + AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4) + + AS2( movdqa xmm0, [REG_state + 0*16]) + AS2( movdqa xmm1, [REG_state + 1*16]) + AS2( movdqa xmm2, [REG_state + 2*16]) + AS2( movdqa xmm3, [REG_state + 3*16]) + SSE2_EXPAND_S(0, 0) + SSE2_EXPAND_S(0, 1) + SSE2_EXPAND_S(0, 2) + SSE2_EXPAND_S(0, 3) + SSE2_EXPAND_S(1, 0) + SSE2_EXPAND_S(1, 2) + SSE2_EXPAND_S(1, 3) + SSE2_EXPAND_S(2, 1) + SSE2_EXPAND_S(2, 2) + SSE2_EXPAND_S(2, 3) + SSE2_EXPAND_S(3, 0) + SSE2_EXPAND_S(3, 1) + SSE2_EXPAND_S(3, 2) + SSE2_EXPAND_S(3, 3) + +#define SSE2_EXPAND_S85(i) \ + AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_rounds) \ + AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \ + AS2( add REG_rounds, 1) \ + AS2( adc REG_temp32, 0) + + ASL(1) + AS2( mov REG_rounds, dword ptr [REG_state + 8*4]) + AS2( mov REG_temp32, dword ptr [REG_state + 5*4]) + SSE2_EXPAND_S85(0) + SSE2_EXPAND_S85(1) + SSE2_EXPAND_S85(2) + SSE2_EXPAND_S85(3) + AS2( mov dword ptr [REG_state + 8*4], REG_rounds) + AS2( mov dword ptr [REG_state + 5*4], REG_temp32) + +#define SSE2_QUARTER_ROUND(a, b, d, i) \ + AS2( movdqa xmm4, xmm##d) \ + AS2( paddd xmm4, xmm##a) \ + AS2( movdqa xmm5, xmm4) \ + AS2( pslld xmm4, i) \ + AS2( psrld xmm5, 32-i) \ + AS2( pxor xmm##b, xmm4) \ + AS2( pxor xmm##b, xmm5) + +#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */ +#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */ +#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */ +#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) +#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7) +#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7) +#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256]) +#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */ +#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A) +#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) +#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */ +#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) +#define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9) +#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9) +#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256]) +#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */ +#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A) +#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) +#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */ +#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) +#define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13) +#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13) +#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) +#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */ +#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A) +#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */ +#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) +#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18) +#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18) +#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */ +#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */ +#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A) + +#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \ + L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \ + L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \ + L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \ + L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \ + L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \ + L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \ + L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \ + L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \ + L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \ + L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \ + L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \ + L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \ + L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \ + L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \ + L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \ + L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \ + L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \ + L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \ + L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \ + L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \ + L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \ + L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \ + L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \ + L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \ + L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \ + L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \ + L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \ + L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \ + L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \ + L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \ + L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \ + L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) + +#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \ + L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \ + L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \ + L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \ + L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \ + L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \ + L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \ + L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \ + L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \ + L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \ + L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \ + L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \ + L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \ + L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \ + L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \ + L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \ + L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \ + L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \ + L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \ + L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \ + L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \ + L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \ + L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \ + L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \ + L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \ + L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \ + L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \ + L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \ + L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \ + L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \ + L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \ + L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \ + L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i) + +#if CRYPTOPP_BOOL_X64 + SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) +#else + SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15) + SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13) +#endif + SSE2_LOAD_ROUNDS + ASJ( jmp, 2, f) + + ASL(SSE2_Salsa_Output) + AS2( movdqa xmm0, xmm4) + AS2( punpckldq xmm4, xmm5) + AS2( movdqa xmm1, xmm6) + AS2( punpckldq xmm6, xmm7) + AS2( movdqa xmm2, xmm4) + AS2( punpcklqdq xmm4, xmm6) // e + AS2( punpckhqdq xmm2, xmm6) // f + AS2( punpckhdq xmm0, xmm5) + AS2( punpckhdq xmm1, xmm7) + AS2( movdqa xmm6, xmm0) + AS2( punpcklqdq xmm0, xmm1) // g + AS2( punpckhqdq xmm6, xmm1) // h + AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1) + AS1( ret) + + ASL(6) +#if CRYPTOPP_BOOL_X64 + SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) + ASL(2) + SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6) +#else + SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15) + SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13) + ASL(2) + SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6) + SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4) +#endif + AS2( sub REG_rounds, 2) + ASJ( jnz, 6, b) + +#define SSE2_OUTPUT_4(a, b, c, d) \ + AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\ + AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\ + AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\ + AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\ + AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\ + AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\ + AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\ + AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\ + ASC( call, SSE2_Salsa_Output) + + SSE2_OUTPUT_4(0, 13, 10, 7) + SSE2_OUTPUT_4(4, 1, 14, 11) + SSE2_OUTPUT_4(8, 5, 2, 15) + SSE2_OUTPUT_4(12, 9, 6, 3) + AS2( test REG_input, REG_input) + ASJ( jz, 9, f) + AS2( add REG_input, 12*16) + ASL(9) + AS2( add REG_output, 12*16) + AS2( sub REG_iterationCount, 4) + AS2( cmp REG_iterationCount, 4) + ASJ( jge, 1, b) + AS_POP_IF86( sp) + + ASL(5) + AS2( sub REG_iterationCount, 1) + ASJ( jl, 4, f) + AS2( movdqa xmm0, [REG_state + 0*16]) + AS2( movdqa xmm1, [REG_state + 1*16]) + AS2( movdqa xmm2, [REG_state + 2*16]) + AS2( movdqa xmm3, [REG_state + 3*16]) + SSE2_LOAD_ROUNDS + + ASL(0) + SSE2_QUARTER_ROUND(0, 1, 3, 7) + SSE2_QUARTER_ROUND(1, 2, 0, 9) + SSE2_QUARTER_ROUND(2, 3, 1, 13) + SSE2_QUARTER_ROUND(3, 0, 2, 18) + ASS( pshufd xmm1, xmm1, 2, 1, 0, 3) + ASS( pshufd xmm2, xmm2, 1, 0, 3, 2) + ASS( pshufd xmm3, xmm3, 0, 3, 2, 1) + SSE2_QUARTER_ROUND(0, 3, 1, 7) + SSE2_QUARTER_ROUND(3, 2, 0, 9) + SSE2_QUARTER_ROUND(2, 1, 3, 13) + SSE2_QUARTER_ROUND(1, 0, 2, 18) + ASS( pshufd xmm1, xmm1, 0, 3, 2, 1) + ASS( pshufd xmm2, xmm2, 1, 0, 3, 2) + ASS( pshufd xmm3, xmm3, 2, 1, 0, 3) + AS2( sub REG_rounds, 2) + ASJ( jnz, 0, b) + + AS2( paddd xmm0, [REG_state + 0*16]) + AS2( paddd xmm1, [REG_state + 1*16]) + AS2( paddd xmm2, [REG_state + 2*16]) + AS2( paddd xmm3, [REG_state + 3*16]) + + AS2( add dword ptr [REG_state + 8*4], 1) + AS2( adc dword ptr [REG_state + 5*4], 0) + + AS2( pcmpeqb xmm6, xmm6) // all ones + AS2( psrlq xmm6, 32) // lo32 mask + ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask + AS2( movdqa xmm4, xmm0) + AS2( movdqa xmm5, xmm3) + AS2( pand xmm0, xmm7) + AS2( pand xmm4, xmm6) + AS2( pand xmm3, xmm6) + AS2( pand xmm5, xmm7) + AS2( por xmm4, xmm5) // 0,13,2,15 + AS2( movdqa xmm5, xmm1) + AS2( pand xmm1, xmm7) + AS2( pand xmm5, xmm6) + AS2( por xmm0, xmm5) // 4,1,6,3 + AS2( pand xmm6, xmm2) + AS2( pand xmm2, xmm7) + AS2( por xmm1, xmm6) // 8,5,10,7 + AS2( por xmm2, xmm3) // 12,9,14,11 + + AS2( movdqa xmm5, xmm4) + AS2( movdqa xmm6, xmm0) + AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7 + AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11 + AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15 + AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3 + + // output keystream + AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4) + ASJ( jmp, 5, b) + ASL(4) + +#ifdef __GNUC__ + AS_POP_IF86( bx) + ".att_syntax prefix;" + : + #if CRYPTOPP_BOOL_X64 + : "r" (r), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace) + : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + #else + : "m" (r), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output) + : "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" + #endif + ); +#endif +#ifdef CRYPTOPP_GENERATE_X64_MASM + movdqa xmm6, [rsp + 0200h] + movdqa xmm7, [rsp + 0210h] + movdqa xmm8, [rsp + 0220h] + movdqa xmm9, [rsp + 0230h] + movdqa xmm10, [rsp + 0240h] + movdqa xmm11, [rsp + 0250h] + movdqa xmm12, [rsp + 0260h] + movdqa xmm13, [rsp + 0270h] + movdqa xmm14, [rsp + 0280h] + movdqa xmm15, [rsp + 0290h] + add rsp, 10*16 + 32*16 + 8 + ret +Salsa20_OperateKeystream ENDP +#else } + else #endif - - word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - while (iterationCount--) +#endif +#ifndef CRYPTOPP_GENERATE_X64_MASM { - x0 = m_state[0]; - x1 = m_state[1]; - x2 = m_state[2]; - x3 = m_state[3]; - x4 = m_state[4]; - x5 = m_state[5]; - x6 = m_state[6]; - x7 = m_state[7]; - x8 = m_state[8]; - x9 = m_state[9]; - x10 = m_state[10]; - x11 = m_state[11]; - x12 = m_state[12]; - x13 = m_state[13]; - x14 = m_state[14]; - x15 = m_state[15]; - - for (i=m_rounds; i>0; i-=2) + word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + + while (iterationCount--) { - #define QUARTER_ROUND(a, b, c, d) \ - b = b ^ rotlFixed(a + d, 7); \ - c = c ^ rotlFixed(b + a, 9); \ - d = d ^ rotlFixed(c + b, 13); \ - a = a ^ rotlFixed(d + c, 18); - - QUARTER_ROUND(x0, x4, x8, x12) - QUARTER_ROUND(x1, x5, x9, x13) - QUARTER_ROUND(x2, x6, x10, x14) - QUARTER_ROUND(x3, x7, x11, x15) - - QUARTER_ROUND(x0, x13, x10, x7) - QUARTER_ROUND(x1, x14, x11, x4) - QUARTER_ROUND(x2, x15, x8, x5) - QUARTER_ROUND(x3, x12, x9, x6) - } + x0 = m_state[0]; + x1 = m_state[1]; + x2 = m_state[2]; + x3 = m_state[3]; + x4 = m_state[4]; + x5 = m_state[5]; + x6 = m_state[6]; + x7 = m_state[7]; + x8 = m_state[8]; + x9 = m_state[9]; + x10 = m_state[10]; + x11 = m_state[11]; + x12 = m_state[12]; + x13 = m_state[13]; + x14 = m_state[14]; + x15 = m_state[15]; + + for (int i=m_rounds; i>0; i-=2) + { + #define QUARTER_ROUND(a, b, c, d) \ + b = b ^ rotlFixed(a + d, 7); \ + c = c ^ rotlFixed(b + a, 9); \ + d = d ^ rotlFixed(c + b, 13); \ + a = a ^ rotlFixed(d + c, 18); + + QUARTER_ROUND(x0, x4, x8, x12) + QUARTER_ROUND(x1, x5, x9, x13) + QUARTER_ROUND(x2, x6, x10, x14) + QUARTER_ROUND(x3, x7, x11, x15) + + QUARTER_ROUND(x0, x13, x10, x7) + QUARTER_ROUND(x1, x14, x11, x4) + QUARTER_ROUND(x2, x15, x8, x5) + QUARTER_ROUND(x3, x12, x9, x6) + } - #define SALSA_OUTPUT(x) {\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} + #define SALSA_OUTPUT(x) {\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} #ifndef CRYPTOPP_DOXYGEN_PROCESSING - CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); #endif - if (++m_state[8] == 0) - ++m_state[5]; + if (++m_state[8] == 0) + ++m_state[5]; + } } } // see comment above if an internal compiler error occurs here NAMESPACE_END + +#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM diff --git a/sosemanuk.cpp b/sosemanuk.cpp index c86b877..b8c0c6c 100755 --- a/sosemanuk.cpp +++ b/sosemanuk.cpp @@ -1,12 +1,21 @@ // sosemanuk.cpp - written and placed in the public domain by Wei Dai +// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code + #include "pch.h" + +#ifndef CRYPTOPP_GENERATE_X64_MASM + #include "sosemanuk.h" #include "misc.h" #include "cpu.h" #include "serpentp.h" +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#include <emmintrin.h> +#endif + NAMESPACE_BEGIN(CryptoPP) void SosemanukPolicy::CipherSetKey(const NameValuePairs ¶ms, const byte *userKey, size_t keylen) @@ -74,7 +83,8 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv) m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7); } -static word32 s_mulTables[512] = { +extern "C" { +word32 s_sosemanukMulTables[512] = { #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64 0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836, 0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E, @@ -271,7 +281,7 @@ static word32 s_mulTables[512] = { 0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84, 0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2 }; - +} #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 unsigned int SosemanukPolicy::GetAlignment() const @@ -303,11 +313,36 @@ unsigned int SosemanukPolicy::GetOptimalBlockSize() const } #endif +#ifdef CRYPTOPP_X64_MASM_AVAILABLE +extern "C" { +void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state); +} +#endif + #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { +#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM + +#ifdef CRYPTOPP_X64_MASM_AVAILABLE + Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data()); + return; +#endif + #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#ifdef CRYPTOPP_GENERATE_X64_MASM + ALIGN 8 + Sosemanuk_OperateKeystream PROC FRAME + rex_push_reg rsi + push_reg rdi + alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8) + save_xmm128 xmm6, 02f0h + save_xmm128 xmm7, 0300h + .endprolog + mov rdi, r8 + mov rax, r9 +#else #ifdef __INTEL_COMPILER if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4 #else @@ -315,10 +350,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu #endif { #ifdef __GNUC__ + #if CRYPTOPP_BOOL_X64 + __m128i workspace[(80*4*2+12*4+8*WORD_SZ)/16]; + #endif __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS_PUSH( bx) + AS_PUSH_IF86( bx) #else word32 *state = m_state; AS2( mov WORD_REG(ax), state) @@ -326,22 +364,31 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu AS2( mov WORD_REG(dx), input) AS2( mov WORD_REG(cx), iterationCount) #endif +#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM + +#if defined(__GNUC__) && CRYPTOPP_BOOL_X64 + #define SSE2_workspace %5 +#else + #define SSE2_workspace WORD_REG(sp) +#endif -#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ] -#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ] -#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ] -#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ] -#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ] -#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ] -#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ] -#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ +#define SSE2_output WORD_PTR [SSE2_workspace+1*WORD_SZ] +#define SSE2_input WORD_PTR [SSE2_workspace+2*WORD_SZ] +#define SSE2_wordsLeft WORD_PTR [SSE2_workspace+3*WORD_SZ] +#define SSE2_diEnd WORD_PTR [SSE2_workspace+4*WORD_SZ] +#define SSE2_pMulTables WORD_PTR [SSE2_workspace+5*WORD_SZ] +#define SSE2_state WORD_PTR [SSE2_workspace+6*WORD_SZ] +#define SSE2_wordsLeft2 WORD_PTR [SSE2_workspace+7*WORD_SZ] +#define SSE2_stateCopy SSE2_workspace + 8*WORD_SZ #define SSE2_uvStart SSE2_stateCopy + 12*4 - AS_PUSH( bp) - AS2( mov WORD_REG(bx), WORD_REG(sp)) - AS2( and WORD_REG(sp), -16) - AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals - AS2( mov [WORD_REG(sp)], WORD_REG(bx)) +#if CRYPTOPP_BOOL_X86 + AS_PUSH_IF86( bp) + AS2( mov AS_REG_6, esp) + AS2( and esp, -16) + AS2( sub esp, 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals + AS2( mov [esp], AS_REG_6) +#endif AS2( mov SSE2_output, WORD_REG(di)) AS2( mov SSE2_input, WORD_REG(dx)) AS2( mov SSE2_state, WORD_REG(ax)) @@ -358,7 +405,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16]) AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0) AS2( psrlq xmm0, 32) - AS2( movd ebx, xmm0) // s(9) + AS2( movd AS_REG_6d, xmm0) // s(9) AS2( mov ecx, [WORD_REG(ax)+10*4]) AS2( mov edx, [WORD_REG(ax)+11*4]) AS2( pcmpeqb xmm7, xmm7) // all ones @@ -367,35 +414,35 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu #define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 #define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4 -#define r10 ecx -#define r11 edx -#define r20 edx -#define r21 ecx +#define R10 ecx +#define R11 edx +#define R20 edx +#define R21 ecx #define SSE2_STEP(i, j) \ AS2( mov eax, [s(i+0)])\ AS2( mov [v(i)], eax)\ AS2( rol eax, 8)\ - AS2( lea ebp, [ebx + r2##j])\ - AS2( xor ebp, r1##j)\ - AS2( mov [u(i)], ebp)\ - AS2( mov ebp, 1)\ - AS2( and ebp, r2##j)\ - AS1( neg ebp)\ - AS2( and ebp, ebx)\ - AS2( xor ebx, eax)\ + AS2( lea AS_REG_7d, [AS_REG_6d + R2##j])\ + AS2( xor AS_REG_7d, R1##j)\ + AS2( mov [u(i)], AS_REG_7d)\ + AS2( mov AS_REG_7d, 1)\ + AS2( and AS_REG_7d, R2##j)\ + AS1( neg AS_REG_7d)\ + AS2( and AS_REG_7d, AS_REG_6d)\ + AS2( xor AS_REG_6d, eax)\ AS2( movzx eax, al)\ - AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\ + AS2( xor AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\ AS2( mov eax, [s(i+3)])\ - AS2( xor ebp, [s(i+2)])\ - AS2( add r1##j, ebp)\ - AS2( movzx ebp, al)\ + AS2( xor AS_REG_7d, [s(i+2)])\ + AS2( add R1##j, AS_REG_7d)\ + AS2( movzx AS_REG_7d, al)\ AS2( shr eax, 8)\ - AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\ - AS2( xor ebx, eax)\ - AS2( imul r2##j, 0x54655307)\ - AS2( rol r2##j, 7)\ - AS2( mov [s(i+0)], ebx)\ + AS2( xor AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\ + AS2( xor AS_REG_6d, eax)\ + AS2( imul R2##j, AS_HEX(54655307))\ + AS2( rol R2##j, 7)\ + AS2( mov [s(i+0)], AS_REG_6d)\ ASL(2) // outer loop, each iteration of this processes 80 words AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u @@ -406,7 +453,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop AS2( mov SSE2_diEnd, WORD_REG(si)) #ifdef _MSC_VER - AS2( lea WORD_REG(si), s_mulTables) + AS2( lea WORD_REG(si), s_sosemanukMulTables) #else AS2( mov WORD_REG(si), SSE2_pMulTables) #endif @@ -438,7 +485,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu ASJ( jne, 0, b) AS2( mov WORD_REG(ax), SSE2_input) - AS2( mov WORD_REG(bp), SSE2_output) + AS2( mov AS_REG_7, SSE2_output) AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u AS2( mov WORD_REG(si), SSE2_wordsLeft2) @@ -487,43 +534,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu AS2( punpcklqdq xmm6, xmm5) AS2( punpckhqdq xmm3, xmm5) // output keystream - AS2( test WORD_REG(ax), WORD_REG(ax)) - ASJ( jz, 3, f) - AS2( test eax, 0xf) - ASJ( jnz, 7, f) - AS2( pxor xmm2, [WORD_REG(ax)+0*16]) - AS2( pxor xmm0, [WORD_REG(ax)+1*16]) - AS2( pxor xmm6, [WORD_REG(ax)+2*16]) - AS2( pxor xmm3, [WORD_REG(ax)+3*16]) - AS2( add WORD_REG(ax), 4*16) - ASJ( jmp, 3, f) - ASL(7) - AS2( movdqu xmm1, [WORD_REG(ax)+0*16]) - AS2( pxor xmm2, xmm1) - AS2( movdqu xmm1, [WORD_REG(ax)+1*16]) - AS2( pxor xmm0, xmm1) - AS2( movdqu xmm1, [WORD_REG(ax)+2*16]) - AS2( pxor xmm6, xmm1) - AS2( movdqu xmm1, [WORD_REG(ax)+3*16]) - AS2( pxor xmm3, xmm1) - AS2( add WORD_REG(ax), 4*16) - ASL(3) - AS2( test ebp, 0xf) - ASJ( jnz, 8, f) - AS2( movdqa [WORD_REG(bp)+0*16], xmm2) - AS2( movdqa [WORD_REG(bp)+1*16], xmm0) - AS2( movdqa [WORD_REG(bp)+2*16], xmm6) - AS2( movdqa [WORD_REG(bp)+3*16], xmm3) - ASJ( jmp, 9, f) - ASL(8) - AS2( movdqu [WORD_REG(bp)+0*16], xmm2) - AS2( movdqu [WORD_REG(bp)+1*16], xmm0) - AS2( movdqu [WORD_REG(bp)+2*16], xmm6) - AS2( movdqu [WORD_REG(bp)+3*16], xmm3) - ASL(9) + AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4) + // loop AS2( add WORD_REG(di), 4*4) - AS2( add WORD_REG(bp), 4*16) AS2( sub WORD_REG(si), 16) ASJ( jnz, 1, b) @@ -533,29 +547,29 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu ASJ( jz, 6, f) AS2( mov SSE2_wordsLeft, WORD_REG(si)) AS2( mov SSE2_input, WORD_REG(ax)) - AS2( mov SSE2_output, WORD_REG(bp)) + AS2( mov SSE2_output, AS_REG_7) ASJ( jmp, 2, b) ASL(4) // final output of less than 16 words AS2( test WORD_REG(ax), WORD_REG(ax)) ASJ( jz, 5, f) - AS2( movd xmm0, [WORD_REG(ax)+0*4]) + AS2( movd xmm0, dword ptr [WORD_REG(ax)+0*4]) AS2( pxor xmm2, xmm0) - AS2( movd xmm0, [WORD_REG(ax)+1*4]) + AS2( movd xmm0, dword ptr [WORD_REG(ax)+1*4]) AS2( pxor xmm3, xmm0) - AS2( movd xmm0, [WORD_REG(ax)+2*4]) + AS2( movd xmm0, dword ptr [WORD_REG(ax)+2*4]) AS2( pxor xmm1, xmm0) - AS2( movd xmm0, [WORD_REG(ax)+3*4]) + AS2( movd xmm0, dword ptr [WORD_REG(ax)+3*4]) AS2( pxor xmm4, xmm0) AS2( add WORD_REG(ax), 16) ASL(5) - AS2( movd [WORD_REG(bp)+0*4], xmm2) - AS2( movd [WORD_REG(bp)+1*4], xmm3) - AS2( movd [WORD_REG(bp)+2*4], xmm1) - AS2( movd [WORD_REG(bp)+3*4], xmm4) + AS2( movd dword ptr [AS_REG_7+0*4], xmm2) + AS2( movd dword ptr [AS_REG_7+1*4], xmm3) + AS2( movd dword ptr [AS_REG_7+2*4], xmm1) + AS2( movd dword ptr [AS_REG_7+3*4], xmm4) AS2( sub WORD_REG(si), 4) ASJ( jz, 6, f) - AS2( add WORD_REG(bp), 16) + AS2( add AS_REG_7, 16) AS2( psrldq xmm2, 4) AS2( psrldq xmm3, 4) AS2( psrldq xmm1, 4) @@ -563,38 +577,52 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu ASJ( jmp, 4, b) ASL(6) // save state - AS2( mov WORD_REG(bx), SSE2_state) + AS2( mov AS_REG_6, SSE2_state) AS2( movdqa xmm0, [SSE2_stateCopy+0*16]) - AS2( movdqa [WORD_REG(bx)+0*16], xmm0) + AS2( movdqa [AS_REG_6+0*16], xmm0) AS2( movdqa xmm0, [SSE2_stateCopy+1*16]) - AS2( movdqa [WORD_REG(bx)+1*16], xmm0) + AS2( movdqa [AS_REG_6+1*16], xmm0) AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16]) - AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0) - AS2( mov [WORD_REG(bx)+10*4], ecx) - AS2( mov [WORD_REG(bx)+11*4], edx) + AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0) + AS2( mov [AS_REG_6+10*4], ecx) + AS2( mov [AS_REG_6+11*4], edx) - AS_POP( sp) - AS_POP( bp) + AS_POP_IF86( sp) + AS_POP_IF86( bp) #ifdef __GNUC__ - AS_POP( bx) + AS_POP_IF86( bx) ".att_syntax prefix;" : - : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input) - : "memory", "cc" + : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input) + #if CRYPTOPP_BOOL_X64 + , "r" (workspace) + #endif + : "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" ); #endif +#ifdef CRYPTOPP_GENERATE_X64_MASM + movdqa xmm6, [rsp + 02f0h] + movdqa xmm7, [rsp + 0300h] + add rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8 + pop rdi + pop rsi + ret + Sosemanuk_OperateKeystream ENDP +#else } else #endif +#endif +#ifndef CRYPTOPP_GENERATE_X64_MASM { #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64 -#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)]) +#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)]) #else -#define MUL_A(x) (((x) << 8) ^ s_mulTables[(x) >> 24]) +#define MUL_A(x) (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24]) #endif -#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)]) +#define DIV_A(x) (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)]) #define r1(i) ((i%2) ? reg2 : reg1) #define r2(i) ((i%2) ? reg1 : reg2) @@ -676,3 +704,5 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu } NAMESPACE_END + +#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM diff --git a/whrlpool.cpp b/whrlpool.cpp index 149be39..59f0751 100644 --- a/whrlpool.cpp +++ b/whrlpool.cpp @@ -1,7 +1,7 @@ // whrlpool.cpp - originally modified by Kevin Springle from // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c. // Updated to Whirlpool version 3.0, optimized and SSE version added by Wei Dai -// Any modifications are placed in the public domain +// All modifications are placed in the public domain // This is the original introductory comment: @@ -71,6 +71,10 @@ #include "misc.h" #include "cpu.h" +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#include <emmintrin.h> +#endif + NAMESPACE_BEGIN(CryptoPP) void Whirlpool_TestInstantiations() @@ -395,29 +399,37 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) { // MMX version has the same structure as C version below #ifdef __GNUC__ + #if CRYPTOPP_BOOL_X64 + __m128i workspace[8]; + #endif __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS_PUSH( bx) - AS2( mov WORD_REG(bx), WORD_REG(ax)) + AS_PUSH_IF86( bx) + AS2( mov AS_REG_6, WORD_REG(ax)) #else #if _MSC_VER < 1300 - AS_PUSH( bx) + AS_PUSH_IF86( bx) #endif - AS2( lea WORD_REG(bx), [Whirlpool_C]) + AS2( lea AS_REG_6, [Whirlpool_C]) AS2( mov WORD_REG(cx), digest) AS2( mov WORD_REG(dx), block) #endif - AS2( mov WORD_REG(ax), WORD_REG(sp)) - AS2( and WORD_REG(sp), -16) - AS2( sub WORD_REG(sp), 16*8) - AS_PUSH( ax) +#if CRYPTOPP_BOOL_X86 + AS2( mov eax, esp) + AS2( and esp, -16) + AS2( sub esp, 16*8) + AS1( push eax) + #define SSE2_workspace esp+WORD_SZ +#else + #define SSE2_workspace %3 +#endif AS2( xor esi, esi) ASL(0) AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)]) - AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k + AS2( movq [SSE2_workspace+8*WORD_REG(si)], mm0) // k AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)]) - AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s + AS2( movq [SSE2_workspace+64+8*WORD_REG(si)], mm0) // s AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0) AS1( inc WORD_REG(si)) AS2( cmp WORD_REG(si), 8) @@ -430,16 +442,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) #define KSL1(a, b) AS2(pxor mm##a, b) #define KSL(op, i, a, b, c, d) \ - AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\ + AS2(mov eax, [SSE2_workspace+8*i])\ AS2(movzx edi, al)\ - KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ + KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ + KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ - KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) + KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\ + KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)]) #define KSH0(a, b) \ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ @@ -448,57 +460,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) AS2(pxor mm##a, b) #define KSH2(a, b) \ AS2(pxor mm##a, b)\ - AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a) + AS2(movq [SSE2_workspace+8*a], mm##a) #define KSH(op, i, a, b, c, d) \ - AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\ + AS2(mov eax, [SSE2_workspace+8*((i+4)-8*((i+4)/8))+4])\ AS2(movzx edi, al)\ - KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ + KSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ + KSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ - KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) + KSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\ + KSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)]) #define TSL(op, i, a, b, c, d) \ - AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\ + AS2(mov eax, [SSE2_workspace+64+8*i])\ AS2(movzx edi, al)\ - KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ + KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ + KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ - KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) + KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\ + KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)]) #define TSH0(a, b) \ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ - AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\ + AS2(pxor mm##a, [SSE2_workspace+8*a])\ AS2(pxor mm##a, b) #define TSH1(a, b) \ AS2(pxor mm##a, b) #define TSH2(a, b) \ AS2(pxor mm##a, b)\ - AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a) + AS2(movq [SSE2_workspace+64+8*a], mm##a) #define TSH3(a, b) \ AS2(pxor mm##a, b)\ AS2(pxor mm##a, [WORD_REG(cx)+8*a])\ AS2(movq [WORD_REG(cx)+8*a], mm##a) #define TSH(op, i, a, b, c, d) \ - AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\ + AS2(mov eax, [SSE2_workspace+64+8*((i+4)-8*((i+4)/8))+4])\ AS2(movzx edi, al)\ - TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ + TSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ + TSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ - TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) + TSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\ + TSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)]) KSL(0, 4, 3, 2, 1, 0) KSL(0, 0, 7, 6, 5, 4) @@ -517,8 +529,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) KSH(2, 3, 2, 1, 0, 7) KSH(2, 7, 6, 5, 4, 3) - AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8]) - AS2( movq [WORD_REG(sp)+WORD_SZ], mm0) + AS2( pxor mm0, [AS_REG_6 + 8*1024 + WORD_REG(si)*8]) + AS2( movq [SSE2_workspace], mm0) TSL(0, 4, 3, 2, 1, 0) TSL(0, 0, 7, 6, 5, 4) @@ -553,17 +565,23 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) #undef TSL #undef TSH - AS_POP( sp) + AS_POP_IF86( sp) AS1( emms) #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) - AS_POP( bx) + AS_POP_IF86( bx) #endif #ifdef __GNUC__ ".att_syntax prefix;" : : "a" (Whirlpool_C), "c" (digest), "d" (block) + #if CRYPTOPP_BOOL_X64 + , "r" (workspace) + #endif : "%esi", "%edi", "memory", "cc" + #if CRYPTOPP_BOOL_X64 + , "%r9" + #endif ); #endif } diff --git a/x64masm.asm b/x64masm.asm index 76676a7..a395c9a 100755 --- a/x64masm.asm +++ b/x64masm.asm @@ -1,5 +1,6 @@ -PUBLIC Baseline_Add -PUBLIC Baseline_Sub +include ksamd64.inc +EXTERNDEF s_sosemanukMulTables:FAR + .CODE ALIGN 8 Baseline_Add PROC @@ -54,5 +55,1842 @@ $1@Baseline_Sub: ret Baseline_Sub ENDP +ALIGN 8 +Salsa20_OperateKeystream PROC FRAME +mov r10, [rsp + 5*8] +alloc_stack(10*16 + 32*16 + 8) +save_xmm128 xmm6, 0200h +save_xmm128 xmm7, 0210h +save_xmm128 xmm8, 0220h +save_xmm128 xmm9, 0230h +save_xmm128 xmm10, 0240h +save_xmm128 xmm11, 0250h +save_xmm128 xmm12, 0260h +save_xmm128 xmm13, 0270h +save_xmm128 xmm14, 0280h +save_xmm128 xmm15, 0290h +.endprolog +cmp r8, 4 +jl label5 +movdqa xmm0, [r10 + 0*16] +movdqa xmm1, [r10 + 1*16] +movdqa xmm2, [r10 + 2*16] +movdqa xmm3, [r10 + 3*16] +pshufd xmm4, xmm0, 0*64+0*16+0*4+0 +movdqa [rsp + (0*4+0)*16 + 256], xmm4 +pshufd xmm4, xmm0, 1*64+1*16+1*4+1 +movdqa [rsp + (0*4+1)*16 + 256], xmm4 +pshufd xmm4, xmm0, 2*64+2*16+2*4+2 +movdqa [rsp + (0*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm0, 3*64+3*16+3*4+3 +movdqa [rsp + (0*4+3)*16 + 256], xmm4 +pshufd xmm4, xmm1, 0*64+0*16+0*4+0 +movdqa [rsp + (1*4+0)*16 + 256], xmm4 +pshufd xmm4, xmm1, 2*64+2*16+2*4+2 +movdqa [rsp + (1*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm1, 3*64+3*16+3*4+3 +movdqa [rsp + (1*4+3)*16 + 256], xmm4 +pshufd xmm4, xmm2, 1*64+1*16+1*4+1 +movdqa [rsp + (2*4+1)*16 + 256], xmm4 +pshufd xmm4, xmm2, 2*64+2*16+2*4+2 +movdqa [rsp + (2*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm2, 3*64+3*16+3*4+3 +movdqa [rsp + (2*4+3)*16 + 256], xmm4 +pshufd xmm4, xmm3, 0*64+0*16+0*4+0 +movdqa [rsp + (3*4+0)*16 + 256], xmm4 +pshufd xmm4, xmm3, 1*64+1*16+1*4+1 +movdqa [rsp + (3*4+1)*16 + 256], xmm4 +pshufd xmm4, xmm3, 2*64+2*16+2*4+2 +movdqa [rsp + (3*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm3, 3*64+3*16+3*4+3 +movdqa [rsp + (3*4+3)*16 + 256], xmm4 +label1: +mov eax, dword ptr [r10 + 8*4] +mov r11d, dword ptr [r10 + 5*4] +mov dword ptr [rsp + 8*16 + 0*4 + 256], eax +mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [rsp + 8*16 + 1*4 + 256], eax +mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [rsp + 8*16 + 2*4 + 256], eax +mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [rsp + 8*16 + 3*4 + 256], eax +mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [r10 + 8*4], eax +mov dword ptr [r10 + 5*4], r11d +movdqa xmm0, [rsp + 12*16 + 1*256] +movdqa xmm4, [rsp + 13*16 + 1*256] +movdqa xmm8, [rsp + 14*16 + 1*256] +movdqa xmm12, [rsp + 15*16 + 1*256] +movdqa xmm2, [rsp + 0*16 + 1*256] +movdqa xmm6, [rsp + 1*16 + 1*256] +movdqa xmm10, [rsp + 2*16 + 1*256] +movdqa xmm14, [rsp + 3*16 + 1*256] +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 7 +pslld xmm4, 7 +pslld xmm8, 7 +pslld xmm12, 7 +psrld xmm1, 32-7 +psrld xmm5, 32-7 +psrld xmm9, 32-7 +psrld xmm13, 32-7 +pxor xmm0, [rsp + 4*16 + 1*256] +pxor xmm4, [rsp + 5*16 + 1*256] +pxor xmm8, [rsp + 6*16 + 1*256] +pxor xmm12, [rsp + 7*16 + 1*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 4*16], xmm0 +movdqa [rsp + 5*16], xmm4 +movdqa [rsp + 6*16], xmm8 +movdqa [rsp + 7*16], xmm12 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 9 +pslld xmm4, 9 +pslld xmm8, 9 +pslld xmm12, 9 +psrld xmm3, 32-9 +psrld xmm7, 32-9 +psrld xmm11, 32-9 +psrld xmm15, 32-9 +pxor xmm0, [rsp + 8*16 + 1*256] +pxor xmm4, [rsp + 9*16 + 1*256] +pxor xmm8, [rsp + 10*16 + 1*256] +pxor xmm12, [rsp + 11*16 + 1*256] +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 8*16], xmm0 +movdqa [rsp + 9*16], xmm4 +movdqa [rsp + 10*16], xmm8 +movdqa [rsp + 11*16], xmm12 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +paddd xmm0, xmm1 +paddd xmm4, xmm5 +paddd xmm8, xmm9 +paddd xmm12, xmm13 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 13 +pslld xmm4, 13 +pslld xmm8, 13 +pslld xmm12, 13 +psrld xmm1, 32-13 +psrld xmm5, 32-13 +psrld xmm9, 32-13 +psrld xmm13, 32-13 +pxor xmm0, [rsp + 12*16 + 1*256] +pxor xmm4, [rsp + 13*16 + 1*256] +pxor xmm8, [rsp + 14*16 + 1*256] +pxor xmm12, [rsp + 15*16 + 1*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 12*16], xmm0 +movdqa [rsp + 13*16], xmm4 +movdqa [rsp + 14*16], xmm8 +movdqa [rsp + 15*16], xmm12 +paddd xmm0, xmm3 +paddd xmm4, xmm7 +paddd xmm8, xmm11 +paddd xmm12, xmm15 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 18 +pslld xmm4, 18 +pslld xmm8, 18 +pslld xmm12, 18 +psrld xmm3, 32-18 +psrld xmm7, 32-18 +psrld xmm11, 32-18 +psrld xmm15, 32-18 +pxor xmm0, xmm2 +pxor xmm4, xmm6 +pxor xmm8, xmm10 +pxor xmm12, xmm14 +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 0*16], xmm0 +movdqa [rsp + 1*16], xmm4 +movdqa [rsp + 2*16], xmm8 +movdqa [rsp + 3*16], xmm12 +mov rax, r9 +jmp label2 +labelSSE2_Salsa_Output: +movdqa xmm0, xmm4 +punpckldq xmm4, xmm5 +movdqa xmm1, xmm6 +punpckldq xmm6, xmm7 +movdqa xmm2, xmm4 +punpcklqdq xmm4, xmm6 +punpckhqdq xmm2, xmm6 +punpckhdq xmm0, xmm5 +punpckhdq xmm1, xmm7 +movdqa xmm6, xmm0 +punpcklqdq xmm0, xmm1 +punpckhqdq xmm6, xmm1 +test rdx, rdx +jz labelSSE2_Salsa_Output_A3 +test rdx, 15 +jnz labelSSE2_Salsa_Output_A7 +pxor xmm4, [rdx+0*16] +pxor xmm2, [rdx+4*16] +pxor xmm0, [rdx+8*16] +pxor xmm6, [rdx+12*16] +add rdx, 1*16 +jmp labelSSE2_Salsa_Output_A3 +labelSSE2_Salsa_Output_A7: +movdqu xmm1, [rdx+0*16] +pxor xmm4, xmm1 +movdqu xmm1, [rdx+4*16] +pxor xmm2, xmm1 +movdqu xmm1, [rdx+8*16] +pxor xmm0, xmm1 +movdqu xmm1, [rdx+12*16] +pxor xmm6, xmm1 +add rdx, 1*16 +labelSSE2_Salsa_Output_A3: +test rcx, 15 +jnz labelSSE2_Salsa_Output_A8 +movdqa [rcx+0*16], xmm4 +movdqa [rcx+4*16], xmm2 +movdqa [rcx+8*16], xmm0 +movdqa [rcx+12*16], xmm6 +jmp labelSSE2_Salsa_Output_A9 +labelSSE2_Salsa_Output_A8: +movdqu [rcx+0*16], xmm4 +movdqu [rcx+4*16], xmm2 +movdqu [rcx+8*16], xmm0 +movdqu [rcx+12*16], xmm6 +labelSSE2_Salsa_Output_A9: +add rcx, 1*16 +ret +label6: +movdqa xmm0, [rsp + 12*16 + 0*256] +movdqa xmm4, [rsp + 13*16 + 0*256] +movdqa xmm8, [rsp + 14*16 + 0*256] +movdqa xmm12, [rsp + 15*16 + 0*256] +movdqa xmm2, [rsp + 0*16 + 0*256] +movdqa xmm6, [rsp + 1*16 + 0*256] +movdqa xmm10, [rsp + 2*16 + 0*256] +movdqa xmm14, [rsp + 3*16 + 0*256] +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 7 +pslld xmm4, 7 +pslld xmm8, 7 +pslld xmm12, 7 +psrld xmm1, 32-7 +psrld xmm5, 32-7 +psrld xmm9, 32-7 +psrld xmm13, 32-7 +pxor xmm0, [rsp + 4*16 + 0*256] +pxor xmm4, [rsp + 5*16 + 0*256] +pxor xmm8, [rsp + 6*16 + 0*256] +pxor xmm12, [rsp + 7*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 4*16], xmm0 +movdqa [rsp + 5*16], xmm4 +movdqa [rsp + 6*16], xmm8 +movdqa [rsp + 7*16], xmm12 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 9 +pslld xmm4, 9 +pslld xmm8, 9 +pslld xmm12, 9 +psrld xmm3, 32-9 +psrld xmm7, 32-9 +psrld xmm11, 32-9 +psrld xmm15, 32-9 +pxor xmm0, [rsp + 8*16 + 0*256] +pxor xmm4, [rsp + 9*16 + 0*256] +pxor xmm8, [rsp + 10*16 + 0*256] +pxor xmm12, [rsp + 11*16 + 0*256] +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 8*16], xmm0 +movdqa [rsp + 9*16], xmm4 +movdqa [rsp + 10*16], xmm8 +movdqa [rsp + 11*16], xmm12 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +paddd xmm0, xmm1 +paddd xmm4, xmm5 +paddd xmm8, xmm9 +paddd xmm12, xmm13 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 13 +pslld xmm4, 13 +pslld xmm8, 13 +pslld xmm12, 13 +psrld xmm1, 32-13 +psrld xmm5, 32-13 +psrld xmm9, 32-13 +psrld xmm13, 32-13 +pxor xmm0, [rsp + 12*16 + 0*256] +pxor xmm4, [rsp + 13*16 + 0*256] +pxor xmm8, [rsp + 14*16 + 0*256] +pxor xmm12, [rsp + 15*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 12*16], xmm0 +movdqa [rsp + 13*16], xmm4 +movdqa [rsp + 14*16], xmm8 +movdqa [rsp + 15*16], xmm12 +paddd xmm0, xmm3 +paddd xmm4, xmm7 +paddd xmm8, xmm11 +paddd xmm12, xmm15 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 18 +pslld xmm4, 18 +pslld xmm8, 18 +pslld xmm12, 18 +psrld xmm3, 32-18 +psrld xmm7, 32-18 +psrld xmm11, 32-18 +psrld xmm15, 32-18 +pxor xmm0, xmm2 +pxor xmm4, xmm6 +pxor xmm8, xmm10 +pxor xmm12, xmm14 +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 0*16], xmm0 +movdqa [rsp + 1*16], xmm4 +movdqa [rsp + 2*16], xmm8 +movdqa [rsp + 3*16], xmm12 +label2: +movdqa xmm0, [rsp + 7*16 + 0*256] +movdqa xmm4, [rsp + 4*16 + 0*256] +movdqa xmm8, [rsp + 5*16 + 0*256] +movdqa xmm12, [rsp + 6*16 + 0*256] +movdqa xmm2, [rsp + 0*16 + 0*256] +movdqa xmm6, [rsp + 1*16 + 0*256] +movdqa xmm10, [rsp + 2*16 + 0*256] +movdqa xmm14, [rsp + 3*16 + 0*256] +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 7 +pslld xmm4, 7 +pslld xmm8, 7 +pslld xmm12, 7 +psrld xmm1, 32-7 +psrld xmm5, 32-7 +psrld xmm9, 32-7 +psrld xmm13, 32-7 +pxor xmm0, [rsp + 13*16 + 0*256] +pxor xmm4, [rsp + 14*16 + 0*256] +pxor xmm8, [rsp + 15*16 + 0*256] +pxor xmm12, [rsp + 12*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 13*16], xmm0 +movdqa [rsp + 14*16], xmm4 +movdqa [rsp + 15*16], xmm8 +movdqa [rsp + 12*16], xmm12 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 9 +pslld xmm4, 9 +pslld xmm8, 9 +pslld xmm12, 9 +psrld xmm3, 32-9 +psrld xmm7, 32-9 +psrld xmm11, 32-9 +psrld xmm15, 32-9 +pxor xmm0, [rsp + 10*16 + 0*256] +pxor xmm4, [rsp + 11*16 + 0*256] +pxor xmm8, [rsp + 8*16 + 0*256] +pxor xmm12, [rsp + 9*16 + 0*256] +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 10*16], xmm0 +movdqa [rsp + 11*16], xmm4 +movdqa [rsp + 8*16], xmm8 +movdqa [rsp + 9*16], xmm12 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +paddd xmm0, xmm1 +paddd xmm4, xmm5 +paddd xmm8, xmm9 +paddd xmm12, xmm13 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 13 +pslld xmm4, 13 +pslld xmm8, 13 +pslld xmm12, 13 +psrld xmm1, 32-13 +psrld xmm5, 32-13 +psrld xmm9, 32-13 +psrld xmm13, 32-13 +pxor xmm0, [rsp + 7*16 + 0*256] +pxor xmm4, [rsp + 4*16 + 0*256] +pxor xmm8, [rsp + 5*16 + 0*256] +pxor xmm12, [rsp + 6*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 7*16], xmm0 +movdqa [rsp + 4*16], xmm4 +movdqa [rsp + 5*16], xmm8 +movdqa [rsp + 6*16], xmm12 +paddd xmm0, xmm3 +paddd xmm4, xmm7 +paddd xmm8, xmm11 +paddd xmm12, xmm15 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 18 +pslld xmm4, 18 +pslld xmm8, 18 +pslld xmm12, 18 +psrld xmm3, 32-18 +psrld xmm7, 32-18 +psrld xmm11, 32-18 +psrld xmm15, 32-18 +pxor xmm0, xmm2 +pxor xmm4, xmm6 +pxor xmm8, xmm10 +pxor xmm12, xmm14 +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 0*16], xmm0 +movdqa [rsp + 1*16], xmm4 +movdqa [rsp + 2*16], xmm8 +movdqa [rsp + 3*16], xmm12 +sub eax, 2 +jnz label6 +movdqa xmm4, [rsp + 0*16 + 256] +paddd xmm4, [rsp + 0*16] +movdqa xmm5, [rsp + 13*16 + 256] +paddd xmm5, [rsp + 13*16] +movdqa xmm6, [rsp + 10*16 + 256] +paddd xmm6, [rsp + 10*16] +movdqa xmm7, [rsp + 7*16 + 256] +paddd xmm7, [rsp + 7*16] +call labelSSE2_Salsa_Output +movdqa xmm4, [rsp + 4*16 + 256] +paddd xmm4, [rsp + 4*16] +movdqa xmm5, [rsp + 1*16 + 256] +paddd xmm5, [rsp + 1*16] +movdqa xmm6, [rsp + 14*16 + 256] +paddd xmm6, [rsp + 14*16] +movdqa xmm7, [rsp + 11*16 + 256] +paddd xmm7, [rsp + 11*16] +call labelSSE2_Salsa_Output +movdqa xmm4, [rsp + 8*16 + 256] +paddd xmm4, [rsp + 8*16] +movdqa xmm5, [rsp + 5*16 + 256] +paddd xmm5, [rsp + 5*16] +movdqa xmm6, [rsp + 2*16 + 256] +paddd xmm6, [rsp + 2*16] +movdqa xmm7, [rsp + 15*16 + 256] +paddd xmm7, [rsp + 15*16] +call labelSSE2_Salsa_Output +movdqa xmm4, [rsp + 12*16 + 256] +paddd xmm4, [rsp + 12*16] +movdqa xmm5, [rsp + 9*16 + 256] +paddd xmm5, [rsp + 9*16] +movdqa xmm6, [rsp + 6*16 + 256] +paddd xmm6, [rsp + 6*16] +movdqa xmm7, [rsp + 3*16 + 256] +paddd xmm7, [rsp + 3*16] +call labelSSE2_Salsa_Output +test rdx, rdx +jz label9 +add rdx, 12*16 +label9: +add rcx, 12*16 +sub r8, 4 +cmp r8, 4 +jge label1 +label5: +sub r8, 1 +jl label4 +movdqa xmm0, [r10 + 0*16] +movdqa xmm1, [r10 + 1*16] +movdqa xmm2, [r10 + 2*16] +movdqa xmm3, [r10 + 3*16] +mov rax, r9 +label0: +movdqa xmm4, xmm3 +paddd xmm4, xmm0 +movdqa xmm5, xmm4 +pslld xmm4, 7 +psrld xmm5, 32-7 +pxor xmm1, xmm4 +pxor xmm1, xmm5 +movdqa xmm4, xmm0 +paddd xmm4, xmm1 +movdqa xmm5, xmm4 +pslld xmm4, 9 +psrld xmm5, 32-9 +pxor xmm2, xmm4 +pxor xmm2, xmm5 +movdqa xmm4, xmm1 +paddd xmm4, xmm2 +movdqa xmm5, xmm4 +pslld xmm4, 13 +psrld xmm5, 32-13 +pxor xmm3, xmm4 +pxor xmm3, xmm5 +movdqa xmm4, xmm2 +paddd xmm4, xmm3 +movdqa xmm5, xmm4 +pslld xmm4, 18 +psrld xmm5, 32-18 +pxor xmm0, xmm4 +pxor xmm0, xmm5 +pshufd xmm1, xmm1, 2*64+1*16+0*4+3 +pshufd xmm2, xmm2, 1*64+0*16+3*4+2 +pshufd xmm3, xmm3, 0*64+3*16+2*4+1 +movdqa xmm4, xmm1 +paddd xmm4, xmm0 +movdqa xmm5, xmm4 +pslld xmm4, 7 +psrld xmm5, 32-7 +pxor xmm3, xmm4 +pxor xmm3, xmm5 +movdqa xmm4, xmm0 +paddd xmm4, xmm3 +movdqa xmm5, xmm4 +pslld xmm4, 9 +psrld xmm5, 32-9 +pxor xmm2, xmm4 +pxor xmm2, xmm5 +movdqa xmm4, xmm3 +paddd xmm4, xmm2 +movdqa xmm5, xmm4 +pslld xmm4, 13 +psrld xmm5, 32-13 +pxor xmm1, xmm4 +pxor xmm1, xmm5 +movdqa xmm4, xmm2 +paddd xmm4, xmm1 +movdqa xmm5, xmm4 +pslld xmm4, 18 +psrld xmm5, 32-18 +pxor xmm0, xmm4 +pxor xmm0, xmm5 +pshufd xmm1, xmm1, 0*64+3*16+2*4+1 +pshufd xmm2, xmm2, 1*64+0*16+3*4+2 +pshufd xmm3, xmm3, 2*64+1*16+0*4+3 +sub eax, 2 +jnz label0 +paddd xmm0, [r10 + 0*16] +paddd xmm1, [r10 + 1*16] +paddd xmm2, [r10 + 2*16] +paddd xmm3, [r10 + 3*16] +add dword ptr [r10 + 8*4], 1 +adc dword ptr [r10 + 5*4], 0 +pcmpeqb xmm6, xmm6 +psrlq xmm6, 32 +pshufd xmm7, xmm6, 0*64+1*16+2*4+3 +movdqa xmm4, xmm0 +movdqa xmm5, xmm3 +pand xmm0, xmm7 +pand xmm4, xmm6 +pand xmm3, xmm6 +pand xmm5, xmm7 +por xmm4, xmm5 +movdqa xmm5, xmm1 +pand xmm1, xmm7 +pand xmm5, xmm6 +por xmm0, xmm5 +pand xmm6, xmm2 +pand xmm2, xmm7 +por xmm1, xmm6 +por xmm2, xmm3 +movdqa xmm5, xmm4 +movdqa xmm6, xmm0 +shufpd xmm4, xmm1, 2 +shufpd xmm0, xmm2, 2 +shufpd xmm1, xmm5, 2 +shufpd xmm2, xmm6, 2 +test rdx, rdx +jz labelSSE2_Salsa_Output_B3 +test rdx, 15 +jnz labelSSE2_Salsa_Output_B7 +pxor xmm4, [rdx+0*16] +pxor xmm0, [rdx+1*16] +pxor xmm1, [rdx+2*16] +pxor xmm2, [rdx+3*16] +add rdx, 4*16 +jmp labelSSE2_Salsa_Output_B3 +labelSSE2_Salsa_Output_B7: +movdqu xmm3, [rdx+0*16] +pxor xmm4, xmm3 +movdqu xmm3, [rdx+1*16] +pxor xmm0, xmm3 +movdqu xmm3, [rdx+2*16] +pxor xmm1, xmm3 +movdqu xmm3, [rdx+3*16] +pxor xmm2, xmm3 +add rdx, 4*16 +labelSSE2_Salsa_Output_B3: +test rcx, 15 +jnz labelSSE2_Salsa_Output_B8 +movdqa [rcx+0*16], xmm4 +movdqa [rcx+1*16], xmm0 +movdqa [rcx+2*16], xmm1 +movdqa [rcx+3*16], xmm2 +jmp labelSSE2_Salsa_Output_B9 +labelSSE2_Salsa_Output_B8: +movdqu [rcx+0*16], xmm4 +movdqu [rcx+1*16], xmm0 +movdqu [rcx+2*16], xmm1 +movdqu [rcx+3*16], xmm2 +labelSSE2_Salsa_Output_B9: +add rcx, 4*16 +jmp label5 +label4: +movdqa xmm6, [rsp + 0200h] +movdqa xmm7, [rsp + 0210h] +movdqa xmm8, [rsp + 0220h] +movdqa xmm9, [rsp + 0230h] +movdqa xmm10, [rsp + 0240h] +movdqa xmm11, [rsp + 0250h] +movdqa xmm12, [rsp + 0260h] +movdqa xmm13, [rsp + 0270h] +movdqa xmm14, [rsp + 0280h] +movdqa xmm15, [rsp + 0290h] +add rsp, 10*16 + 32*16 + 8 +ret +Salsa20_OperateKeystream ENDP +ALIGN 8 +Rijndael_Enc_ProcessAndXorBlock PROC FRAME +rex_push_reg rbx +push_reg rsi +push_reg rdi +push_reg r12 +push_reg r13 +push_reg r14 +push_reg r15 +.endprolog +mov r11, rcx +mov rdi, [rsp + 5*8 + 7*8] ; inBlock +mov eax, [r8+0*4] +xor eax, [rdi+0*4] +mov r13d, eax +mov ebx, [r8+1*4] +xor ebx, [rdi+1*4] +mov r14d, ebx +and ebx, eax +mov eax, [r8+2*4] +xor eax, [rdi+2*4] +mov r15d, eax +and ebx, eax +mov ecx, [r8+3*4] +xor ecx, [rdi+3*4] +and ebx, ecx +and ebx, 0 +mov edi, ebx +label2: +and ebx, [r11+rdi] +add edi, edx +and ebx, [r11+rdi] +add edi, edx +and ebx, [r11+rdi] +add edi, edx +and ebx, [r11+rdi] +add edi, edx +cmp edi, 1024 +jl label2 +and ebx, [r11+1020] +xor r13d, ebx +xor r14d, ebx +xor r15d, ebx +xor ecx, ebx +mov edi, [r8+4*4] +mov eax, [r8+5*4] +mov ebx, [r8+6*4] +mov edx, [r8+7*4] +add r8, 8*4 +movzx esi, cl +xor edx, [r11+0*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor eax, [r11+2*1024+4*rsi] +movzx esi, ch +xor edi, [r11+3*1024+4*rsi] +mov ecx, r15d +movzx esi, cl +xor ebx, [r11+0*1024+4*rsi] +movzx esi, ch +xor eax, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edi, [r11+2*1024+4*rsi] +movzx esi, ch +xor edx, [r11+3*1024+4*rsi] +mov ecx, r14d +movzx esi, cl +xor eax, [r11+0*1024+4*rsi] +movzx esi, ch +xor edi, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edx, [r11+2*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+3*1024+4*rsi] +mov ecx, r13d +movzx esi, cl +xor edi, [r11+0*1024+4*rsi] +movzx esi, ch +xor edx, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor ebx, [r11+2*1024+4*rsi] +movzx esi, ch +xor eax, [r11+3*1024+4*rsi] +mov r15d, ebx +mov r14d, eax +mov r13d, edi +label0: +mov edi, [r8+0*4] +mov eax, [r8+1*4] +mov ebx, [r8+2*4] +mov ecx, [r8+3*4] +movzx esi, dl +xor edi, [r11+3*1024+4*rsi] +movzx esi, dh +xor eax, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor ebx, [r11+1*1024+4*rsi] +movzx esi, dh +xor ecx, [r11+0*1024+4*rsi] +mov edx, r15d +movzx esi, dl +xor ecx, [r11+3*1024+4*rsi] +movzx esi, dh +xor edi, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor eax, [r11+1*1024+4*rsi] +movzx esi, dh +xor ebx, [r11+0*1024+4*rsi] +mov edx, r14d +movzx esi, dl +xor ebx, [r11+3*1024+4*rsi] +movzx esi, dh +xor ecx, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor edi, [r11+1*1024+4*rsi] +movzx esi, dh +xor eax, [r11+0*1024+4*rsi] +mov edx, r13d +movzx esi, dl +xor eax, [r11+3*1024+4*rsi] +movzx esi, dh +xor ebx, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor ecx, [r11+1*1024+4*rsi] +movzx esi, dh +xor edi, [r11+0*1024+4*rsi] +mov r15d, ebx +mov r14d, eax +mov r13d, edi +mov edi, [r8+4*4] +mov eax, [r8+5*4] +mov ebx, [r8+6*4] +mov edx, [r8+7*4] +movzx esi, cl +xor edi, [r11+3*1024+4*rsi] +movzx esi, ch +xor eax, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor ebx, [r11+1*1024+4*rsi] +movzx esi, ch +xor edx, [r11+0*1024+4*rsi] +mov ecx, r15d +movzx esi, cl +xor edx, [r11+3*1024+4*rsi] +movzx esi, ch +xor edi, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor eax, [r11+1*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+0*1024+4*rsi] +mov ecx, r14d +movzx esi, cl +xor ebx, [r11+3*1024+4*rsi] +movzx esi, ch +xor edx, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edi, [r11+1*1024+4*rsi] +movzx esi, ch +xor eax, [r11+0*1024+4*rsi] +mov ecx, r13d +movzx esi, cl +xor eax, [r11+3*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edx, [r11+1*1024+4*rsi] +movzx esi, ch +xor edi, [r11+0*1024+4*rsi] +mov r15d, ebx +mov r14d, eax +mov r13d, edi +add r8, 8*4 +cmp r9, r8 +jne label0 +mov eax, [r9+0*4] +mov ecx, [r9+1*4] +mov esi, [r9+2*4] +mov edi, [r9+3*4] +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor eax, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor ecx, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor esi, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor edi, ebx +mov edx, r15d +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor edi, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor eax, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor ecx, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor esi, ebx +mov edx, r14d +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor esi, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor edi, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor eax, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor ecx, ebx +mov edx, r13d +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor ecx, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor esi, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor edi, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor eax, ebx +mov rbx, [rsp + 6*8 + 7*8] ; xorBlock +test rbx, rbx +jz label1 +xor eax, [rbx+0*4] +xor ecx, [rbx+1*4] +xor esi, [rbx+2*4] +xor edi, [rbx+3*4] +label1: +mov rbx, [rsp + 7*8 + 7*8] ; outBlock +mov [rbx+0*4], eax +mov [rbx+1*4], ecx +mov [rbx+2*4], esi +mov [rbx+3*4], edi +pop r15 +pop r14 +pop r13 +pop r12 +pop rdi +pop rsi +pop rbx +ret +Rijndael_Enc_ProcessAndXorBlock ENDP + +ALIGN 8 +Sosemanuk_OperateKeystream PROC FRAME +rex_push_reg rsi +push_reg rdi +alloc_stack(80*4*2+12*4+8*8 + 2*16+8) +save_xmm128 xmm6, 02f0h +save_xmm128 xmm7, 0300h +.endprolog +mov rdi, r8 +mov rax, r9 +mov QWORD PTR [rsp+1*8], rdi +mov QWORD PTR [rsp+2*8], rdx +mov QWORD PTR [rsp+6*8], rax +lea rcx, [4*rcx+rcx] +lea rsi, [4*rcx] +mov QWORD PTR [rsp+3*8], rsi +movdqa xmm0, [rax+0*16] +movdqa [rsp + 8*8+0*16], xmm0 +movdqa xmm0, [rax+1*16] +movdqa [rsp + 8*8+1*16], xmm0 +movq xmm0, QWORD PTR [rax+2*16] +movq QWORD PTR [rsp + 8*8+2*16], xmm0 +psrlq xmm0, 32 +movd r10d, xmm0 +mov ecx, [rax+10*4] +mov edx, [rax+11*4] +pcmpeqb xmm7, xmm7 +label2: +lea rdi, [rsp + 8*8 + 12*4] +mov rax, 80 +cmp rsi, 80 +cmovg rsi, rax +mov QWORD PTR [rsp+7*8], rsi +lea rsi, [rdi+rsi] +mov QWORD PTR [rsp+4*8], rsi +lea rsi, s_sosemanukMulTables +label0: +mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4] +mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4] +mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4] +mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4] +mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4] +mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4] +mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4] +mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4] +mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4] +mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4] +mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4] +mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4] +mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4] +mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4] +mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4] +mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4] +mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4] +mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4] +mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4] +mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4] +mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d +add rdi, 5*4 +cmp rdi, QWORD PTR [rsp+4*8] +jne label0 +mov rax, QWORD PTR [rsp+2*8] +mov r11, QWORD PTR [rsp+1*8] +lea rdi, [rsp + 8*8 + 12*4] +mov rsi, QWORD PTR [rsp+7*8] +label1: +movdqa xmm0, [rdi+0*20*4] +movdqa xmm2, [rdi+2*20*4] +movdqa xmm3, [rdi+3*20*4] +movdqa xmm1, [rdi+1*20*4] +movdqa xmm4, xmm0 +pand xmm0, xmm2 +pxor xmm0, xmm3 +pxor xmm2, xmm1 +pxor xmm2, xmm0 +por xmm3, xmm4 +pxor xmm3, xmm1 +pxor xmm4, xmm2 +movdqa xmm1, xmm3 +por xmm3, xmm4 +pxor xmm3, xmm0 +pand xmm0, xmm1 +pxor xmm4, xmm0 +pxor xmm1, xmm3 +pxor xmm1, xmm4 +pxor xmm4, xmm7 +pxor xmm2, [rdi+80*4] +pxor xmm3, [rdi+80*5] +pxor xmm1, [rdi+80*6] +pxor xmm4, [rdi+80*7] +cmp rsi, 16 +jl label4 +movdqa xmm6, xmm2 +punpckldq xmm2, xmm3 +movdqa xmm5, xmm1 +punpckldq xmm1, xmm4 +movdqa xmm0, xmm2 +punpcklqdq xmm2, xmm1 +punpckhqdq xmm0, xmm1 +punpckhdq xmm6, xmm3 +punpckhdq xmm5, xmm4 +movdqa xmm3, xmm6 +punpcklqdq xmm6, xmm5 +punpckhqdq xmm3, xmm5 +test rax, rax +jz labelSSE2_Sosemanuk_Output3 +test rax, 15 +jnz labelSSE2_Sosemanuk_Output7 +pxor xmm2, [rax+0*16] +pxor xmm0, [rax+1*16] +pxor xmm6, [rax+2*16] +pxor xmm3, [rax+3*16] +add rax, 4*16 +jmp labelSSE2_Sosemanuk_Output3 +labelSSE2_Sosemanuk_Output7: +movdqu xmm1, [rax+0*16] +pxor xmm2, xmm1 +movdqu xmm1, [rax+1*16] +pxor xmm0, xmm1 +movdqu xmm1, [rax+2*16] +pxor xmm6, xmm1 +movdqu xmm1, [rax+3*16] +pxor xmm3, xmm1 +add rax, 4*16 +labelSSE2_Sosemanuk_Output3: +test r11, 15 +jnz labelSSE2_Sosemanuk_Output8 +movdqa [r11+0*16], xmm2 +movdqa [r11+1*16], xmm0 +movdqa [r11+2*16], xmm6 +movdqa [r11+3*16], xmm3 +jmp labelSSE2_Sosemanuk_Output9 +labelSSE2_Sosemanuk_Output8: +movdqu [r11+0*16], xmm2 +movdqu [r11+1*16], xmm0 +movdqu [r11+2*16], xmm6 +movdqu [r11+3*16], xmm3 +labelSSE2_Sosemanuk_Output9: +add r11, 4*16 +add rdi, 4*4 +sub rsi, 16 +jnz label1 +mov rsi, QWORD PTR [rsp+3*8] +sub rsi, 80 +jz label6 +mov QWORD PTR [rsp+3*8], rsi +mov QWORD PTR [rsp+2*8], rax +mov QWORD PTR [rsp+1*8], r11 +jmp label2 +label4: +test rax, rax +jz label5 +movd xmm0, dword ptr [rax+0*4] +pxor xmm2, xmm0 +movd xmm0, dword ptr [rax+1*4] +pxor xmm3, xmm0 +movd xmm0, dword ptr [rax+2*4] +pxor xmm1, xmm0 +movd xmm0, dword ptr [rax+3*4] +pxor xmm4, xmm0 +add rax, 16 +label5: +movd dword ptr [r11+0*4], xmm2 +movd dword ptr [r11+1*4], xmm3 +movd dword ptr [r11+2*4], xmm1 +movd dword ptr [r11+3*4], xmm4 +sub rsi, 4 +jz label6 +add r11, 16 +psrldq xmm2, 4 +psrldq xmm3, 4 +psrldq xmm1, 4 +psrldq xmm4, 4 +jmp label4 +label6: +mov r10, QWORD PTR [rsp+6*8] +movdqa xmm0, [rsp + 8*8+0*16] +movdqa [r10+0*16], xmm0 +movdqa xmm0, [rsp + 8*8+1*16] +movdqa [r10+1*16], xmm0 +movq xmm0, QWORD PTR [rsp + 8*8+2*16] +movq QWORD PTR [r10+2*16], xmm0 +mov [r10+10*4], ecx +mov [r10+11*4], edx +movdqa xmm6, [rsp + 02f0h] +movdqa xmm7, [rsp + 0300h] +add rsp, 80*4*2+12*4+8*8 + 2*16+8 +pop rdi +pop rsi +ret +Sosemanuk_OperateKeystream ENDP + +Panama_SSE2_Pull PROC FRAME +alloc_stack(2*16+8) +save_xmm128 xmm6, 0h +save_xmm128 xmm7, 10h +.endprolog +shl rcx, 5 +jz label5 +mov r10d, [rdx+4*17] +add rcx, r10 +mov rdi, rcx +movdqa xmm0, xmmword ptr [rdx+0*16] +movdqa xmm1, xmmword ptr [rdx+1*16] +movdqa xmm2, xmmword ptr [rdx+2*16] +movdqa xmm3, xmmword ptr [rdx+3*16] +mov eax, dword ptr [rdx+4*16] +label4: +movdqa xmm6, xmm2 +movss xmm6, xmm3 +pshufd xmm5, xmm6, 0*64+3*16+2*4+1 +movd xmm6, eax +movdqa xmm7, xmm3 +movss xmm7, xmm6 +pshufd xmm6, xmm7, 0*64+3*16+2*4+1 +movd ecx, xmm2 +not ecx +movd r11d, xmm3 +or ecx, r11d +xor eax, ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm1 +por xmm7, xmm2 +pxor xmm7, xmm3 +movd ecx, xmm7 +rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm0 +por xmm7, xmm1 +pxor xmm7, xmm2 +movd ecx, xmm7 +rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm6 +por xmm7, xmm0 +pxor xmm7, xmm1 +movd ecx, xmm7 +rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm5 +por xmm7, xmm6 +pxor xmm7, xmm0 +movd ecx, xmm7 +rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx +movdqa xmm4, xmm3 +punpcklqdq xmm3, xmm2 +punpckhdq xmm4, xmm2 +movdqa xmm2, xmm1 +punpcklqdq xmm1, xmm0 +punpckhdq xmm2, xmm0 +test r8, r8 +jz label0 +movdqa xmm6, xmm4 +punpcklqdq xmm4, xmm2 +punpckhqdq xmm6, xmm2 +test r9, 15 +jnz label2 +test r9, r9 +jz label1 +pxor xmm4, [r9] +pxor xmm6, [r9+16] +add r9, 32 +jmp label1 +label2: +movdqu xmm0, [r9] +movdqu xmm2, [r9+16] +pxor xmm4, xmm0 +pxor xmm6, xmm2 +add r9, 32 +label1: +test r8, 15 +jnz label3 +movdqa xmmword ptr [r8], xmm4 +movdqa xmmword ptr [r8+16], xmm6 +add r8, 32 +jmp label0 +label3: +movdqu xmmword ptr [r8], xmm4 +movdqu xmmword ptr [r8+16], xmm6 +add r8, 32 +label0: +lea rcx, [r10 + 32] +and rcx, 31*32 +lea r11, [r10 + (32-24)*32] +and r11, 31*32 +movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8] +pxor xmm3, xmm0 +pshufd xmm0, xmm0, 2*64+3*16+0*4+1 +movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3 +pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8] +movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0 +movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8] +pxor xmm1, xmm4 +movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1 +pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8] +movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4 +movdqa xmm3, xmmword ptr [rdx+3*16] +movdqa xmm2, xmmword ptr [rdx+2*16] +movdqa xmm1, xmmword ptr [rdx+1*16] +movdqa xmm0, xmmword ptr [rdx+0*16] +movd xmm6, eax +movdqa xmm7, xmm3 +movss xmm7, xmm6 +movdqa xmm6, xmm2 +movss xmm6, xmm3 +movdqa xmm5, xmm1 +movss xmm5, xmm2 +movdqa xmm4, xmm0 +movss xmm4, xmm1 +pshufd xmm7, xmm7, 0*64+3*16+2*4+1 +pshufd xmm6, xmm6, 0*64+3*16+2*4+1 +pshufd xmm5, xmm5, 0*64+3*16+2*4+1 +pshufd xmm4, xmm4, 0*64+3*16+2*4+1 +xor eax, 1 +movd ecx, xmm0 +xor eax, ecx +movd ecx, xmm3 +xor eax, ecx +pxor xmm3, xmm2 +pxor xmm2, xmm1 +pxor xmm1, xmm0 +pxor xmm0, xmm7 +pxor xmm3, xmm7 +pxor xmm2, xmm6 +pxor xmm1, xmm5 +pxor xmm0, xmm4 +lea rcx, [r10 + (32-4)*32] +and rcx, 31*32 +lea r11, [r10 + 16*32] +and r11, 31*32 +movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16] +movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16] +movdqa xmm6, xmm4 +punpcklqdq xmm4, xmm5 +punpckhqdq xmm6, xmm5 +pxor xmm3, xmm4 +pxor xmm2, xmm6 +movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16] +movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16] +movdqa xmm6, xmm4 +punpcklqdq xmm4, xmm5 +punpckhqdq xmm6, xmm5 +pxor xmm1, xmm4 +pxor xmm0, xmm6 +add r10, 32 +cmp r10, rdi +jne label4 +mov [rdx+4*16], eax +movdqa xmmword ptr [rdx+3*16], xmm3 +movdqa xmmword ptr [rdx+2*16], xmm2 +movdqa xmmword ptr [rdx+1*16], xmm1 +movdqa xmmword ptr [rdx+0*16], xmm0 +label5: +movdqa xmm6, [rsp + 0h] +movdqa xmm7, [rsp + 10h] +add rsp, 2*16+8 +ret +Panama_SSE2_Pull ENDP + _TEXT ENDS END |