- port x64 assembly code to MASM

- improve stack unwindability on x64 for GCC by not modifying RBP/RSP registers in inline assembly git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@396 57ff6487-cd31-0410-9ec3-f628ee90f5f0
author: weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> 2007-09-24 00:43:57 +0000
committer: weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> 2007-09-24 00:43:57 +0000
commit: 982ba6fa712d44275c2541b6b9badf489cf9eda6 (patch)
tree: 7d4e77f11bb8dc49557b634d8380767aef1b8502
parent: 489a156f9bc41028439b6375af6314e473565847 (diff)
download: cryptopp-982ba6fa712d44275c2541b6b9badf489cf9eda6.tar.gz
7 files changed, 2854 insertions, 534 deletions
diff --git a/cpu.h b/cpu.h
index bec63fd..2ef55c3 100755
--- a/cpu.h
+++ b/cpu.h
@@ -1,6 +1,15 @@
 #ifndef CRYPTOPP_CPU_H
 #define CRYPTOPP_CPU_H
 
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#define CRYPTOPP_X86_ASM_AVAILABLE
+#define CRYPTOPP_BOOL_X64 1
+#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
+#define NAMESPACE_END
+
+#else
+
 #include "config.h"
 
 #ifdef CRYPTOPP_MSVC6PP_OR_LATER
@@ -98,7 +107,18 @@ inline bool HasMMX()	{return false;}
 
 #endif		// #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400
 
-#if defined(__GNUC__)
+#endif
+
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+	#define AS1(x) x*newline*
+	#define AS2(x, y) x, y*newline*
+	#define AS3(x, y, z) x, y, z*newline*
+	#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline*
+	#define ASL(x) label##x:*newline*
+	#define ASJ(x, y, z) x label##y*newline*
+	#define ASC(x, y) x label##y*newline*
+	#define AS_HEX(y) y##h
+#elif defined(__GNUC__)
 	// define these in two steps to allow arguments to be expanded
 	#define GNU_AS1(x) #x ";"
 	#define GNU_AS2(x, y) #x ", " #y ";"
@@ -113,6 +133,7 @@ inline bool HasMMX()	{return false;}
 	#define ASJ(x, y, z) GNU_ASJ(x, y, z)
 	#define ASC(x, y) #x " " #y ";"
 	#define CRYPTOPP_NAKED
+	#define AS_HEX(y) 0x##y
 #else
 	#define AS1(x) __asm {x}
 	#define AS2(x, y) __asm {x, y}
@@ -122,25 +143,115 @@ inline bool HasMMX()	{return false;}
 	#define ASJ(x, y, z) __asm {x label##y}
 	#define ASC(x, y) __asm {x label##y}
 	#define CRYPTOPP_NAKED __declspec(naked)
+	#define AS_HEX(y) 0x##y
 #endif
 
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+#define ASM_MOD(x, y) ((x) MOD (y))
+#else
 // GNU assembler doesn't seem to have mod operator
 #define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
+#endif
 
 #if CRYPTOPP_BOOL_X86
+	#define AS_REG_1 ecx
+	#define AS_REG_2 edx
+	#define AS_REG_3 esi
+	#define AS_REG_4 edi
+	#define AS_REG_5 eax
+	#define AS_REG_6 ebx
+	#define AS_REG_7 ebp
+	#define AS_REG_1d ecx
+	#define AS_REG_2d edx
+	#define AS_REG_3d esi
+	#define AS_REG_4d edi
+	#define AS_REG_5d eax
+	#define AS_REG_6d ebx
+	#define AS_REG_7d ebp
 	#define WORD_SZ 4
 	#define WORD_REG(x)	e##x
 	#define WORD_PTR DWORD PTR
-	#define AS_PUSH(x) AS1(push e##x)
-	#define AS_POP(x) AS1(pop e##x)
+	#define AS_PUSH_IF86(x) AS1(push e##x)
+	#define AS_POP_IF86(x) AS1(pop e##x)
+	#define AS_JCXZ jecxz
 #elif CRYPTOPP_BOOL_X64
+	#ifdef CRYPTOPP_GENERATE_X64_MASM
+		#define AS_REG_1 rcx
+		#define AS_REG_2 rdx
+		#define AS_REG_3 r8
+		#define AS_REG_4 r9
+		#define AS_REG_5 rax
+		#define AS_REG_6 r10
+		#define AS_REG_7 r11
+		#define AS_REG_1d ecx
+		#define AS_REG_2d edx
+		#define AS_REG_3d r8d
+		#define AS_REG_4d r9d
+		#define AS_REG_5d eax
+		#define AS_REG_6d r10d
+		#define AS_REG_7d r11d
+	#else
+		#define AS_REG_1 rdi
+		#define AS_REG_2 rsi
+		#define AS_REG_3 rdx
+		#define AS_REG_4 rcx
+		#define AS_REG_5 r8
+		#define AS_REG_6 r9
+		#define AS_REG_7 r10
+		#define AS_REG_1d edi
+		#define AS_REG_2d esi
+		#define AS_REG_3d edx
+		#define AS_REG_4d ecx
+		#define AS_REG_5d r8d
+		#define AS_REG_6d r9d
+		#define AS_REG_7d r10d
+	#endif
 	#define WORD_SZ 8
 	#define WORD_REG(x)	r##x
 	#define WORD_PTR QWORD PTR
-	#define AS_PUSH(x) AS1(pushq r##x)
-	#define AS_POP(x) AS1(popq r##x)
+	#define AS_PUSH_IF86(x)
+	#define AS_POP_IF86(x)
+	#define AS_JCXZ jrcxz
 #endif
 
+// helper macro for stream cipher output
+#define AS_XMM_OUTPUT4(labelPrefix, inputPtr, outputPtr, x0, x1, x2, x3, t, p0, p1, p2, p3, increment)\
+	AS2(	test	inputPtr, inputPtr)\
+	ASC(	jz,		labelPrefix##3)\
+	AS2(	test	inputPtr, 15)\
+	ASC(	jnz,	labelPrefix##7)\
+	AS2(	pxor	xmm##x0, [inputPtr+p0*16])\
+	AS2(	pxor	xmm##x1, [inputPtr+p1*16])\
+	AS2(	pxor	xmm##x2, [inputPtr+p2*16])\
+	AS2(	pxor	xmm##x3, [inputPtr+p3*16])\
+	AS2(	add		inputPtr, increment*16)\
+	ASC(	jmp,	labelPrefix##3)\
+	ASL(labelPrefix##7)\
+	AS2(	movdqu	xmm##t, [inputPtr+p0*16])\
+	AS2(	pxor	xmm##x0, xmm##t)\
+	AS2(	movdqu	xmm##t, [inputPtr+p1*16])\
+	AS2(	pxor	xmm##x1, xmm##t)\
+	AS2(	movdqu	xmm##t, [inputPtr+p2*16])\
+	AS2(	pxor	xmm##x2, xmm##t)\
+	AS2(	movdqu	xmm##t, [inputPtr+p3*16])\
+	AS2(	pxor	xmm##x3, xmm##t)\
+	AS2(	add		inputPtr, increment*16)\
+	ASL(labelPrefix##3)\
+	AS2(	test	outputPtr, 15)\
+	ASC(	jnz,	labelPrefix##8)\
+	AS2(	movdqa	[outputPtr+p0*16], xmm##x0)\
+	AS2(	movdqa	[outputPtr+p1*16], xmm##x1)\
+	AS2(	movdqa	[outputPtr+p2*16], xmm##x2)\
+	AS2(	movdqa	[outputPtr+p3*16], xmm##x3)\
+	ASC(	jmp,	labelPrefix##9)\
+	ASL(labelPrefix##8)\
+	AS2(	movdqu	[outputPtr+p0*16], xmm##x0)\
+	AS2(	movdqu	[outputPtr+p1*16], xmm##x1)\
+	AS2(	movdqu	[outputPtr+p2*16], xmm##x2)\
+	AS2(	movdqu	[outputPtr+p3*16], xmm##x3)\
+	ASL(labelPrefix##9)\
+	AS2(	add		outputPtr, increment*16)
+
 NAMESPACE_END
 
 #endif
diff --git a/panama.cpp b/panama.cpp
index 6fecd9b..1191dcb 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -1,6 +1,11 @@
 // panama.cpp - written and placed in the public domain by Wei Dai
 
+// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM panama.cpp" to generate MASM code
+
 #include "pch.h"
+
+#ifndef CRYPTOPP_GENERATE_X64_MASM
+
 #include "panama.h"
 #include "misc.h"
 #include "cpu.h"
@@ -16,41 +21,67 @@ void Panama<B>::Reset()
 #endif
 }
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#endif	// #ifndef CRYPTOPP_GENERATE_X64_MASM
 
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+extern "C" {
+void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y);
+}
+#elif CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+	Panama_SSE2_Pull	PROC FRAME
+	alloc_stack(2*16+8)
+	save_xmm128 xmm6, 0h
+	save_xmm128 xmm7, 10h
+	.endprolog
+#else
 #pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
-
 void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 {
 #ifdef __GNUC__
 	__asm__ __volatile__
 	(
 		".intel_syntax noprefix;"
-	AS_PUSH(		bx)
+		AS_POP_IF86(	bx)
 #else
-	AS2(	mov		WORD_REG(cx), count)
-	AS2(	mov		WORD_REG(si), state)
-	AS2(	mov		WORD_REG(di), z)
-	AS2(	mov		WORD_REG(dx), y)
+	AS2(	mov		AS_REG_1, count)
+	AS2(	mov		AS_REG_2, state)
+	AS2(	mov		AS_REG_3, z)
+	AS2(	mov		AS_REG_4, y)
 #endif
-	AS2(	shl		WORD_REG(cx), 5)
-	ASJ(	jz,		5, f)
-	AS2(	mov		ebx, [WORD_REG(si)+4*17])
-	AS2(	add		WORD_REG(cx), WORD_REG(bx))
+#endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
 
-	AS_PUSH(		bp)
-	AS_PUSH(		cx)
+#if CRYPTOPP_BOOL_X86
+	#define REG_loopEnd			[esp]
+#elif defined(CRYPTOPP_GENERATE_X64_MASM)
+	#define REG_loopEnd			rdi
+#else
+	#define REG_loopEnd			r8
+#endif
 
-	AS2(	movdqa	xmm0, [WORD_REG(si)+0*16])
-	AS2(	movdqa	xmm1, [WORD_REG(si)+1*16])
-	AS2(	movdqa	xmm2, [WORD_REG(si)+2*16])
-	AS2(	movdqa	xmm3, [WORD_REG(si)+3*16])
-	AS2(	mov		eax, [WORD_REG(si)+4*16])
+	AS2(	shl		AS_REG_1, 5)
+	ASJ(	jz,		5, f)
+	AS2(	mov		AS_REG_6d, [AS_REG_2+4*17])
+	AS2(	add		AS_REG_1, AS_REG_6)
+
+	#if CRYPTOPP_BOOL_X64
+		AS2(	mov		REG_loopEnd, AS_REG_1)
+	#else
+		AS1(	push	ebp)
+		AS1(	push	AS_REG_1)
+	#endif
+
+	AS2(	movdqa	xmm0, XMMWORD PTR [AS_REG_2+0*16])
+	AS2(	movdqa	xmm1, XMMWORD PTR [AS_REG_2+1*16])
+	AS2(	movdqa	xmm2, XMMWORD PTR [AS_REG_2+2*16])
+	AS2(	movdqa	xmm3, XMMWORD PTR [AS_REG_2+3*16])
+	AS2(	mov		eax, dword ptr [AS_REG_2+4*16])
 
 	ASL(4)
 	// gamma and pi
 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
-	AS2(	test	WORD_REG(bx), 1)
+	AS2(	test	AS_REG_6, 1)
 	ASJ(	jnz,	6, f)
 #endif
 	AS2(	movdqa	xmm6, xmm2)
@@ -70,18 +101,18 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	ASL(7)
 #endif
 
-	AS2(	movd	ecx, xmm2)
-	AS1(	not		ecx)
-	AS2(	movd	ebp, xmm3)
-	AS2(	or		ecx, ebp)
-	AS2(	xor		eax, ecx)
+	AS2(	movd	AS_REG_1d, xmm2)
+	AS1(	not		AS_REG_1d)
+	AS2(	movd	AS_REG_7d, xmm3)
+	AS2(	or		AS_REG_1d, AS_REG_7d)
+	AS2(	xor		eax, AS_REG_1d)
 
 #define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
 
 #define pi(i)	\
-	AS2(	movd	ecx, xmm7)\
-	AS2(	rol		ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
-	AS2(	mov		[WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
+	AS2(	movd	AS_REG_1d, xmm7)\
+	AS2(	rol		AS_REG_1d, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
+	AS2(	mov		[AS_REG_2+SSE2_Index(ASM_MOD(5*(i), 17))*4], AS_REG_1d)
 
 #define pi4(x, y, z, a, b, c, d)	\
 	AS2(	pcmpeqb	xmm7, xmm7)\
@@ -110,65 +141,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	AS2(	punpckhdq	xmm2, xmm0)		// 11 12 15 16
 
 	// keystream
-	AS2(	test	WORD_REG(di), WORD_REG(di))
+	AS2(	test	AS_REG_3, AS_REG_3)
 	ASJ(	jz,		0, f)
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm2)
 	AS2(	punpckhqdq	xmm6, xmm2)
-	AS2(	test	WORD_REG(dx), 0xf)
+	AS2(	test	AS_REG_4, 15)
 	ASJ(	jnz,	2, f)
-	AS2(	test	WORD_REG(dx), WORD_REG(dx))
+	AS2(	test	AS_REG_4, AS_REG_4)
 	ASJ(	jz,		1, f)
-	AS2(	pxor	xmm4, [WORD_REG(dx)])
-	AS2(	pxor	xmm6, [WORD_REG(dx)+16])
-	AS2(	add		WORD_REG(dx), 32)
+	AS2(	pxor	xmm4, [AS_REG_4])
+	AS2(	pxor	xmm6, [AS_REG_4+16])
+	AS2(	add		AS_REG_4, 32)
 	ASJ(	jmp,	1, f)
 	ASL(2)
-	AS2(	movdqu	xmm0, [WORD_REG(dx)])
-	AS2(	movdqu	xmm2, [WORD_REG(dx)+16])
+	AS2(	movdqu	xmm0, [AS_REG_4])
+	AS2(	movdqu	xmm2, [AS_REG_4+16])
 	AS2(	pxor	xmm4, xmm0)
 	AS2(	pxor	xmm6, xmm2)
-	AS2(	add		WORD_REG(dx), 32)
+	AS2(	add		AS_REG_4, 32)
 	ASL(1)
-	AS2(	test	WORD_REG(di), 0xf)
+	AS2(	test	AS_REG_3, 15)
 	ASJ(	jnz,	3, f)
-	AS2(	movdqa	[WORD_REG(di)], xmm4)
-	AS2(	movdqa	[WORD_REG(di)+16], xmm6)
-	AS2(	add		WORD_REG(di), 32)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_3], xmm4)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_3+16], xmm6)
+	AS2(	add		AS_REG_3, 32)
 	ASJ(	jmp,	0, f)
 	ASL(3)
-	AS2(	movdqu	[WORD_REG(di)], xmm4)
-	AS2(	movdqu	[WORD_REG(di)+16], xmm6)
-	AS2(	add		WORD_REG(di), 32)
+	AS2(	movdqu	XMMWORD PTR [AS_REG_3], xmm4)
+	AS2(	movdqu	XMMWORD PTR [AS_REG_3+16], xmm6)
+	AS2(	add		AS_REG_3, 32)
 	ASL(0)
 
 	// buffer update
-	AS2(	lea		WORD_REG(cx), [WORD_REG(bx) + 32])
-	AS2(	and		WORD_REG(cx), 31*32)
-	AS2(	lea		WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
-	AS2(	and		WORD_REG(bp), 31*32)
+	AS2(	lea		AS_REG_1, [AS_REG_6 + 32])
+	AS2(	and		AS_REG_1, 31*32)
+	AS2(	lea		AS_REG_7, [AS_REG_6 + (32-24)*32])
+	AS2(	and		AS_REG_7, 31*32)
 
-	AS2(	movdqa	xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
+	AS2(	movdqa	xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8])
 	AS2(	pxor	xmm3, xmm0)
 	ASS(	pshufd	xmm0, xmm0, 2, 3, 0, 1)
-	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
-	AS2(	pxor	xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
-	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
+	AS2(	pxor	xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8])
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
 
-	AS2(	movdqa	xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
+	AS2(	movdqa	xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8])
 	AS2(	pxor	xmm1, xmm4)
-	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
-	AS2(	pxor	xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
-	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
+	AS2(	pxor	xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8])
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
 
 	// theta
-	AS2(	movdqa	xmm3, [WORD_REG(si)+3*16])
-	AS2(	movdqa	xmm2, [WORD_REG(si)+2*16])
-	AS2(	movdqa	xmm1, [WORD_REG(si)+1*16])
-	AS2(	movdqa	xmm0, [WORD_REG(si)+0*16])
+	AS2(	movdqa	xmm3, XMMWORD PTR [AS_REG_2+3*16])
+	AS2(	movdqa	xmm2, XMMWORD PTR [AS_REG_2+2*16])
+	AS2(	movdqa	xmm1, XMMWORD PTR [AS_REG_2+1*16])
+	AS2(	movdqa	xmm0, XMMWORD PTR [AS_REG_2+0*16])
 
 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
-	AS2(	test	WORD_REG(bx), 1)
+	AS2(	test	AS_REG_6, 1)
 	ASJ(	jnz,	8, f)
 #endif
 	AS2(	movd	xmm6, eax)
@@ -199,10 +230,10 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 #endif
 
 	AS2(	xor		eax, 1)
-	AS2(	movd	ecx, xmm0)
-	AS2(	xor		eax, ecx)
-	AS2(	movd	ecx, xmm3)
-	AS2(	xor		eax, ecx)
+	AS2(	movd	AS_REG_1d, xmm0)
+	AS2(	xor		eax, AS_REG_1d)
+	AS2(	movd	AS_REG_1d, xmm3)
+	AS2(	xor		eax, AS_REG_1d)
 
 	AS2(	pxor	xmm3, xmm2)
 	AS2(	pxor	xmm2, xmm1)
@@ -214,21 +245,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	AS2(	pxor	xmm0, xmm4)
 
 	// sigma
-	AS2(	lea		WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
-	AS2(	and		WORD_REG(cx), 31*32)
-	AS2(	lea		WORD_REG(bp), [WORD_REG(bx) + 16*32])
-	AS2(	and		WORD_REG(bp), 31*32)
+	AS2(	lea		AS_REG_1, [AS_REG_6 + (32-4)*32])
+	AS2(	and		AS_REG_1, 31*32)
+	AS2(	lea		AS_REG_7, [AS_REG_6 + 16*32])
+	AS2(	and		AS_REG_7, 31*32)
 
-	AS2(	movdqa	xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
-	AS2(	movdqa	xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
+	AS2(	movdqa	xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*16])
+	AS2(	movdqa	xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*16])
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm5)
 	AS2(	punpckhqdq	xmm6, xmm5)
 	AS2(	pxor	xmm3, xmm4)
 	AS2(	pxor	xmm2, xmm6)
 
-	AS2(	movdqa	xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
-	AS2(	movdqa	xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
+	AS2(	movdqa	xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+1*16])
+	AS2(	movdqa	xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+1*16])
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm5)
 	AS2(	punpckhqdq	xmm6, xmm5)
@@ -236,31 +267,48 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	AS2(	pxor	xmm0, xmm6)
 
 	// loop
-	AS2(	add		WORD_REG(bx), 32)
-	AS2(	cmp		WORD_REG(bx), [WORD_REG(sp)])
+	AS2(	add		AS_REG_6, 32)
+	AS2(	cmp		AS_REG_6, REG_loopEnd)
 	ASJ(	jne,	4, b)
 
 	// save state
-	AS2(	add		WORD_REG(sp), WORD_SZ)
-	AS_POP(			bp)
-	AS2(	mov		[WORD_REG(si)+4*16], eax)
-	AS2(	movdqa	[WORD_REG(si)+3*16], xmm3)
-	AS2(	movdqa	[WORD_REG(si)+2*16], xmm2)
-	AS2(	movdqa	[WORD_REG(si)+1*16], xmm1)
-	AS2(	movdqa	[WORD_REG(si)+0*16], xmm0)
+	AS2(	mov		[AS_REG_2+4*16], eax)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+3*16], xmm3)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+2*16], xmm2)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+1*16], xmm1)
+	AS2(	movdqa	XMMWORD PTR [AS_REG_2+0*16], xmm0)
+
+	#if CRYPTOPP_BOOL_X86
+		AS2(	add		esp, 4)
+		AS1(	pop		ebp)
+	#endif
 	ASL(5)
 
 #ifdef __GNUC__
-	AS_POP(			bx)
-	".att_syntax prefix;"
-		:
-		: "c" (count), "S" (state), "D" (z), "d" (y)
-		: "%eax", "memory", "cc"
+		AS_POP_IF86(	bx)
+		".att_syntax prefix;"
+			:
+	#if CRYPTOPP_BOOL_X64
+			: "D" (count), "S" (state), "d" (z), "c" (y)
+			: "%r8", "%r9", "r10", "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+	#else
+			: "c" (count), "d" (state), "S" (z), "D" (y)
+			: "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+	#endif
 	);
 #endif
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+	movdqa	xmm6, [rsp + 0h]
+	movdqa	xmm7, [rsp + 10h]
+	add		rsp, 2*16+8
+	ret
+	Panama_SSE2_Pull ENDP
+#else
 }
-
 #endif
+#endif	// #ifdef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+#ifndef CRYPTOPP_GENERATE_X64_MASM
 
 template <class B>
 void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
@@ -411,7 +459,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
 		this->Iterate(1, buf);
 	}
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
 	if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
 		Panama_SSE2_Pull(32, this->m_state, NULL, NULL);
 	else
@@ -423,7 +471,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
 template <class B>
 unsigned int PanamaCipherPolicy<B>::GetAlignment() const
 {
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
 	if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
 		return 16;
 	else
@@ -435,7 +483,7 @@ unsigned int PanamaCipherPolicy<B>::GetAlignment() const
 template <class B>
 void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
 {
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
 	if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
 		Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input);
 	else
@@ -453,3 +501,5 @@ template class PanamaCipherPolicy<BigEndian>;
 template class PanamaCipherPolicy<LittleEndian>;
 
 NAMESPACE_END
+
+#endif	// #ifndef CRYPTOPP_GENERATE_X64_MASM
diff --git a/rijndael.cpp b/rijndael.cpp
index ac4f769..b89e3b3 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -2,6 +2,8 @@
 // and Wei Dai from Paulo Baretto's Rijndael implementation
 // The original code and all modifications are in the public domain.
 
+// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
+
 /*
 Defense against timing attacks was added in July 2006 by Wei Dai.
 
@@ -48,6 +50,7 @@ being unloaded from L1 cache, until that round is finished.
 #include "pch.h"
 
 #ifndef CRYPTOPP_IMPORTS
+#ifndef CRYPTOPP_GENERATE_X64_MASM
 
 #include "rijndael.h"
 #include "misc.h"
@@ -145,27 +148,56 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 	ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
 }
 
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+extern "C" {
+void Rijndael_Enc_ProcessAndXorBlock(const word32 *table, word32 cacheLineSize, const word32 *k, const word32 *kLoopEnd, const byte *inBlock, const byte *xorBlock, byte *outBlock);
+}
+#endif
+
 #pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
 
 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
 {
+#endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+	Rijndael_Enc_ProcessAndXorBlock(Te, g_cacheLineSize, m_key, m_key + m_rounds*4, inBlock, xorBlock, outBlock);
+	return;
+#endif
+
 #if defined(CRYPTOPP_X86_ASM_AVAILABLE)
+	#ifdef CRYPTOPP_GENERATE_X64_MASM
+		ALIGN   8
+	Rijndael_Enc_ProcessAndXorBlock	PROC FRAME
+		rex_push_reg rbx
+		push_reg rsi
+		push_reg rdi
+		push_reg r12
+		push_reg r13
+		push_reg r14
+		push_reg r15
+		.endprolog
+		mov		AS_REG_7, rcx
+		mov		rdi, [rsp + 5*8 + 7*8]			; inBlock
+	#else
 	if (HasMMX())
 	{
 		const word32 *k = m_key;
 		const word32 *kLoopEnd = k + m_rounds*4;
+	#endif
+
 		#if CRYPTOPP_BOOL_X64
 			#define K_REG			r8
 			#define K_END_REG		r9
 			#define SAVE_K
 			#define RESTORE_K
 			#define RESTORE_K_END
-			#define SAVE_0(x)		AS2(mov	r10d, x)
-			#define SAVE_1(x)		AS2(mov	r11d, x)
-			#define SAVE_2(x)		AS2(mov	r12d, x)
-			#define RESTORE_0(x)	AS2(mov	x, r10d)
-			#define RESTORE_1(x)	AS2(mov	x, r11d)
-			#define RESTORE_2(x)	AS2(mov	x, r12d)
+			#define SAVE_0(x)		AS2(mov	r13d, x)
+			#define SAVE_1(x)		AS2(mov	r14d, x)
+			#define SAVE_2(x)		AS2(mov	r15d, x)
+			#define RESTORE_0(x)	AS2(mov	x, r13d)
+			#define RESTORE_1(x)	AS2(mov	x, r14d)
+			#define RESTORE_2(x)	AS2(mov	x, r15d)
 		#else
 			#define K_REG			esi
 			#define K_END_REG		edi
@@ -184,22 +216,16 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		__asm__ __volatile__
 		(
 		".intel_syntax noprefix;"
-		AS_PUSH(		bx)
-		AS_PUSH(		bp)
-		AS2(	mov		WORD_REG(bp), WORD_REG(ax))
 	#if CRYPTOPP_BOOL_X64
-		// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
-		AS1(	pushq	K_REG)
-		AS1(	pushq	K_END_REG)
-		AS1(	pushq	r10)
-		AS1(	pushq	r11)
-		AS1(	pushq	r12)
 		AS2(	mov		K_REG, rsi)
 		AS2(	mov		K_END_REG, rcx)
 	#else
+		AS1(	push	ebx)
+		AS1(	push	ebp)
 		AS2(	movd	mm5, ecx)
 	#endif
-#else
+		AS2(	mov		AS_REG_7, WORD_REG(ax))
+#elif CRYPTOPP_BOOL_X86
 	#if _MSC_VER < 1300
 		const word32 *t = Te;
 		AS2(	mov		eax, t)
@@ -209,12 +235,12 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		AS2(	mov		K_REG, k)
 		AS2(	movd	mm5, kLoopEnd)
 	#if _MSC_VER < 1300
-		AS_PUSH(		bx)
-		AS_PUSH(		bp)
-		AS2(	mov		ebp, eax)
+		AS1(	push	ebx)
+		AS1(	push	ebp)
+		AS2(	mov		AS_REG_7, eax)
 	#else
-		AS_PUSH(		bp)
-		AS2(	lea		ebp, Te)
+		AS1(	push	ebp)
+		AS2(	lea		AS_REG_7, Te)
 	#endif
 #endif
 		AS2(	mov		eax, [K_REG+0*4])	// s0
@@ -236,21 +262,21 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		AS2(	and		ebx, 0)
 		AS2(	mov		edi, ebx)	// make index depend on previous loads to simulate lfence
 		ASL(2)
-		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
+		AS2(	and		ebx, [AS_REG_7+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
+		AS2(	and		ebx, [AS_REG_7+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
+		AS2(	and		ebx, [AS_REG_7+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
+		AS2(	and		ebx, [AS_REG_7+WORD_REG(di)])
 		AS2(	add		edi, edx)
 		AS2(	cmp		edi, 1024)
 		ASJ(	jl,		2, b)
-		AS2(	and		ebx, [WORD_REG(bp)+1020])
+		AS2(	and		ebx, [AS_REG_7+1020])
 #if CRYPTOPP_BOOL_X64
-		AS2(	xor		r10d, ebx)
-		AS2(	xor		r11d, ebx)
-		AS2(	xor		r12d, ebx)
+		AS2(	xor		r13d, ebx)
+		AS2(	xor		r14d, ebx)
+		AS2(	xor		r15d, ebx)
 #else
 		AS2(	movd	mm6, ebx)
 		AS2(	pxor	mm2, mm6)
@@ -268,14 +294,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 
 #define QUARTER_ROUND(t, a, b, c, d)	\
 	AS2(movzx esi, t##l)\
-	AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
+	AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
+	AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\
 	AS2(shr e##t##x, 16)\
 	AS2(movzx esi, t##l)\
-	AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
+	AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
+	AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])
 
 #define s0		xor edi
 #define s1		xor eax
@@ -308,14 +334,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 
 #define QUARTER_ROUND(t, a, b, c, d)	\
 	AS2(movzx esi, t##l)\
-	AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
+	AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
+	AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\
 	AS2(shr e##t##x, 16)\
 	AS2(movzx esi, t##l)\
-	AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
+	AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
+	AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])
 
 		QUARTER_ROUND(d, s0, s1, s2, s3)
 		RESTORE_2(edx)
@@ -369,20 +395,20 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 
 #define QUARTER_ROUND(a, b, c, d)	\
 	AS2(	movzx	ebx, dl)\
-	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
+	AS2(	movzx	ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 3*8)\
 	AS2(	xor		a, ebx)\
 	AS2(	movzx	ebx, dh)\
-	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
+	AS2(	movzx	ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 2*8)\
 	AS2(	xor		b, ebx)\
 	AS2(	shr		edx, 16)\
 	AS2(	movzx	ebx, dl)\
 	AS2(	shr		edx, 8)\
-	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
+	AS2(	movzx	ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 1*8)\
 	AS2(	xor		c, ebx)\
-	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
+	AS2(	movzx	ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(dx)])\
 	AS2(	xor		d, ebx)
 
 		QUARTER_ROUND(eax, ecx, esi, edi)
@@ -395,25 +421,22 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 
 #undef QUARTER_ROUND
 
-#if CRYPTOPP_BOOL_X64
-		AS1(popq	r12)
-		AS1(popq	r11)
-		AS1(popq	r10)
-		AS1(popq	K_END_REG)
-		AS1(popq	K_REG)
-#else
+#if CRYPTOPP_BOOL_X86
 		AS1(emms)
+		AS1(pop		ebp)
+	#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
+		AS1(pop		ebx)
+	#endif
 #endif
-		AS_POP(		bp)
 
-#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
-		AS_POP(		bx)
-#endif
 #ifdef __GNUC__
 		".att_syntax prefix;"
 			: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
 			: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
 			: "memory", "cc"
+	#if CRYPTOPP_BOOL_X64
+			, "%ebx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
+	#endif
 		);
 
 		if (xorBlock)
@@ -428,7 +451,11 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		((word32 *)outBlock)[2] = t2;
 		((word32 *)outBlock)[3] = t3;
 #else
-		AS2(	mov		WORD_REG(bx), xorBlock)
+	#if CRYPTOPP_BOOL_X64
+		mov		rbx, [rsp + 6*8 + 7*8]			; xorBlock
+	#else
+		AS2(	mov		ebx, xorBlock)
+	#endif
 		AS2(	test	WORD_REG(bx), WORD_REG(bx))
 		ASJ(	jz,		1, f)
 		AS2(	xor		eax, [WORD_REG(bx)+0*4])
@@ -436,15 +463,33 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		AS2(	xor		esi, [WORD_REG(bx)+2*4])
 		AS2(	xor		edi, [WORD_REG(bx)+3*4])
 		ASL(1)
-		AS2(	mov		WORD_REG(bx), outBlock)
+	#if CRYPTOPP_BOOL_X64
+		mov		rbx, [rsp + 7*8 + 7*8]			; outBlock
+	#else
+		AS2(	mov		ebx, outBlock)
+	#endif
 		AS2(	mov		[WORD_REG(bx)+0*4], eax)
 		AS2(	mov		[WORD_REG(bx)+1*4], ecx)
 		AS2(	mov		[WORD_REG(bx)+2*4], esi)
 		AS2(	mov		[WORD_REG(bx)+3*4], edi)
 #endif
+
+#if CRYPTOPP_GENERATE_X64_MASM
+		pop r15
+		pop r14
+		pop r13
+		pop r12
+		pop rdi
+		pop rsi
+		pop rbx
+		ret
+	Rijndael_Enc_ProcessAndXorBlock ENDP
+#else
 	}
 	else
+#endif
 #endif	// #ifdef CRYPTOPP_X86_ASM_AVAILABLE
+#ifndef CRYPTOPP_GENERATE_X64_MASM
 	{
 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
 	const word32 *rk = m_key;
@@ -674,3 +719,4 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 NAMESPACE_END
 
 #endif
+#endif
diff --git a/salsa.cpp b/salsa.cpp
index f24e7ab..4681ec6 100755
--- a/salsa.cpp
+++ b/salsa.cpp
@@ -1,6 +1,11 @@
 // salsa.cpp - written and placed in the public domain by Wei Dai
 
+// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
+
 #include "pch.h"
+
+#ifndef CRYPTOPP_GENERATE_X64_MASM
+
 #include "salsa.h"
 #include "misc.h"
 #include "argnames.h"
@@ -53,7 +58,7 @@ void Salsa20_Policy::SeekToIteration(lword iterationCount)
 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
 unsigned int Salsa20_Policy::GetAlignment() const
 {
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
 	if (HasSSE2())
 		return 16;
 	else
@@ -63,7 +68,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
 
 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
 {
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
 	if (HasSSE2())
 		return 4*BYTES_PER_ITERATION;
 	else
@@ -72,267 +77,489 @@ unsigned int Salsa20_Policy::GetOptimalBlockSize() const
 }
 #endif
 
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+extern "C" {
+void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
+}
+#endif
+
 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
 {
-	int i;
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
-	#define SSE2_QUARTER_ROUND(a, b, d, i)				{\
-		__m128i t = _mm_add_epi32(a, d);				\
-		b = _mm_xor_si128(b, _mm_slli_epi32(t, i));		\
-		b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
+#endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+	Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
+	return;
+#endif
 
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+		ALIGN   8
+	Salsa20_OperateKeystream	PROC FRAME
+		mov		r10, [rsp + 5*8]			; state
+		alloc_stack(10*16 + 32*16 + 8)
+		save_xmm128 xmm6, 0200h
+		save_xmm128 xmm7, 0210h
+		save_xmm128 xmm8, 0220h
+		save_xmm128 xmm9, 0230h
+		save_xmm128 xmm10, 0240h
+		save_xmm128 xmm11, 0250h
+		save_xmm128 xmm12, 0260h
+		save_xmm128 xmm13, 0270h
+		save_xmm128 xmm14, 0280h
+		save_xmm128 xmm15, 0290h
+		.endprolog
+
+	#define REG_output			rcx
+	#define REG_input			rdx
+	#define REG_iterationCount	r8
+	#define REG_state			r10
+	#define REG_rounds			eax
+	#define REG_temp32			r11d
+	#define REG_temp			r11
+	#define SSE2_WORKSPACE		rsp
+	#define SSE2_LOAD_ROUNDS	mov eax, r9d
+#else
 	if (HasSSE2())
 	{
-		__m128i *s = (__m128i *)m_state.data();
-
-#if _MSC_VER > 1400 || (defined(_MSC_VER) && CRYPTOPP_BOOL_X86) || (CRYPTOPP_GCC_VERSION >= 40000 && CRYPTOPP_BOOL_X86)
-		// This code triggers an internal compiler error on MSVC 2005 when compiling 
-		// for x64 with optimizations on. hopefully it will get fixed in the next release.
-		// A bug report has been submitted at http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?FeedbackID=274123
-		// Also, GCC 3.4.4 generates incorrect code for x86 at -O2.
-		// GCC 4.1.1 generates incorrect code for x64 at -O2
-		if (iterationCount >= 4)
-		{
-			__m128i ss[16];
-			ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
-			ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
-			ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
-			ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
-			ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
-			ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
-			ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
-			ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
-			ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
-			ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
-			ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
-			ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
-			ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
-			ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
-
-			do
-			{
-				word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
-				for (i=0; i<4; i++)
-				{
-					countersLo[i] = m_state[8];
-					countersHi[i] = m_state[5];
-					if (++m_state[8] == 0)
-						++m_state[5];
-				}
-
-				__m128i x0 = ss[0];
-				__m128i x1 = ss[1];
-				__m128i x2 = ss[2];
-				__m128i x3 = ss[3];
-				__m128i x4 = ss[4];
-				__m128i x5 = ss[5];
-				__m128i x6 = ss[6];
-				__m128i x7 = ss[7];
-				__m128i x8 = ss[8];
-				__m128i x9 = ss[9];
-				__m128i x10 = ss[10];
-				__m128i x11 = ss[11];
-				__m128i x12 = ss[12];
-				__m128i x13 = ss[13];
-				__m128i x14 = ss[14];
-				__m128i x15 = ss[15];
-
-				for (i=m_rounds; i>0; i-=2)
-				{
-					#define QUARTER_ROUND(a, b, c, d)	\
-						SSE2_QUARTER_ROUND(a, b, d, 7)	\
-						SSE2_QUARTER_ROUND(b, c, a, 9)	\
-						SSE2_QUARTER_ROUND(c, d, b, 13)	\
-						SSE2_QUARTER_ROUND(d, a, c, 18)	
-
-					QUARTER_ROUND(x0, x4, x8, x12)
-					QUARTER_ROUND(x1, x5, x9, x13)
-					QUARTER_ROUND(x2, x6, x10, x14)
-					QUARTER_ROUND(x3, x7, x11, x15)
-
-					QUARTER_ROUND(x0, x13, x10, x7)
-					QUARTER_ROUND(x1, x14, x11, x4)
-					QUARTER_ROUND(x2, x15, x8, x5)
-					QUARTER_ROUND(x3, x12, x9, x6)
-
-					#undef QUARTER_ROUND
-				}
-
-				x0 = _mm_add_epi32(x0, ss[0]);
-				x1 = _mm_add_epi32(x1, ss[1]);
-				x2 = _mm_add_epi32(x2, ss[2]);
-				x3 = _mm_add_epi32(x3, ss[3]);
-				x4 = _mm_add_epi32(x4, ss[4]);
-				x5 = _mm_add_epi32(x5, ss[5]);
-				x6 = _mm_add_epi32(x6, ss[6]);
-				x7 = _mm_add_epi32(x7, ss[7]);
-				x8 = _mm_add_epi32(x8, ss[8]);
-				x9 = _mm_add_epi32(x9, ss[9]);
-				x10 = _mm_add_epi32(x10, ss[10]);
-				x11 = _mm_add_epi32(x11, ss[11]);
-				x12 = _mm_add_epi32(x12, ss[12]);
-				x13 = _mm_add_epi32(x13, ss[13]);
-				x14 = _mm_add_epi32(x14, ss[14]);
-				x15 = _mm_add_epi32(x15, ss[15]);
-
-				#define OUTPUT_4(x, a, b, c, d, e, f, g, h)	{\
-					__m128i t0 = _mm_unpacklo_epi32(a, b);\
-					__m128i t1 = _mm_unpacklo_epi32(c, d);\
-					__m128i t2 = _mm_unpacklo_epi64(t0, t1);\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
-					t2 = _mm_unpackhi_epi64(t0, t1);\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
-					t0 = _mm_unpackhi_epi32(a, b);\
-					t1 = _mm_unpackhi_epi32(c, d);\
-					t2 = _mm_unpacklo_epi64(t0, t1);\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
-					t2 = _mm_unpackhi_epi64(t0, t1);\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
-
-				#define SALSA_OUTPUT(x)		\
-					OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
-					OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
-					OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
-					OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
-
-				CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
-
-				#undef SALSA_OUTPUT
-			} while ((iterationCount-=4) >= 4);
-		}
+	#if CRYPTOPP_BOOL_X64
+		#define REG_output			%4
+		#define REG_input			%1
+		#define REG_iterationCount	%2
+		#define REG_state			%3
+		#define REG_rounds			eax
+		#define REG_temp32			edx
+		#define REG_temp			rdx
+		#define SSE2_WORKSPACE		%5
+		#define SSE2_LOAD_ROUNDS	AS2(mov eax, %0)
+
+		__m128i workspace[32];
+	#else
+		#define REG_output			edi
+		#define REG_input			eax
+		#define REG_iterationCount	ecx
+		#define REG_state			esi
+		#define REG_rounds			ebx
+		#define REG_temp32			edx
+		#define REG_temp			edx
+		#define SSE2_WORKSPACE		esp + WORD_SZ
+		#ifdef __GNUC__
+			// this assumes that a frame pointer is used
+			#define SSE2_LOAD_ROUNDS	".att_syntax prefix;movl %0, %%ebx;.intel_syntax noprefix;"
+		#else
+			#define SSE2_LOAD_ROUNDS	AS2(mov REG_rounds, r)
+		#endif
+	#endif
+
+		word32 r = m_rounds;
+
+	#ifdef __GNUC__
+		__asm__ __volatile__
+		(
+			".intel_syntax noprefix;"
+			AS_PUSH_IF86(	bx)
+	#else
+		void *s = m_state.data();
+
+		AS2(	mov		REG_iterationCount, iterationCount)
+		AS2(	mov		REG_state, s)
+		AS2(	mov		REG_input, input)
+		AS2(	mov		REG_output, output)
+	#endif
+#endif	// #ifndef CRYPTOPP_GENERATE_X64_MASM
+
+		AS2(	cmp		REG_iterationCount, 4)
+		ASJ(	jl,		5, f)
+
+#if CRYPTOPP_BOOL_X86
+		AS2(	mov		ebx, esp)
+		AS2(	and		esp, -16)
+		AS2(	sub		esp, 32*16)
+		AS1(	push	ebx)
 #endif
 
-		if (!IsP4() && iterationCount > 0)
-		{
-			const __m128i s_maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
-			const __m128i s_maskHi32 = _mm_slli_epi64(s_maskLo32, 32);
-
-			do
-			{
-				__m128i x0 = s[0];
-				__m128i x1 = s[1];
-				__m128i x2 = s[2];
-				__m128i x3 = s[3];
-
-				for (i=m_rounds; i>0; i-=2)
-				{
-					SSE2_QUARTER_ROUND(x0, x1, x3, 7)
-					SSE2_QUARTER_ROUND(x1, x2, x0, 9)
-					SSE2_QUARTER_ROUND(x2, x3, x1, 13)
-					SSE2_QUARTER_ROUND(x3, x0, x2, 18)
-
-					x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
-					x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
-					x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
-
-					SSE2_QUARTER_ROUND(x0, x3, x1, 7)
-					SSE2_QUARTER_ROUND(x3, x2, x0, 9)
-					SSE2_QUARTER_ROUND(x2, x1, x3, 13)
-					SSE2_QUARTER_ROUND(x1, x0, x2, 18)
-
-					x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
-					x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
-					x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
-				}
-
-				x0 = _mm_add_epi32(x0, s[0]);
-				x1 = _mm_add_epi32(x1, s[1]);
-				x2 = _mm_add_epi32(x2, s[2]);
-				x3 = _mm_add_epi32(x3, s[3]);
-
-				if (++m_state[8] == 0)
-					++m_state[5];
-
-				__m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
-				k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
-				__m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
-				k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
-				__m128i k20 = _mm_or_si128(_mm_and_si128(x2, s_maskLo32), _mm_and_si128(x1, s_maskHi32));
-				__m128i k31 = _mm_or_si128(_mm_and_si128(x3, s_maskLo32), _mm_and_si128(x2, s_maskHi32));
-
-				__m128i k0 = _mm_unpackhi_epi64(k02, k20);
-				__m128i k1 = _mm_unpackhi_epi64(k13, k31);
-				__m128i k2 = _mm_unpacklo_epi64(k20, k02);
-				__m128i k3 = _mm_unpacklo_epi64(k31, k13);
-
-				#define SSE2_OUTPUT(x)	{\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
-					CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
-
-				CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
-			}
-			while (--iterationCount);
-		}
+#define SSE2_EXPAND_S(i, j)		\
+	ASS(	pshufd	xmm4, xmm##i, j, j, j, j)	\
+	AS2(	movdqa	[SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
+
+		AS2(	movdqa	xmm0, [REG_state + 0*16])
+		AS2(	movdqa	xmm1, [REG_state + 1*16])
+		AS2(	movdqa	xmm2, [REG_state + 2*16])
+		AS2(	movdqa	xmm3, [REG_state + 3*16])
+		SSE2_EXPAND_S(0, 0)
+		SSE2_EXPAND_S(0, 1)
+		SSE2_EXPAND_S(0, 2)
+		SSE2_EXPAND_S(0, 3)
+		SSE2_EXPAND_S(1, 0)
+		SSE2_EXPAND_S(1, 2)
+		SSE2_EXPAND_S(1, 3)
+		SSE2_EXPAND_S(2, 1)
+		SSE2_EXPAND_S(2, 2)
+		SSE2_EXPAND_S(2, 3)
+		SSE2_EXPAND_S(3, 0)
+		SSE2_EXPAND_S(3, 1)
+		SSE2_EXPAND_S(3, 2)
+		SSE2_EXPAND_S(3, 3)
+
+#define SSE2_EXPAND_S85(i)		\
+		AS2(	mov		dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_rounds)	\
+		AS2(	mov		dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32)	\
+		AS2(	add		REG_rounds, 1)	\
+		AS2(	adc		REG_temp32, 0)
+
+		ASL(1)
+		AS2(	mov		REG_rounds, dword ptr [REG_state + 8*4])
+		AS2(	mov		REG_temp32, dword ptr [REG_state + 5*4])
+		SSE2_EXPAND_S85(0)
+		SSE2_EXPAND_S85(1)
+		SSE2_EXPAND_S85(2)
+		SSE2_EXPAND_S85(3)
+		AS2(	mov		dword ptr [REG_state + 8*4], REG_rounds)
+		AS2(	mov		dword ptr [REG_state + 5*4], REG_temp32)
+
+#define SSE2_QUARTER_ROUND(a, b, d, i)		\
+	AS2(	movdqa	xmm4, xmm##d)			\
+	AS2(	paddd	xmm4, xmm##a)			\
+	AS2(	movdqa	xmm5, xmm4)				\
+	AS2(	pslld	xmm4, i)				\
+	AS2(	psrld	xmm5, 32-i)				\
+	AS2(	pxor	xmm##b, xmm4)			\
+	AS2(	pxor	xmm##b, xmm5)
+
+#define L01(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##A, [SSE2_WORKSPACE + d*16 + i*256])	/* y3 */
+#define L02(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##C, [SSE2_WORKSPACE + a*16 + i*256])	/* y0 */	
+#define L03(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##C)		/* y0+y3 */							
+#define L04(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##B, xmm##A)											
+#define L05(A,B,C,D,a,b,c,d,i)		AS2(	pslld	xmm##A, 7)											
+#define L06(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##B, 32-7)											
+#define L07(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, [SSE2_WORKSPACE + b*16 + i*256])				
+#define L08(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##B)		/* z1 */							
+#define L09(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + b*16], xmm##A)				
+#define L10(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##B, xmm##A)											
+#define L11(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##C)		/* z1+y0 */							
+#define L12(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##D, xmm##A)											
+#define L13(A,B,C,D,a,b,c,d,i)		AS2(	pslld	xmm##A, 9)											
+#define L14(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##D, 32-9)											
+#define L15(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, [SSE2_WORKSPACE + c*16 + i*256])				
+#define L16(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##D)		/* z2 */							
+#define L17(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + c*16], xmm##A)				
+#define L18(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##D, xmm##A)											
+#define L19(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##B)		/* z2+z1 */							
+#define L20(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##B, xmm##A)											
+#define L21(A,B,C,D,a,b,c,d,i)		AS2(	pslld	xmm##A, 13)											
+#define L22(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##B, 32-13)										
+#define L23(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, [SSE2_WORKSPACE + d*16 + i*256])				
+#define L24(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##B)		/* z3 */							
+#define L25(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + d*16], xmm##A)				
+#define L26(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##D)		/* z3+z2 */							
+#define L27(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##D, xmm##A)											
+#define L28(A,B,C,D,a,b,c,d,i)		AS2(	pslld	xmm##A, 18)											
+#define L29(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##D, 32-18)										
+#define L30(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##C)		/* xor y0 */						
+#define L31(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##D)		/* z0 */							
+#define L32(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + a*16], xmm##A)				
+
+#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h)	\
+	L01(0,1,2,3, a,b,c,d, i)	L01(4,5,6,7, e,f,g,h, i)	\
+	L02(0,1,2,3, a,b,c,d, i)	L02(4,5,6,7, e,f,g,h, i)	\
+	L03(0,1,2,3, a,b,c,d, i)	L03(4,5,6,7, e,f,g,h, i)	\
+	L04(0,1,2,3, a,b,c,d, i)	L04(4,5,6,7, e,f,g,h, i)	\
+	L05(0,1,2,3, a,b,c,d, i)	L05(4,5,6,7, e,f,g,h, i)	\
+	L06(0,1,2,3, a,b,c,d, i)	L06(4,5,6,7, e,f,g,h, i)	\
+	L07(0,1,2,3, a,b,c,d, i)	L07(4,5,6,7, e,f,g,h, i)	\
+	L08(0,1,2,3, a,b,c,d, i)	L08(4,5,6,7, e,f,g,h, i)	\
+	L09(0,1,2,3, a,b,c,d, i)	L09(4,5,6,7, e,f,g,h, i)	\
+	L10(0,1,2,3, a,b,c,d, i)	L10(4,5,6,7, e,f,g,h, i)	\
+	L11(0,1,2,3, a,b,c,d, i)	L11(4,5,6,7, e,f,g,h, i)	\
+	L12(0,1,2,3, a,b,c,d, i)	L12(4,5,6,7, e,f,g,h, i)	\
+	L13(0,1,2,3, a,b,c,d, i)	L13(4,5,6,7, e,f,g,h, i)	\
+	L14(0,1,2,3, a,b,c,d, i)	L14(4,5,6,7, e,f,g,h, i)	\
+	L15(0,1,2,3, a,b,c,d, i)	L15(4,5,6,7, e,f,g,h, i)	\
+	L16(0,1,2,3, a,b,c,d, i)	L16(4,5,6,7, e,f,g,h, i)	\
+	L17(0,1,2,3, a,b,c,d, i)	L17(4,5,6,7, e,f,g,h, i)	\
+	L18(0,1,2,3, a,b,c,d, i)	L18(4,5,6,7, e,f,g,h, i)	\
+	L19(0,1,2,3, a,b,c,d, i)	L19(4,5,6,7, e,f,g,h, i)	\
+	L20(0,1,2,3, a,b,c,d, i)	L20(4,5,6,7, e,f,g,h, i)	\
+	L21(0,1,2,3, a,b,c,d, i)	L21(4,5,6,7, e,f,g,h, i)	\
+	L22(0,1,2,3, a,b,c,d, i)	L22(4,5,6,7, e,f,g,h, i)	\
+	L23(0,1,2,3, a,b,c,d, i)	L23(4,5,6,7, e,f,g,h, i)	\
+	L24(0,1,2,3, a,b,c,d, i)	L24(4,5,6,7, e,f,g,h, i)	\
+	L25(0,1,2,3, a,b,c,d, i)	L25(4,5,6,7, e,f,g,h, i)	\
+	L26(0,1,2,3, a,b,c,d, i)	L26(4,5,6,7, e,f,g,h, i)	\
+	L27(0,1,2,3, a,b,c,d, i)	L27(4,5,6,7, e,f,g,h, i)	\
+	L28(0,1,2,3, a,b,c,d, i)	L28(4,5,6,7, e,f,g,h, i)	\
+	L29(0,1,2,3, a,b,c,d, i)	L29(4,5,6,7, e,f,g,h, i)	\
+	L30(0,1,2,3, a,b,c,d, i)	L30(4,5,6,7, e,f,g,h, i)	\
+	L31(0,1,2,3, a,b,c,d, i)	L31(4,5,6,7, e,f,g,h, i)	\
+	L32(0,1,2,3, a,b,c,d, i)	L32(4,5,6,7, e,f,g,h, i)
+
+#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H)	\
+	L01(0,1,2,3, a,b,c,d, i)	L01(4,5,6,7, e,f,g,h, i)	L01(8,9,10,11, A,B,C,D, i)	L01(12,13,14,15, E,F,G,H, i)	\
+	L02(0,1,2,3, a,b,c,d, i)	L02(4,5,6,7, e,f,g,h, i)	L02(8,9,10,11, A,B,C,D, i)	L02(12,13,14,15, E,F,G,H, i)	\
+	L03(0,1,2,3, a,b,c,d, i)	L03(4,5,6,7, e,f,g,h, i)	L03(8,9,10,11, A,B,C,D, i)	L03(12,13,14,15, E,F,G,H, i)	\
+	L04(0,1,2,3, a,b,c,d, i)	L04(4,5,6,7, e,f,g,h, i)	L04(8,9,10,11, A,B,C,D, i)	L04(12,13,14,15, E,F,G,H, i)	\
+	L05(0,1,2,3, a,b,c,d, i)	L05(4,5,6,7, e,f,g,h, i)	L05(8,9,10,11, A,B,C,D, i)	L05(12,13,14,15, E,F,G,H, i)	\
+	L06(0,1,2,3, a,b,c,d, i)	L06(4,5,6,7, e,f,g,h, i)	L06(8,9,10,11, A,B,C,D, i)	L06(12,13,14,15, E,F,G,H, i)	\
+	L07(0,1,2,3, a,b,c,d, i)	L07(4,5,6,7, e,f,g,h, i)	L07(8,9,10,11, A,B,C,D, i)	L07(12,13,14,15, E,F,G,H, i)	\
+	L08(0,1,2,3, a,b,c,d, i)	L08(4,5,6,7, e,f,g,h, i)	L08(8,9,10,11, A,B,C,D, i)	L08(12,13,14,15, E,F,G,H, i)	\
+	L09(0,1,2,3, a,b,c,d, i)	L09(4,5,6,7, e,f,g,h, i)	L09(8,9,10,11, A,B,C,D, i)	L09(12,13,14,15, E,F,G,H, i)	\
+	L10(0,1,2,3, a,b,c,d, i)	L10(4,5,6,7, e,f,g,h, i)	L10(8,9,10,11, A,B,C,D, i)	L10(12,13,14,15, E,F,G,H, i)	\
+	L11(0,1,2,3, a,b,c,d, i)	L11(4,5,6,7, e,f,g,h, i)	L11(8,9,10,11, A,B,C,D, i)	L11(12,13,14,15, E,F,G,H, i)	\
+	L12(0,1,2,3, a,b,c,d, i)	L12(4,5,6,7, e,f,g,h, i)	L12(8,9,10,11, A,B,C,D, i)	L12(12,13,14,15, E,F,G,H, i)	\
+	L13(0,1,2,3, a,b,c,d, i)	L13(4,5,6,7, e,f,g,h, i)	L13(8,9,10,11, A,B,C,D, i)	L13(12,13,14,15, E,F,G,H, i)	\
+	L14(0,1,2,3, a,b,c,d, i)	L14(4,5,6,7, e,f,g,h, i)	L14(8,9,10,11, A,B,C,D, i)	L14(12,13,14,15, E,F,G,H, i)	\
+	L15(0,1,2,3, a,b,c,d, i)	L15(4,5,6,7, e,f,g,h, i)	L15(8,9,10,11, A,B,C,D, i)	L15(12,13,14,15, E,F,G,H, i)	\
+	L16(0,1,2,3, a,b,c,d, i)	L16(4,5,6,7, e,f,g,h, i)	L16(8,9,10,11, A,B,C,D, i)	L16(12,13,14,15, E,F,G,H, i)	\
+	L17(0,1,2,3, a,b,c,d, i)	L17(4,5,6,7, e,f,g,h, i)	L17(8,9,10,11, A,B,C,D, i)	L17(12,13,14,15, E,F,G,H, i)	\
+	L18(0,1,2,3, a,b,c,d, i)	L18(4,5,6,7, e,f,g,h, i)	L18(8,9,10,11, A,B,C,D, i)	L18(12,13,14,15, E,F,G,H, i)	\
+	L19(0,1,2,3, a,b,c,d, i)	L19(4,5,6,7, e,f,g,h, i)	L19(8,9,10,11, A,B,C,D, i)	L19(12,13,14,15, E,F,G,H, i)	\
+	L20(0,1,2,3, a,b,c,d, i)	L20(4,5,6,7, e,f,g,h, i)	L20(8,9,10,11, A,B,C,D, i)	L20(12,13,14,15, E,F,G,H, i)	\
+	L21(0,1,2,3, a,b,c,d, i)	L21(4,5,6,7, e,f,g,h, i)	L21(8,9,10,11, A,B,C,D, i)	L21(12,13,14,15, E,F,G,H, i)	\
+	L22(0,1,2,3, a,b,c,d, i)	L22(4,5,6,7, e,f,g,h, i)	L22(8,9,10,11, A,B,C,D, i)	L22(12,13,14,15, E,F,G,H, i)	\
+	L23(0,1,2,3, a,b,c,d, i)	L23(4,5,6,7, e,f,g,h, i)	L23(8,9,10,11, A,B,C,D, i)	L23(12,13,14,15, E,F,G,H, i)	\
+	L24(0,1,2,3, a,b,c,d, i)	L24(4,5,6,7, e,f,g,h, i)	L24(8,9,10,11, A,B,C,D, i)	L24(12,13,14,15, E,F,G,H, i)	\
+	L25(0,1,2,3, a,b,c,d, i)	L25(4,5,6,7, e,f,g,h, i)	L25(8,9,10,11, A,B,C,D, i)	L25(12,13,14,15, E,F,G,H, i)	\
+	L26(0,1,2,3, a,b,c,d, i)	L26(4,5,6,7, e,f,g,h, i)	L26(8,9,10,11, A,B,C,D, i)	L26(12,13,14,15, E,F,G,H, i)	\
+	L27(0,1,2,3, a,b,c,d, i)	L27(4,5,6,7, e,f,g,h, i)	L27(8,9,10,11, A,B,C,D, i)	L27(12,13,14,15, E,F,G,H, i)	\
+	L28(0,1,2,3, a,b,c,d, i)	L28(4,5,6,7, e,f,g,h, i)	L28(8,9,10,11, A,B,C,D, i)	L28(12,13,14,15, E,F,G,H, i)	\
+	L29(0,1,2,3, a,b,c,d, i)	L29(4,5,6,7, e,f,g,h, i)	L29(8,9,10,11, A,B,C,D, i)	L29(12,13,14,15, E,F,G,H, i)	\
+	L30(0,1,2,3, a,b,c,d, i)	L30(4,5,6,7, e,f,g,h, i)	L30(8,9,10,11, A,B,C,D, i)	L30(12,13,14,15, E,F,G,H, i)	\
+	L31(0,1,2,3, a,b,c,d, i)	L31(4,5,6,7, e,f,g,h, i)	L31(8,9,10,11, A,B,C,D, i)	L31(12,13,14,15, E,F,G,H, i)	\
+	L32(0,1,2,3, a,b,c,d, i)	L32(4,5,6,7, e,f,g,h, i)	L32(8,9,10,11, A,B,C,D, i)	L32(12,13,14,15, E,F,G,H, i)
+
+#if CRYPTOPP_BOOL_X64
+		SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
+#else
+		SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
+		SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
+#endif
+		SSE2_LOAD_ROUNDS
+		ASJ(	jmp,	2, f)
+
+		ASL(SSE2_Salsa_Output)
+		AS2(	movdqa		xmm0, xmm4)
+		AS2(	punpckldq	xmm4, xmm5)
+		AS2(	movdqa		xmm1, xmm6)
+		AS2(	punpckldq	xmm6, xmm7)
+		AS2(	movdqa		xmm2, xmm4)
+		AS2(	punpcklqdq	xmm4, xmm6)	// e
+		AS2(	punpckhqdq	xmm2, xmm6)	// f
+		AS2(	punpckhdq	xmm0, xmm5)
+		AS2(	punpckhdq	xmm1, xmm7)
+		AS2(	movdqa		xmm6, xmm0)
+		AS2(	punpcklqdq	xmm0, xmm1)	// g
+		AS2(	punpckhqdq	xmm6, xmm1)	// h
+		AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
+		AS1(	ret)
+
+		ASL(6)
+#if CRYPTOPP_BOOL_X64
+		SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
+		ASL(2)
+		SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
+#else
+		SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
+		SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
+		ASL(2)
+		SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
+		SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
+#endif
+		AS2(	sub		REG_rounds, 2)
+		ASJ(	jnz,	6, b)
+
+#define SSE2_OUTPUT_4(a, b, c, d)	\
+	AS2(	movdqa		xmm4, [SSE2_WORKSPACE + a*16 + 256])\
+	AS2(	paddd		xmm4, [SSE2_WORKSPACE + a*16])\
+	AS2(	movdqa		xmm5, [SSE2_WORKSPACE + b*16 + 256])\
+	AS2(	paddd		xmm5, [SSE2_WORKSPACE + b*16])\
+	AS2(	movdqa		xmm6, [SSE2_WORKSPACE + c*16 + 256])\
+	AS2(	paddd		xmm6, [SSE2_WORKSPACE + c*16])\
+	AS2(	movdqa		xmm7, [SSE2_WORKSPACE + d*16 + 256])\
+	AS2(	paddd		xmm7, [SSE2_WORKSPACE + d*16])\
+	ASC(	call,		SSE2_Salsa_Output)
+
+		SSE2_OUTPUT_4(0, 13, 10, 7)
+		SSE2_OUTPUT_4(4, 1, 14, 11)
+		SSE2_OUTPUT_4(8, 5, 2, 15)
+		SSE2_OUTPUT_4(12, 9, 6, 3)
+		AS2(	test	REG_input, REG_input)
+		ASJ(	jz,		9, f)
+		AS2(	add		REG_input, 12*16)
+		ASL(9)
+		AS2(	add		REG_output, 12*16)
+		AS2(	sub		REG_iterationCount, 4)
+		AS2(	cmp		REG_iterationCount, 4)
+		ASJ(	jge,	1, b)
+		AS_POP_IF86(	sp)
+
+		ASL(5)
+		AS2(	sub		REG_iterationCount, 1)
+		ASJ(	jl,		4, f)
+		AS2(	movdqa	xmm0, [REG_state + 0*16])
+		AS2(	movdqa	xmm1, [REG_state + 1*16])
+		AS2(	movdqa	xmm2, [REG_state + 2*16])
+		AS2(	movdqa	xmm3, [REG_state + 3*16])
+		SSE2_LOAD_ROUNDS
+
+		ASL(0)
+		SSE2_QUARTER_ROUND(0, 1, 3, 7)
+		SSE2_QUARTER_ROUND(1, 2, 0, 9)
+		SSE2_QUARTER_ROUND(2, 3, 1, 13)
+		SSE2_QUARTER_ROUND(3, 0, 2, 18)
+		ASS(	pshufd	xmm1, xmm1, 2, 1, 0, 3)
+		ASS(	pshufd	xmm2, xmm2, 1, 0, 3, 2)
+		ASS(	pshufd	xmm3, xmm3, 0, 3, 2, 1)
+		SSE2_QUARTER_ROUND(0, 3, 1, 7)
+		SSE2_QUARTER_ROUND(3, 2, 0, 9)
+		SSE2_QUARTER_ROUND(2, 1, 3, 13)
+		SSE2_QUARTER_ROUND(1, 0, 2, 18)
+		ASS(	pshufd	xmm1, xmm1, 0, 3, 2, 1)
+		ASS(	pshufd	xmm2, xmm2, 1, 0, 3, 2)
+		ASS(	pshufd	xmm3, xmm3, 2, 1, 0, 3)
+		AS2(	sub		REG_rounds, 2)
+		ASJ(	jnz,	0, b)
+
+		AS2(	paddd	xmm0, [REG_state + 0*16])
+		AS2(	paddd	xmm1, [REG_state + 1*16])
+		AS2(	paddd	xmm2, [REG_state + 2*16])
+		AS2(	paddd	xmm3, [REG_state + 3*16])
+
+		AS2(	add		dword ptr [REG_state + 8*4], 1)
+		AS2(	adc		dword ptr [REG_state + 5*4], 0)
+
+		AS2(	pcmpeqb	xmm6, xmm6)			// all ones
+		AS2(	psrlq	xmm6, 32)			// lo32 mask
+		ASS(	pshufd	xmm7, xmm6, 0, 1, 2, 3)		// hi32 mask
+		AS2(	movdqa	xmm4, xmm0)
+		AS2(	movdqa	xmm5, xmm3)
+		AS2(	pand	xmm0, xmm7)
+		AS2(	pand	xmm4, xmm6)
+		AS2(	pand	xmm3, xmm6)
+		AS2(	pand	xmm5, xmm7)
+		AS2(	por		xmm4, xmm5)			// 0,13,2,15
+		AS2(	movdqa	xmm5, xmm1)
+		AS2(	pand	xmm1, xmm7)
+		AS2(	pand	xmm5, xmm6)
+		AS2(	por		xmm0, xmm5)			// 4,1,6,3
+		AS2(	pand	xmm6, xmm2)
+		AS2(	pand	xmm2, xmm7)
+		AS2(	por		xmm1, xmm6)			// 8,5,10,7
+		AS2(	por		xmm2, xmm3)			// 12,9,14,11
+
+		AS2(	movdqa	xmm5, xmm4)
+		AS2(	movdqa	xmm6, xmm0)
+		AS3(	shufpd	xmm4, xmm1, 2)		// 0,13,10,7
+		AS3(	shufpd	xmm0, xmm2, 2)		// 4,1,14,11
+		AS3(	shufpd	xmm1, xmm5, 2)		// 8,5,2,15
+		AS3(	shufpd	xmm2, xmm6, 2)		// 12,9,6,3
+
+		// output keystream
+		AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
+		ASJ(	jmp,	5, b)
+		ASL(4)
+
+#ifdef __GNUC__
+		AS_POP_IF86(	bx)
+		".att_syntax prefix;"
+			: 
+	#if CRYPTOPP_BOOL_X64
+			: "r" (r), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace)
+			: "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+	#else
+			: "m" (r), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
+			: "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+	#endif
+		);
+#endif
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+	movdqa	xmm6, [rsp + 0200h]
+	movdqa	xmm7, [rsp + 0210h]
+	movdqa	xmm8, [rsp + 0220h]
+	movdqa	xmm9, [rsp + 0230h]
+	movdqa	xmm10, [rsp + 0240h]
+	movdqa	xmm11, [rsp + 0250h]
+	movdqa	xmm12, [rsp + 0260h]
+	movdqa	xmm13, [rsp + 0270h]
+	movdqa	xmm14, [rsp + 0280h]
+	movdqa	xmm15, [rsp + 0290h]
+	add		rsp, 10*16 + 32*16 + 8
+	ret
+Salsa20_OperateKeystream ENDP
+#else
 	}
+	else
 #endif
-
-	word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	while (iterationCount--)
+#endif
+#ifndef CRYPTOPP_GENERATE_X64_MASM
 	{
-		x0 = m_state[0];
-		x1 = m_state[1];
-		x2 = m_state[2];
-		x3 = m_state[3];
-		x4 = m_state[4];
-		x5 = m_state[5];
-		x6 = m_state[6];
-		x7 = m_state[7];
-		x8 = m_state[8];
-		x9 = m_state[9];
-		x10 = m_state[10];
-		x11 = m_state[11];
-		x12 = m_state[12];
-		x13 = m_state[13];
-		x14 = m_state[14];
-		x15 = m_state[15];
-
-		for (i=m_rounds; i>0; i-=2)
+		word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+
+		while (iterationCount--)
 		{
-			#define QUARTER_ROUND(a, b, c, d)	\
-				b = b ^ rotlFixed(a + d, 7);	\
-				c = c ^ rotlFixed(b + a, 9);	\
-				d = d ^ rotlFixed(c + b, 13);	\
-				a = a ^ rotlFixed(d + c, 18);
-
-			QUARTER_ROUND(x0, x4, x8, x12)
-			QUARTER_ROUND(x1, x5, x9, x13)
-			QUARTER_ROUND(x2, x6, x10, x14)
-			QUARTER_ROUND(x3, x7, x11, x15)
-
-			QUARTER_ROUND(x0, x13, x10, x7)
-			QUARTER_ROUND(x1, x14, x11, x4)
-			QUARTER_ROUND(x2, x15, x8, x5)
-			QUARTER_ROUND(x3, x12, x9, x6)
-		}
+			x0 = m_state[0];
+			x1 = m_state[1];
+			x2 = m_state[2];
+			x3 = m_state[3];
+			x4 = m_state[4];
+			x5 = m_state[5];
+			x6 = m_state[6];
+			x7 = m_state[7];
+			x8 = m_state[8];
+			x9 = m_state[9];
+			x10 = m_state[10];
+			x11 = m_state[11];
+			x12 = m_state[12];
+			x13 = m_state[13];
+			x14 = m_state[14];
+			x15 = m_state[15];
+
+			for (int i=m_rounds; i>0; i-=2)
+			{
+				#define QUARTER_ROUND(a, b, c, d)	\
+					b = b ^ rotlFixed(a + d, 7);	\
+					c = c ^ rotlFixed(b + a, 9);	\
+					d = d ^ rotlFixed(c + b, 13);	\
+					a = a ^ rotlFixed(d + c, 18);
+
+				QUARTER_ROUND(x0, x4, x8, x12)
+				QUARTER_ROUND(x1, x5, x9, x13)
+				QUARTER_ROUND(x2, x6, x10, x14)
+				QUARTER_ROUND(x3, x7, x11, x15)
+
+				QUARTER_ROUND(x0, x13, x10, x7)
+				QUARTER_ROUND(x1, x14, x11, x4)
+				QUARTER_ROUND(x2, x15, x8, x5)
+				QUARTER_ROUND(x3, x12, x9, x6)
+			}
 
-		#define SALSA_OUTPUT(x)	{\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
-			CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
+			#define SALSA_OUTPUT(x)	{\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
+				CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
 
 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
-		CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
+			CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
 #endif
 
-		if (++m_state[8] == 0)
-			++m_state[5];
+			if (++m_state[8] == 0)
+				++m_state[5];
+		}
 	}
 }	// see comment above if an internal compiler error occurs here
 
 NAMESPACE_END
+
+#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
diff --git a/sosemanuk.cpp b/sosemanuk.cpp
index c86b877..b8c0c6c 100755
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@@ -1,12 +1,21 @@
 // sosemanuk.cpp - written and placed in the public domain by Wei Dai
 
+// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
+
 #include "pch.h"
+
+#ifndef CRYPTOPP_GENERATE_X64_MASM
+
 #include "sosemanuk.h"
 #include "misc.h"
 #include "cpu.h"
 
 #include "serpentp.h"
 
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#include <emmintrin.h>
+#endif
+
 NAMESPACE_BEGIN(CryptoPP)
 
 void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
@@ -74,7 +83,8 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
 	m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
 }
 
-static word32 s_mulTables[512] = {
+extern "C" {
+word32 s_sosemanukMulTables[512] = {
 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
 	0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836, 
 	0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E, 
@@ -271,7 +281,7 @@ static word32 s_mulTables[512] = {
 	0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
 	0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
 };
-
+}
 
 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
 unsigned int SosemanukPolicy::GetAlignment() const
@@ -303,11 +313,36 @@ unsigned int SosemanukPolicy::GetOptimalBlockSize() const
 }
 #endif
 
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+extern "C" {
+void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
+}
+#endif
+
 #pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
 
 void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
 {
+#endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+	Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
+	return;
+#endif
+
 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+		ALIGN   8
+	Sosemanuk_OperateKeystream	PROC FRAME
+		rex_push_reg rsi
+		push_reg rdi
+		alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
+		save_xmm128 xmm6, 02f0h
+		save_xmm128 xmm7, 0300h
+		.endprolog
+		mov		rdi, r8
+		mov		rax, r9
+#else
 #ifdef __INTEL_COMPILER
 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
 #else
@@ -315,10 +350,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 #endif
 	{
 #ifdef __GNUC__
+	#if CRYPTOPP_BOOL_X64
+		__m128i workspace[(80*4*2+12*4+8*WORD_SZ)/16];
+	#endif
 		__asm__ __volatile__
 		(
 		".intel_syntax noprefix;"
-		AS_PUSH(		bx)
+		AS_PUSH_IF86(	bx)
 #else
 		word32 *state = m_state;
 		AS2(	mov		WORD_REG(ax), state)
@@ -326,22 +364,31 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	mov		WORD_REG(dx), input)
 		AS2(	mov		WORD_REG(cx), iterationCount)
 #endif
+#endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#if defined(__GNUC__) && CRYPTOPP_BOOL_X64
+	#define SSE2_workspace %5
+#else
+	#define SSE2_workspace WORD_REG(sp)
+#endif
 
-#define SSE2_output			WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
-#define SSE2_input			WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
-#define SSE2_wordsLeft		WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
-#define SSE2_diEnd			WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
-#define SSE2_pMulTables		WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
-#define SSE2_state			WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
-#define SSE2_wordsLeft2		WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
-#define SSE2_stateCopy		WORD_REG(sp) + 8*WORD_SZ
+#define SSE2_output			WORD_PTR [SSE2_workspace+1*WORD_SZ]
+#define SSE2_input			WORD_PTR [SSE2_workspace+2*WORD_SZ]
+#define SSE2_wordsLeft		WORD_PTR [SSE2_workspace+3*WORD_SZ]
+#define SSE2_diEnd			WORD_PTR [SSE2_workspace+4*WORD_SZ]
+#define SSE2_pMulTables		WORD_PTR [SSE2_workspace+5*WORD_SZ]
+#define SSE2_state			WORD_PTR [SSE2_workspace+6*WORD_SZ]
+#define SSE2_wordsLeft2		WORD_PTR [SSE2_workspace+7*WORD_SZ]
+#define SSE2_stateCopy		SSE2_workspace + 8*WORD_SZ
 #define	SSE2_uvStart		SSE2_stateCopy + 12*4
 
-		AS_PUSH(		bp)
-		AS2(	mov		WORD_REG(bx), WORD_REG(sp))
-		AS2(	and		WORD_REG(sp), -16)
-		AS2(	sub		WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ)	// 80 v's, 80 u's, 12 state, 8 locals
-		AS2(	mov		[WORD_REG(sp)], WORD_REG(bx))
+#if CRYPTOPP_BOOL_X86
+		AS_PUSH_IF86(	bp)
+		AS2(	mov		AS_REG_6, esp)
+		AS2(	and		esp, -16)
+		AS2(	sub		esp, 80*4*2+12*4+8*WORD_SZ)	// 80 v's, 80 u's, 12 state, 8 locals
+		AS2(	mov		[esp], AS_REG_6)
+#endif
 		AS2(	mov		SSE2_output, WORD_REG(di))
 		AS2(	mov		SSE2_input, WORD_REG(dx))
 		AS2(	mov		SSE2_state, WORD_REG(ax))
@@ -358,7 +405,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	movq	xmm0, QWORD PTR [WORD_REG(ax)+2*16])
 		AS2(	movq	QWORD PTR [SSE2_stateCopy+2*16], xmm0)
 		AS2(	psrlq	xmm0, 32)
-		AS2(	movd	ebx, xmm0)				// s(9)
+		AS2(	movd	AS_REG_6d, xmm0)				// s(9)
 		AS2(	mov		ecx, [WORD_REG(ax)+10*4])
 		AS2(	mov		edx, [WORD_REG(ax)+11*4])
 		AS2(	pcmpeqb	xmm7, xmm7)				// all ones
@@ -367,35 +414,35 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 #define u(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
 #define v(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
 
-#define r10 ecx
-#define r11 edx
-#define r20 edx
-#define r21 ecx
+#define R10 ecx
+#define R11 edx
+#define R20 edx
+#define R21 ecx
 
 #define SSE2_STEP(i, j)	\
 	AS2(	mov		eax, [s(i+0)])\
 	AS2(	mov		[v(i)], eax)\
 	AS2(	rol		eax, 8)\
-	AS2(	lea		ebp, [ebx + r2##j])\
-	AS2(	xor		ebp, r1##j)\
-	AS2(	mov		[u(i)], ebp)\
-	AS2(	mov		ebp, 1)\
-	AS2(	and		ebp, r2##j)\
-	AS1(	neg		ebp)\
-	AS2(	and		ebp, ebx)\
-	AS2(	xor		ebx, eax)\
+	AS2(	lea		AS_REG_7d, [AS_REG_6d + R2##j])\
+	AS2(	xor		AS_REG_7d, R1##j)\
+	AS2(	mov		[u(i)], AS_REG_7d)\
+	AS2(	mov		AS_REG_7d, 1)\
+	AS2(	and		AS_REG_7d, R2##j)\
+	AS1(	neg		AS_REG_7d)\
+	AS2(	and		AS_REG_7d, AS_REG_6d)\
+	AS2(	xor		AS_REG_6d, eax)\
 	AS2(	movzx	eax, al)\
-	AS2(	xor		ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
+	AS2(	xor		AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
 	AS2(	mov		eax, [s(i+3)])\
-	AS2(	xor		ebp, [s(i+2)])\
-	AS2(	add		r1##j, ebp)\
-	AS2(	movzx	ebp, al)\
+	AS2(	xor		AS_REG_7d, [s(i+2)])\
+	AS2(	add		R1##j, AS_REG_7d)\
+	AS2(	movzx	AS_REG_7d, al)\
 	AS2(	shr		eax, 8)\
-	AS2(	xor		ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
-	AS2(	xor		ebx, eax)\
-	AS2(	imul	r2##j, 0x54655307)\
-	AS2(	rol		r2##j, 7)\
-	AS2(	mov		[s(i+0)], ebx)\
+	AS2(	xor		AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
+	AS2(	xor		AS_REG_6d, eax)\
+	AS2(	imul	R2##j, AS_HEX(54655307))\
+	AS2(	rol		R2##j, 7)\
+	AS2(	mov		[s(i+0)], AS_REG_6d)\
 
 		ASL(2)	// outer loop, each iteration of this processes 80 words
 		AS2(	lea		WORD_REG(di), [SSE2_uvStart])	// start of v and u
@@ -406,7 +453,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	lea		WORD_REG(si), [WORD_REG(di)+WORD_REG(si)])		// use to end first inner loop
 		AS2(	mov		SSE2_diEnd, WORD_REG(si))
 #ifdef _MSC_VER
-		AS2(	lea		WORD_REG(si), s_mulTables)
+		AS2(	lea		WORD_REG(si), s_sosemanukMulTables)
 #else
 		AS2(	mov		WORD_REG(si), SSE2_pMulTables)
 #endif
@@ -438,7 +485,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		ASJ(	jne,	0, b)
 
 		AS2(	mov		WORD_REG(ax), SSE2_input)
-		AS2(	mov		WORD_REG(bp), SSE2_output)
+		AS2(	mov		AS_REG_7, SSE2_output)
 		AS2(	lea		WORD_REG(di), [SSE2_uvStart])		// start of v and u
 		AS2(	mov		WORD_REG(si), SSE2_wordsLeft2)
 
@@ -487,43 +534,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	punpcklqdq	xmm6, xmm5)
 		AS2(	punpckhqdq	xmm3, xmm5)
 		// output keystream
-		AS2(	test	WORD_REG(ax), WORD_REG(ax))
-		ASJ(	jz,		3, f)
-		AS2(	test	eax, 0xf)
-		ASJ(	jnz,	7, f)
-		AS2(	pxor	xmm2, [WORD_REG(ax)+0*16])
-		AS2(	pxor	xmm0, [WORD_REG(ax)+1*16])
-		AS2(	pxor	xmm6, [WORD_REG(ax)+2*16])
-		AS2(	pxor	xmm3, [WORD_REG(ax)+3*16])
-		AS2(	add		WORD_REG(ax), 4*16)
-		ASJ(	jmp,	3, f)
-		ASL(7)
-		AS2(	movdqu	xmm1, [WORD_REG(ax)+0*16])
-		AS2(	pxor	xmm2, xmm1)
-		AS2(	movdqu	xmm1, [WORD_REG(ax)+1*16])
-		AS2(	pxor	xmm0, xmm1)
-		AS2(	movdqu	xmm1, [WORD_REG(ax)+2*16])
-		AS2(	pxor	xmm6, xmm1)
-		AS2(	movdqu	xmm1, [WORD_REG(ax)+3*16])
-		AS2(	pxor	xmm3, xmm1)
-		AS2(	add		WORD_REG(ax), 4*16)
-		ASL(3)
-		AS2(	test	ebp, 0xf)
-		ASJ(	jnz,	8, f)
-		AS2(	movdqa	[WORD_REG(bp)+0*16], xmm2)
-		AS2(	movdqa	[WORD_REG(bp)+1*16], xmm0)
-		AS2(	movdqa	[WORD_REG(bp)+2*16], xmm6)
-		AS2(	movdqa	[WORD_REG(bp)+3*16], xmm3)
-		ASJ(	jmp,	9, f)
-		ASL(8)
-		AS2(	movdqu	[WORD_REG(bp)+0*16], xmm2)
-		AS2(	movdqu	[WORD_REG(bp)+1*16], xmm0)
-		AS2(	movdqu	[WORD_REG(bp)+2*16], xmm6)
-		AS2(	movdqu	[WORD_REG(bp)+3*16], xmm3)
-		ASL(9)
+		AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
+
 		// loop
 		AS2(	add		WORD_REG(di), 4*4)
-		AS2(	add		WORD_REG(bp), 4*16)
 		AS2(	sub		WORD_REG(si), 16)
 		ASJ(	jnz,	1, b)
 
@@ -533,29 +547,29 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		ASJ(	jz,		6, f)
 		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
 		AS2(	mov		SSE2_input, WORD_REG(ax))
-		AS2(	mov		SSE2_output, WORD_REG(bp))
+		AS2(	mov		SSE2_output, AS_REG_7)
 		ASJ(	jmp,	2, b)
 
 		ASL(4)	// final output of less than 16 words
 		AS2(	test	WORD_REG(ax), WORD_REG(ax))
 		ASJ(	jz,		5, f)
-		AS2(	movd	xmm0, [WORD_REG(ax)+0*4])
+		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+0*4])
 		AS2(	pxor	xmm2, xmm0)
-		AS2(	movd	xmm0, [WORD_REG(ax)+1*4])
+		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+1*4])
 		AS2(	pxor	xmm3, xmm0)
-		AS2(	movd	xmm0, [WORD_REG(ax)+2*4])
+		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+2*4])
 		AS2(	pxor	xmm1, xmm0)
-		AS2(	movd	xmm0, [WORD_REG(ax)+3*4])
+		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+3*4])
 		AS2(	pxor	xmm4, xmm0)
 		AS2(	add		WORD_REG(ax), 16)
 		ASL(5)
-		AS2(	movd	[WORD_REG(bp)+0*4], xmm2)
-		AS2(	movd	[WORD_REG(bp)+1*4], xmm3)
-		AS2(	movd	[WORD_REG(bp)+2*4], xmm1)
-		AS2(	movd	[WORD_REG(bp)+3*4], xmm4)
+		AS2(	movd	dword ptr [AS_REG_7+0*4], xmm2)
+		AS2(	movd	dword ptr [AS_REG_7+1*4], xmm3)
+		AS2(	movd	dword ptr [AS_REG_7+2*4], xmm1)
+		AS2(	movd	dword ptr [AS_REG_7+3*4], xmm4)
 		AS2(	sub		WORD_REG(si), 4)
 		ASJ(	jz,		6, f)
-		AS2(	add		WORD_REG(bp), 16)
+		AS2(	add		AS_REG_7, 16)
 		AS2(	psrldq	xmm2, 4)
 		AS2(	psrldq	xmm3, 4)
 		AS2(	psrldq	xmm1, 4)
@@ -563,38 +577,52 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		ASJ(	jmp,	4, b)
 
 		ASL(6)	// save state
-		AS2(	mov		WORD_REG(bx), SSE2_state)
+		AS2(	mov		AS_REG_6, SSE2_state)
 		AS2(	movdqa	xmm0, [SSE2_stateCopy+0*16])
-		AS2(	movdqa	[WORD_REG(bx)+0*16], xmm0)
+		AS2(	movdqa	[AS_REG_6+0*16], xmm0)
 		AS2(	movdqa	xmm0, [SSE2_stateCopy+1*16])
-		AS2(	movdqa	[WORD_REG(bx)+1*16], xmm0)
+		AS2(	movdqa	[AS_REG_6+1*16], xmm0)
 		AS2(	movq	xmm0, QWORD PTR [SSE2_stateCopy+2*16])
-		AS2(	movq	QWORD PTR [WORD_REG(bx)+2*16], xmm0)
-		AS2(	mov		[WORD_REG(bx)+10*4], ecx)
-		AS2(	mov		[WORD_REG(bx)+11*4], edx)
+		AS2(	movq	QWORD PTR [AS_REG_6+2*16], xmm0)
+		AS2(	mov		[AS_REG_6+10*4], ecx)
+		AS2(	mov		[AS_REG_6+11*4], edx)
 
-		AS_POP(			sp)
-		AS_POP(			bp)
+		AS_POP_IF86(	sp)
+		AS_POP_IF86(	bp)
 
 #ifdef __GNUC__
-		AS_POP(			bx)
+		AS_POP_IF86(	bx)
 		".att_syntax prefix;"
 			:
-			: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
-			: "memory", "cc"
+			: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
+	#if CRYPTOPP_BOOL_X64
+			, "r" (workspace)
+	#endif
+			: "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
 		);
 #endif
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+	movdqa	xmm6, [rsp + 02f0h]
+	movdqa	xmm7, [rsp + 0300h]
+	add		rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
+	pop		rdi
+	pop		rsi
+	ret
+	Sosemanuk_OperateKeystream ENDP
+#else
 	}
 	else
 #endif
+#endif
+#ifndef CRYPTOPP_GENERATE_X64_MASM
 	{
 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
-#define MUL_A(x)    (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)])
+#define MUL_A(x)    (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
 #else
-#define MUL_A(x)    (((x) << 8) ^ s_mulTables[(x) >> 24])
+#define MUL_A(x)    (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
 #endif
 
-#define DIV_A(x)    (((x) >> 8) ^ s_mulTables[256 + byte(x)])
+#define DIV_A(x)    (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
 
 #define r1(i) ((i%2) ? reg2 : reg1)
 #define r2(i) ((i%2) ? reg1 : reg2)
@@ -676,3 +704,5 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 }
 
 NAMESPACE_END
+
+#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
diff --git a/whrlpool.cpp b/whrlpool.cpp
index 149be39..59f0751 100644
--- a/whrlpool.cpp
+++ b/whrlpool.cpp
@@ -1,7 +1,7 @@
 // whrlpool.cpp - originally modified by Kevin Springle from
 // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c.
 // Updated to Whirlpool version 3.0, optimized and SSE version added by Wei Dai
-// Any modifications are placed in the public domain
+// All modifications are placed in the public domain
 
 // This is the original introductory comment:
 
@@ -71,6 +71,10 @@
 #include "misc.h"
 #include "cpu.h"
 
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#include <emmintrin.h>
+#endif
+
 NAMESPACE_BEGIN(CryptoPP)
 
 void Whirlpool_TestInstantiations()
@@ -395,29 +399,37 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 	{
 		// MMX version has the same structure as C version below
 #ifdef __GNUC__
+	#if CRYPTOPP_BOOL_X64
+		__m128i workspace[8];
+	#endif
 	__asm__ __volatile__
 	(
 		".intel_syntax noprefix;"
-		AS_PUSH(		bx)
-		AS2(	mov		WORD_REG(bx), WORD_REG(ax))
+		AS_PUSH_IF86(	bx)
+		AS2(	mov		AS_REG_6, WORD_REG(ax))
 #else
 	#if _MSC_VER < 1300
-		AS_PUSH(		bx)
+		AS_PUSH_IF86(	bx)
 	#endif
-		AS2(	lea		WORD_REG(bx), [Whirlpool_C])
+		AS2(	lea		AS_REG_6, [Whirlpool_C])
 		AS2(	mov		WORD_REG(cx), digest)
 		AS2(	mov		WORD_REG(dx), block)
 #endif
-		AS2(	mov		WORD_REG(ax), WORD_REG(sp))
-		AS2(	and		WORD_REG(sp), -16)
-		AS2(	sub		WORD_REG(sp), 16*8)
-		AS_PUSH(		ax)
+#if CRYPTOPP_BOOL_X86
+		AS2(	mov		eax, esp)
+		AS2(	and		esp, -16)
+		AS2(	sub		esp, 16*8)
+		AS1(	push	eax)
+	#define SSE2_workspace	esp+WORD_SZ
+#else
+	#define SSE2_workspace	%3
+#endif
 		AS2(	xor		esi, esi)
 		ASL(0)
 		AS2(	movq	mm0, [WORD_REG(cx)+8*WORD_REG(si)])
-		AS2(	movq	[WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0)		// k
+		AS2(	movq	[SSE2_workspace+8*WORD_REG(si)], mm0)		// k
 		AS2(	pxor	mm0, [WORD_REG(dx)+8*WORD_REG(si)])
-		AS2(	movq	[WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0)	// s
+		AS2(	movq	[SSE2_workspace+64+8*WORD_REG(si)], mm0)	// s
 		AS2(	movq	[WORD_REG(cx)+8*WORD_REG(si)], mm0)
 		AS1(	inc		WORD_REG(si))
 		AS2(	cmp		WORD_REG(si), 8)
@@ -430,16 +442,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 #define KSL1(a, b)	AS2(pxor	mm##a, b)
 
 #define KSL(op, i, a, b, c, d)	\
-	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+8*i])\
+	AS2(mov		eax, [SSE2_workspace+8*i])\
 	AS2(movzx	edi, al)\
-	KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
+	KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
+	KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
-	KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
+	KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
+	KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
 
 #define KSH0(a, b)	\
 	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
@@ -448,57 +460,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 	AS2(pxor	mm##a, b)
 #define KSH2(a, b)	\
 	AS2(pxor	mm##a, b)\
-	AS2(movq	[WORD_REG(sp)+WORD_SZ+8*a], mm##a)
+	AS2(movq	[SSE2_workspace+8*a], mm##a)
 
 #define KSH(op, i, a, b, c, d)	\
-	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(mov		eax, [SSE2_workspace+8*((i+4)-8*((i+4)/8))+4])\
 	AS2(movzx	edi, al)\
-	KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
+	KSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
+	KSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
-	KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
+	KSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
+	KSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
 
 #define TSL(op, i, a, b, c, d)	\
-	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\
+	AS2(mov		eax, [SSE2_workspace+64+8*i])\
 	AS2(movzx	edi, al)\
-	KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
+	KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
+	KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
-	KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
+	KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
+	KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
 
 #define TSH0(a, b)	\
 	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
-	AS2(pxor	mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\
+	AS2(pxor	mm##a, [SSE2_workspace+8*a])\
 	AS2(pxor	mm##a, b)
 #define TSH1(a, b)	\
 	AS2(pxor	mm##a, b)
 #define TSH2(a, b)	\
 	AS2(pxor	mm##a, b)\
-	AS2(movq	[WORD_REG(sp)+WORD_SZ+64+8*a], mm##a)
+	AS2(movq	[SSE2_workspace+64+8*a], mm##a)
 #define TSH3(a, b)	\
 	AS2(pxor	mm##a, b)\
 	AS2(pxor	mm##a, [WORD_REG(cx)+8*a])\
 	AS2(movq	[WORD_REG(cx)+8*a], mm##a)
 
 #define TSH(op, i, a, b, c, d)	\
-	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(mov		eax, [SSE2_workspace+64+8*((i+4)-8*((i+4)/8))+4])\
 	AS2(movzx	edi, al)\
-	TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
+	TSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
+	TSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
-	TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
+	TSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
+	TSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
 
 		KSL(0, 4, 3, 2, 1, 0)
 		KSL(0, 0, 7, 6, 5, 4)
@@ -517,8 +529,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 		KSH(2, 3, 2, 1, 0, 7)
 		KSH(2, 7, 6, 5, 4, 3)
 
-		AS2(	pxor	mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8])
-		AS2(	movq	[WORD_REG(sp)+WORD_SZ], mm0)
+		AS2(	pxor	mm0, [AS_REG_6 + 8*1024 + WORD_REG(si)*8])
+		AS2(	movq	[SSE2_workspace], mm0)
 
 		TSL(0, 4, 3, 2, 1, 0)
 		TSL(0, 0, 7, 6, 5, 4)
@@ -553,17 +565,23 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 #undef TSL
 #undef TSH
 
-		AS_POP(			sp)
+		AS_POP_IF86(	sp)
 		AS1(	emms)
 
 #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
-		AS_POP(			bx)
+		AS_POP_IF86(	bx)
 #endif
 #ifdef __GNUC__
 		".att_syntax prefix;"
 			:
 			: "a" (Whirlpool_C), "c" (digest), "d" (block)
+	#if CRYPTOPP_BOOL_X64
+			, "r" (workspace)
+	#endif
 			: "%esi", "%edi", "memory", "cc"
+	#if CRYPTOPP_BOOL_X64
+			, "%r9"
+	#endif
 		);
 #endif
 	}
diff --git a/x64masm.asm b/x64masm.asm
index 76676a7..a395c9a 100755
--- a/x64masm.asm
+++ b/x64masm.asm
@@ -1,5 +1,6 @@
-PUBLIC Baseline_Add
-PUBLIC Baseline_Sub
+include ksamd64.inc
+EXTERNDEF s_sosemanukMulTables:FAR
+
 .CODE
     ALIGN   8
 Baseline_Add	PROC
@@ -54,5 +55,1842 @@ $1@Baseline_Sub:
 	ret
 Baseline_Sub ENDP
 
+ALIGN 8
+Salsa20_OperateKeystream PROC FRAME
+mov r10, [rsp + 5*8]
+alloc_stack(10*16 + 32*16 + 8)
+save_xmm128 xmm6, 0200h
+save_xmm128 xmm7, 0210h
+save_xmm128 xmm8, 0220h
+save_xmm128 xmm9, 0230h
+save_xmm128 xmm10, 0240h
+save_xmm128 xmm11, 0250h
+save_xmm128 xmm12, 0260h
+save_xmm128 xmm13, 0270h
+save_xmm128 xmm14, 0280h
+save_xmm128 xmm15, 0290h
+.endprolog
+cmp r8, 4
+jl label5
+movdqa xmm0, [r10 + 0*16]
+movdqa xmm1, [r10 + 1*16]
+movdqa xmm2, [r10 + 2*16]
+movdqa xmm3, [r10 + 3*16]
+pshufd xmm4, xmm0, 0*64+0*16+0*4+0
+movdqa [rsp + (0*4+0)*16 + 256], xmm4
+pshufd xmm4, xmm0, 1*64+1*16+1*4+1
+movdqa [rsp + (0*4+1)*16 + 256], xmm4
+pshufd xmm4, xmm0, 2*64+2*16+2*4+2
+movdqa [rsp + (0*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm0, 3*64+3*16+3*4+3
+movdqa [rsp + (0*4+3)*16 + 256], xmm4
+pshufd xmm4, xmm1, 0*64+0*16+0*4+0
+movdqa [rsp + (1*4+0)*16 + 256], xmm4
+pshufd xmm4, xmm1, 2*64+2*16+2*4+2
+movdqa [rsp + (1*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm1, 3*64+3*16+3*4+3
+movdqa [rsp + (1*4+3)*16 + 256], xmm4
+pshufd xmm4, xmm2, 1*64+1*16+1*4+1
+movdqa [rsp + (2*4+1)*16 + 256], xmm4
+pshufd xmm4, xmm2, 2*64+2*16+2*4+2
+movdqa [rsp + (2*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm2, 3*64+3*16+3*4+3
+movdqa [rsp + (2*4+3)*16 + 256], xmm4
+pshufd xmm4, xmm3, 0*64+0*16+0*4+0
+movdqa [rsp + (3*4+0)*16 + 256], xmm4
+pshufd xmm4, xmm3, 1*64+1*16+1*4+1
+movdqa [rsp + (3*4+1)*16 + 256], xmm4
+pshufd xmm4, xmm3, 2*64+2*16+2*4+2
+movdqa [rsp + (3*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm3, 3*64+3*16+3*4+3
+movdqa [rsp + (3*4+3)*16 + 256], xmm4
+label1:
+mov eax, dword ptr [r10 + 8*4]
+mov r11d, dword ptr [r10 + 5*4]
+mov dword ptr [rsp + 8*16 + 0*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [rsp + 8*16 + 1*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [rsp + 8*16 + 2*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [rsp + 8*16 + 3*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [r10 + 8*4], eax
+mov dword ptr [r10 + 5*4], r11d
+movdqa xmm0, [rsp + 12*16 + 1*256]
+movdqa xmm4, [rsp + 13*16 + 1*256]
+movdqa xmm8, [rsp + 14*16 + 1*256]
+movdqa xmm12, [rsp + 15*16 + 1*256]
+movdqa xmm2, [rsp + 0*16 + 1*256]
+movdqa xmm6, [rsp + 1*16 + 1*256]
+movdqa xmm10, [rsp + 2*16 + 1*256]
+movdqa xmm14, [rsp + 3*16 + 1*256]
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 7
+pslld xmm4, 7
+pslld xmm8, 7
+pslld xmm12, 7
+psrld xmm1, 32-7
+psrld xmm5, 32-7
+psrld xmm9, 32-7
+psrld xmm13, 32-7
+pxor xmm0, [rsp + 4*16 + 1*256]
+pxor xmm4, [rsp + 5*16 + 1*256]
+pxor xmm8, [rsp + 6*16 + 1*256]
+pxor xmm12, [rsp + 7*16 + 1*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 4*16], xmm0
+movdqa [rsp + 5*16], xmm4
+movdqa [rsp + 6*16], xmm8
+movdqa [rsp + 7*16], xmm12
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 9
+pslld xmm4, 9
+pslld xmm8, 9
+pslld xmm12, 9
+psrld xmm3, 32-9
+psrld xmm7, 32-9
+psrld xmm11, 32-9
+psrld xmm15, 32-9
+pxor xmm0, [rsp + 8*16 + 1*256]
+pxor xmm4, [rsp + 9*16 + 1*256]
+pxor xmm8, [rsp + 10*16 + 1*256]
+pxor xmm12, [rsp + 11*16 + 1*256]
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 8*16], xmm0
+movdqa [rsp + 9*16], xmm4
+movdqa [rsp + 10*16], xmm8
+movdqa [rsp + 11*16], xmm12
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+paddd xmm0, xmm1
+paddd xmm4, xmm5
+paddd xmm8, xmm9
+paddd xmm12, xmm13
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 13
+pslld xmm4, 13
+pslld xmm8, 13
+pslld xmm12, 13
+psrld xmm1, 32-13
+psrld xmm5, 32-13
+psrld xmm9, 32-13
+psrld xmm13, 32-13
+pxor xmm0, [rsp + 12*16 + 1*256]
+pxor xmm4, [rsp + 13*16 + 1*256]
+pxor xmm8, [rsp + 14*16 + 1*256]
+pxor xmm12, [rsp + 15*16 + 1*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 12*16], xmm0
+movdqa [rsp + 13*16], xmm4
+movdqa [rsp + 14*16], xmm8
+movdqa [rsp + 15*16], xmm12
+paddd xmm0, xmm3
+paddd xmm4, xmm7
+paddd xmm8, xmm11
+paddd xmm12, xmm15
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 18
+pslld xmm4, 18
+pslld xmm8, 18
+pslld xmm12, 18
+psrld xmm3, 32-18
+psrld xmm7, 32-18
+psrld xmm11, 32-18
+psrld xmm15, 32-18
+pxor xmm0, xmm2
+pxor xmm4, xmm6
+pxor xmm8, xmm10
+pxor xmm12, xmm14
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 0*16], xmm0
+movdqa [rsp + 1*16], xmm4
+movdqa [rsp + 2*16], xmm8
+movdqa [rsp + 3*16], xmm12
+mov rax, r9
+jmp label2
+labelSSE2_Salsa_Output:
+movdqa xmm0, xmm4
+punpckldq xmm4, xmm5
+movdqa xmm1, xmm6
+punpckldq xmm6, xmm7
+movdqa xmm2, xmm4
+punpcklqdq xmm4, xmm6
+punpckhqdq xmm2, xmm6
+punpckhdq xmm0, xmm5
+punpckhdq xmm1, xmm7
+movdqa xmm6, xmm0
+punpcklqdq xmm0, xmm1
+punpckhqdq xmm6, xmm1
+test rdx, rdx
+jz labelSSE2_Salsa_Output_A3
+test rdx, 15
+jnz labelSSE2_Salsa_Output_A7
+pxor xmm4, [rdx+0*16]
+pxor xmm2, [rdx+4*16]
+pxor xmm0, [rdx+8*16]
+pxor xmm6, [rdx+12*16]
+add rdx, 1*16
+jmp labelSSE2_Salsa_Output_A3
+labelSSE2_Salsa_Output_A7:
+movdqu xmm1, [rdx+0*16]
+pxor xmm4, xmm1
+movdqu xmm1, [rdx+4*16]
+pxor xmm2, xmm1
+movdqu xmm1, [rdx+8*16]
+pxor xmm0, xmm1
+movdqu xmm1, [rdx+12*16]
+pxor xmm6, xmm1
+add rdx, 1*16
+labelSSE2_Salsa_Output_A3:
+test rcx, 15
+jnz labelSSE2_Salsa_Output_A8
+movdqa [rcx+0*16], xmm4
+movdqa [rcx+4*16], xmm2
+movdqa [rcx+8*16], xmm0
+movdqa [rcx+12*16], xmm6
+jmp labelSSE2_Salsa_Output_A9
+labelSSE2_Salsa_Output_A8:
+movdqu [rcx+0*16], xmm4
+movdqu [rcx+4*16], xmm2
+movdqu [rcx+8*16], xmm0
+movdqu [rcx+12*16], xmm6
+labelSSE2_Salsa_Output_A9:
+add rcx, 1*16
+ret
+label6:
+movdqa xmm0, [rsp + 12*16 + 0*256]
+movdqa xmm4, [rsp + 13*16 + 0*256]
+movdqa xmm8, [rsp + 14*16 + 0*256]
+movdqa xmm12, [rsp + 15*16 + 0*256]
+movdqa xmm2, [rsp + 0*16 + 0*256]
+movdqa xmm6, [rsp + 1*16 + 0*256]
+movdqa xmm10, [rsp + 2*16 + 0*256]
+movdqa xmm14, [rsp + 3*16 + 0*256]
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 7
+pslld xmm4, 7
+pslld xmm8, 7
+pslld xmm12, 7
+psrld xmm1, 32-7
+psrld xmm5, 32-7
+psrld xmm9, 32-7
+psrld xmm13, 32-7
+pxor xmm0, [rsp + 4*16 + 0*256]
+pxor xmm4, [rsp + 5*16 + 0*256]
+pxor xmm8, [rsp + 6*16 + 0*256]
+pxor xmm12, [rsp + 7*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 4*16], xmm0
+movdqa [rsp + 5*16], xmm4
+movdqa [rsp + 6*16], xmm8
+movdqa [rsp + 7*16], xmm12
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 9
+pslld xmm4, 9
+pslld xmm8, 9
+pslld xmm12, 9
+psrld xmm3, 32-9
+psrld xmm7, 32-9
+psrld xmm11, 32-9
+psrld xmm15, 32-9
+pxor xmm0, [rsp + 8*16 + 0*256]
+pxor xmm4, [rsp + 9*16 + 0*256]
+pxor xmm8, [rsp + 10*16 + 0*256]
+pxor xmm12, [rsp + 11*16 + 0*256]
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 8*16], xmm0
+movdqa [rsp + 9*16], xmm4
+movdqa [rsp + 10*16], xmm8
+movdqa [rsp + 11*16], xmm12
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+paddd xmm0, xmm1
+paddd xmm4, xmm5
+paddd xmm8, xmm9
+paddd xmm12, xmm13
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 13
+pslld xmm4, 13
+pslld xmm8, 13
+pslld xmm12, 13
+psrld xmm1, 32-13
+psrld xmm5, 32-13
+psrld xmm9, 32-13
+psrld xmm13, 32-13
+pxor xmm0, [rsp + 12*16 + 0*256]
+pxor xmm4, [rsp + 13*16 + 0*256]
+pxor xmm8, [rsp + 14*16 + 0*256]
+pxor xmm12, [rsp + 15*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 12*16], xmm0
+movdqa [rsp + 13*16], xmm4
+movdqa [rsp + 14*16], xmm8
+movdqa [rsp + 15*16], xmm12
+paddd xmm0, xmm3
+paddd xmm4, xmm7
+paddd xmm8, xmm11
+paddd xmm12, xmm15
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 18
+pslld xmm4, 18
+pslld xmm8, 18
+pslld xmm12, 18
+psrld xmm3, 32-18
+psrld xmm7, 32-18
+psrld xmm11, 32-18
+psrld xmm15, 32-18
+pxor xmm0, xmm2
+pxor xmm4, xmm6
+pxor xmm8, xmm10
+pxor xmm12, xmm14
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 0*16], xmm0
+movdqa [rsp + 1*16], xmm4
+movdqa [rsp + 2*16], xmm8
+movdqa [rsp + 3*16], xmm12
+label2:
+movdqa xmm0, [rsp + 7*16 + 0*256]
+movdqa xmm4, [rsp + 4*16 + 0*256]
+movdqa xmm8, [rsp + 5*16 + 0*256]
+movdqa xmm12, [rsp + 6*16 + 0*256]
+movdqa xmm2, [rsp + 0*16 + 0*256]
+movdqa xmm6, [rsp + 1*16 + 0*256]
+movdqa xmm10, [rsp + 2*16 + 0*256]
+movdqa xmm14, [rsp + 3*16 + 0*256]
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 7
+pslld xmm4, 7
+pslld xmm8, 7
+pslld xmm12, 7
+psrld xmm1, 32-7
+psrld xmm5, 32-7
+psrld xmm9, 32-7
+psrld xmm13, 32-7
+pxor xmm0, [rsp + 13*16 + 0*256]
+pxor xmm4, [rsp + 14*16 + 0*256]
+pxor xmm8, [rsp + 15*16 + 0*256]
+pxor xmm12, [rsp + 12*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 13*16], xmm0
+movdqa [rsp + 14*16], xmm4
+movdqa [rsp + 15*16], xmm8
+movdqa [rsp + 12*16], xmm12
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 9
+pslld xmm4, 9
+pslld xmm8, 9
+pslld xmm12, 9
+psrld xmm3, 32-9
+psrld xmm7, 32-9
+psrld xmm11, 32-9
+psrld xmm15, 32-9
+pxor xmm0, [rsp + 10*16 + 0*256]
+pxor xmm4, [rsp + 11*16 + 0*256]
+pxor xmm8, [rsp + 8*16 + 0*256]
+pxor xmm12, [rsp + 9*16 + 0*256]
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 10*16], xmm0
+movdqa [rsp + 11*16], xmm4
+movdqa [rsp + 8*16], xmm8
+movdqa [rsp + 9*16], xmm12
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+paddd xmm0, xmm1
+paddd xmm4, xmm5
+paddd xmm8, xmm9
+paddd xmm12, xmm13
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 13
+pslld xmm4, 13
+pslld xmm8, 13
+pslld xmm12, 13
+psrld xmm1, 32-13
+psrld xmm5, 32-13
+psrld xmm9, 32-13
+psrld xmm13, 32-13
+pxor xmm0, [rsp + 7*16 + 0*256]
+pxor xmm4, [rsp + 4*16 + 0*256]
+pxor xmm8, [rsp + 5*16 + 0*256]
+pxor xmm12, [rsp + 6*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 7*16], xmm0
+movdqa [rsp + 4*16], xmm4
+movdqa [rsp + 5*16], xmm8
+movdqa [rsp + 6*16], xmm12
+paddd xmm0, xmm3
+paddd xmm4, xmm7
+paddd xmm8, xmm11
+paddd xmm12, xmm15
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 18
+pslld xmm4, 18
+pslld xmm8, 18
+pslld xmm12, 18
+psrld xmm3, 32-18
+psrld xmm7, 32-18
+psrld xmm11, 32-18
+psrld xmm15, 32-18
+pxor xmm0, xmm2
+pxor xmm4, xmm6
+pxor xmm8, xmm10
+pxor xmm12, xmm14
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 0*16], xmm0
+movdqa [rsp + 1*16], xmm4
+movdqa [rsp + 2*16], xmm8
+movdqa [rsp + 3*16], xmm12
+sub eax, 2
+jnz label6
+movdqa xmm4, [rsp + 0*16 + 256]
+paddd xmm4, [rsp + 0*16]
+movdqa xmm5, [rsp + 13*16 + 256]
+paddd xmm5, [rsp + 13*16]
+movdqa xmm6, [rsp + 10*16 + 256]
+paddd xmm6, [rsp + 10*16]
+movdqa xmm7, [rsp + 7*16 + 256]
+paddd xmm7, [rsp + 7*16]
+call labelSSE2_Salsa_Output
+movdqa xmm4, [rsp + 4*16 + 256]
+paddd xmm4, [rsp + 4*16]
+movdqa xmm5, [rsp + 1*16 + 256]
+paddd xmm5, [rsp + 1*16]
+movdqa xmm6, [rsp + 14*16 + 256]
+paddd xmm6, [rsp + 14*16]
+movdqa xmm7, [rsp + 11*16 + 256]
+paddd xmm7, [rsp + 11*16]
+call labelSSE2_Salsa_Output
+movdqa xmm4, [rsp + 8*16 + 256]
+paddd xmm4, [rsp + 8*16]
+movdqa xmm5, [rsp + 5*16 + 256]
+paddd xmm5, [rsp + 5*16]
+movdqa xmm6, [rsp + 2*16 + 256]
+paddd xmm6, [rsp + 2*16]
+movdqa xmm7, [rsp + 15*16 + 256]
+paddd xmm7, [rsp + 15*16]
+call labelSSE2_Salsa_Output
+movdqa xmm4, [rsp + 12*16 + 256]
+paddd xmm4, [rsp + 12*16]
+movdqa xmm5, [rsp + 9*16 + 256]
+paddd xmm5, [rsp + 9*16]
+movdqa xmm6, [rsp + 6*16 + 256]
+paddd xmm6, [rsp + 6*16]
+movdqa xmm7, [rsp + 3*16 + 256]
+paddd xmm7, [rsp + 3*16]
+call labelSSE2_Salsa_Output
+test rdx, rdx
+jz label9
+add rdx, 12*16
+label9:
+add rcx, 12*16
+sub r8, 4
+cmp r8, 4
+jge label1
+label5:
+sub r8, 1
+jl label4
+movdqa xmm0, [r10 + 0*16]
+movdqa xmm1, [r10 + 1*16]
+movdqa xmm2, [r10 + 2*16]
+movdqa xmm3, [r10 + 3*16]
+mov rax, r9
+label0:
+movdqa xmm4, xmm3
+paddd xmm4, xmm0
+movdqa xmm5, xmm4
+pslld xmm4, 7
+psrld xmm5, 32-7
+pxor xmm1, xmm4
+pxor xmm1, xmm5
+movdqa xmm4, xmm0
+paddd xmm4, xmm1
+movdqa xmm5, xmm4
+pslld xmm4, 9
+psrld xmm5, 32-9
+pxor xmm2, xmm4
+pxor xmm2, xmm5
+movdqa xmm4, xmm1
+paddd xmm4, xmm2
+movdqa xmm5, xmm4
+pslld xmm4, 13
+psrld xmm5, 32-13
+pxor xmm3, xmm4
+pxor xmm3, xmm5
+movdqa xmm4, xmm2
+paddd xmm4, xmm3
+movdqa xmm5, xmm4
+pslld xmm4, 18
+psrld xmm5, 32-18
+pxor xmm0, xmm4
+pxor xmm0, xmm5
+pshufd xmm1, xmm1, 2*64+1*16+0*4+3
+pshufd xmm2, xmm2, 1*64+0*16+3*4+2
+pshufd xmm3, xmm3, 0*64+3*16+2*4+1
+movdqa xmm4, xmm1
+paddd xmm4, xmm0
+movdqa xmm5, xmm4
+pslld xmm4, 7
+psrld xmm5, 32-7
+pxor xmm3, xmm4
+pxor xmm3, xmm5
+movdqa xmm4, xmm0
+paddd xmm4, xmm3
+movdqa xmm5, xmm4
+pslld xmm4, 9
+psrld xmm5, 32-9
+pxor xmm2, xmm4
+pxor xmm2, xmm5
+movdqa xmm4, xmm3
+paddd xmm4, xmm2
+movdqa xmm5, xmm4
+pslld xmm4, 13
+psrld xmm5, 32-13
+pxor xmm1, xmm4
+pxor xmm1, xmm5
+movdqa xmm4, xmm2
+paddd xmm4, xmm1
+movdqa xmm5, xmm4
+pslld xmm4, 18
+psrld xmm5, 32-18
+pxor xmm0, xmm4
+pxor xmm0, xmm5
+pshufd xmm1, xmm1, 0*64+3*16+2*4+1
+pshufd xmm2, xmm2, 1*64+0*16+3*4+2
+pshufd xmm3, xmm3, 2*64+1*16+0*4+3
+sub eax, 2
+jnz label0
+paddd xmm0, [r10 + 0*16]
+paddd xmm1, [r10 + 1*16]
+paddd xmm2, [r10 + 2*16]
+paddd xmm3, [r10 + 3*16]
+add dword ptr [r10 + 8*4], 1
+adc dword ptr [r10 + 5*4], 0
+pcmpeqb xmm6, xmm6
+psrlq xmm6, 32
+pshufd xmm7, xmm6, 0*64+1*16+2*4+3
+movdqa xmm4, xmm0
+movdqa xmm5, xmm3
+pand xmm0, xmm7
+pand xmm4, xmm6
+pand xmm3, xmm6
+pand xmm5, xmm7
+por xmm4, xmm5
+movdqa xmm5, xmm1
+pand xmm1, xmm7
+pand xmm5, xmm6
+por xmm0, xmm5
+pand xmm6, xmm2
+pand xmm2, xmm7
+por xmm1, xmm6
+por xmm2, xmm3
+movdqa xmm5, xmm4
+movdqa xmm6, xmm0
+shufpd xmm4, xmm1, 2
+shufpd xmm0, xmm2, 2
+shufpd xmm1, xmm5, 2
+shufpd xmm2, xmm6, 2
+test rdx, rdx
+jz labelSSE2_Salsa_Output_B3
+test rdx, 15
+jnz labelSSE2_Salsa_Output_B7
+pxor xmm4, [rdx+0*16]
+pxor xmm0, [rdx+1*16]
+pxor xmm1, [rdx+2*16]
+pxor xmm2, [rdx+3*16]
+add rdx, 4*16
+jmp labelSSE2_Salsa_Output_B3
+labelSSE2_Salsa_Output_B7:
+movdqu xmm3, [rdx+0*16]
+pxor xmm4, xmm3
+movdqu xmm3, [rdx+1*16]
+pxor xmm0, xmm3
+movdqu xmm3, [rdx+2*16]
+pxor xmm1, xmm3
+movdqu xmm3, [rdx+3*16]
+pxor xmm2, xmm3
+add rdx, 4*16
+labelSSE2_Salsa_Output_B3:
+test rcx, 15
+jnz labelSSE2_Salsa_Output_B8
+movdqa [rcx+0*16], xmm4
+movdqa [rcx+1*16], xmm0
+movdqa [rcx+2*16], xmm1
+movdqa [rcx+3*16], xmm2
+jmp labelSSE2_Salsa_Output_B9
+labelSSE2_Salsa_Output_B8:
+movdqu [rcx+0*16], xmm4
+movdqu [rcx+1*16], xmm0
+movdqu [rcx+2*16], xmm1
+movdqu [rcx+3*16], xmm2
+labelSSE2_Salsa_Output_B9:
+add rcx, 4*16
+jmp label5
+label4:
+movdqa xmm6, [rsp + 0200h]
+movdqa xmm7, [rsp + 0210h]
+movdqa xmm8, [rsp + 0220h]
+movdqa xmm9, [rsp + 0230h]
+movdqa xmm10, [rsp + 0240h]
+movdqa xmm11, [rsp + 0250h]
+movdqa xmm12, [rsp + 0260h]
+movdqa xmm13, [rsp + 0270h]
+movdqa xmm14, [rsp + 0280h]
+movdqa xmm15, [rsp + 0290h]
+add rsp, 10*16 + 32*16 + 8
+ret
+Salsa20_OperateKeystream ENDP
+ALIGN 8
+Rijndael_Enc_ProcessAndXorBlock PROC FRAME
+rex_push_reg rbx
+push_reg rsi
+push_reg rdi
+push_reg r12
+push_reg r13
+push_reg r14
+push_reg r15
+.endprolog
+mov r11, rcx
+mov rdi, [rsp + 5*8 + 7*8] ; inBlock
+mov eax, [r8+0*4]
+xor eax, [rdi+0*4]
+mov r13d, eax
+mov ebx, [r8+1*4]
+xor ebx, [rdi+1*4]
+mov r14d, ebx
+and ebx, eax
+mov eax, [r8+2*4]
+xor eax, [rdi+2*4]
+mov r15d, eax
+and ebx, eax
+mov ecx, [r8+3*4]
+xor ecx, [rdi+3*4]
+and ebx, ecx
+and ebx, 0
+mov edi, ebx
+label2:
+and ebx, [r11+rdi]
+add edi, edx
+and ebx, [r11+rdi]
+add edi, edx
+and ebx, [r11+rdi]
+add edi, edx
+and ebx, [r11+rdi]
+add edi, edx
+cmp edi, 1024
+jl label2
+and ebx, [r11+1020]
+xor r13d, ebx
+xor r14d, ebx
+xor r15d, ebx
+xor ecx, ebx
+mov edi, [r8+4*4]
+mov eax, [r8+5*4]
+mov ebx, [r8+6*4]
+mov edx, [r8+7*4]
+add r8, 8*4
+movzx esi, cl
+xor edx, [r11+0*1024+4*rsi]
+movzx esi, ch
+xor ebx, [r11+1*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor eax, [r11+2*1024+4*rsi]
+movzx esi, ch
+xor edi, [r11+3*1024+4*rsi]
+mov ecx, r15d
+movzx esi, cl
+xor ebx, [r11+0*1024+4*rsi]
+movzx esi, ch
+xor eax, [r11+1*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor edi, [r11+2*1024+4*rsi]
+movzx esi, ch
+xor edx, [r11+3*1024+4*rsi]
+mov ecx, r14d
+movzx esi, cl
+xor eax, [r11+0*1024+4*rsi]
+movzx esi, ch
+xor edi, [r11+1*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor edx, [r11+2*1024+4*rsi]
+movzx esi, ch
+xor ebx, [r11+3*1024+4*rsi]
+mov ecx, r13d
+movzx esi, cl
+xor edi, [r11+0*1024+4*rsi]
+movzx esi, ch
+xor edx, [r11+1*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor ebx, [r11+2*1024+4*rsi]
+movzx esi, ch
+xor eax, [r11+3*1024+4*rsi]
+mov r15d, ebx
+mov r14d, eax
+mov r13d, edi
+label0:
+mov edi, [r8+0*4]
+mov eax, [r8+1*4]
+mov ebx, [r8+2*4]
+mov ecx, [r8+3*4]
+movzx esi, dl
+xor edi, [r11+3*1024+4*rsi]
+movzx esi, dh
+xor eax, [r11+2*1024+4*rsi]
+shr edx, 16
+movzx esi, dl
+xor ebx, [r11+1*1024+4*rsi]
+movzx esi, dh
+xor ecx, [r11+0*1024+4*rsi]
+mov edx, r15d
+movzx esi, dl
+xor ecx, [r11+3*1024+4*rsi]
+movzx esi, dh
+xor edi, [r11+2*1024+4*rsi]
+shr edx, 16
+movzx esi, dl
+xor eax, [r11+1*1024+4*rsi]
+movzx esi, dh
+xor ebx, [r11+0*1024+4*rsi]
+mov edx, r14d
+movzx esi, dl
+xor ebx, [r11+3*1024+4*rsi]
+movzx esi, dh
+xor ecx, [r11+2*1024+4*rsi]
+shr edx, 16
+movzx esi, dl
+xor edi, [r11+1*1024+4*rsi]
+movzx esi, dh
+xor eax, [r11+0*1024+4*rsi]
+mov edx, r13d
+movzx esi, dl
+xor eax, [r11+3*1024+4*rsi]
+movzx esi, dh
+xor ebx, [r11+2*1024+4*rsi]
+shr edx, 16
+movzx esi, dl
+xor ecx, [r11+1*1024+4*rsi]
+movzx esi, dh
+xor edi, [r11+0*1024+4*rsi]
+mov r15d, ebx
+mov r14d, eax
+mov r13d, edi
+mov edi, [r8+4*4]
+mov eax, [r8+5*4]
+mov ebx, [r8+6*4]
+mov edx, [r8+7*4]
+movzx esi, cl
+xor edi, [r11+3*1024+4*rsi]
+movzx esi, ch
+xor eax, [r11+2*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor ebx, [r11+1*1024+4*rsi]
+movzx esi, ch
+xor edx, [r11+0*1024+4*rsi]
+mov ecx, r15d
+movzx esi, cl
+xor edx, [r11+3*1024+4*rsi]
+movzx esi, ch
+xor edi, [r11+2*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor eax, [r11+1*1024+4*rsi]
+movzx esi, ch
+xor ebx, [r11+0*1024+4*rsi]
+mov ecx, r14d
+movzx esi, cl
+xor ebx, [r11+3*1024+4*rsi]
+movzx esi, ch
+xor edx, [r11+2*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor edi, [r11+1*1024+4*rsi]
+movzx esi, ch
+xor eax, [r11+0*1024+4*rsi]
+mov ecx, r13d
+movzx esi, cl
+xor eax, [r11+3*1024+4*rsi]
+movzx esi, ch
+xor ebx, [r11+2*1024+4*rsi]
+shr ecx, 16
+movzx esi, cl
+xor edx, [r11+1*1024+4*rsi]
+movzx esi, ch
+xor edi, [r11+0*1024+4*rsi]
+mov r15d, ebx
+mov r14d, eax
+mov r13d, edi
+add r8, 8*4
+cmp r9, r8
+jne label0
+mov eax, [r9+0*4]
+mov ecx, [r9+1*4]
+mov esi, [r9+2*4]
+mov edi, [r9+3*4]
+movzx ebx, dl
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 3*8
+xor eax, ebx
+movzx ebx, dh
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 2*8
+xor ecx, ebx
+shr edx, 16
+movzx ebx, dl
+shr edx, 8
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 1*8
+xor esi, ebx
+movzx ebx, BYTE PTR [r11+1+4*rdx]
+xor edi, ebx
+mov edx, r15d
+movzx ebx, dl
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 3*8
+xor edi, ebx
+movzx ebx, dh
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 2*8
+xor eax, ebx
+shr edx, 16
+movzx ebx, dl
+shr edx, 8
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 1*8
+xor ecx, ebx
+movzx ebx, BYTE PTR [r11+1+4*rdx]
+xor esi, ebx
+mov edx, r14d
+movzx ebx, dl
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 3*8
+xor esi, ebx
+movzx ebx, dh
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 2*8
+xor edi, ebx
+shr edx, 16
+movzx ebx, dl
+shr edx, 8
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 1*8
+xor eax, ebx
+movzx ebx, BYTE PTR [r11+1+4*rdx]
+xor ecx, ebx
+mov edx, r13d
+movzx ebx, dl
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 3*8
+xor ecx, ebx
+movzx ebx, dh
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 2*8
+xor esi, ebx
+shr edx, 16
+movzx ebx, dl
+shr edx, 8
+movzx ebx, BYTE PTR [r11+1+4*rbx]
+shl ebx, 1*8
+xor edi, ebx
+movzx ebx, BYTE PTR [r11+1+4*rdx]
+xor eax, ebx
+mov rbx, [rsp + 6*8 + 7*8] ; xorBlock
+test rbx, rbx
+jz label1
+xor eax, [rbx+0*4]
+xor ecx, [rbx+1*4]
+xor esi, [rbx+2*4]
+xor edi, [rbx+3*4]
+label1:
+mov rbx, [rsp + 7*8 + 7*8] ; outBlock
+mov [rbx+0*4], eax
+mov [rbx+1*4], ecx
+mov [rbx+2*4], esi
+mov [rbx+3*4], edi
+pop r15
+pop r14
+pop r13
+pop r12
+pop rdi
+pop rsi
+pop rbx
+ret
+Rijndael_Enc_ProcessAndXorBlock ENDP
+
+ALIGN 8
+Sosemanuk_OperateKeystream PROC FRAME
+rex_push_reg rsi
+push_reg rdi
+alloc_stack(80*4*2+12*4+8*8 + 2*16+8)
+save_xmm128 xmm6, 02f0h
+save_xmm128 xmm7, 0300h
+.endprolog
+mov rdi, r8
+mov rax, r9
+mov QWORD PTR [rsp+1*8], rdi
+mov QWORD PTR [rsp+2*8], rdx
+mov QWORD PTR [rsp+6*8], rax
+lea rcx, [4*rcx+rcx]
+lea rsi, [4*rcx]
+mov QWORD PTR [rsp+3*8], rsi
+movdqa xmm0, [rax+0*16]
+movdqa [rsp + 8*8+0*16], xmm0
+movdqa xmm0, [rax+1*16]
+movdqa [rsp + 8*8+1*16], xmm0
+movq xmm0, QWORD PTR [rax+2*16]
+movq QWORD PTR [rsp + 8*8+2*16], xmm0
+psrlq xmm0, 32
+movd r10d, xmm0
+mov ecx, [rax+10*4]
+mov edx, [rax+11*4]
+pcmpeqb xmm7, xmm7
+label2:
+lea rdi, [rsp + 8*8 + 12*4]
+mov rax, 80
+cmp rsi, 80
+cmovg rsi, rax
+mov QWORD PTR [rsp+7*8], rsi
+lea rsi, [rdi+rsi]
+mov QWORD PTR [rsp+4*8], rsi
+lea rsi, s_sosemanukMulTables
+label0:
+mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4]
+mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4]
+mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4]
+mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4]
+mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4]
+mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4]
+mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4]
+mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4]
+mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4]
+mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4]
+mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4]
+mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4]
+mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4]
+mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4]
+mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4]
+mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4]
+mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4]
+mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4]
+mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4]
+mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4]
+mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d
+add rdi, 5*4
+cmp rdi, QWORD PTR [rsp+4*8]
+jne label0
+mov rax, QWORD PTR [rsp+2*8]
+mov r11, QWORD PTR [rsp+1*8]
+lea rdi, [rsp + 8*8 + 12*4]
+mov rsi, QWORD PTR [rsp+7*8]
+label1:
+movdqa xmm0, [rdi+0*20*4]
+movdqa xmm2, [rdi+2*20*4]
+movdqa xmm3, [rdi+3*20*4]
+movdqa xmm1, [rdi+1*20*4]
+movdqa xmm4, xmm0
+pand xmm0, xmm2
+pxor xmm0, xmm3
+pxor xmm2, xmm1
+pxor xmm2, xmm0
+por xmm3, xmm4
+pxor xmm3, xmm1
+pxor xmm4, xmm2
+movdqa xmm1, xmm3
+por xmm3, xmm4
+pxor xmm3, xmm0
+pand xmm0, xmm1
+pxor xmm4, xmm0
+pxor xmm1, xmm3
+pxor xmm1, xmm4
+pxor xmm4, xmm7
+pxor xmm2, [rdi+80*4]
+pxor xmm3, [rdi+80*5]
+pxor xmm1, [rdi+80*6]
+pxor xmm4, [rdi+80*7]
+cmp rsi, 16
+jl label4
+movdqa xmm6, xmm2
+punpckldq xmm2, xmm3
+movdqa xmm5, xmm1
+punpckldq xmm1, xmm4
+movdqa xmm0, xmm2
+punpcklqdq xmm2, xmm1
+punpckhqdq xmm0, xmm1
+punpckhdq xmm6, xmm3
+punpckhdq xmm5, xmm4
+movdqa xmm3, xmm6
+punpcklqdq xmm6, xmm5
+punpckhqdq xmm3, xmm5
+test rax, rax
+jz labelSSE2_Sosemanuk_Output3
+test rax, 15
+jnz labelSSE2_Sosemanuk_Output7
+pxor xmm2, [rax+0*16]
+pxor xmm0, [rax+1*16]
+pxor xmm6, [rax+2*16]
+pxor xmm3, [rax+3*16]
+add rax, 4*16
+jmp labelSSE2_Sosemanuk_Output3
+labelSSE2_Sosemanuk_Output7:
+movdqu xmm1, [rax+0*16]
+pxor xmm2, xmm1
+movdqu xmm1, [rax+1*16]
+pxor xmm0, xmm1
+movdqu xmm1, [rax+2*16]
+pxor xmm6, xmm1
+movdqu xmm1, [rax+3*16]
+pxor xmm3, xmm1
+add rax, 4*16
+labelSSE2_Sosemanuk_Output3:
+test r11, 15
+jnz labelSSE2_Sosemanuk_Output8
+movdqa [r11+0*16], xmm2
+movdqa [r11+1*16], xmm0
+movdqa [r11+2*16], xmm6
+movdqa [r11+3*16], xmm3
+jmp labelSSE2_Sosemanuk_Output9
+labelSSE2_Sosemanuk_Output8:
+movdqu [r11+0*16], xmm2
+movdqu [r11+1*16], xmm0
+movdqu [r11+2*16], xmm6
+movdqu [r11+3*16], xmm3
+labelSSE2_Sosemanuk_Output9:
+add r11, 4*16
+add rdi, 4*4
+sub rsi, 16
+jnz label1
+mov rsi, QWORD PTR [rsp+3*8]
+sub rsi, 80
+jz label6
+mov QWORD PTR [rsp+3*8], rsi
+mov QWORD PTR [rsp+2*8], rax
+mov QWORD PTR [rsp+1*8], r11
+jmp label2
+label4:
+test rax, rax
+jz label5
+movd xmm0, dword ptr [rax+0*4]
+pxor xmm2, xmm0
+movd xmm0, dword ptr [rax+1*4]
+pxor xmm3, xmm0
+movd xmm0, dword ptr [rax+2*4]
+pxor xmm1, xmm0
+movd xmm0, dword ptr [rax+3*4]
+pxor xmm4, xmm0
+add rax, 16
+label5:
+movd dword ptr [r11+0*4], xmm2
+movd dword ptr [r11+1*4], xmm3
+movd dword ptr [r11+2*4], xmm1
+movd dword ptr [r11+3*4], xmm4
+sub rsi, 4
+jz label6
+add r11, 16
+psrldq xmm2, 4
+psrldq xmm3, 4
+psrldq xmm1, 4
+psrldq xmm4, 4
+jmp label4
+label6:
+mov r10, QWORD PTR [rsp+6*8]
+movdqa xmm0, [rsp + 8*8+0*16]
+movdqa [r10+0*16], xmm0
+movdqa xmm0, [rsp + 8*8+1*16]
+movdqa [r10+1*16], xmm0
+movq xmm0, QWORD PTR [rsp + 8*8+2*16]
+movq QWORD PTR [r10+2*16], xmm0
+mov [r10+10*4], ecx
+mov [r10+11*4], edx
+movdqa xmm6, [rsp + 02f0h]
+movdqa xmm7, [rsp + 0300h]
+add rsp, 80*4*2+12*4+8*8 + 2*16+8
+pop		rdi
+pop		rsi
+ret
+Sosemanuk_OperateKeystream ENDP
+
+Panama_SSE2_Pull PROC FRAME
+alloc_stack(2*16+8)
+save_xmm128 xmm6, 0h
+save_xmm128 xmm7, 10h
+.endprolog
+shl rcx, 5
+jz label5
+mov r10d, [rdx+4*17]
+add rcx, r10
+mov rdi, rcx
+movdqa xmm0, xmmword ptr [rdx+0*16]
+movdqa xmm1, xmmword ptr [rdx+1*16]
+movdqa xmm2, xmmword ptr [rdx+2*16]
+movdqa xmm3, xmmword ptr [rdx+3*16]
+mov eax, dword ptr [rdx+4*16]
+label4:
+movdqa xmm6, xmm2
+movss xmm6, xmm3
+pshufd xmm5, xmm6, 0*64+3*16+2*4+1
+movd xmm6, eax
+movdqa xmm7, xmm3
+movss xmm7, xmm6
+pshufd xmm6, xmm7, 0*64+3*16+2*4+1
+movd ecx, xmm2
+not ecx
+movd r11d, xmm3
+or ecx, r11d
+xor eax, ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm1
+por xmm7, xmm2
+pxor xmm7, xmm3
+movd ecx, xmm7
+rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm0
+por xmm7, xmm1
+pxor xmm7, xmm2
+movd ecx, xmm7
+rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm6
+por xmm7, xmm0
+pxor xmm7, xmm1
+movd ecx, xmm7
+rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm5
+por xmm7, xmm6
+pxor xmm7, xmm0
+movd ecx, xmm7
+rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx
+movdqa xmm4, xmm3
+punpcklqdq xmm3, xmm2
+punpckhdq xmm4, xmm2
+movdqa xmm2, xmm1
+punpcklqdq xmm1, xmm0
+punpckhdq xmm2, xmm0
+test r8, r8
+jz label0
+movdqa xmm6, xmm4
+punpcklqdq xmm4, xmm2
+punpckhqdq xmm6, xmm2
+test r9, 15
+jnz label2
+test r9, r9
+jz label1
+pxor xmm4, [r9]
+pxor xmm6, [r9+16]
+add r9, 32
+jmp label1
+label2:
+movdqu xmm0, [r9]
+movdqu xmm2, [r9+16]
+pxor xmm4, xmm0
+pxor xmm6, xmm2
+add r9, 32
+label1:
+test r8, 15
+jnz label3
+movdqa xmmword ptr [r8], xmm4
+movdqa xmmword ptr [r8+16], xmm6
+add r8, 32
+jmp label0
+label3:
+movdqu xmmword ptr [r8], xmm4
+movdqu xmmword ptr [r8+16], xmm6
+add r8, 32
+label0:
+lea rcx, [r10 + 32]
+and rcx, 31*32
+lea r11, [r10 + (32-24)*32]
+and r11, 31*32
+movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8]
+pxor xmm3, xmm0
+pshufd xmm0, xmm0, 2*64+3*16+0*4+1
+movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3
+pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8]
+movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0
+movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8]
+pxor xmm1, xmm4
+movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1
+pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8]
+movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4
+movdqa xmm3, xmmword ptr [rdx+3*16]
+movdqa xmm2, xmmword ptr [rdx+2*16]
+movdqa xmm1, xmmword ptr [rdx+1*16]
+movdqa xmm0, xmmword ptr [rdx+0*16]
+movd xmm6, eax
+movdqa xmm7, xmm3
+movss xmm7, xmm6
+movdqa xmm6, xmm2
+movss xmm6, xmm3
+movdqa xmm5, xmm1
+movss xmm5, xmm2
+movdqa xmm4, xmm0
+movss xmm4, xmm1
+pshufd xmm7, xmm7, 0*64+3*16+2*4+1
+pshufd xmm6, xmm6, 0*64+3*16+2*4+1
+pshufd xmm5, xmm5, 0*64+3*16+2*4+1
+pshufd xmm4, xmm4, 0*64+3*16+2*4+1
+xor eax, 1
+movd ecx, xmm0
+xor eax, ecx
+movd ecx, xmm3
+xor eax, ecx
+pxor xmm3, xmm2
+pxor xmm2, xmm1
+pxor xmm1, xmm0
+pxor xmm0, xmm7
+pxor xmm3, xmm7
+pxor xmm2, xmm6
+pxor xmm1, xmm5
+pxor xmm0, xmm4
+lea rcx, [r10 + (32-4)*32]
+and rcx, 31*32
+lea r11, [r10 + 16*32]
+and r11, 31*32
+movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16]
+movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16]
+movdqa xmm6, xmm4
+punpcklqdq xmm4, xmm5
+punpckhqdq xmm6, xmm5
+pxor xmm3, xmm4
+pxor xmm2, xmm6
+movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16]
+movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16]
+movdqa xmm6, xmm4
+punpcklqdq xmm4, xmm5
+punpckhqdq xmm6, xmm5
+pxor xmm1, xmm4
+pxor xmm0, xmm6
+add r10, 32
+cmp r10, rdi
+jne label4
+mov [rdx+4*16], eax
+movdqa xmmword ptr [rdx+3*16], xmm3
+movdqa xmmword ptr [rdx+2*16], xmm2
+movdqa xmmword ptr [rdx+1*16], xmm1
+movdqa xmmword ptr [rdx+0*16], xmm0
+label5:
+movdqa xmm6, [rsp + 0h]
+movdqa xmm7, [rsp + 10h]
+add rsp, 2*16+8
+ret
+Panama_SSE2_Pull ENDP
+
 _TEXT ENDS
 END
author	weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>	2007-09-24 00:43:57 +0000
committer	weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>	2007-09-24 00:43:57 +0000
commit	982ba6fa712d44275c2541b6b9badf489cf9eda6 (patch)
tree	7d4e77f11bb8dc49557b634d8380767aef1b8502
parent	489a156f9bc41028439b6375af6314e473565847 (diff)
download	cryptopp-982ba6fa712d44275c2541b6b9badf489cf9eda6.tar.gz