10 files changed, 2765 insertions, 2156 deletions
diff --git a/integer.cpp b/integer.cpp
index 0c5018ee..515643ed 100644
--- a/integer.cpp
+++ b/integer.cpp
@@ -14,30 +14,20 @@
 #include "algparam.h"
 #include "pubkey.h"		// for P1363_KDF2
 #include "sha.h"
+#include "cpu.h"
 
 #include <iostream>
 
-#ifdef _M_X64
-#include <Intrin.h>
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	#include <intrin.h>
 #endif
 
-#ifdef SSE2_INTRINSICS_AVAILABLE
-	#ifdef __GNUC__
-		#include <xmmintrin.h>
-		#include <signal.h>
-		#include <setjmp.h>
-		#ifdef CRYPTOPP_MEMALIGN_AVAILABLE
-			#include <malloc.h>
-		#else
-			#include <stdlib.h>
-		#endif
-	#else
-		#include <emmintrin.h>
-	#endif
-#elif defined(_MSC_VER) && defined(_M_IX86)
-	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
-#elif defined(__GNUC__) && defined(__i386__)
-	#warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled."
+#ifdef __DECCXX
+	#include <c_asm.h>
+#endif
+
+#ifdef CRYPTOPP_MSVC6_NO_PP
+	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
 #endif
 
 NAMESPACE_BEGIN(CryptoPP)
@@ -50,67 +40,7 @@ bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const v
 	return true;
 }
 
-#ifdef SSE2_INTRINSICS_AVAILABLE
-template <class T>
-CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *)
-{
-	CheckSize(n);
-	if (n == 0)
-		return NULL;
-	if (n >= 4)
-	{
-		void *p;
-	#ifdef CRYPTOPP_MM_MALLOC_AVAILABLE
-		while (!(p = _mm_malloc(sizeof(T)*n, 16)))
-	#elif defined(CRYPTOPP_MEMALIGN_AVAILABLE)
-		while (!(p = memalign(16, sizeof(T)*n)))
-	#elif defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16)
-		while (!(p = malloc(sizeof(T)*n)))
-	#else
-		while (!(p = (byte *)malloc(sizeof(T)*n + 8)))	// assume malloc alignment is at least 8
-	#endif
-			CallNewHandler();
-
-	#ifdef CRYPTOPP_NO_ALIGNED_ALLOC
-		assert(m_pBlock == NULL);
-		m_pBlock = p;
-		if (!IsAlignedOn(p, 16))
-		{
-			assert(IsAlignedOn(p, 8));
-			p = (byte *)p + 8;
-		}
-	#endif
-
-		assert(IsAlignedOn(p, 16));
-		return (T*)p;
-	}
-	return new T[n];
-}
-
-template <class T>
-void AlignedAllocator<T>::deallocate(void *p, size_type n)
-{
-	memset(p, 0, n*sizeof(T));
-	if (n >= 4)
-	{
-		#ifdef CRYPTOPP_MM_MALLOC_AVAILABLE
-			_mm_free(p);
-		#elif defined(CRYPTOPP_NO_ALIGNED_ALLOC)
-			assert(m_pBlock == p || (byte *)m_pBlock+8 == p);
-			free(m_pBlock);
-			m_pBlock = NULL;
-		#else
-			free(p);
-		#endif
-	}
-	else
-		delete [] (T *)p;
-}
-
-template class CRYPTOPP_DLL AlignedAllocator<word>;
-#endif
-
-static int Compare(const word *A, const word *B, size_t N)
+inline static int Compare(const word *A, const word *B, size_t N)
 {
 	while (N--)
 		if (A[N] > B[N])
@@ -121,7 +51,7 @@ static int Compare(const word *A, const word *B, size_t N)
 	return 0;
 }
 
-static int Increment(word *A, size_t N, word B=1)
+inline static int Increment(word *A, size_t N, word B=1)
 {
 	assert(N);
 	word t = A[0];
@@ -134,7 +64,7 @@ static int Increment(word *A, size_t N, word B=1)
 	return 1;
 }
 
-static int Decrement(word *A, size_t N, word B=1)
+inline static int Decrement(word *A, size_t N, word B=1)
 {
 	assert(N);
 	word t = A[0];
@@ -169,6 +99,45 @@ static word AtomicInverseModPower2(word A)
 
 // ********************************************************
 
+#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+	#define Declare2Words(x)			dword x;
+	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
+		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
+	#else
+		#define MultiplyWords(p, a, b)		p = (dword)a*b;
+	#endif
+	#define AssignWord(a, b)			a = b;
+	#define Add2WordsBy1(a, b, c)		a = b + c;
+	#define Acc2WordsBy1(a, b)			a += b;
+	#define Acc2WordsBy2(a, b)			a += b;
+	#define LowWord(a)					(word)a
+	#define HighWord(a)					(word)(a>>WORD_BITS)
+	#define Double2Words(a)				a += a;
+	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
+	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
+	#define GetCarry(u)					HighWord(u)
+	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
+#else
+	#define Declare2Words(x)			word x##0, x##1;
+	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
+	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
+	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
+	#define Acc2WordsBy2(a, b)			a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
+	#define LowWord(a)					a##0
+	#define HighWord(a)					a##1
+	#ifdef _MSC_VER
+		#define MultiplyWords(p, a, b)		p##0 = _umul128(a, b, &p##1);
+		#define Double2Words(a)				a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0;
+	#elif defined(__DECCXX)
+		#define MultiplyWords(p, a, b)		p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
+		#define Double2Words(a)				a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0;
+	#endif
+	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
+	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
+	#define GetCarry(u)					u##1
+	#define GetBorrow(u)				u##1
+#endif
+
 class DWord
 {
 public:
@@ -198,25 +167,8 @@ public:
 		DWord r;
 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
 			r.m_whole = (dword)a * b;
-		#elif defined(__alpha__)
-			r.m_halfs.low = a*b; __asm__("umulh %1,%2,%0" : "=r" (r.m_halfs.high) : "r" (a), "r" (b));
-		#elif defined(__ia64__)
-			r.m_halfs.low = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (r.m_halfs.high) : "f" (a), "f" (b));
-		#elif defined(_ARCH_PPC64)
-			r.m_halfs.low = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (r.m_halfs.high) : "r" (a), "r" (b) : "cc");
-		#elif defined(__x86_64__)
-			__asm__("mulq %3" : "=d" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc");
-		#elif defined(__mips64)
-			__asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b));
-		#elif defined(_M_X64)
-			r.m_halfs.low = _umul128(a, b, &r.m_halfs.high);
-		#elif defined(_M_IX86)
-			// for testing
-			word64 t = (word64)a * b;
-			r.m_halfs.high = ((word32 *)(&t))[1];
-			r.m_halfs.low = (word32)t;
 		#else
-			#error can not implement DWord
+			r.m_halfs.low = _umul128(a, b, &r.m_halfs.high);
 		#endif
 		return r;
 	}
@@ -457,1529 +409,1449 @@ inline word DWord::operator%(word a)
 
 // ********************************************************
 
-class Portable
-{
-public:
-	static int Add(word *C, const word *A, const word *B, size_t N);
-	static int Subtract(word *C, const word *A, const word *B, size_t N);
-
-	static inline void Multiply2(word *C, const word *A, const word *B);
-	static inline word Multiply2Add(word *C, const word *A, const word *B);
-	static void Multiply4(word *C, const word *A, const word *B);
-	static void Multiply8(word *C, const word *A, const word *B);
-	static inline unsigned int MultiplyRecursionLimit() {return 8;}
-
-	static inline void Multiply2Bottom(word *C, const word *A, const word *B);
-	static void Multiply4Bottom(word *C, const word *A, const word *B);
-	static void Multiply8Bottom(word *C, const word *A, const word *B);
-	static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
-
-	static void Square2(word *R, const word *A);
-	static void Square4(word *R, const word *A);
-	static void Square8(word *R, const word *A) {assert(false);}
-	static inline unsigned int SquareRecursionLimit() {return 4;}
-};
+// use some tricks to share assembly code between MSVC and GCC
+#if defined(__GNUC__)
+	#define CRYPTOPP_NAKED
+	#define AddPrologue \
+		__asm__ __volatile__ \
+		( \
+			"push %%ebx;"	/* save this manually, in case of -fPIC */ \
+			"mov %2, %%ebx;" \
+			".intel_syntax noprefix;"
+	#define AddEpilogue \
+			".att_syntax prefix;" \
+			"pop %%ebx;" \
+					: \
+					: "d" (C), "a" (A), "m" (B), "c" (N) \
+					: "%esi", "memory", "cc" \
+		);
+	#define MulPrologue \
+		__asm__ __volatile__ \
+		( \
+			".intel_syntax noprefix;" \
+			AS1(	push	ebx) \
+			AS2(	mov		ebx, edx)
+	#define MulEpilogue \
+			AS1(	pop		ebx) \
+			".att_syntax prefix;" \
+			: \
+			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
+			: "%esi", "memory", "cc" \
+		);
+	#define SquPrologue		MulPrologue
+	#define SquEpilogue	\
+			AS1(	pop		ebx) \
+			".att_syntax prefix;" \
+			: \
+			: "d" (s_maskLow16), "c" (C), "a" (A) \
+			: "%esi", "%edi", "memory", "cc" \
+		);
+	#define TopPrologue		MulPrologue
+	#define TopEpilogue	\
+			AS1(	pop		ebx) \
+			".att_syntax prefix;" \
+			: \
+			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
+			: "memory", "cc" \
+		);
+#else
+	#define CRYPTOPP_NAKED __declspec(naked)
+	#define AddPrologue \
+		__asm	push ebx \
+		__asm	push esi \
+		__asm	mov		eax, [esp+12] \
+		__asm	mov		ebx, [esp+16]
+	#define AddEpilogue \
+		__asm	pop esi \
+		__asm	pop ebx \
+		__asm	ret 8
+	#define SquPrologue					\
+		AS2(	mov		eax, A)			\
+		AS2(	mov		ecx, C)			\
+		AS2(	lea		ebx, s_maskLow16)
+	#define SquEpilogue
+	#define MulPrologue					\
+		AS2(	mov		eax, A)			\
+		AS2(	mov		edi, B)			\
+		AS2(	mov		ecx, C)			\
+		AS2(	lea		ebx, s_maskLow16)
+	#define MulEpilogue
+	#define TopPrologue					\
+		AS2(	mov		eax, A)			\
+		AS2(	mov		edi, B)			\
+		AS2(	mov		ecx, C)			\
+		AS2(	mov		esi, L)			\
+		AS2(	lea		ebx, s_maskLow16)
+	#define TopEpilogue
+#endif
 
-int Portable::Add(word *C, const word *A, const word *B, size_t N)
+#if defined(_MSC_VER) && defined(_M_X64)
+extern "C" {
+int Baseline_Add(size_t N, word *C, const word *A, const word *B);
+int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
+}
+#elif defined(CRYPTOPP_X86_ASM_AVAILABLE)
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
 {
-	assert (N%2 == 0);
+	AddPrologue
 
-	DWord u(0, 0);
-	for (unsigned int i = 0; i < N; i+=2)
-	{
-		u = DWord(A[i]) + B[i] + u.GetHighHalf();
-		C[i] = u.GetLowHalf();
-		u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
-		C[i+1] = u.GetLowHalf();
-	}
-	return int(u.GetHighHalf());
+	// now: eax = A, ebx = B, edx = C, ecx = N
+	AS2(	lea		eax, [eax+4*ecx])
+	AS2(	lea		ebx, [ebx+4*ecx])
+	AS2(	lea		edx, [edx+4*ecx])
+
+	AS1(	neg		ecx)				// ecx is negative index
+	AS2(	test	ecx, 2)				// this clears carry flag
+	ASJ(	jz,		0, f)
+	AS2(	sub		ecx, 2)
+	ASJ(	jmp,	1, f)
+
+	ASL(0)
+	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
+	AS2(	mov		esi,[eax+4*ecx])
+	AS2(	adc		esi,[ebx+4*ecx])
+	AS2(	mov		[edx+4*ecx],esi)
+	AS2(	mov		esi,[eax+4*ecx+4])
+	AS2(	adc		esi,[ebx+4*ecx+4])
+	AS2(	mov		[edx+4*ecx+4],esi)
+	ASL(1)
+	AS2(	mov		esi,[eax+4*ecx+8])
+	AS2(	adc		esi,[ebx+4*ecx+8])
+	AS2(	mov		[edx+4*ecx+8],esi)
+	AS2(	mov		esi,[eax+4*ecx+12])
+	AS2(	adc		esi,[ebx+4*ecx+12])
+	AS2(	mov		[edx+4*ecx+12],esi)
+
+	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
+	ASJ(	jmp,	0, b)
+
+	ASL(2)
+	AS2(	mov		eax, 0)
+	AS1(	setc	al)					// store carry into eax (return result register)
+
+	AddEpilogue
 }
 
-int Portable::Subtract(word *C, const word *A, const word *B, size_t N)
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
 {
-	assert (N%2 == 0);
+	AddPrologue
 
-	DWord u(0, 0);
-	for (unsigned int i = 0; i < N; i+=2)
-	{
-		u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
-		C[i] = u.GetLowHalf();
-		u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
-		C[i+1] = u.GetLowHalf();
-	}
-	return int(0-u.GetHighHalf());
+	// now: eax = A, ebx = B, edx = C, ecx = N
+	AS2(	lea		eax, [eax+4*ecx])
+	AS2(	lea		ebx, [ebx+4*ecx])
+	AS2(	lea		edx, [edx+4*ecx])
+
+	AS1(	neg		ecx)				// ecx is negative index
+	AS2(	test	ecx, 2)				// this clears carry flag
+	ASJ(	jz,		0, f)
+	AS2(	sub		ecx, 2)
+	ASJ(	jmp,	1, f)
+
+	ASL(0)
+	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
+	AS2(	mov		esi,[eax+4*ecx])
+	AS2(	sbb		esi,[ebx+4*ecx])
+	AS2(	mov		[edx+4*ecx],esi)
+	AS2(	mov		esi,[eax+4*ecx+4])
+	AS2(	sbb		esi,[ebx+4*ecx+4])
+	AS2(	mov		[edx+4*ecx+4],esi)
+	ASL(1)
+	AS2(	mov		esi,[eax+4*ecx+8])
+	AS2(	sbb		esi,[ebx+4*ecx+8])
+	AS2(	mov		[edx+4*ecx+8],esi)
+	AS2(	mov		esi,[eax+4*ecx+12])
+	AS2(	sbb		esi,[ebx+4*ecx+12])
+	AS2(	mov		[edx+4*ecx+12],esi)
+
+	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
+	ASJ(	jmp,	0, b)
+
+	ASL(2)
+	AS2(	mov		eax, 0)
+	AS1(	setc	al)					// store carry into eax (return result register)
+
+	AddEpilogue
 }
 
-void Portable::Multiply2(word *C, const word *A, const word *B)
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
 {
-/*
-	word s;
-	dword d;
+	AddPrologue
 
-	if (A1 >= A0)
-		if (B0 >= B1)
-		{
-			s = 0;
-			d = (dword)(A1-A0)*(B0-B1);
-		}
-		else
-		{
-			s = (A1-A0);
-			d = (dword)s*(word)(B0-B1);
-		}
-	else
-		if (B0 > B1)
-		{
-			s = (B0-B1);
-			d = (word)(A1-A0)*(dword)s;
-		}
-		else
-		{
-			s = 0;
-			d = (dword)(A0-A1)*(B1-B0);
-		}
-*/
-	// this segment is the branchless equivalent of above
-	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
-	unsigned int ai = A[1] < A[0];
-	unsigned int bi = B[0] < B[1];
-	unsigned int di = ai & bi;
-	DWord d = DWord::Multiply(D[di], D[di+2]);
-	D[1] = D[3] = 0;
-	unsigned int si = ai + !bi;
-	word s = D[si];
-
-	DWord A0B0 = DWord::Multiply(A[0], B[0]);
-	C[0] = A0B0.GetLowHalf();
-
-	DWord A1B1 = DWord::Multiply(A[1], B[1]);
-	DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf();
-	C[1] = t.GetLowHalf();
-
-	t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s;
-	C[2] = t.GetLowHalf();
-	C[3] = t.GetHighHalf();
-}
-
-inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
-{
-	DWord t = DWord::Multiply(A[0], B[0]);
-	C[0] = t.GetLowHalf();
-	C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
-}
-
-word Portable::Multiply2Add(word *C, const word *A, const word *B)
-{
-	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
-	unsigned int ai = A[1] < A[0];
-	unsigned int bi = B[0] < B[1];
-	unsigned int di = ai & bi;
-	DWord d = DWord::Multiply(D[di], D[di+2]);
-	D[1] = D[3] = 0;
-	unsigned int si = ai + !bi;
-	word s = D[si];
-
-	DWord A0B0 = DWord::Multiply(A[0], B[0]);
-	DWord t = A0B0 + C[0];
-	C[0] = t.GetLowHalf();
-
-	DWord A1B1 = DWord::Multiply(A[1], B[1]);
-	t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1];
-	C[1] = t.GetLowHalf();
-
-	t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
-	C[2] = t.GetLowHalf();
-
-	t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
-	C[3] = t.GetLowHalf();
-	return t.GetHighHalf();
-}
-
-#define MulAcc(x, y)								\
-	p = DWord::MultiplyAndAdd(A[x], B[y], c);		\
-	c = p.GetLowHalf();								\
-	p = (DWord) d + p.GetHighHalf();					\
-	d = p.GetLowHalf();								\
-	e += p.GetHighHalf();
-
-#define SaveMulAcc(s, x, y) 						\
-	R[s] = c;										\
-	p = DWord::MultiplyAndAdd(A[x], B[y], d);				\
-	c = p.GetLowHalf();								\
-	p = (DWord) e + p.GetHighHalf();					\
-	d = p.GetLowHalf();								\
-	e = p.GetHighHalf();
-
-#define SquAcc(x, y)								\
-	q = DWord::Multiply(A[x], A[y]);	\
-	p = q + c; 					\
-	c = p.GetLowHalf();								\
-	p = (DWord) d + p.GetHighHalf();					\
-	d = p.GetLowHalf();								\
-	e += p.GetHighHalf();			\
-	p = q + c; 					\
-	c = p.GetLowHalf();								\
-	p = (DWord) d + p.GetHighHalf();					\
-	d = p.GetLowHalf();								\
-	e += p.GetHighHalf();
-
-#define SaveSquAcc(s, x, y) 						\
-	R[s] = c;										\
-	q = DWord::Multiply(A[x], A[y]);	\
-	p = q + d; 					\
-	c = p.GetLowHalf();								\
-	p = (DWord) e + p.GetHighHalf();					\
-	d = p.GetLowHalf();								\
-	e = p.GetHighHalf();			\
-	p = q + c; 					\
-	c = p.GetLowHalf();								\
-	p = (DWord) d + p.GetHighHalf();					\
-	d = p.GetLowHalf();								\
-	e += p.GetHighHalf();
-
-void Portable::Multiply4(word *R, const word *A, const word *B)
-{
-	DWord p;
-	word c, d, e;
-
-	p = DWord::Multiply(A[0], B[0]);
-	R[0] = p.GetLowHalf();
-	c = p.GetHighHalf();
-	d = e = 0;
-
-	MulAcc(0, 1);
-	MulAcc(1, 0);
-
-	SaveMulAcc(1, 2, 0);
-	MulAcc(1, 1);
-	MulAcc(0, 2);
-
-	SaveMulAcc(2, 0, 3);
-	MulAcc(1, 2);
-	MulAcc(2, 1);
-	MulAcc(3, 0);
-
-	SaveMulAcc(3, 3, 1);
-	MulAcc(2, 2);
-	MulAcc(1, 3);
-
-	SaveMulAcc(4, 2, 3);
-	MulAcc(3, 2);
-
-	R[5] = c;
-	p = DWord::MultiplyAndAdd(A[3], B[3], d);
-	R[6] = p.GetLowHalf();
-	R[7] = e + p.GetHighHalf();
-}
-
-void Portable::Square2(word *R, const word *A)
-{
-	DWord p, q;
-	word c, d, e;
-
-	p = DWord::Multiply(A[0], A[0]);
-	R[0] = p.GetLowHalf();
-	c = p.GetHighHalf();
-	d = e = 0;
-
-	SquAcc(0, 1);
-
-	R[1] = c;
-	p = DWord::MultiplyAndAdd(A[1], A[1], d);
-	R[2] = p.GetLowHalf();
-	R[3] = e + p.GetHighHalf();
-}
-
-void Portable::Square4(word *R, const word *A)
-{
-#ifdef _MSC_VER
-	// VC60 workaround: MSVC 6.0 has an optimization bug that makes
-	// (dword)A*B where either A or B has been cast to a dword before
-	// very expensive. Revisit this function when this
-	// bug is fixed.
-	Multiply4(R, A, A);
-#else
-	const word *B = A;
-	DWord p, q;
-	word c, d, e;
+	// now: eax = A, ebx = B, edx = C, ecx = N
+	AS2(	lea		eax, [eax+4*ecx])
+	AS2(	lea		ebx, [ebx+4*ecx])
+	AS2(	lea		edx, [edx+4*ecx])
+
+	AS1(	neg		ecx)				// ecx is negative index
+	AS2(	pxor    mm2, mm2)
+	ASJ(	jz,		2, f)
+	AS2(	test	ecx, 2)				// this clears carry flag
+	ASJ(	jz,		0, f)
+	AS2(	sub		ecx, 2)
+	ASJ(	jmp,	1, f)
+
+	ASL(0)
+	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx])
+	AS2(	paddq    mm0, mm1)
+	AS2(	paddq	 mm2, mm0)
+	AS2(	movd	 DWORD PTR [edx+4*ecx], mm2)
+	AS2(	psrlq    mm2, 32)
+
+	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+4])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx+4])
+	AS2(	paddq    mm0, mm1)
+	AS2(	paddq	 mm2, mm0)
+	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
+	AS2(	psrlq    mm2, 32)
+
+	ASL(1)
+	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx+8])
+	AS2(	paddq    mm0, mm1)
+	AS2(	paddq	 mm2, mm0)
+	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm2)
+	AS2(	psrlq    mm2, 32)
+
+	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+12])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx+12])
+	AS2(	paddq    mm0, mm1)
+	AS2(	paddq	 mm2, mm0)
+	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
+	AS2(	psrlq    mm2, 32)
+
+	AS2(	add		ecx, 4)
+	ASJ(	jnz,	0, b)
+
+	ASL(2)
+	AS2(	movd	eax, mm2)
+	AS1(	emms)
 
-	p = DWord::Multiply(A[0], A[0]);
-	R[0] = p.GetLowHalf();
-	c = p.GetHighHalf();
-	d = e = 0;
+	AddEpilogue
+}
 
-	SquAcc(0, 1);
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
+{
+	AddPrologue
 
-	SaveSquAcc(1, 2, 0);
-	MulAcc(1, 1);
+	// now: eax = A, ebx = B, edx = C, ecx = N
+	AS2(	lea		eax, [eax+4*ecx])
+	AS2(	lea		ebx, [ebx+4*ecx])
+	AS2(	lea		edx, [edx+4*ecx])
+
+	AS1(	neg		ecx)				// ecx is negative index
+	AS2(	pxor    mm2, mm2)
+	ASJ(	jz,		2, f)
+	AS2(	test	ecx, 2)				// this clears carry flag
+	ASJ(	jz,		0, f)
+	AS2(	sub		ecx, 2)
+	ASJ(	jmp,	1, f)
+
+	ASL(0)
+	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx])
+	AS2(	psubq    mm0, mm1)
+	AS2(	psubq	 mm0, mm2)
+	AS2(	movd	 DWORD PTR [edx+4*ecx], mm0)
+	AS2(	psrlq    mm0, 63)
+
+	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+4])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx+4])
+	AS2(	psubq    mm2, mm1)
+	AS2(	psubq	 mm2, mm0)
+	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
+	AS2(	psrlq    mm2, 63)
+
+	ASL(1)
+	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx+8])
+	AS2(	psubq    mm0, mm1)
+	AS2(	psubq	 mm0, mm2)
+	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm0)
+	AS2(	psrlq    mm0, 63)
+
+	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+12])
+	AS2(	movd     mm1, DWORD PTR [ebx+4*ecx+12])
+	AS2(	psubq    mm2, mm1)
+	AS2(	psubq	 mm2, mm0)
+	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
+	AS2(	psrlq    mm2, 63)
+
+	AS2(	add		ecx, 4)
+	ASJ(	jnz,	0, b)
+
+	ASL(2)
+	AS2(	movd	eax, mm2)
+	AS1(	emms)
 
-	SaveSquAcc(2, 0, 3);
-	SquAcc(1, 2);
+	AddEpilogue
+}
+#else
+int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
+{
+	assert (N%2 == 0);
 
-	SaveSquAcc(3, 3, 1);
-	MulAcc(2, 2);
+	Declare2Words(u);
+	for (size_t i=0; i<N; i+=2)
+	{
+		AddWithCarry(u, A[i], B[i]);
+		C[i] = LowWord(u);
+		AddWithCarry(u, A[i+1], B[i+1]);
+		C[i+1] = LowWord(u);
+	}
+	return int(GetCarry(u));
+}
 
-	SaveSquAcc(4, 2, 3);
+int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
+{
+	assert (N%2 == 0);
 
-	R[5] = c;
-	p = DWord::MultiplyAndAdd(A[3], A[3], d);
-	R[6] = p.GetLowHalf();
-	R[7] = e + p.GetHighHalf();
+	Declare2Words(u);
+	for (size_t i=0; i<N; i+=2)
+	{
+		SubtractWithBorrow(u, A[i], B[i]);
+		C[i] = LowWord(u);
+		SubtractWithBorrow(u, A[i+1], B[i+1]);
+		C[i+1] = LowWord(u);
+	}
+	return int(GetBorrow(u));
+}
 #endif
+
+static word LinearMultiply(word *C, const word *A, word B, size_t N)
+{
+	word carry=0;
+	for(unsigned i=0; i<N; i++)
+	{
+		Declare2Words(p);
+		MultiplyWords(p, A[i], B);
+		Acc2WordsBy1(p, carry);
+		C[i] = LowWord(p);
+		carry = HighWord(p);
+	}
+	return carry;
 }
 
-void Portable::Multiply8(word *R, const word *A, const word *B)
-{
-	DWord p;
-	word c, d, e;
-
-	p = DWord::Multiply(A[0], B[0]);
-	R[0] = p.GetLowHalf();
-	c = p.GetHighHalf();
-	d = e = 0;
-
-	MulAcc(0, 1);
-	MulAcc(1, 0);
-
-	SaveMulAcc(1, 2, 0);
-	MulAcc(1, 1);
-	MulAcc(0, 2);
+#define Mul_2 \
+	Mul_Begin(2) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_End(2)
+
+#define Mul_4 \
+	Mul_Begin(4) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
+	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
+	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
+	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
+	Mul_End(4)
+
+#define Mul_8 \
+	Mul_Begin(8) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
+	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
+	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
+	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
+	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
+	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
+	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
+	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
+	Mul_End(8)
+
+#define Mul_16 \
+	Mul_Begin(16) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
+	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
+	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
+	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
+	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
+	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
+	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
+	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
+	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
+	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
+	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
+	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
+	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
+	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
+	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
+	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
+	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
+	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
+	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
+	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
+	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
+	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
+	Mul_End(16)
+
+#define Squ_2 \
+	Squ_Begin(2) \
+	Squ_End(2)
+
+#define Squ_4 \
+	Squ_Begin(4) \
+	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
+	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
+	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
+	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
+	Squ_End(4)
+
+#define Squ_8 \
+	Squ_Begin(8) \
+	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
+	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
+	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
+	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
+	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
+	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
+	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
+	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
+	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
+	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
+	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
+	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
+	Squ_End(8)
+
+#define Squ_16 \
+	Squ_Begin(16) \
+	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
+	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
+	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
+	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
+	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
+	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
+	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
+	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
+	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
+	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
+	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
+	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
+	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
+	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
+	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
+	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
+	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
+	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
+	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
+	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
+	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
+	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
+	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
+	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
+	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
+	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
+	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
+	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
+	Squ_End(16)
+
+#define Bot_2 \
+	Mul_Begin(2) \
+	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
+	Bot_End(2)
+
+#define Bot_4 \
+	Mul_Begin(4) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
+	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
+	Bot_End(4)
+
+#define Bot_8 \
+	Mul_Begin(8) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
+	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
+	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
+	Bot_End(8)
+
+#define Bot_16 \
+	Mul_Begin(16) \
+	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
+	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
+	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
+	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
+	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
+	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
+	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
+	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
+	Bot_End(16)
+
+#define Mul_Begin(n)				\
+	Declare2Words(p)				\
+	Declare2Words(c)				\
+	Declare2Words(d)				\
+	MultiplyWords(p, A[0], B[0])	\
+	AssignWord(c, LowWord(p))		\
+	AssignWord(d, HighWord(p))
+
+#define Mul_Acc(i, j)				\
+	MultiplyWords(p, A[i], B[j])	\
+	Acc2WordsBy1(c, LowWord(p))		\
+	Acc2WordsBy1(d, HighWord(p))
+
+#define Mul_SaveAcc(k, i, j) 		\
+	R[k] = LowWord(c);				\
+	Add2WordsBy1(c, d, HighWord(c))	\
+	MultiplyWords(p, A[i], B[j])	\
+	AssignWord(d, HighWord(p))		\
+	Acc2WordsBy1(c, LowWord(p))
+
+#define Mul_End(n)					\
+	R[2*n-3] = LowWord(c);			\
+	Acc2WordsBy1(d, HighWord(c))	\
+	MultiplyWords(p, A[n-1], B[n-1])\
+	Acc2WordsBy2(d, p)				\
+	R[2*n-2] = LowWord(d);			\
+	R[2*n-1] = HighWord(d);
+
+#define Bot_SaveAcc(k, i, j)		\
+	R[k] = LowWord(c);				\
+	word e = LowWord(d) + HighWord(c);	\
+	e += A[i] * B[j];
+
+#define Bot_Acc(i, j)	\
+	e += A[i] * B[j];
+
+#define Bot_End(n)		\
+	R[n-1] = e;
 
-	SaveMulAcc(2, 0, 3);
-	MulAcc(1, 2);
-	MulAcc(2, 1);
-	MulAcc(3, 0);
-
-	SaveMulAcc(3, 0, 4);
-	MulAcc(1, 3);
-	MulAcc(2, 2);
-	MulAcc(3, 1);
-	MulAcc(4, 0);
-
-	SaveMulAcc(4, 0, 5);
-	MulAcc(1, 4);
-	MulAcc(2, 3);
-	MulAcc(3, 2);
-	MulAcc(4, 1);
-	MulAcc(5, 0);
-
-	SaveMulAcc(5, 0, 6);
-	MulAcc(1, 5);
-	MulAcc(2, 4);
-	MulAcc(3, 3);
-	MulAcc(4, 2);
-	MulAcc(5, 1);
-	MulAcc(6, 0);
-
-	SaveMulAcc(6, 0, 7);
-	MulAcc(1, 6);
-	MulAcc(2, 5);
-	MulAcc(3, 4);
-	MulAcc(4, 3);
-	MulAcc(5, 2);
-	MulAcc(6, 1);
-	MulAcc(7, 0);
-
-	SaveMulAcc(7, 1, 7);
-	MulAcc(2, 6);
-	MulAcc(3, 5);
-	MulAcc(4, 4);
-	MulAcc(5, 3);
-	MulAcc(6, 2);
-	MulAcc(7, 1);
-
-	SaveMulAcc(8, 2, 7);
-	MulAcc(3, 6);
-	MulAcc(4, 5);
-	MulAcc(5, 4);
-	MulAcc(6, 3);
-	MulAcc(7, 2);
-
-	SaveMulAcc(9, 3, 7);
-	MulAcc(4, 6);
-	MulAcc(5, 5);
-	MulAcc(6, 4);
-	MulAcc(7, 3);
+/*
+// this is slower on MSVC 2005 Win32
+#define Mul_Begin(n)				\
+	Declare2Words(p)				\
+	word c;	\
+	Declare2Words(d)				\
+	MultiplyWords(p, A[0], B[0])	\
+	c = LowWord(p);		\
+	AssignWord(d, HighWord(p))
+
+#define Mul_Acc(i, j)				\
+	MultiplyWords(p, A[i], B[j])	\
+	Acc2WordsBy1(p, c)		\
+	c = LowWord(p);	\
+	Acc2WordsBy1(d, HighWord(p))
+
+#define Mul_SaveAcc(k, i, j) 		\
+	R[k] = c;				\
+	MultiplyWords(p, A[i], B[j])	\
+	Acc2WordsBy1(p, LowWord(d))		\
+	c = LowWord(p);	\
+	AssignWord(d, HighWord(d))	\
+	Acc2WordsBy1(d, HighWord(p))
+
+#define Mul_End(n)					\
+	R[2*n-3] = c;			\
+	MultiplyWords(p, A[n-1], B[n-1])\
+	Acc2WordsBy2(d, p)				\
+	R[2*n-2] = LowWord(d);			\
+	R[2*n-1] = HighWord(d);
+
+#define Bot_SaveAcc(k, i, j)		\
+	R[k] = c;				\
+	c = LowWord(d);	\
+	c += A[i] * B[j];
+
+#define Bot_Acc(i, j)	\
+	c += A[i] * B[j];
+
+#define Bot_End(n)		\
+	R[n-1] = c;
+*/
 
-	SaveMulAcc(10, 4, 7);
-	MulAcc(5, 6);
-	MulAcc(6, 5);
-	MulAcc(7, 4);
-
-	SaveMulAcc(11, 5, 7);
-	MulAcc(6, 6);
-	MulAcc(7, 5);
-
-	SaveMulAcc(12, 6, 7);
-	MulAcc(7, 6);
-
-	R[13] = c;
-	p = DWord::MultiplyAndAdd(A[7], B[7], d);
-	R[14] = p.GetLowHalf();
-	R[15] = e + p.GetHighHalf();
-}
-
-void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
-{
-	DWord p;
-	word c, d, e;
-
-	p = DWord::Multiply(A[0], B[0]);
-	R[0] = p.GetLowHalf();
-	c = p.GetHighHalf();
-	d = e = 0;
-
-	MulAcc(0, 1);
-	MulAcc(1, 0);
-
-	SaveMulAcc(1, 2, 0);
-	MulAcc(1, 1);
-	MulAcc(0, 2);
-
-	R[2] = c;
-	R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
-}
-
-void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
-{
-	DWord p;
-	word c, d, e;
-
-	p = DWord::Multiply(A[0], B[0]);
-	R[0] = p.GetLowHalf();
-	c = p.GetHighHalf();
-	d = e = 0;
-
-	MulAcc(0, 1);
-	MulAcc(1, 0);
-
-	SaveMulAcc(1, 2, 0);
-	MulAcc(1, 1);
-	MulAcc(0, 2);
-
-	SaveMulAcc(2, 0, 3);
-	MulAcc(1, 2);
-	MulAcc(2, 1);
-	MulAcc(3, 0);
-
-	SaveMulAcc(3, 0, 4);
-	MulAcc(1, 3);
-	MulAcc(2, 2);
-	MulAcc(3, 1);
-	MulAcc(4, 0);
-
-	SaveMulAcc(4, 0, 5);
-	MulAcc(1, 4);
-	MulAcc(2, 3);
-	MulAcc(3, 2);
-	MulAcc(4, 1);
-	MulAcc(5, 0);
-
-	SaveMulAcc(5, 0, 6);
-	MulAcc(1, 5);
-	MulAcc(2, 4);
-	MulAcc(3, 3);
-	MulAcc(4, 2);
-	MulAcc(5, 1);
-	MulAcc(6, 0);
+#define Squ_Begin(n)				\
+	Declare2Words(p)				\
+	Declare2Words(c)				\
+	Declare2Words(d)				\
+	Declare2Words(e)				\
+	MultiplyWords(p, A[0], A[0])	\
+	R[0] = LowWord(p);				\
+	AssignWord(e, HighWord(p))		\
+	MultiplyWords(p, A[0], A[1])	\
+	AssignWord(c, LowWord(p))		\
+	AssignWord(d, HighWord(p))		\
+	Squ_NonDiag						\
 
-	R[6] = c;
-	R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
-				A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
-}
+#define Squ_NonDiag				\
+	Double2Words(c)				\
+	Double2Words(d)				\
 
-#undef MulAcc
-#undef SaveMulAcc
-#undef SquAcc
-#undef SaveSquAcc
+#define Squ_SaveAcc(k, i, j) 		\
+	Acc2WordsBy2(c, e)				\
+	R[k] = LowWord(c);				\
+	Add2WordsBy1(e, d, HighWord(c))	\
+	MultiplyWords(p, A[i], A[j])	\
+	AssignWord(c, LowWord(p))		\
+	AssignWord(d, HighWord(p))		\
 
-#ifdef CRYPTOPP_X86ASM_AVAILABLE
+#define Squ_Acc(i, j)				\
+	MultiplyWords(p, A[i], A[j])	\
+	Acc2WordsBy1(c, LowWord(p))		\
+	Acc2WordsBy1(d, HighWord(p))
 
-// ************** x86 feature detection ***************
+#define Squ_Diag(i)					\
+	Squ_NonDiag						\
+	MultiplyWords(p, A[i], A[i])	\
+	Acc2WordsBy1(c, LowWord(p))		\
+	Acc2WordsBy1(d, HighWord(p))	\
 
-static bool s_sse2Enabled = true;
+#define Squ_End(n)					\
+	Acc2WordsBy2(c, e)				\
+	R[2*n-3] = LowWord(c);			\
+	Acc2WordsBy1(d, HighWord(c))	\
+	MultiplyWords(p, A[n-1], A[n-1])\
+	Acc2WordsBy2(d, p)				\
+	R[2*n-2] = LowWord(d);			\
+	R[2*n-1] = HighWord(d);
 
-static void CpuId(word32 input, word32 *output)
+void Baseline_Multiply2(word *R, const word *A, const word *B)
 {
-#ifdef __GNUC__
-	__asm__
-	(
-		// save ebx in case -fPIC is being used
-		"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
-		: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
-		: "a" (input)
-	);
-#else
-	__asm
-	{
-		mov eax, input
-		cpuid
-		mov edi, output
-		mov [edi], eax
-		mov [edi+4], ebx
-		mov [edi+8], ecx
-		mov [edi+12], edx
-	}
-#endif
+	Mul_2
 }
 
-#ifdef SSE2_INTRINSICS_AVAILABLE
-#ifndef _MSC_VER
-static jmp_buf s_env;
-static void SigIllHandler(int)
+void Baseline_Multiply4(word *R, const word *A, const word *B)
 {
-	longjmp(s_env, 1);
+	Mul_4
 }
-#endif
 
-static bool HasSSE2()
+void Baseline_Multiply8(word *R, const word *A, const word *B)
 {
-	if (!s_sse2Enabled)
-		return false;
-
-	word32 cpuid[4];
-	CpuId(1, cpuid);
-	if ((cpuid[3] & (1 << 26)) == 0)
-		return false;
-
-#ifdef _MSC_VER
-    __try
-	{
-        __asm xorpd xmm0, xmm0        // executing SSE2 instruction
-	}
-    __except (1)
-	{
-		return false;
-    }
-	return true;
-#else
-	typedef void (*SigHandler)(int);
+	Mul_8
+}
 
-	SigHandler oldHandler = signal(SIGILL, SigIllHandler);
-	if (oldHandler == SIG_ERR)
-		return false;
+void Baseline_Square2(word *R, const word *A)
+{
+	Squ_2
+}
 
-	bool result = true;
-	if (setjmp(s_env))
-		result = false;
-	else
-		__asm __volatile ("xorps %xmm0, %xmm0");
+void Baseline_Square4(word *R, const word *A)
+{
+	Squ_4
+}
 
-	signal(SIGILL, oldHandler);
-	return result;
-#endif
+void Baseline_Square8(word *R, const word *A)
+{
+	Squ_8
 }
-#endif
 
-static bool IsP4()
+void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
 {
-	word32 cpuid[4];
+	Bot_2
+}
 
-	CpuId(0, cpuid);
-	std::swap(cpuid[2], cpuid[3]);
-	if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
-		return false;
+void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
+{
+	Bot_4
+}
 
-	CpuId(1, cpuid);
-	return ((cpuid[0] >> 8) & 0xf) == 0xf;
+void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
+{
+	Bot_8
 }
 
-// ************** Pentium/P4 optimizations ***************
+/*
+void Baseline_Multiply16(word *R, const word *A, const word *B)
+{
+	Mul_16
+}
 
-class PentiumOptimized : public Portable
+void Baseline_Square16(word *R, const word *A)
 {
-public:
-	static int Add(word *C, const word *A, const word *B, size_t N);
-	static int Subtract(word *C, const word *A, const word *B, size_t N);
-	static void Multiply4(word *C, const word *A, const word *B);
-	static void Multiply8(word *C, const word *A, const word *B);
-	static void Multiply8Bottom(word *C, const word *A, const word *B);
-};
+	Squ_16
+}
 
-class P4Optimized
+void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
 {
-public:
-	static int Add(word *C, const word *A, const word *B, size_t N);
-	static int Subtract(word *C, const word *A, const word *B, size_t N);
-#ifdef SSE2_INTRINSICS_AVAILABLE
-	static void Multiply4(word *C, const word *A, const word *B);
-	static void Multiply8(word *C, const word *A, const word *B);
-	static void Multiply8Bottom(word *C, const word *A, const word *B);
-#endif
-};
+	Bot_16
+}
+*/
 
-typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);
-typedef void (* PMul)(word *C, const word *A, const word *B);
+// ********************************************************
+
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
+
+#undef Mul_Begin
+#undef Mul_Acc
+#undef Squ_Acc
+#undef Squ_NonDiag
+#undef Squ_Diag
+#undef Squ_SaveAcc
+#undef Squ_Begin
+#undef Mul_SaveAcc
+#undef Bot_Acc
+#undef Bot_SaveAcc
+#undef Bot_End
+#undef Squ_End
+#undef Mul_End
+
+#define SSE2_FinalSave(k)			\
+	AS2(	psllq		xmm5, 16)	\
+	AS2(	paddq		xmm4, xmm5)	\
+	AS2(	movq		QWORD PTR [ecx+8*(k)], xmm4)
+
+#define SSE2_SaveShift(k)			\
+	AS2(	movq		xmm0, xmm6)	\
+	AS2(	punpckhqdq	xmm6, xmm0)	\
+	AS2(	movq		xmm1, xmm7)	\
+	AS2(	punpckhqdq	xmm7, xmm1)	\
+	AS2(	paddd		xmm6, xmm0)	\
+	AS2(	pslldq		xmm6, 4)	\
+	AS2(	paddd		xmm7, xmm1)	\
+	AS2(	paddd		xmm4, xmm6)	\
+	AS2(	pslldq		xmm7, 4)	\
+	AS2(	movq		xmm6, xmm4)	\
+	AS2(	paddd		xmm5, xmm7)	\
+	AS2(	movq		xmm7, xmm5)	\
+	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
+	AS2(	psrlq		xmm6, 16)	\
+	AS2(	paddq		xmm6, xmm7)	\
+	AS2(	punpckhqdq	xmm4, xmm0)	\
+	AS2(	punpckhqdq	xmm5, xmm0)	\
+	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm6)	\
+	AS2(	psrlq		xmm6, 3*16)	\
+	AS2(	paddd		xmm4, xmm6)	\
+
+#define Squ_SSE2_SaveShift(k)			\
+	AS2(	movq		xmm0, xmm6)	\
+	AS2(	punpckhqdq	xmm6, xmm0)	\
+	AS2(	movq		xmm1, xmm7)	\
+	AS2(	punpckhqdq	xmm7, xmm1)	\
+	AS2(	paddd		xmm6, xmm0)	\
+	AS2(	pslldq		xmm6, 4)	\
+	AS2(	paddd		xmm7, xmm1)	\
+	AS2(	paddd		xmm4, xmm6)	\
+	AS2(	pslldq		xmm7, 4)	\
+	AS2(	movhlps		xmm6, xmm4)	\
+	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
+	AS2(	paddd		xmm5, xmm7)	\
+	AS2(	movhps		QWORD PTR [esp+12], xmm5)\
+	AS2(	psrlq		xmm4, 16)	\
+	AS2(	paddq		xmm4, xmm5)	\
+	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm4)	\
+	AS2(	psrlq		xmm4, 3*16)	\
+	AS2(	paddd		xmm4, xmm6)	\
+	AS2(	movq		QWORD PTR [esp+4], xmm4)\
+
+#define SSE2_FirstMultiply(i)				\
+	AS2(	movdqa		xmm7, [esi+(i)*16])\
+	AS2(	movdqa		xmm5, [edi-(i)*16])\
+	AS2(	pmuludq		xmm5, xmm7)		\
+	AS2(	movdqa		xmm4, [ebx])\
+	AS2(	movdqa		xmm6, xmm4)		\
+	AS2(	pand		xmm4, xmm5)		\
+	AS2(	psrld		xmm5, 16)		\
+	AS2(	pmuludq		xmm7, [edx-(i)*16])\
+	AS2(	pand		xmm6, xmm7)		\
+	AS2(	psrld		xmm7, 16)
+
+#define Squ_Begin(n)							\
+	SquPrologue									\
+	AS2(	mov		esi, esp)\
+	AS2(	and		esp, 0xfffffff0)\
+	AS2(	lea		edi, [esp-32*n])\
+	AS2(	sub		esp, 32*n+16)\
+	AS1(	push	esi)\
+	AS2(	mov		esi, edi)					\
+	AS2(	xor		edx, edx)					\
+	ASL(1)										\
+	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
+	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
+	AS2(	movdqa	[edi+2*edx], xmm0)		\
+	AS2(	psrlq	xmm0, 32)					\
+	AS2(	movdqa	[edi+2*edx+16], xmm0)	\
+	AS2(	movdqa	[edi+16*n+2*edx], xmm1)		\
+	AS2(	psrlq	xmm1, 32)					\
+	AS2(	movdqa	[edi+16*n+2*edx+16], xmm1)	\
+	AS2(	add		edx, 16)					\
+	AS2(	cmp		edx, 8*(n))					\
+	ASJ(	jne,	1, b)						\
+	AS2(	lea		edx, [edi+16*n])\
+	SSE2_FirstMultiply(0)							\
+
+#define Squ_Acc(i)								\
+	ASL(LSqu##i)								\
+	AS2(	movdqa		xmm1, [esi+(i)*16])	\
+	AS2(	movdqa		xmm0, [edi-(i)*16])	\
+	AS2(	movdqa		xmm2, [ebx])	\
+	AS2(	pmuludq		xmm0, xmm1)				\
+	AS2(	pmuludq		xmm1, [edx-(i)*16])	\
+	AS2(	movdqa		xmm3, xmm2)			\
+	AS2(	pand		xmm2, xmm0)			\
+	AS2(	psrld		xmm0, 16)			\
+	AS2(	paddd		xmm4, xmm2)			\
+	AS2(	paddd		xmm5, xmm0)			\
+	AS2(	pand		xmm3, xmm1)			\
+	AS2(	psrld		xmm1, 16)			\
+	AS2(	paddd		xmm6, xmm3)			\
+	AS2(	paddd		xmm7, xmm1)		\
+
+#define Squ_Acc1(i)		
+#define Squ_Acc2(i)		ASC(call, LSqu##i)
+#define Squ_Acc3(i)		Squ_Acc2(i)
+#define Squ_Acc4(i)		Squ_Acc2(i)
+#define Squ_Acc5(i)		Squ_Acc2(i)
+#define Squ_Acc6(i)		Squ_Acc2(i)
+#define Squ_Acc7(i)		Squ_Acc2(i)
+#define Squ_Acc8(i)		Squ_Acc2(i)
+
+#define SSE2_End(E, n)					\
+	SSE2_SaveShift(2*(n)-3)			\
+	AS2(	movdqa		xmm7, [esi+16])	\
+	AS2(	movdqa		xmm0, [edi])	\
+	AS2(	pmuludq		xmm0, xmm7)				\
+	AS2(	movdqa		xmm2, [ebx])		\
+	AS2(	pmuludq		xmm7, [edx])	\
+	AS2(	movdqa		xmm6, xmm2)				\
+	AS2(	pand		xmm2, xmm0)				\
+	AS2(	psrld		xmm0, 16)				\
+	AS2(	paddd		xmm4, xmm2)				\
+	AS2(	paddd		xmm5, xmm0)				\
+	AS2(	pand		xmm6, xmm7)				\
+	AS2(	psrld		xmm7, 16)	\
+	SSE2_SaveShift(2*(n)-2)			\
+	SSE2_FinalSave(2*(n)-1)			\
+	AS1(	pop		esp)\
+	E
+
+#define Squ_End(n)		SSE2_End(SquEpilogue, n)
+#define Mul_End(n)		SSE2_End(MulEpilogue, n)
+#define Top_End(n)		SSE2_End(TopEpilogue, n)
+
+#define Squ_Column1(k, i)	\
+	Squ_SSE2_SaveShift(k)					\
+	AS2(	add			esi, 16)	\
+	SSE2_FirstMultiply(1)\
+	Squ_Acc##i(i)	\
+	AS2(	paddd		xmm4, xmm4)		\
+	AS2(	paddd		xmm5, xmm5)		\
+	AS2(	movdqa		xmm3, [esi])				\
+	AS2(	movq		xmm1, QWORD PTR [esi+8])	\
+	AS2(	pmuludq		xmm1, xmm3)		\
+	AS2(	pmuludq		xmm3, xmm3)		\
+	AS2(	movdqa		xmm0, [ebx])\
+	AS2(	movdqa		xmm2, xmm0)		\
+	AS2(	pand		xmm0, xmm1)		\
+	AS2(	psrld		xmm1, 16)		\
+	AS2(	paddd		xmm6, xmm0)		\
+	AS2(	paddd		xmm7, xmm1)		\
+	AS2(	pand		xmm2, xmm3)		\
+	AS2(	psrld		xmm3, 16)		\
+	AS2(	paddd		xmm6, xmm6)		\
+	AS2(	paddd		xmm7, xmm7)		\
+	AS2(	paddd		xmm4, xmm2)		\
+	AS2(	paddd		xmm5, xmm3)		\
+	AS2(	movq		xmm0, QWORD PTR [esp+4])\
+	AS2(	movq		xmm1, QWORD PTR [esp+12])\
+	AS2(	paddd		xmm4, xmm0)\
+	AS2(	paddd		xmm5, xmm1)\
+
+#define Squ_Column0(k, i)	\
+	Squ_SSE2_SaveShift(k)					\
+	AS2(	add			edi, 16)	\
+	AS2(	add			edx, 16)	\
+	SSE2_FirstMultiply(1)\
+	Squ_Acc##i(i)	\
+	AS2(	paddd		xmm6, xmm6)		\
+	AS2(	paddd		xmm7, xmm7)		\
+	AS2(	paddd		xmm4, xmm4)		\
+	AS2(	paddd		xmm5, xmm5)		\
+	AS2(	movq		xmm0, QWORD PTR [esp+4])\
+	AS2(	movq		xmm1, QWORD PTR [esp+12])\
+	AS2(	paddd		xmm4, xmm0)\
+	AS2(	paddd		xmm5, xmm1)\
+
+#define SSE2_MulAdd45						\
+	AS2(	movdqa		xmm7, [esi])	\
+	AS2(	movdqa		xmm0, [edi])	\
+	AS2(	pmuludq		xmm0, xmm7)				\
+	AS2(	movdqa		xmm2, [ebx])		\
+	AS2(	pmuludq		xmm7, [edx])	\
+	AS2(	movdqa		xmm6, xmm2)				\
+	AS2(	pand		xmm2, xmm0)				\
+	AS2(	psrld		xmm0, 16)				\
+	AS2(	paddd		xmm4, xmm2)				\
+	AS2(	paddd		xmm5, xmm0)				\
+	AS2(	pand		xmm6, xmm7)				\
+	AS2(	psrld		xmm7, 16)
+
+#define Mul_Begin(n)							\
+	MulPrologue									\
+	AS2(	mov		esi, esp)\
+	AS2(	and		esp, 0xfffffff0)\
+	AS2(	sub		esp, 48*n+16)\
+	AS1(	push	esi)\
+	AS2(	xor		edx, edx)					\
+	ASL(1)										\
+	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
+	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
+	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
+	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
+	AS2(	psrlq	xmm0, 32)					\
+	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
+	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
+	AS2(	psrlq	xmm1, 32)					\
+	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
+	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
+	AS2(	psrlq	xmm2, 32)					\
+	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
+	AS2(	add		edx, 16)					\
+	AS2(	cmp		edx, 8*(n))					\
+	ASJ(	jne,	1, b)						\
+	AS2(	lea		edi, [esp+20])\
+	AS2(	lea		edx, [esp+20+16*n])\
+	AS2(	lea		esi, [esp+20+32*n])\
+	SSE2_FirstMultiply(0)							\
+
+#define Mul_Acc(i)								\
+	ASL(LMul##i)										\
+	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
+	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
+	AS2(	movdqa		xmm2, [ebx])	\
+	AS2(	pmuludq		xmm0, xmm1)				\
+	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
+	AS2(	movdqa		xmm3, xmm2)			\
+	AS2(	pand		xmm2, xmm0)			\
+	AS2(	psrld		xmm0, 16)			\
+	AS2(	paddd		xmm4, xmm2)			\
+	AS2(	paddd		xmm5, xmm0)			\
+	AS2(	pand		xmm3, xmm1)			\
+	AS2(	psrld		xmm1, 16)			\
+	AS2(	paddd		xmm6, xmm3)			\
+	AS2(	paddd		xmm7, xmm1)		\
+
+#define Mul_Acc1(i)		
+#define Mul_Acc2(i)		ASC(call, LMul##i)
+#define Mul_Acc3(i)		Mul_Acc2(i)
+#define Mul_Acc4(i)		Mul_Acc2(i)
+#define Mul_Acc5(i)		Mul_Acc2(i)
+#define Mul_Acc6(i)		Mul_Acc2(i)
+#define Mul_Acc7(i)		Mul_Acc2(i)
+#define Mul_Acc8(i)		Mul_Acc2(i)
+#define Mul_Acc9(i)		Mul_Acc2(i)
+#define Mul_Acc10(i)	Mul_Acc2(i)
+#define Mul_Acc11(i)	Mul_Acc2(i)
+#define Mul_Acc12(i)	Mul_Acc2(i)
+#define Mul_Acc13(i)	Mul_Acc2(i)
+#define Mul_Acc14(i)	Mul_Acc2(i)
+#define Mul_Acc15(i)	Mul_Acc2(i)
+#define Mul_Acc16(i)	Mul_Acc2(i)
+
+#define Mul_Column1(k, i)	\
+	SSE2_SaveShift(k)					\
+	AS2(	add			esi, 16)	\
+	SSE2_MulAdd45\
+	Mul_Acc##i(i)	\
+
+#define Mul_Column0(k, i)	\
+	SSE2_SaveShift(k)					\
+	AS2(	add			edi, 16)	\
+	AS2(	add			edx, 16)	\
+	SSE2_MulAdd45\
+	Mul_Acc##i(i)	\
+
+#define Bot_Acc(i)							\
+	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
+	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
+	AS2(	pmuludq		xmm0, xmm1)				\
+	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])		\
+	AS2(	paddq		xmm4, xmm0)				\
+	AS2(	paddd		xmm6, xmm1)
+
+#define Bot_SaveAcc(k)					\
+	SSE2_SaveShift(k)							\
+	AS2(	add			edi, 16)	\
+	AS2(	add			edx, 16)	\
+	AS2(	movdqa		xmm6, [esi])	\
+	AS2(	movdqa		xmm0, [edi])	\
+	AS2(	pmuludq		xmm0, xmm6)				\
+	AS2(	paddq		xmm4, xmm0)				\
+	AS2(	psllq		xmm5, 16)				\
+	AS2(	paddq		xmm4, xmm5)				\
+	AS2(	pmuludq		xmm6, [edx])
+
+#define Bot_End(n)							\
+	AS2(	movhlps		xmm7, xmm6)			\
+	AS2(	paddd		xmm6, xmm7)			\
+	AS2(	psllq		xmm6, 32)			\
+	AS2(	paddd		xmm4, xmm6)			\
+	AS2(	movq		QWORD PTR [ecx+8*((n)-1)], xmm4)	\
+	AS1(	pop		esp)\
+	MulEpilogue
 
-static PAddSub s_pAdd, s_pSub;
-#ifdef SSE2_INTRINSICS_AVAILABLE
-static PMul s_pMul4, s_pMul8, s_pMul8B;
+#define Top_Begin(n)							\
+	TopPrologue									\
+	AS2(	mov		edx, esp)\
+	AS2(	and		esp, 0xfffffff0)\
+	AS2(	sub		esp, 48*n+16)\
+	AS1(	push	edx)\
+	AS2(	xor		edx, edx)					\
+	ASL(1)										\
+	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
+	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
+	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
+	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
+	AS2(	psrlq	xmm0, 32)					\
+	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
+	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
+	AS2(	psrlq	xmm1, 32)					\
+	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
+	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
+	AS2(	psrlq	xmm2, 32)					\
+	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
+	AS2(	add		edx, 16)					\
+	AS2(	cmp		edx, 8*(n))					\
+	ASJ(	jne,	1, b)						\
+	AS2(	mov		eax, esi)					\
+	AS2(	lea		edi, [esp+20+00*n+16*(n/2-1)])\
+	AS2(	lea		edx, [esp+20+16*n+16*(n/2-1)])\
+	AS2(	lea		esi, [esp+20+32*n+16*(n/2-1)])\
+	AS2(	pxor	xmm4, xmm4)\
+	AS2(	pxor	xmm5, xmm5)
+
+#define Top_Acc(i)							\
+	AS2(	movq		xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])	\
+	AS2(	pmuludq		xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
+	AS2(	psrlq		xmm0, 48)				\
+	AS2(	paddd		xmm5, xmm0)\
+
+#define Top_Column0(i)	\
+	AS2(	psllq		xmm5, 32)				\
+	AS2(	add			edi, 16)	\
+	AS2(	add			edx, 16)	\
+	SSE2_MulAdd45\
+	Mul_Acc##i(i)	\
+
+#define Top_Column1(i)	\
+	SSE2_SaveShift(0)					\
+	AS2(	add			esi, 16)	\
+	SSE2_MulAdd45\
+	Mul_Acc##i(i)	\
+	AS2(	shr			eax, 16)	\
+	AS2(	movd		xmm0, eax)\
+	AS2(	movd		xmm1, [ecx+4])\
+	AS2(	psrld		xmm1, 16)\
+	AS2(	pcmpgtd		xmm1, xmm0)\
+	AS2(	psrld		xmm1, 31)\
+	AS2(	paddd		xmm4, xmm1)\
+
+void SSE2_Square4(word *C, const word *A)
+{
+	Squ_Begin(2)
+	Squ_Column0(0, 1)
+	Squ_End(2)
+}
+
+void SSE2_Square8(word *C, const word *A)
+{
+	Squ_Begin(4)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Squ_Acc(2)
+	AS1(	ret) ASL(0)
 #endif
+	Squ_Column0(0, 1)
+	Squ_Column1(1, 1)
+	Squ_Column0(2, 2)
+	Squ_Column1(3, 1)
+	Squ_Column0(4, 1)
+	Squ_End(4)
+}
 
-static void SetPentiumFunctionPointers()
+void SSE2_Square16(word *C, const word *A)
 {
-	if (IsP4())
-	{
-		s_pAdd = &P4Optimized::Add;
-		s_pSub = &P4Optimized::Subtract;
-	}
-	else
-	{
-		s_pAdd = &PentiumOptimized::Add;
-		s_pSub = &PentiumOptimized::Subtract;
-	}
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-	if (HasSSE2())
-	{
-		s_pMul4 = &P4Optimized::Multiply4;
-		s_pMul8 = &P4Optimized::Multiply8;
-		s_pMul8B = &P4Optimized::Multiply8Bottom;
-	}
-	else
-	{
-		s_pMul4 = &PentiumOptimized::Multiply4;
-		s_pMul8 = &PentiumOptimized::Multiply8;
-		s_pMul8B = &PentiumOptimized::Multiply8Bottom;
-	}
+	Squ_Begin(8)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Squ_Column0(0, 1)
+	Squ_Column1(1, 1)
+	Squ_Column0(2, 2)
+	Squ_Column1(3, 2)
+	Squ_Column0(4, 3)
+	Squ_Column1(5, 3)
+	Squ_Column0(6, 4)
+	Squ_Column1(7, 3)
+	Squ_Column0(8, 3)
+	Squ_Column1(9, 2)
+	Squ_Column0(10, 2)
+	Squ_Column1(11, 1)
+	Squ_Column0(12, 1)
+	Squ_End(8)
+}
+
+void SSE2_Square32(word *C, const word *A)
+{
+	Squ_Begin(16)
+	ASJ(	jmp,	0, f)
+	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
+	AS1(	ret) ASL(0)
+	Squ_Column0(0, 1)
+	Squ_Column1(1, 1)
+	Squ_Column0(2, 2)
+	Squ_Column1(3, 2)
+	Squ_Column0(4, 3)
+	Squ_Column1(5, 3)
+	Squ_Column0(6, 4)
+	Squ_Column1(7, 4)
+	Squ_Column0(8, 5)
+	Squ_Column1(9, 5)
+	Squ_Column0(10, 6)
+	Squ_Column1(11, 6)
+	Squ_Column0(12, 7)
+	Squ_Column1(13, 7)
+	Squ_Column0(14, 8)
+	Squ_Column1(15, 7)
+	Squ_Column0(16, 7)
+	Squ_Column1(17, 6)
+	Squ_Column0(18, 6)
+	Squ_Column1(19, 5)
+	Squ_Column0(20, 5)
+	Squ_Column1(21, 4)
+	Squ_Column0(22, 4)
+	Squ_Column1(23, 3)
+	Squ_Column0(24, 3)
+	Squ_Column1(25, 2)
+	Squ_Column0(26, 2)
+	Squ_Column1(27, 1)
+	Squ_Column0(28, 1)
+	Squ_End(16)
+}
+
+void SSE2_Multiply4(word *C, const word *A, const word *B)
+{
+	Mul_Begin(2)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(2)
+	AS1(	ret) ASL(0)
 #endif
+	Mul_Column0(0, 2)
+	Mul_End(2)
 }
 
-void DisableSSE2()
+void SSE2_Multiply8(word *C, const word *A, const word *B)
 {
-	s_sse2Enabled = false;
-	SetPentiumFunctionPointers();
+	Mul_Begin(4)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Mul_Column0(0, 2)
+	Mul_Column1(1, 3)
+	Mul_Column0(2, 4)
+	Mul_Column1(3, 3)
+	Mul_Column0(4, 2)
+	Mul_End(4)
 }
 
-class LowLevel : public PentiumOptimized
+void SSE2_Multiply16(word *C, const word *A, const word *B)
 {
-public:
-	inline static int Add(word *C, const word *A, const word *B, size_t N)
-		{return s_pAdd(C, A, B, N);}
-	inline static int Subtract(word *C, const word *A, const word *B, size_t N)
-		{return s_pSub(C, A, B, N);}
-	inline static void Square4(word *R, const word *A)
-		{Multiply4(R, A, A);}
-#ifdef SSE2_INTRINSICS_AVAILABLE
-	inline static void Multiply4(word *C, const word *A, const word *B)
-		{s_pMul4(C, A, B);}
-	inline static void Multiply8(word *C, const word *A, const word *B)
-		{s_pMul8(C, A, B);}
-	inline static void Multiply8Bottom(word *C, const word *A, const word *B)
-		{s_pMul8B(C, A, B);}
+	Mul_Begin(8)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
 #endif
-};
-
-// use some tricks to share assembly code between MSVC and GCC
-#ifdef _MSC_VER
-	#define CRYPTOPP_NAKED __declspec(naked)
-	#define AS1(x) __asm x
-	#define AS2(x, y) __asm x, y
-	#define AddPrologue \
-		__asm	push ebp \
-		__asm	push ebx \
-		__asm	push esi \
-		__asm	push edi \
-		__asm	mov		ecx, [esp+20] \
-		__asm	mov		edx, [esp+24] \
-		__asm	mov		ebx, [esp+28] \
-		__asm	mov		esi, [esp+32]
-	#define AddEpilogue \
-		__asm	pop edi \
-		__asm	pop esi \
-		__asm	pop ebx \
-		__asm	pop ebp \
-		__asm	ret
-	#define MulPrologue \
-		__asm	push ebp \
-		__asm	push ebx \
-		__asm	push esi \
-		__asm	push edi \
-		__asm	mov ecx, [esp+28] \
-		__asm	mov esi, [esp+24] \
-		__asm	push [esp+20]
-	#define MulEpilogue \
-		__asm	add esp, 4 \
-		__asm	pop edi \
-		__asm	pop esi \
-		__asm	pop ebx \
-		__asm	pop ebp \
-		__asm	ret
-#else
-	#define CRYPTOPP_NAKED
-	#define AS1(x) #x ";"
-	#define AS2(x, y) #x ", " #y ";"
-	#define AddPrologue \
-		__asm__ __volatile__ \
-		( \
-			"push %%ebx;"	/* save this manually, in case of -fPIC */ \
-			"mov %2, %%ebx;" \
-			".intel_syntax noprefix;" \
-			"push ebp;"
-	#define AddEpilogue \
-			"pop ebp;" \
-			".att_syntax prefix;" \
-			"pop %%ebx;" \
-					: \
-					: "c" (C), "d" (A), "m" (B), "S" (N) \
-					: "%edi", "memory", "cc" \
-		);
-	#define MulPrologue \
-		__asm__ __volatile__ \
-		( \
-			"push %%ebx;"	/* save this manually, in case of -fPIC */ \
-			"push %%ebp;" \
-			"push %0;" \
-			".intel_syntax noprefix;"
-	#define MulEpilogue \
-			"add esp, 4;" \
-			"pop ebp;" \
-			"pop ebx;" \
-			".att_syntax prefix;" \
-			: \
-			: "rm" (Z), "S" (X), "c" (Y) \
-			: "%eax", "%edx", "%edi", "memory", "cc" \
-		);
+	Mul_Column0(0, 2)
+	Mul_Column1(1, 3)
+	Mul_Column0(2, 4)
+	Mul_Column1(3, 5)
+	Mul_Column0(4, 6)
+	Mul_Column1(5, 7)
+	Mul_Column0(6, 8)
+	Mul_Column1(7, 7)
+	Mul_Column0(8, 6)
+	Mul_Column1(9, 5)
+	Mul_Column0(10, 4)
+	Mul_Column1(11, 3)
+	Mul_Column0(12, 2)
+	Mul_End(8)
+}
+
+void SSE2_Multiply32(word *C, const word *A, const word *B)
+{
+	Mul_Begin(16)
+	ASJ(	jmp,	0, f)
+	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+	Mul_Column0(0, 2)
+	Mul_Column1(1, 3)
+	Mul_Column0(2, 4)
+	Mul_Column1(3, 5)
+	Mul_Column0(4, 6)
+	Mul_Column1(5, 7)
+	Mul_Column0(6, 8)
+	Mul_Column1(7, 9)
+	Mul_Column0(8, 10)
+	Mul_Column1(9, 11)
+	Mul_Column0(10, 12)
+	Mul_Column1(11, 13)
+	Mul_Column0(12, 14)
+	Mul_Column1(13, 15)
+	Mul_Column0(14, 16)
+	Mul_Column1(15, 15)
+	Mul_Column0(16, 14)
+	Mul_Column1(17, 13)
+	Mul_Column0(18, 12)
+	Mul_Column1(19, 11)
+	Mul_Column0(20, 10)
+	Mul_Column1(21, 9)
+	Mul_Column0(22, 8)
+	Mul_Column1(23, 7)
+	Mul_Column0(24, 6)
+	Mul_Column1(25, 5)
+	Mul_Column0(26, 4)
+	Mul_Column1(27, 3)
+	Mul_Column0(28, 2)
+	Mul_End(16)
+}
+
+void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
+{
+	Mul_Begin(2)
+	Bot_SaveAcc(0) Bot_Acc(2)
+	Bot_End(2)
+}
+
+void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
+{
+	Mul_Begin(4)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
 #endif
+	Mul_Column0(0, 2)
+	Mul_Column1(1, 3)
+	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
+	Bot_End(4)
+}
 
-CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)
+void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
 {
-	AddPrologue
-
-	// now: ebx = B, ecx = C, edx = A, esi = N
-	AS2(	sub ecx, edx)	// hold the distance between C & A so we can add this to A to get C
-	AS2(	xor eax, eax)	// clear eax
-
-	AS2(	sub eax, esi)	// eax is a negative index from end of B
-	AS2(	lea ebx, [ebx+4*esi])	// ebx is end of B
-
-	AS2(	sar eax, 1)		// unit of eax is now dwords; this also clears the carry flag
-	AS1(	jz	loopendAdd)		// if no dwords then nothing to do
-
-	AS1(loopstartAdd:)
-	AS2(	mov    esi,[edx])			// load lower word of A
-	AS2(	mov    ebp,[edx+4])			// load higher word of A
-
-	AS2(	mov    edi,[ebx+8*eax])		// load lower word of B
-	AS2(	lea    edx,[edx+8])			// advance A and C
-
-	AS2(	adc    esi,edi)				// add lower words
-	AS2(	mov    edi,[ebx+8*eax+4])	// load higher word of B
-
-	AS2(	adc    ebp,edi)				// add higher words
-	AS1(	inc    eax)					// advance B
-
-	AS2(	mov    [edx+ecx-8],esi)		// store lower word result
-	AS2(	mov    [edx+ecx-4],ebp)		// store higher word result
-
-	AS1(	jnz    loopstartAdd)			// loop until eax overflows and becomes zero
-
-	AS1(loopendAdd:)
-	AS2(	adc eax, 0)		// store carry into eax (return result register)
-
-	AddEpilogue
+	Mul_Begin(8)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Mul_Column0(0, 2)
+	Mul_Column1(1, 3)
+	Mul_Column0(2, 4)
+	Mul_Column1(3, 5)
+	Mul_Column0(4, 6)
+	Mul_Column1(5, 7)
+	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
+	Bot_End(8)
+}
+
+void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
+{
+	Mul_Begin(16)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Mul_Column0(0, 2)
+	Mul_Column1(1, 3)
+	Mul_Column0(2, 4)
+	Mul_Column1(3, 5)
+	Mul_Column0(4, 6)
+	Mul_Column1(5, 7)
+	Mul_Column0(6, 8)
+	Mul_Column1(7, 9)
+	Mul_Column0(8, 10)
+	Mul_Column1(9, 11)
+	Mul_Column0(10, 12)
+	Mul_Column1(11, 13)
+	Mul_Column0(12, 14)
+	Mul_Column1(13, 15)
+	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
+	Bot_End(16)
+}
+
+void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
+{
+	Top_Begin(4)
+	Top_Acc(3) Top_Acc(2) Top_Acc(1)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Top_Column0(4)
+	Top_Column1(3)
+	Mul_Column0(0, 2)
+	Top_End(2)
 }
 
-CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)
+void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
 {
-	AddPrologue
-
-	// now: ebx = B, ecx = C, edx = A, esi = N
-	AS2(	sub ecx, edx)	// hold the distance between C & A so we can add this to A to get C
-	AS2(	xor eax, eax)	// clear eax
-
-	AS2(	sub eax, esi)	// eax is a negative index from end of B
-	AS2(	lea ebx, [ebx+4*esi])	// ebx is end of B
-
-	AS2(	sar eax, 1)		// unit of eax is now dwords; this also clears the carry flag
-	AS1(	jz	loopendSub)		// if no dwords then nothing to do
-
-	AS1(loopstartSub:)
-	AS2(	mov    esi,[edx])			// load lower word of A
-	AS2(	mov    ebp,[edx+4])			// load higher word of A
-
-	AS2(	mov    edi,[ebx+8*eax])		// load lower word of B
-	AS2(	lea    edx,[edx+8])			// advance A and C
-
-	AS2(	sbb    esi,edi)				// subtract lower words
-	AS2(	mov    edi,[ebx+8*eax+4])	// load higher word of B
-
-	AS2(	sbb    ebp,edi)				// subtract higher words
-	AS1(	inc    eax)					// advance B
-
-	AS2(	mov    [edx+ecx-8],esi)		// store lower word result
-	AS2(	mov    [edx+ecx-4],ebp)		// store higher word result
+	Top_Begin(8)
+	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Top_Column0(8)
+	Top_Column1(7)
+	Mul_Column0(0, 6)
+	Mul_Column1(1, 5)
+	Mul_Column0(2, 4)
+	Mul_Column1(3, 3)
+	Mul_Column0(4, 2)
+	Top_End(4)
+}
+
+void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
+{
+	Top_Begin(16)
+	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
+#ifndef __GNUC__
+	ASJ(	jmp,	0, f)
+	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+	AS1(	ret) ASL(0)
+#endif
+	Top_Column0(16)
+	Top_Column1(15)
+	Mul_Column0(0, 14)
+	Mul_Column1(1, 13)
+	Mul_Column0(2, 12)
+	Mul_Column1(3, 11)
+	Mul_Column0(4, 10)
+	Mul_Column1(5, 9)
+	Mul_Column0(6, 8)
+	Mul_Column1(7, 7)
+	Mul_Column0(8, 6)
+	Mul_Column1(9, 5)
+	Mul_Column0(10, 4)
+	Mul_Column1(11, 3)
+	Mul_Column0(12, 2)
+	Top_End(8)
+}
+
+#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
 
-	AS1(	jnz    loopstartSub)			// loop until eax overflows and becomes zero
+// ********************************************************
 
-	AS1(loopendSub:)
-	AS2(	adc eax, 0)		// store carry into eax (return result register)
+typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
+typedef void (* PMul)(word *C, const word *A, const word *B);
+typedef void (* PSqu)(word *C, const word *A);
+typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
 
-	AddEpilogue
-}
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
+static PMulTop s_pTop[3];
+static size_t s_recursionLimit = 8;
+#else
+static const size_t s_recursionLimit = 8;
+#endif
 
-// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
+static PMul s_pMul[9], s_pBot[9];
+static PSqu s_pSqu[9];
 
-CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)
+static void SetFunctionPointers()
 {
-	AddPrologue
+	s_pMul[0] = &Baseline_Multiply2;
+	s_pBot[0] = &Baseline_MultiplyBottom2;
+	s_pSqu[0] = &Baseline_Square2;
 
-	// now: ebx = B, ecx = C, edx = A, esi = N
-	AS2(	xor		eax, eax)
-	AS1(	neg		esi)
-	AS1(	jz		loopendAddP4)		// if no dwords then nothing to do
-
-	AS2(	mov		edi, [edx])
-	AS2(	mov		ebp, [ebx])
-	AS1(	jmp		carry1AddP4)
-
-	AS1(loopstartAddP4:)
-	AS2(	mov		edi, [edx+8])
-	AS2(	add		ecx, 8)
-	AS2(	add		edx, 8)
-	AS2(	mov		ebp, [ebx])
-	AS2(	add		edi, eax)
-	AS1(	jc		carry1AddP4)
-	AS2(	xor		eax, eax)
-
-	AS1(carry1AddP4:)
-	AS2(	add		edi, ebp)
-	AS2(	mov		ebp, 1)
-	AS2(	mov		[ecx], edi)
-	AS2(	mov		edi, [edx+4])
-	AS2(	cmovc	eax, ebp)
-	AS2(	mov		ebp, [ebx+4])
-	AS2(	add		ebx, 8)
-	AS2(	add		edi, eax)
-	AS1(	jc		carry2AddP4)
-	AS2(	xor		eax, eax)
-
-	AS1(carry2AddP4:)
-	AS2(	add		edi, ebp)
-	AS2(	mov		ebp, 1)
-	AS2(	cmovc	eax, ebp)
-	AS2(	mov		[ecx+4], edi)
-	AS2(	add		esi, 2)
-	AS1(	jnz		loopstartAddP4)
-
-	AS1(loopendAddP4:)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	if (HasSSE2())
+	{
+		if (IsP4())
+		{
+			s_pAdd = &SSE2_Add;
+			s_pSub = &SSE2_Sub;
+		}
 
-	AddEpilogue
-}
+		s_recursionLimit = 32;
 
-CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)
-{
-	AddPrologue
+		s_pMul[1] = &SSE2_Multiply4;
+		s_pMul[2] = &SSE2_Multiply8;
+		s_pMul[4] = &SSE2_Multiply16;
+		s_pMul[8] = &SSE2_Multiply32;
 
-	// now: ebx = B, ecx = C, edx = A, esi = N
-	AS2(	xor		eax, eax)
-	AS1(	neg		esi)
-	AS1(	jz		loopendSubP4)		// if no dwords then nothing to do
-
-	AS2(	mov		edi, [edx])
-	AS2(	mov		ebp, [ebx])
-	AS1(	jmp		carry1SubP4)
-
-	AS1(loopstartSubP4:)
-	AS2(	mov		edi, [edx+8])
-	AS2(	add		edx, 8)
-	AS2(	add		ecx, 8)
-	AS2(	mov		ebp, [ebx])
-	AS2(	sub		edi, eax)
-	AS1(	jc		carry1SubP4)
-	AS2(	xor		eax, eax)
-
-	AS1(carry1SubP4:)
-	AS2(	sub		edi, ebp)
-	AS2(	mov		ebp, 1)
-	AS2(	mov		[ecx], edi)
-	AS2(	mov		edi, [edx+4])
-	AS2(	cmovc	eax, ebp)
-	AS2(	mov		ebp, [ebx+4])
-	AS2(	add		ebx, 8)
-	AS2(	sub		edi, eax)
-	AS1(	jc		carry2SubP4)
-	AS2(	xor		eax, eax)
-
-	AS1(carry2SubP4:)
-	AS2(	sub		edi, ebp)
-	AS2(	mov		ebp, 1)
-	AS2(	cmovc	eax, ebp)
-	AS2(	mov		[ecx+4], edi)
-	AS2(	add		esi, 2)
-	AS1(	jnz		loopstartSubP4)
-
-	AS1(loopendSubP4:)
+		s_pBot[1] = &SSE2_MultiplyBottom4;
+		s_pBot[2] = &SSE2_MultiplyBottom8;
+		s_pBot[4] = &SSE2_MultiplyBottom16;
+		s_pBot[8] = &SSE2_MultiplyBottom32;
 
-	AddEpilogue
-}
+		s_pSqu[1] = &SSE2_Square4;
+		s_pSqu[2] = &SSE2_Square8;
+		s_pSqu[4] = &SSE2_Square16;
+		s_pSqu[8] = &SSE2_Square32;
 
-// multiply assembly code originally contributed by Leonard Janke
-
-#define MulStartup \
-	AS2(xor ebp, ebp) \
-	AS2(xor edi, edi) \
-	AS2(xor ebx, ebx) 
-
-#define MulShiftCarry \
-	AS2(mov ebp, edx) \
-	AS2(mov edi, ebx) \
-	AS2(xor ebx, ebx)
-
-#define MulAccumulateBottom(i,j) \
-	AS2(mov eax, [ecx+4*j]) \
-	AS2(imul eax, dword ptr [esi+4*i]) \
-	AS2(add ebp, eax)
-
-#define MulAccumulate(i,j) \
-	AS2(mov eax, [ecx+4*j]) \
-	AS1(mul dword ptr [esi+4*i]) \
-	AS2(add ebp, eax) \
-	AS2(adc edi, edx) \
-	AS2(adc bl, bh)
-
-#define MulStoreDigit(i)  \
-	AS2(mov edx, edi) \
-	AS2(mov edi, [esp]) \
-	AS2(mov [edi+4*i], ebp)
-
-#define MulLastDiagonal(digits) \
-	AS2(mov eax, [ecx+4*(digits-1)]) \
-	AS1(mul dword ptr [esi+4*(digits-1)]) \
-	AS2(add ebp, eax) \
-	AS2(adc edx, edi) \
-	AS2(mov edi, [esp]) \
-	AS2(mov [edi+4*(2*digits-2)], ebp) \
-	AS2(mov [edi+4*(2*digits-1)], edx)
-
-CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
-{
-	MulPrologue
-	// now: [esp] = Z, esi = X, ecx = Y
-	MulStartup
-	MulAccumulate(0,0)
-	MulStoreDigit(0)
-	MulShiftCarry
-
-	MulAccumulate(1,0)
-	MulAccumulate(0,1)
-	MulStoreDigit(1)
-	MulShiftCarry
-
-	MulAccumulate(2,0)
-	MulAccumulate(1,1)
-	MulAccumulate(0,2)
-	MulStoreDigit(2)
-	MulShiftCarry
-
-	MulAccumulate(3,0)
-	MulAccumulate(2,1)
-	MulAccumulate(1,2)
-	MulAccumulate(0,3)
-	MulStoreDigit(3)
-	MulShiftCarry
-
-	MulAccumulate(3,1)
-	MulAccumulate(2,2)
-	MulAccumulate(1,3)
-	MulStoreDigit(4)
-	MulShiftCarry
-
-	MulAccumulate(3,2)
-	MulAccumulate(2,3)
-	MulStoreDigit(5)
-	MulShiftCarry
-
-	MulLastDiagonal(4)
-	MulEpilogue
-}
+		s_pTop[0] = &SSE2_MultiplyTop8;
+		s_pTop[1] = &SSE2_MultiplyTop16;
+		s_pTop[2] = &SSE2_MultiplyTop32;
+	}
+	else
+#endif
+	{
+		s_pMul[1] = &Baseline_Multiply4;
+		s_pMul[2] = &Baseline_Multiply8;
+//		s_pMul[4] = &Baseline_Multiply16;
 
-CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
-{
-	MulPrologue
-	// now: [esp] = Z, esi = X, ecx = Y
-	MulStartup
-	MulAccumulate(0,0)
-	MulStoreDigit(0)
-	MulShiftCarry
-
-	MulAccumulate(1,0)
-	MulAccumulate(0,1)
-	MulStoreDigit(1)
-	MulShiftCarry
-
-	MulAccumulate(2,0)
-	MulAccumulate(1,1)
-	MulAccumulate(0,2)
-	MulStoreDigit(2)
-	MulShiftCarry
-
-	MulAccumulate(3,0)
-	MulAccumulate(2,1)
-	MulAccumulate(1,2)
-	MulAccumulate(0,3)
-	MulStoreDigit(3)
-	MulShiftCarry
-
-	MulAccumulate(4,0)
-	MulAccumulate(3,1)
-	MulAccumulate(2,2)
-	MulAccumulate(1,3)
-	MulAccumulate(0,4)
-	MulStoreDigit(4)
-	MulShiftCarry
-
-	MulAccumulate(5,0)
-	MulAccumulate(4,1)
-	MulAccumulate(3,2)
-	MulAccumulate(2,3)
-	MulAccumulate(1,4)
-	MulAccumulate(0,5)
-	MulStoreDigit(5)
-	MulShiftCarry
-
-	MulAccumulate(6,0)
-	MulAccumulate(5,1)
-	MulAccumulate(4,2)
-	MulAccumulate(3,3)
-	MulAccumulate(2,4)
-	MulAccumulate(1,5)
-	MulAccumulate(0,6)
-	MulStoreDigit(6)
-	MulShiftCarry
-
-	MulAccumulate(7,0)
-	MulAccumulate(6,1)
-	MulAccumulate(5,2)
-	MulAccumulate(4,3)
-	MulAccumulate(3,4)
-	MulAccumulate(2,5)
-	MulAccumulate(1,6)
-	MulAccumulate(0,7)
-	MulStoreDigit(7)
-	MulShiftCarry
-
-	MulAccumulate(7,1)
-	MulAccumulate(6,2)
-	MulAccumulate(5,3)
-	MulAccumulate(4,4)
-	MulAccumulate(3,5)
-	MulAccumulate(2,6)
-	MulAccumulate(1,7)
-	MulStoreDigit(8)
-	MulShiftCarry
-
-	MulAccumulate(7,2)
-	MulAccumulate(6,3)
-	MulAccumulate(5,4)
-	MulAccumulate(4,5)
-	MulAccumulate(3,6)
-	MulAccumulate(2,7)
-	MulStoreDigit(9)
-	MulShiftCarry
-
-	MulAccumulate(7,3)
-	MulAccumulate(6,4)
-	MulAccumulate(5,5)
-	MulAccumulate(4,6)
-	MulAccumulate(3,7)
-	MulStoreDigit(10)
-	MulShiftCarry
-
-	MulAccumulate(7,4)
-	MulAccumulate(6,5)
-	MulAccumulate(5,6)
-	MulAccumulate(4,7)
-	MulStoreDigit(11)
-	MulShiftCarry
-
-	MulAccumulate(7,5)
-	MulAccumulate(6,6)
-	MulAccumulate(5,7)
-	MulStoreDigit(12)
-	MulShiftCarry
-
-	MulAccumulate(7,6)
-	MulAccumulate(6,7)
-	MulStoreDigit(13)
-	MulShiftCarry
-
-	MulLastDiagonal(8)
-	MulEpilogue
-}
+		s_pBot[1] = &Baseline_MultiplyBottom4;
+		s_pBot[2] = &Baseline_MultiplyBottom8;
+//		s_pBot[4] = &Baseline_MultiplyBottom16;
 
-CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
-{
-	MulPrologue
-	// now: [esp] = Z, esi = X, ecx = Y
-	MulStartup
-	MulAccumulate(0,0)
-	MulStoreDigit(0)
-	MulShiftCarry
-
-	MulAccumulate(1,0)
-	MulAccumulate(0,1)
-	MulStoreDigit(1)
-	MulShiftCarry
-
-	MulAccumulate(2,0)
-	MulAccumulate(1,1)
-	MulAccumulate(0,2)
-	MulStoreDigit(2)
-	MulShiftCarry
-
-	MulAccumulate(3,0)
-	MulAccumulate(2,1)
-	MulAccumulate(1,2)
-	MulAccumulate(0,3)
-	MulStoreDigit(3)
-	MulShiftCarry
-
-	MulAccumulate(4,0)
-	MulAccumulate(3,1)
-	MulAccumulate(2,2)
-	MulAccumulate(1,3)
-	MulAccumulate(0,4)
-	MulStoreDigit(4)
-	MulShiftCarry
-
-	MulAccumulate(5,0)
-	MulAccumulate(4,1)
-	MulAccumulate(3,2)
-	MulAccumulate(2,3)
-	MulAccumulate(1,4)
-	MulAccumulate(0,5)
-	MulStoreDigit(5)
-	MulShiftCarry
-
-	MulAccumulate(6,0)
-	MulAccumulate(5,1)
-	MulAccumulate(4,2)
-	MulAccumulate(3,3)
-	MulAccumulate(2,4)
-	MulAccumulate(1,5)
-	MulAccumulate(0,6)
-	MulStoreDigit(6)
-	MulShiftCarry
-
-	MulAccumulateBottom(7,0)
-	MulAccumulateBottom(6,1)
-	MulAccumulateBottom(5,2)
-	MulAccumulateBottom(4,3)
-	MulAccumulateBottom(3,4)
-	MulAccumulateBottom(2,5)
-	MulAccumulateBottom(1,6)
-	MulAccumulateBottom(0,7)
-	MulStoreDigit(7)
-	MulEpilogue
+		s_pSqu[1] = &Baseline_Square4;
+		s_pSqu[2] = &Baseline_Square8;
+//		s_pSqu[4] = &Baseline_Square16;
+	}
 }
 
-#undef AS1
-#undef AS2
-
-#else	// not x86 - no processor specific code at this layer
-
-typedef Portable LowLevel;
-
-#endif
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-
-#ifdef __GNUC__
-#define CRYPTOPP_FASTCALL
+inline int Add(word *C, const word *A, const word *B, size_t N)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	return s_pAdd(N, C, A, B);
 #else
-#define CRYPTOPP_FASTCALL __fastcall
+	return Baseline_Add(N, C, A, B);
 #endif
-
-static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
-{
-	__m128i a3210 = _mm_load_si128(A);
-	__m128i b3210 = _mm_load_si128(B);
-
-	__m128i sum;
-
-	__m128i z = _mm_setzero_si128();
-	__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
-	C[0] = a2b2_a0b0;
-
-	__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
-	__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
-	__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
-	__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
-	__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
-	C[1] = _mm_add_epi64(a1b0, a0b1);
-
-	__m128i a31 = _mm_srli_epi64(a3210, 32);
-	__m128i b31 = _mm_srli_epi64(b3210, 32);
-	__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
-	C[6] = a3b3_a1b1;
-
-	__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
-	__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
-	__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
-	__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
-	__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
-	sum = _mm_add_epi64(a1b1, a0b2);
-	C[2] = _mm_add_epi64(sum, a2b0);
-
-	__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
-	__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
-	__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
-	__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
-	__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
-	__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
-	__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
-	__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
-	__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
-	sum = _mm_add_epi64(a2b1, a0b3);
-	C[3] = _mm_add_epi64(sum, sum1);
-
-	__m128i	a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
-	__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
-	__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
-	__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
-	sum = _mm_add_epi64(a2b2, a3b1);
-	C[4] = _mm_add_epi64(sum, a1b3);
-
-	__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
-	__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
-	__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
-	__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
-	__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
-	C[5] = _mm_add_epi64(a3b2, a2b3);
-}
-
-void P4Optimized::Multiply4(word *C, const word *A, const word *B)
-{
-	__m128i temp[7];
-	const word *w = (word *)temp;
-	const __m64 *mw = (__m64 *)w;
-
-	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
-	C[0] = w[0];
-
-	__m64 s1, s2;
-
-	__m64 w1 = _mm_cvtsi32_si64(w[1]);
-	__m64 w4 = mw[2];
-	__m64 w6 = mw[3];
-	__m64 w8 = mw[4];
-	__m64 w10 = mw[5];
-	__m64 w12 = mw[6];
-	__m64 w14 = mw[7];
-	__m64 w16 = mw[8];
-	__m64 w18 = mw[9];
-	__m64 w20 = mw[10];
-	__m64 w22 = mw[11];
-	__m64 w26 = _mm_cvtsi32_si64(w[26]);
-
-	s1 = _mm_add_si64(w1, w4);
-	C[1] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w6, w8);
-	s1 = _mm_add_si64(s1, s2);
-	C[2] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w10, w12);
-	s1 = _mm_add_si64(s1, s2);
-	C[3] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w14, w16);
-	s1 = _mm_add_si64(s1, s2);
-	C[4] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w18, w20);
-	s1 = _mm_add_si64(s1, s2);
-	C[5] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w22, w26);
-	s1 = _mm_add_si64(s1, s2);
-	C[6] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	C[7] = _mm_cvtsi64_si32(s1) + w[27];
-	_mm_empty();
-}
-
-void P4Optimized::Multiply8(word *C, const word *A, const word *B)
-{
-	__m128i temp[28];
-	const word *w = (word *)temp;
-	const __m64 *mw = (__m64 *)w;
-	const word *x = (word *)temp+7*4;
-	const __m64 *mx = (__m64 *)x;
-	const word *y = (word *)temp+7*4*2;
-	const __m64 *my = (__m64 *)y;
-	const word *z = (word *)temp+7*4*3;
-	const __m64 *mz = (__m64 *)z;
-
-	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
-	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
-
-	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
-
-	P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
-
-	C[0] = w[0];
-
-	__m64 s1, s2, s3, s4;
-
-	__m64 w1 = _mm_cvtsi32_si64(w[1]);
-	__m64 w4 = mw[2];
-	__m64 w6 = mw[3];
-	__m64 w8 = mw[4];
-	__m64 w10 = mw[5];
-	__m64 w12 = mw[6];
-	__m64 w14 = mw[7];
-	__m64 w16 = mw[8];
-	__m64 w18 = mw[9];
-	__m64 w20 = mw[10];
-	__m64 w22 = mw[11];
-	__m64 w26 = _mm_cvtsi32_si64(w[26]);
-	__m64 w27 = _mm_cvtsi32_si64(w[27]);
-
-	__m64 x0 = _mm_cvtsi32_si64(x[0]);
-	__m64 x1 = _mm_cvtsi32_si64(x[1]);
-	__m64 x4 = mx[2];
-	__m64 x6 = mx[3];
-	__m64 x8 = mx[4];
-	__m64 x10 = mx[5];
-	__m64 x12 = mx[6];
-	__m64 x14 = mx[7];
-	__m64 x16 = mx[8];
-	__m64 x18 = mx[9];
-	__m64 x20 = mx[10];
-	__m64 x22 = mx[11];
-	__m64 x26 = _mm_cvtsi32_si64(x[26]);
-	__m64 x27 = _mm_cvtsi32_si64(x[27]);
-
-	__m64 y0 = _mm_cvtsi32_si64(y[0]);
-	__m64 y1 = _mm_cvtsi32_si64(y[1]);
-	__m64 y4 = my[2];
-	__m64 y6 = my[3];
-	__m64 y8 = my[4];
-	__m64 y10 = my[5];
-	__m64 y12 = my[6];
-	__m64 y14 = my[7];
-	__m64 y16 = my[8];
-	__m64 y18 = my[9];
-	__m64 y20 = my[10];
-	__m64 y22 = my[11];
-	__m64 y26 = _mm_cvtsi32_si64(y[26]);
-	__m64 y27 = _mm_cvtsi32_si64(y[27]);
-
-	__m64 z0 = _mm_cvtsi32_si64(z[0]);
-	__m64 z1 = _mm_cvtsi32_si64(z[1]);
-	__m64 z4 = mz[2];
-	__m64 z6 = mz[3];
-	__m64 z8 = mz[4];
-	__m64 z10 = mz[5];
-	__m64 z12 = mz[6];
-	__m64 z14 = mz[7];
-	__m64 z16 = mz[8];
-	__m64 z18 = mz[9];
-	__m64 z20 = mz[10];
-	__m64 z22 = mz[11];
-	__m64 z26 = _mm_cvtsi32_si64(z[26]);
-
-	s1 = _mm_add_si64(w1, w4);
-	C[1] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w6, w8);
-	s1 = _mm_add_si64(s1, s2);
-	C[2] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w10, w12);
-	s1 = _mm_add_si64(s1, s2);
-	C[3] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x0, y0);
-	s2 = _mm_add_si64(w14, w16);
-	s1 = _mm_add_si64(s1, s3);
-	s1 = _mm_add_si64(s1, s2);
-	C[4] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x1, y1);
-	s4 = _mm_add_si64(x4, y4);
-	s1 = _mm_add_si64(s1, w18);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w20);
-	s1 = _mm_add_si64(s1, s3);
-	C[5] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x6, y6);
-	s4 = _mm_add_si64(x8, y8);
-	s1 = _mm_add_si64(s1, w22);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w26);
-	s1 = _mm_add_si64(s1, s3);
-	C[6] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x10, y10);
-	s4 = _mm_add_si64(x12, y12);
-	s1 = _mm_add_si64(s1, w27);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, s3);
-	C[7] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x14, y14);
-	s4 = _mm_add_si64(x16, y16);
-	s1 = _mm_add_si64(s1, z0);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, s3);
-	C[8] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x18, y18);
-	s4 = _mm_add_si64(x20, y20);
-	s1 = _mm_add_si64(s1, z1);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, z4);
-	s1 = _mm_add_si64(s1, s3);
-	C[9] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x22, y22);
-	s4 = _mm_add_si64(x26, y26);
-	s1 = _mm_add_si64(s1, z6);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, z8);
-	s1 = _mm_add_si64(s1, s3);
-	C[10] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x27, y27);
-	s1 = _mm_add_si64(s1, z10);
-	s1 = _mm_add_si64(s1, z12);
-	s1 = _mm_add_si64(s1, s3);
-	C[11] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(z14, z16);
-	s1 = _mm_add_si64(s1, s3);
-	C[12] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(z18, z20);
-	s1 = _mm_add_si64(s1, s3);
-	C[13] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(z22, z26);
-	s1 = _mm_add_si64(s1, s3);
-	C[14] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	C[15] = z[27] + _mm_cvtsi64_si32(s1);
-	_mm_empty();
 }
 
-void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
-{
-	__m128i temp[21];
-	const word *w = (word *)temp;
-	const __m64 *mw = (__m64 *)w;
-	const word *x = (word *)temp+7*4;
-	const __m64 *mx = (__m64 *)x;
-	const word *y = (word *)temp+7*4*2;
-	const __m64 *my = (__m64 *)y;
-
-	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
-	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
-
-	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
-
-	C[0] = w[0];
-
-	__m64 s1, s2, s3, s4;
-
-	__m64 w1 = _mm_cvtsi32_si64(w[1]);
-	__m64 w4 = mw[2];
-	__m64 w6 = mw[3];
-	__m64 w8 = mw[4];
-	__m64 w10 = mw[5];
-	__m64 w12 = mw[6];
-	__m64 w14 = mw[7];
-	__m64 w16 = mw[8];
-	__m64 w18 = mw[9];
-	__m64 w20 = mw[10];
-	__m64 w22 = mw[11];
-	__m64 w26 = _mm_cvtsi32_si64(w[26]);
-
-	__m64 x0 = _mm_cvtsi32_si64(x[0]);
-	__m64 x1 = _mm_cvtsi32_si64(x[1]);
-	__m64 x4 = mx[2];
-	__m64 x6 = mx[3];
-	__m64 x8 = mx[4];
-
-	__m64 y0 = _mm_cvtsi32_si64(y[0]);
-	__m64 y1 = _mm_cvtsi32_si64(y[1]);
-	__m64 y4 = my[2];
-	__m64 y6 = my[3];
-	__m64 y8 = my[4];
-
-	s1 = _mm_add_si64(w1, w4);
-	C[1] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w6, w8);
-	s1 = _mm_add_si64(s1, s2);
-	C[2] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s2 = _mm_add_si64(w10, w12);
-	s1 = _mm_add_si64(s1, s2);
-	C[3] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x0, y0);
-	s2 = _mm_add_si64(w14, w16);
-	s1 = _mm_add_si64(s1, s3);
-	s1 = _mm_add_si64(s1, s2);
-	C[4] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x1, y1);
-	s4 = _mm_add_si64(x4, y4);
-	s1 = _mm_add_si64(s1, w18);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w20);
-	s1 = _mm_add_si64(s1, s3);
-	C[5] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	s3 = _mm_add_si64(x6, y6);
-	s4 = _mm_add_si64(x8, y8);
-	s1 = _mm_add_si64(s1, w22);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w26);
-	s1 = _mm_add_si64(s1, s3);
-	C[6] = _mm_cvtsi64_si32(s1);
-	s1 = _mm_srli_si64(s1, 32);
-
-	C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
-	_mm_empty();
+inline int Subtract(word *C, const word *A, const word *B, size_t N)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	return s_pSub(N, C, A, B);
+#else
+	return Baseline_Sub(N, C, A, B);
+#endif
 }
 
-#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
-
 // ********************************************************
 
+
 #define A0		A
 #define A1		(A+N2)
 #define B0		B
@@ -2004,64 +1876,37 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
 {
 	assert(N>=2 && N%2==0);
 
-	if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
-		LowLevel::Multiply8(R, A, B);
-	else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
-		LowLevel::Multiply4(R, A, B);
-	else if (N==2)
-		LowLevel::Multiply2(R, A, B);
+	if (N <= s_recursionLimit)
+		s_pMul[N/4](R, A, B);
 	else
 	{
 		const size_t N2 = N/2;
-		int carry;
 
-		int aComp = Compare(A0, A1, N2);
-		int bComp = Compare(B0, B1, N2);
+		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
+		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
 
-		switch (2*aComp + aComp + bComp)
-		{
-		case -4:
-			LowLevel::Subtract(R0, A1, A0, N2);
-			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			LowLevel::Subtract(T1, T1, R0, N2);
-			carry = -1;
-			break;
-		case -2:
-			LowLevel::Subtract(R0, A1, A0, N2);
-			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			carry = 0;
-			break;
-		case 2:
-			LowLevel::Subtract(R0, A0, A1, N2);
-			LowLevel::Subtract(R1, B1, B0, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			carry = 0;
-			break;
-		case 4:
-			LowLevel::Subtract(R0, A1, A0, N2);
-			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			LowLevel::Subtract(T1, T1, R1, N2);
-			carry = -1;
-			break;
-		default:
-			SetWords(T0, 0, N);
-			carry = 0;
-		}
+		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
+		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
 
-		RecursiveMultiply(R0, T2, A0, B0, N2);
 		RecursiveMultiply(R2, T2, A1, B1, N2);
+		RecursiveMultiply(T0, T2, R0, R1, N2);
+		RecursiveMultiply(R0, T2, A0, B0, N2);
 
 		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
 
-		carry += LowLevel::Add(T0, T0, R0, N);
-		carry += LowLevel::Add(T0, T0, R2, N);
-		carry += LowLevel::Add(R1, R1, T0, N);
+		int c2 = Add(R2, R2, R1, N2);
+		int c3 = c2;
+		c2 += Add(R1, R2, R0, N2);
+		c3 += Add(R2, R2, R3, N2);
 
-		assert (carry >= 0 && carry <= 2);
-		Increment(R3, N2, carry);
+		if (AN2 == BN2)
+			c3 -= Subtract(R1, R1, T0, N);
+		else
+			c3 += Add(R1, R1, T0, N);
+
+		c3 += Increment(R2, N2, c2);
+		assert (c3 >= 0 && c3 <= 2);
+		Increment(R3, N2, c3);
 	}
 }
 
@@ -2072,12 +1917,9 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
 void RecursiveSquare(word *R, word *T, const word *A, size_t N)
 {
 	assert(N && N%2==0);
-	if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
-		LowLevel::Square8(R, A);
-	if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
-		LowLevel::Square4(R, A);
-	else if (N==2)
-		LowLevel::Square2(R, A);
+
+	if (N <= s_recursionLimit)
+		s_pSqu[N/4](R, A);
 	else
 	{
 		const size_t N2 = N/2;
@@ -2086,35 +1928,32 @@ void RecursiveSquare(word *R, word *T, const word *A, size_t N)
 		RecursiveSquare(R2, T2, A1, N2);
 		RecursiveMultiply(T0, T2, A0, A1, N2);
 
-		int carry = LowLevel::Add(R1, R1, T0, N);
-		carry += LowLevel::Add(R1, R1, T0, N);
+		int carry = Add(R1, R1, T0, N);
+		carry += Add(R1, R1, T0, N);
 		Increment(R3, N2, carry);
 	}
 }
 
 // R[N] - bottom half of A*B
-// T[N] - temporary work space
+// T[3*N/2] - temporary work space
 // A[N] - multiplier
 // B[N] - multiplicant
 
 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
 {
 	assert(N>=2 && N%2==0);
-	if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
-		LowLevel::Multiply8Bottom(R, A, B);
-	else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
-		LowLevel::Multiply4Bottom(R, A, B);
-	else if (N==2)
-		LowLevel::Multiply2Bottom(R, A, B);
+
+	if (N <= s_recursionLimit)
+		s_pBot[N/4](R, A, B);
 	else
 	{
 		const size_t N2 = N/2;
 
 		RecursiveMultiply(R, T, A0, B0, N2);
 		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
-		LowLevel::Add(R1, R1, T0, N2);
+		Add(R1, R1, T0, N2);
 		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
-		LowLevel::Add(R1, R1, T0, N2);
+		Add(R1, R1, T0, N2);
 	}
 }
 
@@ -2124,88 +1963,61 @@ void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, siz
 // A[N] --- multiplier
 // B[N] --- multiplicant
 
-void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
+void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
 {
 	assert(N>=2 && N%2==0);
 
-	if (N==4)
-	{
-		LowLevel::Multiply4(T, A, B);
-		memcpy(R, T+4, 4*WORD_SIZE);
-	}
-	else if (N==2)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	if (HasSSE2() && ((N>=8) & (N<=32)))
+		s_pTop[N/16](R, A, B, L[N-1]);
+	else
+#endif
+	if (N<=4)
 	{
-		LowLevel::Multiply2(T, A, B);
-		memcpy(R, T+2, 2*WORD_SIZE);
+		s_pMul[N/4](T, A, B);
+		memcpy(R, T+N, N*WORD_SIZE);
 	}
 	else
 	{
 		const size_t N2 = N/2;
-		int carry;
 
-		int aComp = Compare(A0, A1, N2);
-		int bComp = Compare(B0, B1, N2);
+		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
+		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
 
-		switch (2*aComp + aComp + bComp)
-		{
-		case -4:
-			LowLevel::Subtract(R0, A1, A0, N2);
-			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			LowLevel::Subtract(T1, T1, R0, N2);
-			carry = -1;
-			break;
-		case -2:
-			LowLevel::Subtract(R0, A1, A0, N2);
-			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			carry = 0;
-			break;
-		case 2:
-			LowLevel::Subtract(R0, A0, A1, N2);
-			LowLevel::Subtract(R1, B1, B0, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			carry = 0;
-			break;
-		case 4:
-			LowLevel::Subtract(R0, A1, A0, N2);
-			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply(T0, T2, R0, R1, N2);
-			LowLevel::Subtract(T1, T1, R1, N2);
-			carry = -1;
-			break;
-		default:
-			SetWords(T0, 0, N);
-			carry = 0;
-		}
-
-		RecursiveMultiply(T2, R0, A1, B1, N2);
+		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
+		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
 
-		// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
+		RecursiveMultiply(T0, T2, R0, R1, N2);
+		RecursiveMultiply(R0, T2, A1, B1, N2);
 
-		int c2 = LowLevel::Subtract(R0, L+N2, L, N2);
-		c2 += LowLevel::Subtract(R0, R0, T0, N2);
-		int t = (Compare(R0, T2, N2) == -1);
+		// now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
 
-		carry += t;
-		carry += Increment(R0, N2, c2+t);
-		carry += LowLevel::Add(R0, R0, T1, N2);
-		carry += LowLevel::Add(R0, R0, T3, N2);
-		assert (carry >= 0 && carry <= 2);
+		int t, c3;
+		int c2 = Subtract(T2, L+N2, L, N2);
 
-		CopyWords(R1, T3, N2);
-		Increment(R1, N2, carry);
-	}
-}
+		if (AN2 == BN2)
+		{
+			c2 -= Add(T2, T2, T0, N2);
+			t = (Compare(T2, R0, N2) == -1);
+			c3 = t - Subtract(T2, T2, T1, N2);
+		}
+		else
+		{
+			c2 += Subtract(T2, T2, T0, N2);
+			t = (Compare(T2, R0, N2) == -1);
+			c3 = t + Add(T2, T2, T1, N2);
+		}
 
-inline int Add(word *C, const word *A, const word *B, size_t N)
-{
-	return LowLevel::Add(C, A, B, N);
-}
+		c2 += t;
+		if (c2 >= 0)
+			c3 += Increment(T2, N2, c2);
+		else
+			c3 -= Decrement(T2, N2, -c2);
+		c3 += Add(R0, T2, R1, N2);
 
-inline int Subtract(word *C, const word *A, const word *B, size_t N)
-{
-	return LowLevel::Subtract(C, A, B, N);
+		assert (c3 >= 0 && c3 <= 2);
+		Increment(R1, N2, c3);
+	}
 }
 
 inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
@@ -2223,23 +2035,6 @@ inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_
 	RecursiveMultiplyBottom(R, T, A, B, N);
 }
 
-inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
-{
-	RecursiveMultiplyTop(R, T, L, A, B, N);
-}
-
-static word LinearMultiply(word *C, const word *A, word B, size_t N)
-{
-	word carry=0;
-	for(unsigned i=0; i<N; i++)
-	{
-		DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
-		C[i] = p.GetLowHalf();
-		carry = p.GetHighHalf();
-	}
-	return carry;
-}
-
 // R[NA+NB] - result = A*B
 // T[NA+NB] - temporary work space
 // A[NA] ---- multiplier
@@ -2264,7 +2059,6 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *
 	}
 
 	assert(NB % NA == 0);
-	assert((NB/NA)%2 == 0); 	// NB is an even multiple of NA
 
 	if (NA==2 && !A[1])
 	{
@@ -2284,15 +2078,24 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *
 		}
 	}
 
-	Multiply(R, T, A, B, NA);
-	CopyWords(T+2*NA, R+NA, NA);
-
 	size_t i;
+	if ((NB/NA)%2 == 0)
+	{
+		Multiply(R, T, A, B, NA);
+		CopyWords(T+2*NA, R+NA, NA);
 
-	for (i=2*NA; i<NB; i+=2*NA)
-		Multiply(T+NA+i, T, A, B+i, NA);
-	for (i=NA; i<NB; i+=2*NA)
-		Multiply(R+i, T, A, B+i, NA);
+		for (i=2*NA; i<NB; i+=2*NA)
+			Multiply(T+NA+i, T, A, B+i, NA);
+		for (i=NA; i<NB; i+=2*NA)
+			Multiply(R+i, T, A, B+i, NA);
+	}
+	else
+	{
+		for (i=0; i<NB; i+=2*NA)
+			Multiply(R+i, T, A, B+i, NA);
+		for (i=NA; i<NB; i+=2*NA)
+			Multiply(T+NA+i, T, A, B+i, NA);
+	}
 
 	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
 		Increment(R+NB, NA);
@@ -2308,10 +2111,10 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
 	{
 		T[0] = AtomicInverseModPower2(A[0]);
 		T[1] = 0;
-		LowLevel::Multiply2Bottom(T+2, T, A);
+		s_pBot[0](T+2, T, A);
 		TwosComplement(T+2, 2);
 		Increment(T+2, 2, 2);
-		LowLevel::Multiply2Bottom(R, T, T+2);
+		s_pBot[0](R, T, T+2);
 	}
 	else
 	{
@@ -2333,8 +2136,9 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
 // M[N] --- modulus
 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
 
-void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, size_t N)
+void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
 {
+#if 1
 	MultiplyBottom(R, T, X, U, N);
 	MultiplyTop(T, T+N, X, R, M, N);
 	word borrow = Subtract(T, X+N, T, N);
@@ -2342,6 +2146,60 @@ void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word
 	word carry = Add(T+N, T, M, N);
 	assert(carry || !borrow);
 	CopyWords(R, T + (borrow ? N : 0), N);
+#elif 0
+	const word u = 0-U[0];
+	Declare2Words(p)
+	for (size_t i=0; i<N; i++)
+	{
+		const word t = u * X[i];
+		word c = 0;
+		for (size_t j=0; j<N; j+=2)
+		{
+			MultiplyWords(p, t, M[j]);
+			Acc2WordsBy1(p, X[i+j]);
+			Acc2WordsBy1(p, c);
+			X[i+j] = LowWord(p);
+			c = HighWord(p);
+			MultiplyWords(p, t, M[j+1]);
+			Acc2WordsBy1(p, X[i+j+1]);
+			Acc2WordsBy1(p, c);
+			X[i+j+1] = LowWord(p);
+			c = HighWord(p);
+		}
+
+		if (Increment(X+N+i, N-i, c))
+			while (!Subtract(X+N, X+N, M, N)) {}
+	}
+
+	memcpy(R, X+N, N*WORD_SIZE);
+#else
+	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
+	for (size_t i=0; i<N; i++)
+	{
+		__m64 t = _mm_cvtsi32_si64(X[i]);
+		t = _mm_mul_su32(t, u);
+		__m64 c = _mm_setzero_si64();
+		for (size_t j=0; j<N; j+=2)
+		{
+			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
+			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
+			c = _mm_add_si64(c, p);
+			X[i+j] = _mm_cvtsi64_si32(c);
+			c = _mm_srli_si64(c, 32);
+			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
+			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
+			c = _mm_add_si64(c, p);
+			X[i+j+1] = _mm_cvtsi64_si32(c);
+			c = _mm_srli_si64(c, 32);
+		}
+
+		if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
+			while (!Subtract(X+N, X+N, M, N)) {}
+	}
+
+	memcpy(R, X+N, N*WORD_SIZE);
+	_mm_empty();
+#endif
 }
 
 // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
@@ -2491,7 +2349,7 @@ static inline void AtomicDivide(word *Q, const word *A, const word *B)
 		// multiply quotient and divisor and add remainder, make sure it equals dividend
 		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
 		word P[4];
-		Portable::Multiply2(P, Q, B);
+		s_pMul[0](P, Q, B);
 		Add(P, P, T, 4);
 		assert(memcmp(P, A, 4*WORD_SIZE)==0);
 	}
@@ -2503,21 +2361,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si
 {
 	assert(N && N%2==0);
 
-	if (Q[1])
-	{
-		T[N] = T[N+1] = 0;
-		unsigned i;
-		for (i=0; i<N; i+=4)
-			LowLevel::Multiply2(T+i, Q, B+i);
-		for (i=2; i<N; i+=4)
-			if (LowLevel::Multiply2Add(T+i, Q, B+i))
-				T[i+5] += (++T[i+4]==0);
-	}
-	else
-	{
-		T[N] = LinearMultiply(T, B, Q[0], N);
-		T[N+1] = 0;
-	}
+	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
 
 	word borrow = Subtract(R, R, T, N+2);
 	assert(!borrow && !R[N+1]);
@@ -2532,7 +2376,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si
 
 // R[NB] -------- remainder = A%B
 // Q[NA-NB+2] --- quotient	= A/B
-// T[NA+2*NB+4] - temp work space
+// T[NA+3*(NB+2)] - temp work space
 // A[NA] -------- dividend
 // B[NB] -------- divisor
 
@@ -2726,9 +2570,7 @@ InitializeInteger::InitializeInteger()
 {
 	if (!g_pAssignIntToInteger)
 	{
-#ifdef CRYPTOPP_X86ASM_AVAILABLE
-		SetPentiumFunctionPointers();
-#endif
+		SetFunctionPointers();
 		g_pAssignIntToInteger = AssignIntToInteger;
 	}
 }
@@ -2877,7 +2719,8 @@ Integer& Integer::operator=(const Integer& t)
 {
 	if (this != &t)
 	{
-		reg.New(RoundupSize(t.WordCount()));
+		if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
+			reg.New(RoundupSize(t.WordCount()));
 		CopyWords(reg, t.reg, reg.size());
 		sign = t.sign;
 	}
@@ -3240,7 +3083,7 @@ public:
 
 	void GenerateBlock(byte *output, size_t size)
 	{
-		UnalignedPutWord(BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
+		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
 		++m_counter;
 		P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
 	}
@@ -3657,7 +3500,7 @@ void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
 	product.reg.CleanNew(RoundupSize(aSize+bSize));
 	product.sign = Integer::POSITIVE;
 
-	SecAlignedWordBlock workspace(aSize + bSize);
+	IntegerSecBlock workspace(aSize + bSize);
 	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
 }
 
@@ -3723,7 +3566,7 @@ void PositiveDivide(Integer &remainder, Integer &quotient,
 	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
 	quotient.sign = Integer::POSITIVE;
 
-	SecAlignedWordBlock T(aSize+2*bSize+4);
+	IntegerSecBlock T(aSize+3*(bSize+2));
 	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
 }
 
diff --git a/integer.h b/integer.h
index 547e3778..4e93c3a1 100644
--- a/integer.h
+++ b/integer.h
@@ -11,44 +11,13 @@
 
 NAMESPACE_BEGIN(CryptoPP)
 
-#if defined(SSE2_INTRINSICS_AVAILABLE)
-	template <class T>
-	class AlignedAllocator : public AllocatorBase<T>
-	{
-	public:
-		CRYPTOPP_INHERIT_ALLOCATOR_TYPES
-
-		pointer allocate(size_type n, const void *);
-		void deallocate(void *p, size_type n);
-		pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
-		{
-			return StandardReallocate(*this, p, oldSize, newSize, preserve);
-		}
-
-	#if !(defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) || defined(CRYPTOPP_MEMALIGN_AVAILABLE) || defined(CRYPTOPP_MM_MALLOC_AVAILABLE))
-	#define CRYPTOPP_NO_ALIGNED_ALLOC
-		AlignedAllocator() : m_pBlock(NULL) {}
-	protected:
-		void *m_pBlock;
-	#endif
-	};
-
-	#ifdef CRYPTOPP_IMPORTS
-		CRYPTOPP_DLL_TEMPLATE_CLASS AlignedAllocator<word>;
-	#endif
-
-	typedef SecBlock<word, AlignedAllocator<word> > SecAlignedWordBlock;
-#else
-	typedef SecWordBlock SecAlignedWordBlock;
-#endif
-
-void CRYPTOPP_DLL CRYPTOPP_API DisableSSE2();
-
 struct InitializeInteger	// used to initialize static variables
 {
 	InitializeInteger();
 };
 
+typedef SecBlock<word, AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86> > IntegerSecBlock;
+
 //! multiple precision integer and basic arithmetics
 /*! This class can represent positive and negative integers
 	with absolute value less than (256**sizeof(word)) ** (256**sizeof(int)).
@@ -406,7 +375,7 @@ private:
 	friend void PositiveMultiply(Integer &product, const Integer &a, const Integer &b);
 	friend void PositiveDivide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor);
 
-	SecAlignedWordBlock reg;
+	IntegerSecBlock reg;
 	Sign sign;
 };
 
diff --git a/rijndael.cpp b/rijndael.cpp
index 2a1a19ef..4a8572f2 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -51,10 +51,7 @@ being unloaded from L1 cache, until that round is finished.
 
 #include "rijndael.h"
 #include "misc.h"
-
-#ifdef CRYPTOPP_L1_CACHE_ALIGN_NOT_AVAILABLE
-#pragma message("Don't know how to align data on L1 cache boundary. Defense against AES timing attack may be affected.")
-#endif
+#include "cpu.h"
 
 NAMESPACE_BEGIN(CryptoPP)
 
@@ -122,25 +119,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 		for (i = 1; i < m_rounds; i++) {
 			rk += 4;
 			rk[0] =
-				Td0[Se[GETBYTE(rk[0], 3)]] ^
-				Td1[Se[GETBYTE(rk[0], 2)]] ^
-				Td2[Se[GETBYTE(rk[0], 1)]] ^
-				Td3[Se[GETBYTE(rk[0], 0)]];
+				Td[0*256+Se[GETBYTE(rk[0], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[0], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[0], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[0], 0)]];
 			rk[1] =
-				Td0[Se[GETBYTE(rk[1], 3)]] ^
-				Td1[Se[GETBYTE(rk[1], 2)]] ^
-				Td2[Se[GETBYTE(rk[1], 1)]] ^
-				Td3[Se[GETBYTE(rk[1], 0)]];
+				Td[0*256+Se[GETBYTE(rk[1], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[1], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[1], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[1], 0)]];
 			rk[2] =
-				Td0[Se[GETBYTE(rk[2], 3)]] ^
-				Td1[Se[GETBYTE(rk[2], 2)]] ^
-				Td2[Se[GETBYTE(rk[2], 1)]] ^
-				Td3[Se[GETBYTE(rk[2], 0)]];
+				Td[0*256+Se[GETBYTE(rk[2], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[2], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[2], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[2], 0)]];
 			rk[3] =
-				Td0[Se[GETBYTE(rk[3], 3)]] ^
-				Td1[Se[GETBYTE(rk[3], 2)]] ^
-				Td2[Se[GETBYTE(rk[3], 1)]] ^
-				Td3[Se[GETBYTE(rk[3], 0)]];
+				Td[0*256+Se[GETBYTE(rk[3], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[3], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[3], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[3], 0)]];
 		}
 	}
 
@@ -148,15 +145,245 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 	ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
 }
 
-const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4;
-#ifdef IS_BIG_ENDIAN
-const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0;
-#else
-const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3;
-#endif
+#pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
 
 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
 {
+#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+	if (HasMMX())
+	{
+		const word32 *k = m_key;
+		const word32 *kLoopEnd = k + m_rounds*4;
+#ifdef __GNUC__
+		word32 t0, t1, t2, t3;
+		__asm__ __volatile__
+		(
+		".intel_syntax noprefix;"
+		AS1(	push	ebx)
+		AS1(	push	ebp)
+		AS2(	mov		ebp, eax)
+		AS2(	movd	mm5, ecx)
+#else
+		AS2(	mov		edx, g_cacheLineSize)
+		AS2(	mov		edi, inBlock)
+		AS2(	mov		esi, k)
+		AS2(	movd	mm5, kLoopEnd)
+		AS1(	push	ebp)
+		AS2(	lea		ebp, Te)
+#endif
+		AS2(	mov		eax, [esi+0*4])	// s0
+		AS2(	xor		eax, [edi+0*4])
+		AS2(	movd	mm0, eax)
+		AS2(	mov		ebx, [esi+1*4])
+		AS2(	xor		ebx, [edi+1*4])
+		AS2(	movd	mm1, ebx)
+		AS2(	and		ebx, eax)
+		AS2(	mov		eax, [esi+2*4])
+		AS2(	xor		eax, [edi+2*4])
+		AS2(	movd	mm2, eax)
+		AS2(	and		ebx, eax)
+		AS2(	mov		ecx, [esi+3*4])
+		AS2(	xor		ecx, [edi+3*4])
+		AS2(	and		ebx, ecx)
+
+		// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
+		AS2(	and		ebx, 0)
+		AS2(	mov		edi, ebx)	// make index depend on previous loads to simulate lfence
+		ASL(2)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	cmp		edi, 1024)
+		ASJ(	jl,		2, b)
+		AS2(	and		ebx, [ebp+1020])
+		AS2(	movd	mm6, ebx)
+		AS2(	pxor	mm2, mm6)
+		AS2(	pxor	mm1, mm6)
+		AS2(	pxor	mm0, mm6)
+		AS2(	xor		ecx, ebx)
+
+		AS2(	mov		edi, [esi+4*4])	// t0
+		AS2(	mov		eax, [esi+5*4])
+		AS2(	mov		ebx, [esi+6*4])
+		AS2(	mov		edx, [esi+7*4])
+		AS2(	add		esi, 8*4)
+		AS2(	movd	mm4, esi)
+
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	AS2(movzx esi, t##l)\
+	AS2(d, [ebp+0*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(shr e##t##x, 16)\
+	AS2(movzx esi, t##l)\
+	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(a, [ebp+3*1024+4*esi])
+
+#define s0		xor edi
+#define s1		xor eax
+#define s2		xor ebx
+#define s3		xor ecx
+#define t0		xor edi
+#define t1		xor eax
+#define t2		xor ebx
+#define t3		xor edx
+
+		QUARTER_ROUND(c, t0, t1, t2, t3)
+		AS2(	movd	ecx, mm2)
+		QUARTER_ROUND(c, t3, t0, t1, t2)
+		AS2(	movd	ecx, mm1)
+		QUARTER_ROUND(c, t2, t3, t0, t1)
+		AS2(	movd	ecx, mm0)
+		QUARTER_ROUND(c, t1, t2, t3, t0)
+		AS2(	movd	mm2, ebx)
+		AS2(	movd	mm1, eax)
+		AS2(	movd	mm0, edi)
+#undef QUARTER_ROUND
+
+		AS2(	movd	esi, mm4)
+
+		ASL(0)
+		AS2(	mov		edi, [esi+0*4])
+		AS2(	mov		eax, [esi+1*4])
+		AS2(	mov		ebx, [esi+2*4])
+		AS2(	mov		ecx, [esi+3*4])
+
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	AS2(movzx esi, t##l)\
+	AS2(a, [ebp+3*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(shr e##t##x, 16)\
+	AS2(movzx esi, t##l)\
+	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(d, [ebp+0*1024+4*esi])
+
+		QUARTER_ROUND(d, s0, s1, s2, s3)
+		AS2(	movd	edx, mm2)
+		QUARTER_ROUND(d, s3, s0, s1, s2)
+		AS2(	movd	edx, mm1)
+		QUARTER_ROUND(d, s2, s3, s0, s1)
+		AS2(	movd	edx, mm0)
+		QUARTER_ROUND(d, s1, s2, s3, s0)
+		AS2(	movd	esi, mm4)
+		AS2(	movd	mm2, ebx)
+		AS2(	movd	mm1, eax)
+		AS2(	movd	mm0, edi)
+
+		AS2(	mov		edi, [esi+4*4])
+		AS2(	mov		eax, [esi+5*4])
+		AS2(	mov		ebx, [esi+6*4])
+		AS2(	mov		edx, [esi+7*4])
+
+		QUARTER_ROUND(c, t0, t1, t2, t3)
+		AS2(	movd	ecx, mm2)
+		QUARTER_ROUND(c, t3, t0, t1, t2)
+		AS2(	movd	ecx, mm1)
+		QUARTER_ROUND(c, t2, t3, t0, t1)
+		AS2(	movd	ecx, mm0)
+		QUARTER_ROUND(c, t1, t2, t3, t0)
+		AS2(	movd	mm2, ebx)
+		AS2(	movd	mm1, eax)
+		AS2(	movd	mm0, edi)
+
+		AS2(	movd	esi, mm4)
+		AS2(	movd	edi, mm5)
+		AS2(	add		esi, 8*4)
+		AS2(	movd	mm4, esi)
+		AS2(	cmp		edi, esi)
+		ASJ(	jne,	0, b)
+
+#undef QUARTER_ROUND
+#undef s0
+#undef s1
+#undef s2
+#undef s3
+#undef t0
+#undef t1
+#undef t2
+#undef t3
+
+		AS2(	mov		eax, [edi+0*4])
+		AS2(	mov		ecx, [edi+1*4])
+		AS2(	mov		esi, [edi+2*4])
+		AS2(	mov		edi, [edi+3*4])
+
+#define QUARTER_ROUND(a, b, c, d)	\
+	AS2(	movzx	ebx, dl)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	shl		ebx, 3*8)\
+	AS2(	xor		a, ebx)\
+	AS2(	movzx	ebx, dh)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	shl		ebx, 2*8)\
+	AS2(	xor		b, ebx)\
+	AS2(	shr		edx, 16)\
+	AS2(	movzx	ebx, dl)\
+	AS2(	shr		edx, 8)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	shl		ebx, 1*8)\
+	AS2(	xor		c, ebx)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*edx])\
+	AS2(	xor		d, ebx)
+
+		QUARTER_ROUND(eax, ecx, esi, edi)
+		AS2(	movd	edx, mm2)
+		QUARTER_ROUND(edi, eax, ecx, esi)
+		AS2(	movd	edx, mm1)
+		QUARTER_ROUND(esi, edi, eax, ecx)
+		AS2(	movd	edx, mm0)
+		QUARTER_ROUND(ecx, esi, edi, eax)
+
+#undef QUARTER_ROUND
+
+		AS1(	pop		ebp)
+		AS1(	emms)
+
+#ifdef __GNUC__
+		AS1(	pop		ebx)
+		".att_syntax prefix;"
+			: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
+			: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
+			: "memory", "cc"
+		);
+
+		if (xorBlock)
+		{
+			t0 ^= ((const word32 *)xorBlock)[0];
+			t1 ^= ((const word32 *)xorBlock)[1];
+			t2 ^= ((const word32 *)xorBlock)[2];
+			t3 ^= ((const word32 *)xorBlock)[3];
+		}
+		((word32 *)outBlock)[0] = t0;
+		((word32 *)outBlock)[1] = t1;
+		((word32 *)outBlock)[2] = t2;
+		((word32 *)outBlock)[3] = t3;
+#else
+		AS2(	mov		ebx, xorBlock)
+		AS2(	test	ebx, ebx)
+		ASJ(	jz,		1, f)
+		AS2(	xor		eax, [ebx+0*4])
+		AS2(	xor		ecx, [ebx+1*4])
+		AS2(	xor		esi, [ebx+2*4])
+		AS2(	xor		edi, [ebx+3*4])
+		ASL(1)
+		AS2(	mov		ebx, outBlock)
+		AS2(	mov		[ebx+0*4], eax)
+		AS2(	mov		[ebx+1*4], ecx)
+		AS2(	mov		[ebx+2*4], esi)
+		AS2(	mov		[ebx+3*4], edi)
+#endif
+	}
+	else
+#endif	// #ifdef CRYPTOPP_X86_ASM_AVAILABLE
+	{
 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
 	const word32 *rk = m_key;
 
@@ -171,95 +398,68 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	rk += 8;
 
 	// timing attack countermeasure. see comments at top for more details
+	const int cacheLineSize = GetCacheLineSize();
 	unsigned int i;
 	word32 u = 0;
-	for (i=0; i<sizeof(Te0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (Te0[i+0*s_lineSizeDiv4] & Te0[i+2*s_lineSizeDiv4]) & (Te0[i+1*s_lineSizeDiv4] & Te0[i+3*s_lineSizeDiv4]);
+	for (i=0; i<1024; i+=cacheLineSize)
+		u &= *(const word32 *)(((const byte *)Te)+i);
+	u &= Te[255];
 	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
 
 	// first round
-    t0 ^=
-        Te0[GETBYTE(s0, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s1, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s2, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s3, s_i0)], 24);
-    t1 ^=
-        Te0[GETBYTE(s1, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s2, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s3, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s0, s_i0)], 24);
-    t2 ^=
-        Te0[GETBYTE(s2, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s3, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s0, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s1, s_i0)], 24);
-    t3 ^=
-        Te0[GETBYTE(s3, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s0, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s1, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s2, s_i0)], 24);
+#ifdef IS_BIG_ENDIAN
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= rotrFixed(Te[byte(t)], 24);	t >>= 8;\
+		b ^= rotrFixed(Te[byte(t)], 16);	t >>= 8;\
+		c ^= rotrFixed(Te[byte(t)], 8);	t >>= 8;\
+		d ^= Te[t];
+#else
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		d ^= Te[byte(t)];					t >>= 8;\
+		c ^= rotrFixed(Te[byte(t)], 8);	t >>= 8;\
+		b ^= rotrFixed(Te[byte(t)], 16);	t >>= 8;\
+		a ^= rotrFixed(Te[t], 24);
+#endif
+
+	QUARTER_ROUND(s3, t0, t1, t2, t3)
+	QUARTER_ROUND(s2, t3, t0, t1, t2)
+	QUARTER_ROUND(s1, t2, t3, t0, t1)
+	QUARTER_ROUND(s0, t1, t2, t3, t0)
+#undef QUARTER_ROUND
 
 	// Nr - 2 full rounds:
     unsigned int r = m_rounds/2 - 1;
     do
 	{
-        s0 =
-            Te0[GETBYTE(t0, 3)] ^
-            Te1[GETBYTE(t1, 2)] ^
-            Te2[GETBYTE(t2, 1)] ^
-            Te3[GETBYTE(t3, 0)] ^
-            rk[0];
-        s1 =
-            Te0[GETBYTE(t1, 3)] ^
-            Te1[GETBYTE(t2, 2)] ^
-            Te2[GETBYTE(t3, 1)] ^
-            Te3[GETBYTE(t0, 0)] ^
-            rk[1];
-        s2 =
-            Te0[GETBYTE(t2, 3)] ^
-            Te1[GETBYTE(t3, 2)] ^
-            Te2[GETBYTE(t0, 1)] ^
-            Te3[GETBYTE(t1, 0)] ^
-            rk[2];
-        s3 =
-            Te0[GETBYTE(t3, 3)] ^
-            Te1[GETBYTE(t0, 2)] ^
-            Te2[GETBYTE(t1, 1)] ^
-            Te3[GETBYTE(t2, 0)] ^
-            rk[3];
-
-        t0 =
-            Te0[GETBYTE(s0, 3)] ^
-            Te1[GETBYTE(s1, 2)] ^
-            Te2[GETBYTE(s2, 1)] ^
-            Te3[GETBYTE(s3, 0)] ^
-            rk[4];
-        t1 =
-            Te0[GETBYTE(s1, 3)] ^
-            Te1[GETBYTE(s2, 2)] ^
-            Te2[GETBYTE(s3, 1)] ^
-            Te3[GETBYTE(s0, 0)] ^
-            rk[5];
-        t2 =
-            Te0[GETBYTE(s2, 3)] ^
-            Te1[GETBYTE(s3, 2)] ^
-            Te2[GETBYTE(s0, 1)] ^
-            Te3[GETBYTE(s1, 0)] ^
-            rk[6];
-        t3 =
-            Te0[GETBYTE(s3, 3)] ^
-            Te1[GETBYTE(s0, 2)] ^
-            Te2[GETBYTE(s1, 1)] ^
-            Te3[GETBYTE(s2, 0)] ^
-            rk[7];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= Te[3*256+byte(t)]; t >>= 8;\
+		b ^= Te[2*256+byte(t)]; t >>= 8;\
+		c ^= Te[1*256+byte(t)]; t >>= 8;\
+		d ^= Te[t];
+
+		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+		QUARTER_ROUND(t3, s0, s1, s2, s3)
+		QUARTER_ROUND(t2, s3, s0, s1, s2)
+		QUARTER_ROUND(t1, s2, s3, s0, s1)
+		QUARTER_ROUND(t0, s1, s2, s3, s0)
+
+		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+		QUARTER_ROUND(s3, t0, t1, t2, t3)
+		QUARTER_ROUND(s2, t3, t0, t1, t2)
+		QUARTER_ROUND(s1, t2, t3, t0, t1)
+		QUARTER_ROUND(s0, t1, t2, t3, t0)
+#undef QUARTER_ROUND
 
         rk += 8;
     } while (--r);
 
 	// timing attack countermeasure. see comments at top for more details
 	u = 0;
-	for (i=0; i<sizeof(Se)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (((word32*)Se)[i+0*s_lineSizeDiv4] & ((word32*)Se)[i+2*s_lineSizeDiv4]) & (((word32*)Se)[i+1*s_lineSizeDiv4] & ((word32*)Se)[i+3*s_lineSizeDiv4]);
+	for (i=0; i<256; i+=cacheLineSize)
+		u &= *(const word32 *)(Se+i);
+	u &= *(const word32 *)(Se+252);
 	t0 |= u; t1 |= u; t2 |= u; t3 |= u;
 
 	word32 tbw[4];
@@ -267,23 +467,17 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	word32 *const obw = (word32 *)outBlock;
 	const word32 *const xbw = (const word32 *)xorBlock;
 
-	// last round
-	tempBlock[0] = Se[GETBYTE(t0, 3)];
-	tempBlock[1] = Se[GETBYTE(t1, 2)];
-	tempBlock[2] = Se[GETBYTE(t2, 1)];
-	tempBlock[3] = Se[GETBYTE(t3, 0)];
-	tempBlock[4] = Se[GETBYTE(t1, 3)];
-	tempBlock[5] = Se[GETBYTE(t2, 2)];
-	tempBlock[6] = Se[GETBYTE(t3, 1)];
-	tempBlock[7] = Se[GETBYTE(t0, 0)];
-	tempBlock[8] = Se[GETBYTE(t2, 3)];
-	tempBlock[9] = Se[GETBYTE(t3, 2)];
-	tempBlock[10] = Se[GETBYTE(t0, 1)];
-	tempBlock[11] = Se[GETBYTE(t1, 0)];
-	tempBlock[12] = Se[GETBYTE(t3, 3)];
-	tempBlock[13] = Se[GETBYTE(t0, 2)];
-	tempBlock[14] = Se[GETBYTE(t1, 1)];
-	tempBlock[15] = Se[GETBYTE(t2, 0)];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	tempBlock[a] = Se[byte(t)]; t >>= 8;\
+	tempBlock[b] = Se[byte(t)]; t >>= 8;\
+	tempBlock[c] = Se[byte(t)]; t >>= 8;\
+	tempBlock[d] = Se[t];
+
+	QUARTER_ROUND(t2, 15, 2, 5, 8)
+	QUARTER_ROUND(t1, 11, 14, 1, 4)
+	QUARTER_ROUND(t0, 7, 10, 13, 0)
+	QUARTER_ROUND(t3, 3, 6, 9, 12)
+#undef QUARTER_ROUND
 
 	if (xbw)
 	{
@@ -299,12 +493,13 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		obw[2] = tbw[2] ^ rk[2];
 		obw[3] = tbw[3] ^ rk[3];
 	}
+	}
 }
 
 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
 {
 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
-    const word32 *rk = m_key;
+	const word32 *rk = m_key;
 
 	s0 = ((const word32 *)inBlock)[0] ^ rk[0];
 	s1 = ((const word32 *)inBlock)[1] ^ rk[1];
@@ -317,95 +512,68 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	rk += 8;
 
 	// timing attack countermeasure. see comments at top for more details
+	const int cacheLineSize = GetCacheLineSize();
 	unsigned int i;
 	word32 u = 0;
-	for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]);
+	for (i=0; i<1024; i+=cacheLineSize)
+		u &= *(const word32 *)(((const byte *)Td)+i);
+	u &= Td[255];
 	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
 
 	// first round
-    t0 ^=
-        Td0[GETBYTE(s0, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s1, s_i0)], 24);
-    t1 ^=
-        Td0[GETBYTE(s1, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s2, s_i0)], 24);
-    t2 ^=
-        Td0[GETBYTE(s2, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s3, s_i0)], 24);
-    t3 ^=
-        Td0[GETBYTE(s3, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s0, s_i0)], 24);
+#ifdef IS_BIG_ENDIAN
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= rotrFixed(Td[byte(t)], 24);	t >>= 8;\
+		b ^= rotrFixed(Td[byte(t)], 16);	t >>= 8;\
+		c ^= rotrFixed(Td[byte(t)], 8);		t >>= 8;\
+		d ^= Td[t];
+#else
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		d ^= Td[byte(t)];					t >>= 8;\
+		c ^= rotrFixed(Td[byte(t)], 8);		t >>= 8;\
+		b ^= rotrFixed(Td[byte(t)], 16);	t >>= 8;\
+		a ^= rotrFixed(Td[t], 24);
+#endif
+
+	QUARTER_ROUND(s3, t2, t1, t0, t3)
+	QUARTER_ROUND(s2, t1, t0, t3, t2)
+	QUARTER_ROUND(s1, t0, t3, t2, t1)
+	QUARTER_ROUND(s0, t3, t2, t1, t0)
+#undef QUARTER_ROUND
 
 	// Nr - 2 full rounds:
     unsigned int r = m_rounds/2 - 1;
     do
 	{
-        s0 =
-            Td0[GETBYTE(t0, 3)] ^
-            Td1[GETBYTE(t3, 2)] ^
-            Td2[GETBYTE(t2, 1)] ^
-            Td3[GETBYTE(t1, 0)] ^
-            rk[0];
-        s1 =
-            Td0[GETBYTE(t1, 3)] ^
-            Td1[GETBYTE(t0, 2)] ^
-            Td2[GETBYTE(t3, 1)] ^
-            Td3[GETBYTE(t2, 0)] ^
-            rk[1];
-        s2 =
-            Td0[GETBYTE(t2, 3)] ^
-            Td1[GETBYTE(t1, 2)] ^
-            Td2[GETBYTE(t0, 1)] ^
-            Td3[GETBYTE(t3, 0)] ^
-            rk[2];
-        s3 =
-            Td0[GETBYTE(t3, 3)] ^
-            Td1[GETBYTE(t2, 2)] ^
-            Td2[GETBYTE(t1, 1)] ^
-            Td3[GETBYTE(t0, 0)] ^
-            rk[3];
-
-        t0 =
-            Td0[GETBYTE(s0, 3)] ^
-            Td1[GETBYTE(s3, 2)] ^
-            Td2[GETBYTE(s2, 1)] ^
-            Td3[GETBYTE(s1, 0)] ^
-            rk[4];
-        t1 =
-            Td0[GETBYTE(s1, 3)] ^
-            Td1[GETBYTE(s0, 2)] ^
-            Td2[GETBYTE(s3, 1)] ^
-            Td3[GETBYTE(s2, 0)] ^
-            rk[5];
-        t2 =
-            Td0[GETBYTE(s2, 3)] ^
-            Td1[GETBYTE(s1, 2)] ^
-            Td2[GETBYTE(s0, 1)] ^
-            Td3[GETBYTE(s3, 0)] ^
-            rk[6];
-        t3 =
-            Td0[GETBYTE(s3, 3)] ^
-            Td1[GETBYTE(s2, 2)] ^
-            Td2[GETBYTE(s1, 1)] ^
-            Td3[GETBYTE(s0, 0)] ^
-            rk[7];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= Td[3*256+byte(t)]; t >>= 8;\
+		b ^= Td[2*256+byte(t)]; t >>= 8;\
+		c ^= Td[1*256+byte(t)]; t >>= 8;\
+		d ^= Td[t];
+
+		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+		QUARTER_ROUND(t3, s2, s1, s0, s3)
+		QUARTER_ROUND(t2, s1, s0, s3, s2)
+		QUARTER_ROUND(t1, s0, s3, s2, s1)
+		QUARTER_ROUND(t0, s3, s2, s1, s0)
+
+		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+		QUARTER_ROUND(s3, t2, t1, t0, t3)
+		QUARTER_ROUND(s2, t1, t0, t3, t2)
+		QUARTER_ROUND(s1, t0, t3, t2, t1)
+		QUARTER_ROUND(s0, t3, t2, t1, t0)
+#undef QUARTER_ROUND
 
         rk += 8;
     } while (--r);
 
 	// timing attack countermeasure. see comments at top for more details
 	u = 0;
-	for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]);
+	for (i=0; i<256; i+=cacheLineSize)
+		u &= *(const word32 *)(Sd+i);
+	u &= *(const word32 *)(Sd+252);
 	t0 |= u; t1 |= u; t2 |= u; t3 |= u;
 
 	word32 tbw[4];
@@ -413,23 +581,17 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	word32 *const obw = (word32 *)outBlock;
 	const word32 *const xbw = (const word32 *)xorBlock;
 
-	// last round
-	tempBlock[0] = Sd[GETBYTE(t0, 3)];
-	tempBlock[1] = Sd[GETBYTE(t3, 2)];
-	tempBlock[2] = Sd[GETBYTE(t2, 1)];
-	tempBlock[3] = Sd[GETBYTE(t1, 0)];
-	tempBlock[4] = Sd[GETBYTE(t1, 3)];
-	tempBlock[5] = Sd[GETBYTE(t0, 2)];
-	tempBlock[6] = Sd[GETBYTE(t3, 1)];
-	tempBlock[7] = Sd[GETBYTE(t2, 0)];
-	tempBlock[8] = Sd[GETBYTE(t2, 3)];
-	tempBlock[9] = Sd[GETBYTE(t1, 2)];
-	tempBlock[10] = Sd[GETBYTE(t0, 1)];
-	tempBlock[11] = Sd[GETBYTE(t3, 0)];
-	tempBlock[12] = Sd[GETBYTE(t3, 3)];
-	tempBlock[13] = Sd[GETBYTE(t2, 2)];
-	tempBlock[14] = Sd[GETBYTE(t1, 1)];
-	tempBlock[15] = Sd[GETBYTE(t0, 0)];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	tempBlock[a] = Sd[byte(t)]; t >>= 8;\
+	tempBlock[b] = Sd[byte(t)]; t >>= 8;\
+	tempBlock[c] = Sd[byte(t)]; t >>= 8;\
+	tempBlock[d] = Sd[t];
+
+	QUARTER_ROUND(t2, 7, 2, 13, 8)
+	QUARTER_ROUND(t1, 3, 14, 9, 4)
+	QUARTER_ROUND(t0, 15, 10, 5, 0)
+	QUARTER_ROUND(t3, 11, 6, 1, 12)
+#undef QUARTER_ROUND
 
 	if (xbw)
 	{
diff --git a/rijndael.h b/rijndael.h
index a035da4c..a068d637 100644
--- a/rijndael.h
+++ b/rijndael.h
@@ -25,16 +25,10 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
 
 	protected:
 		// VS2005 workaround: have to put these on seperate lines, or error C2487 is triggered in DLL build
-		CRYPTOPP_L1_CACHE_ALIGN(static const byte Se[256]);
-		CRYPTOPP_L1_CACHE_ALIGN(static const byte Sd[256]);
-		CRYPTOPP_L1_CACHE_ALIGN(static const word32 Te0[256]);
-		static const word32 Te1[256];
-		static const word32 Te2[256];
-		static const word32 Te3[256];
-		CRYPTOPP_L1_CACHE_ALIGN(static const word32 Td0[256]);
-		static const word32 Td1[256];
-		static const word32 Td2[256];
-		static const word32 Td3[256];
+		static const byte Se[256];
+		static const byte Sd[256];
+		static const word32 Te[4*256];
+		static const word32 Td[4*256];
 
 		static const word32 rcon[];
 
@@ -52,6 +46,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
 	{
 	public:
 		void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+		void ProcessAndXorBlock_Old(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
 	};
 
 public:
diff --git a/sha.cpp b/sha.cpp
index 5355995a..127d1f99 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -9,6 +9,7 @@
 
 #include "sha.h"
 #include "misc.h"
+#include "cpu.h"
 
 NAMESPACE_BEGIN(CryptoPP)
 
@@ -74,27 +75,43 @@ void SHA1::Transform(word32 *state, const word32 *data)
     state[2] += c;
     state[3] += d;
     state[4] += e;
-    /* Wipe variables */
-    a = b = c = d = e = 0;
-	memset(W, 0, sizeof(W));
 }
 
 // end of Steve Reid's code
 
 // *************************************************************
 
+void SHA224::InitState(HashWordType *state)
+{
+	static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
+	memcpy(state, s, sizeof(s));
+}
+
 void SHA256::InitState(HashWordType *state)
 {
-	state[0] = 0x6a09e667;
-	state[1] = 0xbb67ae85;
-	state[2] = 0x3c6ef372;
-	state[3] = 0xa54ff53a;
-	state[4] = 0x510e527f;
-	state[5] = 0x9b05688c;
-	state[6] = 0x1f83d9ab;
-	state[7] = 0x5be0cd19;
+	static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
+	memcpy(state, s, sizeof(s));
 }
 
+static const word32 SHA256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
 
 #define Ch(x,y,z) (z^(x&(y^z)))
@@ -109,7 +126,7 @@ void SHA256::InitState(HashWordType *state)
 #define g(i) T[(6-i)&7]
 #define h(i) T[(7-i)&7]
 
-#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
 	d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
 
 // for SHA256
@@ -141,98 +158,114 @@ void SHA256::Transform(word32 *state, const word32 *data)
     state[5] += f(0);
     state[6] += g(0);
     state[7] += h(0);
-    /* Wipe variables */
-	memset(W, 0, sizeof(W));
-	memset(T, 0, sizeof(T));
 }
 
+/* 
+// smaller but slower
+void SHA256_Transform(word32 *state, const word32 *data)
+{
+	word32 T[20];
+	word32 W[32];
+	unsigned int i = 0, j = 0;
+	word32 *t = T+8;
+
+	memcpy(t, state, 8*4);
+	word32 e = t[4], a = t[0];
+
+	do 
+	{
+		word32 w = data[j];
+		W[j] = w;
+		w += K[j];
+		w += t[7];
+		w += S1(e);
+		w += Ch(e, t[5], t[6]);
+		e = t[3] + w;
+		t[3] = t[3+8] = e;
+		w += S0(t[0]);
+		a = w + Maj(a, t[1], t[2]);
+		t[-1] = t[7] = a;
+		--t;
+		++j;
+		if (j%8 == 0)
+			t += 8;
+	} while (j<16);
+
+	do
+	{
+		i = j&0xf;
+		word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
+		W[i+16] = W[i] = w;
+		w += K[j];
+		w += t[7];
+		w += S1(e);
+		w += Ch(e, t[5], t[6]);
+		e = t[3] + w;
+		t[3] = t[3+8] = e;
+		w += S0(t[0]);
+		a = w + Maj(a, t[1], t[2]);
+		t[-1] = t[7] = a;
+
+		w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
+		W[(i+1)+16] = W[(i+1)] = w;
+		w += K[j+1];
+		w += (t-1)[7];
+		w += S1(e);
+		w += Ch(e, (t-1)[5], (t-1)[6]);
+		e = (t-1)[3] + w;
+		(t-1)[3] = (t-1)[3+8] = e;
+		w += S0((t-1)[0]);
+		a = w + Maj(a, (t-1)[1], (t-1)[2]);
+		(t-1)[-1] = (t-1)[7] = a;
+
+		t-=2;
+		j+=2;
+		if (j%8 == 0)
+			t += 8;
+	} while (j<64);
+
+    state[0] += a;
+    state[1] += t[1];
+    state[2] += t[2];
+    state[3] += t[3];
+    state[4] += e;
+    state[5] += t[5];
+    state[6] += t[6];
+    state[7] += t[7];
+}
+*/
+
 #undef S0
 #undef S1
 #undef s0
 #undef s1
-
-const word32 SHA256::K[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-void SHA224::InitState(HashWordType *state)
-{
-	state[0] = 0xc1059ed8;
-	state[1] = 0x367cd507;
-	state[2] = 0x3070dd17;
-	state[3] = 0xf70e5939;
-	state[4] = 0xffc00b31;
-	state[5] = 0x68581511;
-	state[6] = 0x64f98fa7;
-	state[7] = 0xbefa4fa4;
-}
+#undef R
 
 // *************************************************************
 
 #ifdef WORD64_AVAILABLE
 
-void SHA512::InitState(HashWordType *state)
+void SHA384::InitState(HashWordType *state)
 {
-	state[0] = W64LIT(0x6a09e667f3bcc908);
-	state[1] = W64LIT(0xbb67ae8584caa73b);
-	state[2] = W64LIT(0x3c6ef372fe94f82b);
-	state[3] = W64LIT(0xa54ff53a5f1d36f1);
-	state[4] = W64LIT(0x510e527fade682d1);
-	state[5] = W64LIT(0x9b05688c2b3e6c1f);
-	state[6] = W64LIT(0x1f83d9abfb41bd6b);
-	state[7] = W64LIT(0x5be0cd19137e2179);
+	static const word64 s[8] = {
+		W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507),
+		W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939),
+		W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511),
+		W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)};
+	memcpy(state, s, sizeof(s));
 }
 
-// for SHA512
-#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
-#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
-#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
-#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
-
-void SHA512::Transform(word64 *state, const word64 *data)
+void SHA512::InitState(HashWordType *state)
 {
-	word64 W[16];
-	word64 T[8];
-    /* Copy context->state[] to working vars */
-	memcpy(T, state, sizeof(T));
-    /* 80 operations, partially loop unrolled */
-	for (unsigned int j=0; j<80; j+=16)
-	{
-		R( 0); R( 1); R( 2); R( 3);
-		R( 4); R( 5); R( 6); R( 7);
-		R( 8); R( 9); R(10); R(11);
-		R(12); R(13); R(14); R(15);
-	}
-    /* Add the working vars back into context.state[] */
-    state[0] += a(0);
-    state[1] += b(0);
-    state[2] += c(0);
-    state[3] += d(0);
-    state[4] += e(0);
-    state[5] += f(0);
-    state[6] += g(0);
-    state[7] += h(0);
-    /* Wipe variables */
-	memset(W, 0, sizeof(W));
-	memset(T, 0, sizeof(T));
+	static const word64 s[8] = {
+		W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
+		W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
+		W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
+		W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)};
+	memcpy(state, s, sizeof(s));
 }
 
-const word64 SHA512::K[80] = {
+CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
 	W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
 	W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
 	W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
@@ -275,16 +308,231 @@ const word64 SHA512::K[80] = {
 	W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
 };
 
-void SHA384::InitState(HashWordType *state)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
+static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
+{
+#ifdef __GNUC__
+	__asm__ __volatile__
+	(
+		".intel_syntax noprefix;"
+	AS1(	push	ebx)
+	AS2(	mov		ebx, eax)
+#else
+	AS2(	lea		ebx, SHA512_K)
+#endif
+
+	AS2(	mov		eax, esp)
+	AS2(	and		esp, 0xfffffff0)
+	AS2(	sub		esp, 27*16)				// 17*16 for expanded data, 20*8 for state
+	AS1(	push	eax)
+	AS2(	xor		eax, eax)
+	AS2(	lea		edi, [esp+4+8*8])		// start at middle of state buffer. will decrement pointer each round to avoid copying
+	AS2(	lea		esi, [esp+4+20*8+8])	// 16-byte alignment, then add 8
+
+	AS2(	movq	mm4, [ecx+0*8])
+	AS2(	movq	[edi+0*8], mm4)
+	AS2(	movq	mm0, [ecx+1*8])
+	AS2(	movq	[edi+1*8], mm0)
+	AS2(	movq	mm0, [ecx+2*8])
+	AS2(	movq	[edi+2*8], mm0)
+	AS2(	movq	mm0, [ecx+3*8])
+	AS2(	movq	[edi+3*8], mm0)
+	AS2(	movq	mm5, [ecx+4*8])
+	AS2(	movq	[edi+4*8], mm5)
+	AS2(	movq	mm0, [ecx+5*8])
+	AS2(	movq	[edi+5*8], mm0)
+	AS2(	movq	mm0, [ecx+6*8])
+	AS2(	movq	[edi+6*8], mm0)
+	AS2(	movq	mm0, [ecx+7*8])
+	AS2(	movq	[edi+7*8], mm0)
+	ASJ(	jmp,	0, f)
+
+#define SSE2_S0_S1(r, a, b, c)	\
+	AS2(	movq	mm6, r)\
+	AS2(	psrlq	r, a)\
+	AS2(	movq	mm7, r)\
+	AS2(	psllq	mm6, 64-c)\
+	AS2(	pxor	mm7, mm6)\
+	AS2(	psrlq	r, b-a)\
+	AS2(	pxor	mm7, r)\
+	AS2(	psllq	mm6, c-b)\
+	AS2(	pxor	mm7, mm6)\
+	AS2(	psrlq	r, c-b)\
+	AS2(	pxor	r, mm7)\
+	AS2(	psllq	mm6, b-a)\
+	AS2(	pxor	r, mm6)
+
+#define SSE2_s0(r, a, b, c)	\
+	AS2(	movdqa	xmm6, r)\
+	AS2(	psrlq	r, a)\
+	AS2(	movdqa	xmm7, r)\
+	AS2(	psllq	xmm6, 64-c)\
+	AS2(	pxor	xmm7, xmm6)\
+	AS2(	psrlq	r, b-a)\
+	AS2(	pxor	xmm7, r)\
+	AS2(	psrlq	r, c-b)\
+	AS2(	pxor	r, xmm7)\
+	AS2(	psllq	xmm6, c-a)\
+	AS2(	pxor	r, xmm6)
+
+#define SSE2_s1(r, a, b, c)	\
+	AS2(	movdqa	xmm6, r)\
+	AS2(	psrlq	r, a)\
+	AS2(	movdqa	xmm7, r)\
+	AS2(	psllq	xmm6, 64-c)\
+	AS2(	pxor	xmm7, xmm6)\
+	AS2(	psrlq	r, b-a)\
+	AS2(	pxor	xmm7, r)\
+	AS2(	psllq	xmm6, c-b)\
+	AS2(	pxor	xmm7, xmm6)\
+	AS2(	psrlq	r, c-b)\
+	AS2(	pxor	r, xmm7)
+
+	ASL(SHA512_Round)
+	// k + w is in mm0, a is in mm4, e is in mm5
+	AS2(	paddq	mm0, [edi+7*8])		// h
+	AS2(	movq	mm2, [edi+5*8])		// f
+	AS2(	movq	mm3, [edi+6*8])		// g
+	AS2(	pxor	mm2, mm3)
+	AS2(	pand	mm2, mm5)
+	SSE2_S0_S1(mm5,14,18,41)
+	AS2(	pxor	mm2, mm3)
+	AS2(	paddq	mm0, mm2)			// h += Ch(e,f,g)
+	AS2(	paddq	mm5, mm0)			// h += S1(e)
+	AS2(	movq	mm2, [edi+1*8])		// b
+	AS2(	movq	mm1, mm2)
+	AS2(	por		mm2, mm4)
+	AS2(	pand	mm2, [edi+2*8])		// c
+	AS2(	pand	mm1, mm4)
+	AS2(	por		mm1, mm2)
+	AS2(	paddq	mm1, mm5)			// temp = h + Maj(a,b,c)
+	AS2(	paddq	mm5, [edi+3*8])		// e = d + h
+	AS2(	movq	[edi+3*8], mm5)
+	AS2(	movq	[edi+11*8], mm5)
+	SSE2_S0_S1(mm4,28,34,39)			// S0(a)
+	AS2(	paddq	mm4, mm1)			// a = temp + S0(a)
+	AS2(	movq	[edi-8], mm4)
+	AS2(	movq	[edi+7*8], mm4)
+	AS1(	ret)
+
+	// first 16 rounds
+	ASL(0)
+	AS2(	movq	mm0, [edx+eax*8])
+	AS2(	movq	[esi+eax*8], mm0)
+	AS2(	movq	[esi+eax*8+16*8], mm0)
+	AS2(	paddq	mm0, [ebx+eax*8])
+	ASC(	call,	SHA512_Round)
+	AS1(	inc		eax)
+	AS2(	sub		edi, 8)
+	AS2(	test	eax, 7)
+	ASJ(	jnz,	0, b)
+	AS2(	add		edi, 8*8)
+	AS2(	cmp		eax, 16)
+	ASJ(	jne,	0, b)
+
+	// rest of the rounds
+	AS2(	movdqu	xmm0, [esi+(16-2)*8])
+	ASL(1)
+	// data expansion, W[i-2] already in xmm0
+	AS2(	movdqu	xmm3, [esi])
+	AS2(	paddq	xmm3, [esi+(16-7)*8])
+	AS2(	movdqa	xmm2, [esi+(16-15)*8])
+	SSE2_s1(xmm0, 6, 19, 61)
+	AS2(	paddq	xmm0, xmm3)
+	SSE2_s0(xmm2, 1, 7, 8)
+	AS2(	paddq	xmm0, xmm2)
+	AS2(	movdq2q	mm0, xmm0)
+	AS2(	movhlps	xmm1, xmm0)
+	AS2(	paddq	mm0, [ebx+eax*8])
+	AS2(	movlps	[esi], xmm0)
+	AS2(	movlps	[esi+8], xmm1)
+	AS2(	movlps	[esi+8*16], xmm0)
+	AS2(	movlps	[esi+8*17], xmm1)
+	// 2 rounds
+	ASC(	call,	SHA512_Round)
+	AS2(	sub		edi, 8)
+	AS2(	movdq2q	mm0, xmm1)
+	AS2(	paddq	mm0, [ebx+eax*8+8])
+	ASC(	call,	SHA512_Round)
+	// update indices and loop
+	AS2(	add		esi, 16)
+	AS2(	add		eax, 2)
+	AS2(	sub		edi, 8)
+	AS2(	test	eax, 7)
+	ASJ(	jnz,	1, b)
+	// do housekeeping every 8 rounds
+	AS2(	mov		esi, 0xf)
+	AS2(	and		esi, eax)
+	AS2(	lea		esi, [esp+4+20*8+8+esi*8])
+	AS2(	add		edi, 8*8)
+	AS2(	cmp		eax, 80)
+	ASJ(	jne,	1, b)
+
+#define SSE2_CombineState(i)	\
+	AS2(	movq	mm0, [edi+i*8])\
+	AS2(	paddq	mm0, [ecx+i*8])\
+	AS2(	movq	[ecx+i*8], mm0)
+
+	SSE2_CombineState(0)
+	SSE2_CombineState(1)
+	SSE2_CombineState(2)
+	SSE2_CombineState(3)
+	SSE2_CombineState(4)
+	SSE2_CombineState(5)
+	SSE2_CombineState(6)
+	SSE2_CombineState(7)
+
+	AS1(	pop		esp)
+	AS1(	emms)
+
+#ifdef __GNUC__
+	AS1(	pop		ebx)
+	".att_syntax prefix;"
+		:
+		: "a" (SHA512_K), "c" (state), "d" (data)
+		: "%esi", "%edi", "memory", "cc"
+	);
+#endif
+}
+#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+void SHA512::Transform(word64 *state, const word64 *data)
 {
-	state[0] = W64LIT(0xcbbb9d5dc1059ed8);
-	state[1] = W64LIT(0x629a292a367cd507);
-	state[2] = W64LIT(0x9159015a3070dd17);
-	state[3] = W64LIT(0x152fecd8f70e5939);
-	state[4] = W64LIT(0x67332667ffc00b31);
-	state[5] = W64LIT(0x8eb44a8768581511);
-	state[6] = W64LIT(0xdb0c2e0d64f98fa7);
-	state[7] = W64LIT(0x47b5481dbefa4fa4);
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	if (HasSSE2())
+		return SHA512_SSE2_Transform(state, data);
+#endif
+
+#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
+#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
+#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
+#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
+
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\
+	d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
+
+	word64 W[16];
+	word64 T[8];
+    /* Copy context->state[] to working vars */
+	memcpy(T, state, sizeof(T));
+    /* 80 operations, partially loop unrolled */
+	for (unsigned int j=0; j<80; j+=16)
+	{
+		R( 0); R( 1); R( 2); R( 3);
+		R( 4); R( 5); R( 6); R( 7);
+		R( 8); R( 9); R(10); R(11);
+		R(12); R(13); R(14); R(15);
+	}
+    /* Add the working vars back into context.state[] */
+    state[0] += a(0);
+    state[1] += b(0);
+    state[2] += c(0);
+    state[3] += d(0);
+    state[4] += e(0);
+    state[5] += f(0);
+    state[6] += g(0);
+    state[7] += h(0);
 }
 
 #endif
diff --git a/sha.h b/sha.h
index 69b02ff7..40eb6df6 100644
--- a/sha.h
+++ b/sha.h
@@ -23,9 +23,6 @@ public:
 	static void CRYPTOPP_API InitState(HashWordType *state);
 	static void CRYPTOPP_API Transform(word32 *digest, const word32 *data);
 	static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-256";}
-
-protected:
-	static const word32 K[64];
 };
 
 //! implements the SHA-224 standard
@@ -46,9 +43,6 @@ public:
 	static void CRYPTOPP_API InitState(HashWordType *state);
 	static void CRYPTOPP_API Transform(word64 *digest, const word64 *data);
 	static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-512";}
-
-protected:
-	static const word64 K[80];
 };
 
 //! implements the SHA-384 standard
diff --git a/tiger.cpp b/tiger.cpp
index b69e975a..332de2c6 100644
--- a/tiger.cpp
+++ b/tiger.cpp
@@ -3,6 +3,7 @@
 #include "pch.h"
 #include "tiger.h"
 #include "misc.h"
+#include "cpu.h"
 
 #ifdef WORD64_AVAILABLE
 
@@ -24,13 +25,187 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
 
 	m_data[7] = GetBitCountLo();
 
-	Transform(m_digest, m_data);
-	CorrectEndianess(m_digest, m_digest, DigestSize());
-	memcpy(hash, m_digest, size);
+	Transform(m_state, m_data);
+	CorrectEndianess(m_state, m_state, DigestSize());
+	memcpy(hash, m_state, size);
 
 	Restart();		// reinit for next use
 }
 
+void Tiger::Transform (word64 *digest, const word64 *X)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	if (HasSSE2())
+	{
+#ifdef __GNUC__
+		__asm__ __volatile__
+		(
+		".intel_syntax noprefix;"
+		AS1(	push	ebx)
+#else
+		AS2(	mov		eax, digest)
+		AS2(	mov		esi, X)
+		AS2(	lea		edx, [table])
+#endif
+		AS2(	movq	mm0, [eax])
+		AS2(	movq	mm1, [eax+1*8])
+		AS2(	movq	mm5, mm1)
+		AS2(	movq	mm2, [eax+2*8])
+		AS2(	movq	mm7, [edx+4*2048+0*8])
+		AS2(	movq	mm6, [edx+4*2048+1*8])
+		AS2(	mov		ecx, esp)
+		AS2(	and		esp, 0xfffffff0)
+		AS2(	sub		esp, 8*8)
+		AS1(	push	ecx)
+
+#define SSE2_round(a,b,c,x,mul) \
+		AS2(	pxor	c, [x])\
+		AS2(	movd	ecx, c)\
+		AS2(	movzx	edi, cl)\
+		AS2(	movq	mm3, [edx+0*2048+edi*8])\
+		AS2(	movzx	edi, ch)\
+		AS2(	movq	mm4, [edx+3*2048+edi*8])\
+		AS2(	shr		ecx, 16)\
+		AS2(	movzx	edi, cl)\
+		AS2(	pxor	mm3, [edx+1*2048+edi*8])\
+		AS2(	movzx	edi, ch)\
+		AS2(	pxor	mm4, [edx+2*2048+edi*8])\
+		AS3(	pextrw	ecx, c, 2)\
+		AS2(	movzx	edi, cl)\
+		AS2(	pxor	mm3, [edx+2*2048+edi*8])\
+		AS2(	movzx	edi, ch)\
+		AS2(	pxor	mm4, [edx+1*2048+edi*8])\
+		AS3(	pextrw	ecx, c, 3)\
+		AS2(	movzx	edi, cl)\
+		AS2(	pxor	mm3, [edx+3*2048+edi*8])\
+		AS2(	psubq	a, mm3)\
+		AS2(	movzx	edi, ch)\
+		AS2(	pxor	mm4, [edx+0*2048+edi*8])\
+		AS2(	paddq	b, mm4)\
+		SSE2_mul_##mul(b)
+
+#define SSE2_mul_5(b)	\
+		AS2(	movq	mm3, b)\
+		AS2(	psllq	b, 2)\
+		AS2(	paddq	b, mm3)
+
+#define SSE2_mul_7(b)	\
+		AS2(	movq	mm3, b)\
+		AS2(	psllq	b, 3)\
+		AS2(	psubq	b, mm3)
+
+#define SSE2_mul_9(b)	\
+		AS2(	movq	mm3, b)\
+		AS2(	psllq	b, 3)\
+		AS2(	paddq	b, mm3)
+
+#define label2_5 1
+#define label2_7 2
+#define label2_9 3
+
+#define SSE2_pass(A,B,C,mul,X)	\
+		AS2(	xor		ebx, ebx)\
+		ASL(mul)\
+		SSE2_round(A,B,C,X+0*8+ebx,mul)\
+		SSE2_round(B,C,A,X+1*8+ebx,mul)\
+		AS2(	cmp		ebx, 6*8)\
+		ASJ(	je,		label2_##mul, f)\
+		SSE2_round(C,A,B,X+2*8+ebx,mul)\
+		AS2(	add		ebx, 3*8)\
+		ASJ(	jmp,	mul, b)\
+		ASL(label2_##mul)
+
+#define SSE2_key_schedule(Y,X) \
+		AS2(	movq	mm3, [X+7*8])\
+		AS2(	pxor	mm3, mm6)\
+		AS2(	movq	mm4, [X+0*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+0*8], mm4)\
+		AS2(	pxor	mm4, [X+1*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+1*8], mm4)\
+		AS2(	paddq	mm4, [X+2*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psllq	mm3, 19)\
+		AS2(	movq	[Y+2*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [X+3*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+3*8], mm4)\
+		AS2(	pxor	mm4, [X+4*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+4*8], mm4)\
+		AS2(	paddq	mm4, [X+5*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psrlq	mm3, 23)\
+		AS2(	movq	[Y+5*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [X+6*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+6*8], mm4)\
+		AS2(	pxor	mm4, [X+7*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+7*8], mm4)\
+		AS2(	paddq	mm4, [Y+0*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psllq	mm3, 19)\
+		AS2(	movq	[Y+0*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [Y+1*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+1*8], mm4)\
+		AS2(	pxor	mm4, [Y+2*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+2*8], mm4)\
+		AS2(	paddq	mm4, [Y+3*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psrlq	mm3, 23)\
+		AS2(	movq	[Y+3*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [Y+4*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+4*8], mm4)\
+		AS2(	pxor	mm4, [Y+5*8])\
+		AS2(	movq	[Y+5*8], mm4)\
+		AS2(	paddq	mm4, [Y+6*8])\
+		AS2(	movq	[Y+6*8], mm4)\
+		AS2(	pxor	mm4, [edx+4*2048+2*8])\
+		AS2(	movq	mm3, [Y+7*8])\
+		AS2(	psubq	mm3, mm4)\
+		AS2(	movq	[Y+7*8], mm3)
+
+		SSE2_pass(mm0, mm1, mm2, 5, esi)
+		SSE2_key_schedule(esp+4, esi)
+		SSE2_pass(mm2, mm0, mm1, 7, esp+4)
+		SSE2_key_schedule(esp+4, esp+4)
+		SSE2_pass(mm1, mm2, mm0, 9, esp+4)
+
+		AS2(	pxor	mm0, [eax+0*8])
+		AS2(	movq	[eax+0*8], mm0)
+		AS2(	psubq	mm1, mm5)
+		AS2(	movq	[eax+1*8], mm1)
+		AS2(	paddq	mm2, [eax+2*8])
+		AS2(	movq	[eax+2*8], mm2)
+
+		AS1(	pop		esp)
+		AS1(	emms)
+#ifdef __GNUC__
+		AS1(	pop		ebx)
+		".att_syntax prefix;"
+			:
+			: "a" (digest), "S" (X), "d" (table)
+			: "%ecx", "%edi", "memory", "cc"
+		);
+#endif
+	}
+	else
+#endif
+	{
+		word64 a = digest[0];
+		word64 b = digest[1];
+		word64 c = digest[2];
+		word64 Y[8];
+
 #define t1 (table)
 #define t2 (table+256)
 #define t3 (table+256*2)
@@ -42,15 +217,17 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
 	b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \
 	b *= mul
 
-#define pass(a,b,c,mul,X) \
-	round(a,b,c,X[0],mul); \
-	round(b,c,a,X[1],mul); \
-	round(c,a,b,X[2],mul); \
-	round(a,b,c,X[3],mul); \
-	round(b,c,a,X[4],mul); \
-	round(c,a,b,X[5],mul); \
-	round(a,b,c,X[6],mul); \
-	round(b,c,a,X[7],mul)
+#define pass(a,b,c,mul,X) {\
+	int i=0;\
+	while (true)\
+	{\
+		round(a,b,c,X[i+0],mul); \
+		round(b,c,a,X[i+1],mul); \
+		if (i==6)\
+			break;\
+		round(c,a,b,X[i+2],mul); \
+		i+=3;\
+	}}
 
 #define key_schedule(Y,X) \
 	Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \
@@ -70,24 +247,16 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
 	Y[6] += Y[5]; \
 	Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF)
 
-void Tiger::Transform (word64 *digest, const word64 *X)
-{
-	word64 a = digest[0];
-	word64 b = digest[1];
-	word64 c = digest[2];
-	word64 Y[8];
-
-	pass(a,b,c,5,X);
-	key_schedule(Y,X);
-	pass(c,a,b,7,Y);
-	key_schedule(Y,Y);
-	pass(b,c,a,9,Y);
-
-	digest[0] = a ^ digest[0];
-	digest[1] = b - digest[1];
-	digest[2] = c + digest[2];
-
-	memset(Y, 0, sizeof(Y));
+		pass(a,b,c,5,X);
+		key_schedule(Y,X);
+		pass(c,a,b,7,Y);
+		key_schedule(Y,Y);
+		pass(b,c,a,9,Y);
+
+		digest[0] = a ^ digest[0];
+		digest[1] = b - digest[1];
+		digest[2] = c + digest[2];
+	}
 }
 
 NAMESPACE_END
diff --git a/tiger.h b/tiger.h
index 66d1da2a..42bf1614 100644
--- a/tiger.h
+++ b/tiger.h
@@ -9,7 +9,7 @@
 
 NAMESPACE_BEGIN(CryptoPP)
 
-/// <a href="http://www.weidai.com/scan-mirror/md.html#Tiger">Tiger</a>
+/// <a href="http://www.cryptolounge.org/wiki/Tiger">Tiger</a>
 class Tiger : public IteratedHashWithStaticTransform<word64, LittleEndian, 64, 24, Tiger>
 {
 public:
@@ -19,7 +19,7 @@ public:
 	static const char * StaticAlgorithmName() {return "Tiger";}
 
 protected:
-	static const word64 table[4*256];
+	static const word64 table[4*256+3];
 };
 
 NAMESPACE_END
diff --git a/whrlpool.cpp b/whrlpool.cpp
index 989281a3..da19d7ff 100644
--- a/whrlpool.cpp
+++ b/whrlpool.cpp
@@ -1,7 +1,7 @@
-// Whrlpool.cpp - modified by Kevin Springle from
+// whrlpool.cpp - originally modified by Kevin Springle from
 // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c.
+// Updated to Whirlpool version 3.0, optimized and MMX version added by Wei Dai
 // Any modifications are placed in the public domain
-// Updated to Whirlpool version 3.0 by Wei Dai
 
 // This is the original introductory comment:
 
@@ -69,6 +69,7 @@
 
 #include "whrlpool.h"
 #include "misc.h"
+#include "cpu.h"
 
 NAMESPACE_BEGIN(CryptoPP)
 
@@ -94,9 +95,9 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size)
 	m_data[m_data.size()-2] = GetBitCountHi();
 	m_data[m_data.size()-1] = GetBitCountLo();
 
-	Transform(m_digest, m_data);
-	CorrectEndianess(m_digest, m_digest, DigestSize());
-	memcpy(hash, m_digest, size);
+	Transform(m_state, m_data);
+	CorrectEndianess(m_state, m_state, DigestSize());
+	memcpy(hash, m_state, size);
 
 	Restart();		// reinit for next use
 }
@@ -113,7 +114,7 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size)
  * employed).
  */
 
-static const word64 C0[256] = {
+CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTION_ALIGN16 = {
     W64LIT(0x18186018c07830d8), W64LIT(0x23238c2305af4626), W64LIT(0xc6c63fc67ef991b8), W64LIT(0xe8e887e8136fcdfb),
     W64LIT(0x878726874ca113cb), W64LIT(0xb8b8dab8a9626d11), W64LIT(0x0101040108050209), W64LIT(0x4f4f214f426e9e0d),
     W64LIT(0x3636d836adee6c9b), W64LIT(0xa6a6a2a6590451ff), W64LIT(0xd2d26fd2debdb90c), W64LIT(0xf5f5f3f5fb06f70e),
@@ -177,11 +178,9 @@ static const word64 C0[256] = {
     W64LIT(0x16165816b04e2ca6), W64LIT(0x3a3ae83acdd274f7), W64LIT(0x6969b9696fd0d206), W64LIT(0x09092409482d1241),
     W64LIT(0x7070dd70a7ade0d7), W64LIT(0xb6b6e2b6d954716f), W64LIT(0xd0d067d0ceb7bd1e), W64LIT(0xeded93ed3b7ec7d6),
     W64LIT(0xcccc17cc2edb85e2), W64LIT(0x424215422a578468), W64LIT(0x98985a98b4c22d2c), W64LIT(0xa4a4aaa4490e55ed),
-    W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2),
-};
+	W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2),
 
-static const word64 C1[256] = {
-    W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd),
+	W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd),
     W64LIT(0xcb878726874ca113), W64LIT(0x11b8b8dab8a9626d), W64LIT(0x0901010401080502), W64LIT(0x0d4f4f214f426e9e),
     W64LIT(0x9b3636d836adee6c), W64LIT(0xffa6a6a2a6590451), W64LIT(0x0cd2d26fd2debdb9), W64LIT(0x0ef5f5f3f5fb06f7),
     W64LIT(0x967979f979ef80f2), W64LIT(0x306f6fa16f5fcede), W64LIT(0x6d91917e91fcef3f), W64LIT(0xf852525552aa07a4),
@@ -245,10 +244,8 @@ static const word64 C1[256] = {
     W64LIT(0xd77070dd70a7ade0), W64LIT(0x6fb6b6e2b6d95471), W64LIT(0x1ed0d067d0ceb7bd), W64LIT(0xd6eded93ed3b7ec7),
     W64LIT(0xe2cccc17cc2edb85), W64LIT(0x68424215422a5784), W64LIT(0x2c98985a98b4c22d), W64LIT(0xeda4a4aaa4490e55),
     W64LIT(0x752828a0285d8850), W64LIT(0x865c5c6d5cda31b8), W64LIT(0x6bf8f8c7f8933fed), W64LIT(0xc28686228644a411),
-};
 
-static const word64 C2[256] = {
-    W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f),
+	W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f),
     W64LIT(0x13cb878726874ca1), W64LIT(0x6d11b8b8dab8a962), W64LIT(0x0209010104010805), W64LIT(0x9e0d4f4f214f426e),
     W64LIT(0x6c9b3636d836adee), W64LIT(0x51ffa6a6a2a65904), W64LIT(0xb90cd2d26fd2debd), W64LIT(0xf70ef5f5f3f5fb06),
     W64LIT(0xf2967979f979ef80), W64LIT(0xde306f6fa16f5fce), W64LIT(0x3f6d91917e91fcef), W64LIT(0xa4f852525552aa07),
@@ -312,10 +309,8 @@ static const word64 C2[256] = {
     W64LIT(0xe0d77070dd70a7ad), W64LIT(0x716fb6b6e2b6d954), W64LIT(0xbd1ed0d067d0ceb7), W64LIT(0xc7d6eded93ed3b7e),
     W64LIT(0x85e2cccc17cc2edb), W64LIT(0x8468424215422a57), W64LIT(0x2d2c98985a98b4c2), W64LIT(0x55eda4a4aaa4490e),
     W64LIT(0x50752828a0285d88), W64LIT(0xb8865c5c6d5cda31), W64LIT(0xed6bf8f8c7f8933f), W64LIT(0x11c28686228644a4),
-};
 
-static const word64 C3[256] = {
-    W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813),
+	W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813),
     W64LIT(0xa113cb878726874c), W64LIT(0x626d11b8b8dab8a9), W64LIT(0x0502090101040108), W64LIT(0x6e9e0d4f4f214f42),
     W64LIT(0xee6c9b3636d836ad), W64LIT(0x0451ffa6a6a2a659), W64LIT(0xbdb90cd2d26fd2de), W64LIT(0x06f70ef5f5f3f5fb),
     W64LIT(0x80f2967979f979ef), W64LIT(0xcede306f6fa16f5f), W64LIT(0xef3f6d91917e91fc), W64LIT(0x07a4f852525552aa),
@@ -379,9 +374,7 @@ static const word64 C3[256] = {
     W64LIT(0xade0d77070dd70a7), W64LIT(0x54716fb6b6e2b6d9), W64LIT(0xb7bd1ed0d067d0ce), W64LIT(0x7ec7d6eded93ed3b),
     W64LIT(0xdb85e2cccc17cc2e), W64LIT(0x578468424215422a), W64LIT(0xc22d2c98985a98b4), W64LIT(0x0e55eda4a4aaa449),
     W64LIT(0x8850752828a0285d), W64LIT(0x31b8865c5c6d5cda), W64LIT(0x3fed6bf8f8c7f893), W64LIT(0xa411c28686228644),
-};
 
-static const word64 rc[R] = {
 	W64LIT(0x1823c6e887b8014f),
 	W64LIT(0x36a6d2f5796f9152),
 	W64LIT(0x60bc9b8ea30c7b35),
@@ -397,55 +390,292 @@ static const word64 rc[R] = {
 // Whirlpool basic transformation. Transforms state based on block.
 void Whirlpool::Transform(word64 *digest, const word64 *block)
 {
+#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+	if (HasMMX())
+	{
+		// MMX version has the same structure as C version below
+#ifdef __GNUC__
+	__asm__ __volatile__
+	(
+		".intel_syntax noprefix;"
+		AS1(	push	ebx)
+		AS2(	mov		ebx, eax)
+#else
+		AS2(	lea		ebx, [Whirlpool_C])
+		AS2(	mov		ecx, digest)
+		AS2(	mov		edx, block)
+#endif
+		AS2(	mov		eax, esp)
+		AS2(	and		esp, 0xfffffff0)
+		AS2(	sub		esp, 16*8)
+		AS1(	push	eax)
+		AS2(	xor		esi, esi)
+		ASL(0)
+		AS2(	movq	mm0, [ecx+8*esi])
+		AS2(	movq	[esp+4+8*esi], mm0)		// k
+		AS2(	pxor	mm0, [edx+8*esi])
+		AS2(	movq	[esp+4+64+8*esi], mm0)	// s
+		AS2(	movq	[ecx+8*esi], mm0)
+		AS1(	inc		esi)
+		AS2(	cmp		esi, 8)
+		ASJ(	jne,	0, b)
+
+		AS2(	xor		esi, esi)
+		ASL(1)
+
+#define KSL0(a, b)	AS2(movq	mm##a, b)
+#define KSL1(a, b)	AS2(pxor	mm##a, b)
+
+#define KSL(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+8*i])\
+	AS2(movzx	edi, al)\
+	KSL##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	KSL##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	KSL##op(c, [ebx+1*2048+8*edi])\
+	KSL##op(d, [ebx+0*2048+8*eax])
+
+#define KSH0(a, b)	\
+	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
+	AS2(pxor	mm##a, b)
+#define KSH1(a, b)	\
+	AS2(pxor	mm##a, b)
+#define KSH2(a, b)	\
+	AS2(pxor	mm##a, b)\
+	AS2(movq	[esp+4+8*a], mm##a)
+
+#define KSH(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(movzx	edi, al)\
+	KSH##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	KSH##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	KSH##op(c, [ebx+1*2048+8*edi])\
+	KSH##op(d, [ebx+0*2048+8*eax])
+
+#define TSL(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+64+8*i])\
+	AS2(movzx	edi, al)\
+	KSL##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	KSL##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	KSL##op(c, [ebx+1*2048+8*edi])\
+	KSL##op(d, [ebx+0*2048+8*eax])
+
+#define TSH0(a, b)	\
+	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
+	AS2(pxor	mm##a, [esp+4+8*a])\
+	AS2(pxor	mm##a, b)
+#define TSH1(a, b)	\
+	AS2(pxor	mm##a, b)
+#define TSH2(a, b)	\
+	AS2(pxor	mm##a, b)\
+	AS2(movq	[esp+4+64+8*a], mm##a)
+#define TSH3(a, b)	\
+	AS2(pxor	mm##a, b)\
+	AS2(pxor	mm##a, [ecx+8*a])\
+	AS2(movq	[ecx+8*a], mm##a)
+
+#define TSH(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(movzx	edi, al)\
+	TSH##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	TSH##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	TSH##op(c, [ebx+1*2048+8*edi])\
+	TSH##op(d, [ebx+0*2048+8*eax])
+
+		KSL(0, 4, 3, 2, 1, 0)
+		KSL(0, 0, 7, 6, 5, 4)
+		KSL(1, 1, 0, 7, 6, 5)
+		KSL(1, 2, 1, 0, 7, 6)
+		KSL(1, 3, 2, 1, 0, 7)
+		KSL(1, 5, 4, 3, 2, 1)
+		KSL(1, 6, 5, 4, 3, 2)
+		KSL(1, 7, 6, 5, 4, 3)
+		KSH(0, 0, 7, 6, 5, 4)
+		KSH(0, 4, 3, 2, 1, 0)
+		KSH(1, 1, 0, 7, 6, 5)
+		KSH(1, 2, 1, 0, 7, 6)
+		KSH(1, 5, 4, 3, 2, 1)
+		KSH(1, 6, 5, 4, 3, 2)
+		KSH(2, 3, 2, 1, 0, 7)
+		KSH(2, 7, 6, 5, 4, 3)
+
+		AS2(	pxor	mm0, [ebx + 8*1024 + esi*8])
+		AS2(	movq	[esp+4], mm0)
+
+		TSL(0, 4, 3, 2, 1, 0)
+		TSL(0, 0, 7, 6, 5, 4)
+		TSL(1, 1, 0, 7, 6, 5)
+		TSL(1, 2, 1, 0, 7, 6)
+		TSL(1, 3, 2, 1, 0, 7)
+		TSL(1, 5, 4, 3, 2, 1)
+		TSL(1, 6, 5, 4, 3, 2)
+		TSL(1, 7, 6, 5, 4, 3)
+		TSH(0, 0, 7, 6, 5, 4)
+		TSH(0, 4, 3, 2, 1, 0)
+		TSH(1, 1, 0, 7, 6, 5)
+		TSH(1, 2, 1, 0, 7, 6)
+		TSH(1, 5, 4, 3, 2, 1)
+		TSH(1, 6, 5, 4, 3, 2)
+
+		AS1(	inc		esi)
+		AS2(	cmp		esi, 10)
+		ASJ(	je,		2, f)
+
+		TSH(2, 3, 2, 1, 0, 7)
+		TSH(2, 7, 6, 5, 4, 3)
+
+		ASJ(	jmp,	1, b)
+		ASL(2)
+
+		TSH(3, 3, 2, 1, 0, 7)
+		TSH(3, 7, 6, 5, 4, 3)
+
+#undef KSL
+#undef KSH
+#undef TSL
+#undef TSH
+
+		AS1(	emms)
+		AS1(	pop		esp)
+
+#ifdef __GNUC__
+		AS1(	pop		ebx)
+		".att_syntax prefix;"
+			:
+			: "a" (Whirlpool_C), "c" (digest), "d" (block)
+			: "%esi", "%edi", "memory", "cc"
+		);
+#endif
+	}
+	else
+#endif		// #ifdef CRYPTOPP_X86_ASM_AVAILABLE
+	{
 	word64 s[8];	// the cipher state
 	word64 k[8];	// the round key
 
 	// Compute and apply K^0 to the cipher state
 	// Also apply part of the Miyaguchi-Preneel compression function
-	digest[0] = s[0] = block[0] ^ (k[0] = digest[0]);
-	digest[1] = s[1] = block[1] ^ (k[1] = digest[1]);
-	digest[2] = s[2] = block[2] ^ (k[2] = digest[2]);
-	digest[3] = s[3] = block[3] ^ (k[3] = digest[3]);
-	digest[4] = s[4] = block[4] ^ (k[4] = digest[4]);
-	digest[5] = s[5] = block[5] ^ (k[5] = digest[5]);
-	digest[6] = s[6] = block[6] ^ (k[6] = digest[6]);
-	digest[7] = s[7] = block[7] ^ (k[7] = digest[7]);
+	for (int i=0; i<8; i++)
+		digest[i] = s[i] = block[i] ^ (k[i] = digest[i]);
+
+#define KSL(op, i, a, b, c, d)	\
+	t = (word32)k[i];\
+	w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\
+	t >>= 8;\
+	w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\
+	t >>= 8;\
+	w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\
+	t >>= 8;\
+	w##d = Whirlpool_C[0*256 + t]       ^ (op ? w##d : 0);
+
+#define KSH(op, i, a, b, c, d)	\
+	t = (word32)(k[(i+4)%8]>>32);\
+	w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32));\
+	if (op==2) k[a] = w##a;\
+	t >>= 8;\
+	w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : rotrFixed(w##b, 32));\
+	if (op==2) k[b] = w##b;\
+	t >>= 8;\
+	w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : rotrFixed(w##c, 32));\
+	if (op==2) k[c] = w##c;\
+	t >>= 8;\
+	w##d = Whirlpool_C[0*256 + t]       ^ (op ? w##d : rotrFixed(w##d, 32));\
+	if (op==2) k[d] = w##d;\
+
+#define TSL(op, i, a, b, c, d)	\
+	t = (word32)s[i];\
+	w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\
+	t >>= 8;\
+	w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\
+	t >>= 8;\
+	w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\
+	t >>= 8;\
+	w##d = Whirlpool_C[0*256 + t]       ^ (op ? w##d : 0);
+
+#define TSH_OP(op, a, b)	\
+	w##a = Whirlpool_C[b*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32) ^ k[a]);\
+	if (op==2) s[a] = w##a;\
+	if (op==3) digest[a] ^= w##a;\
+
+#define TSH(op, i, a, b, c, d)	\
+	t = (word32)(s[(i+4)%8]>>32);\
+	TSH_OP(op, a, 3);\
+	t >>= 8;\
+	TSH_OP(op, b, 2);\
+	t >>= 8;\
+	TSH_OP(op, c, 1);\
+	t >>= 8;\
+	TSH_OP(op, d, 0);\
 
 	// Iterate over all rounds:
-	for (int r = 0; r < R; r++)
+	int r=0;
+	while (true)
 	{
 		word64 w0, w1, w2, w3, w4, w5, w6, w7;	// temporary storage
-		word64 t;
-
-		// Compute K^r from K^{r-1}:
-#define K(i,j) GETBYTE(k[(i+j+1)%8], j)
-#define KS(i) \
-	t = C0[K(i,3)] ^ C1[K(i,2)] ^ C2[K(i,1)] ^ C3[K(i,0)]; \
-	w##i = rotrFixed(t, 32) ^ C0[K(i,7)] ^ C1[K(i,6)] ^ C2[K(i,5)] ^ C3[K(i,4)];
-
-		KS(0); KS(1); KS(2); KS(3); KS(4); KS(5); KS(6); KS(7);
-		k[0] = w0 ^ rc[r];
-		k[1] = w1; k[2] = w2; k[3] = w3; k[4] = w4; k[5] = w5; k[6] = w6; k[7] = w7;
-
-		// Apply the r-th round transformation:
-#define S(i,j) GETBYTE(s[(i+j+1)%8], j)
-#define TS(i) \
-	t = C0[S(i,3)] ^ C1[S(i,2)] ^ C2[S(i,1)] ^ C3[S(i,0)]; \
-	w##i = rotrFixed(t, 32) ^ C0[S(i,7)] ^ C1[S(i,6)] ^ C2[S(i,5)] ^ C3[S(i,4)] ^ k[i];
-
-		TS(0); TS(1); TS(2); TS(3); TS(4); TS(5); TS(6); TS(7);
-		s[0] = w0; s[1] = w1; s[2] = w2; s[3] = w3; s[4] = w4; s[5] = w5; s[6] = w6; s[7] = w7;
-	}
+		word32 t;
+
+		KSL(0, 4, 3, 2, 1, 0)
+		KSL(0, 0, 7, 6, 5, 4)
+		KSL(1, 1, 0, 7, 6, 5)
+		KSL(1, 2, 1, 0, 7, 6)
+		KSL(1, 3, 2, 1, 0, 7)
+		KSL(1, 5, 4, 3, 2, 1)
+		KSL(1, 6, 5, 4, 3, 2)
+		KSL(1, 7, 6, 5, 4, 3)
+		KSH(0, 0, 7, 6, 5, 4)
+		KSH(0, 4, 3, 2, 1, 0)
+		KSH(1, 1, 0, 7, 6, 5)
+		KSH(1, 2, 1, 0, 7, 6)
+		KSH(1, 5, 4, 3, 2, 1)
+		KSH(1, 6, 5, 4, 3, 2)
+		KSH(2, 3, 2, 1, 0, 7)
+		KSH(2, 7, 6, 5, 4, 3)
 
-	// Apply the rest of the Miyaguchi-Preneel compression function:
-	digest[0] ^= s[0];
-	digest[1] ^= s[1];
-	digest[2] ^= s[2];
-	digest[3] ^= s[3];
-	digest[4] ^= s[4];
-	digest[5] ^= s[5];
-	digest[6] ^= s[6];
-	digest[7] ^= s[7];
+		k[0] ^= Whirlpool_C[1024+r];
+
+		TSL(0, 4, 3, 2, 1, 0)
+		TSL(0, 0, 7, 6, 5, 4)
+		TSL(1, 1, 0, 7, 6, 5)
+		TSL(1, 2, 1, 0, 7, 6)
+		TSL(1, 3, 2, 1, 0, 7)
+		TSL(1, 5, 4, 3, 2, 1)
+		TSL(1, 6, 5, 4, 3, 2)
+		TSL(1, 7, 6, 5, 4, 3)
+		TSH(0, 0, 7, 6, 5, 4)
+		TSH(0, 4, 3, 2, 1, 0)
+		TSH(1, 1, 0, 7, 6, 5)
+		TSH(1, 2, 1, 0, 7, 6)
+		TSH(1, 5, 4, 3, 2, 1)
+		TSH(1, 6, 5, 4, 3, 2)
+
+		if (++r < R)
+		{
+			TSH(2, 3, 2, 1, 0, 7)
+			TSH(2, 7, 6, 5, 4, 3)
+		}
+		else
+		{
+			TSH(3, 3, 2, 1, 0, 7)
+			TSH(3, 7, 6, 5, 4, 3)
+			break;
+		}
+	}
+	}
 }
 
 NAMESPACE_END
diff --git a/whrlpool.h b/whrlpool.h
index c6971f08..298850ab 100644
--- a/whrlpool.h
+++ b/whrlpool.h
@@ -9,8 +9,7 @@
 
 NAMESPACE_BEGIN(CryptoPP)
 
-//! <a href="http://www.weidai.com/scan-mirror/md.html#Whirlpool">Whirlpool</a>
-/*! 512 Bit Hash */
+//! <a href="http://www.cryptolounge.org/wiki/Whirlpool">Whirlpool</a>
 class Whirlpool : public IteratedHashWithStaticTransform<word64, BigEndian, 64, 64, Whirlpool>
 {
 public: