summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--integer.cpp3267
-rw-r--r--integer.h37
-rw-r--r--rijndael.cpp576
-rw-r--r--rijndael.h15
-rw-r--r--sha.cpp440
-rw-r--r--sha.h6
-rw-r--r--tiger.cpp229
-rw-r--r--tiger.h4
-rw-r--r--whrlpool.cpp344
-rw-r--r--whrlpool.h3
10 files changed, 2765 insertions, 2156 deletions
diff --git a/integer.cpp b/integer.cpp
index 0c5018ee..515643ed 100644
--- a/integer.cpp
+++ b/integer.cpp
@@ -14,30 +14,20 @@
#include "algparam.h"
#include "pubkey.h" // for P1363_KDF2
#include "sha.h"
+#include "cpu.h"
#include <iostream>
-#ifdef _M_X64
-#include <Intrin.h>
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+ #include <intrin.h>
#endif
-#ifdef SSE2_INTRINSICS_AVAILABLE
- #ifdef __GNUC__
- #include <xmmintrin.h>
- #include <signal.h>
- #include <setjmp.h>
- #ifdef CRYPTOPP_MEMALIGN_AVAILABLE
- #include <malloc.h>
- #else
- #include <stdlib.h>
- #endif
- #else
- #include <emmintrin.h>
- #endif
-#elif defined(_MSC_VER) && defined(_M_IX86)
- #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
-#elif defined(__GNUC__) && defined(__i386__)
- #warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled."
+#ifdef __DECCXX
+ #include <c_asm.h>
+#endif
+
+#ifdef CRYPTOPP_MSVC6_NO_PP
+ #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
#endif
NAMESPACE_BEGIN(CryptoPP)
@@ -50,67 +40,7 @@ bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const v
return true;
}
-#ifdef SSE2_INTRINSICS_AVAILABLE
-template <class T>
-CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *)
-{
- CheckSize(n);
- if (n == 0)
- return NULL;
- if (n >= 4)
- {
- void *p;
- #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE
- while (!(p = _mm_malloc(sizeof(T)*n, 16)))
- #elif defined(CRYPTOPP_MEMALIGN_AVAILABLE)
- while (!(p = memalign(16, sizeof(T)*n)))
- #elif defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16)
- while (!(p = malloc(sizeof(T)*n)))
- #else
- while (!(p = (byte *)malloc(sizeof(T)*n + 8))) // assume malloc alignment is at least 8
- #endif
- CallNewHandler();
-
- #ifdef CRYPTOPP_NO_ALIGNED_ALLOC
- assert(m_pBlock == NULL);
- m_pBlock = p;
- if (!IsAlignedOn(p, 16))
- {
- assert(IsAlignedOn(p, 8));
- p = (byte *)p + 8;
- }
- #endif
-
- assert(IsAlignedOn(p, 16));
- return (T*)p;
- }
- return new T[n];
-}
-
-template <class T>
-void AlignedAllocator<T>::deallocate(void *p, size_type n)
-{
- memset(p, 0, n*sizeof(T));
- if (n >= 4)
- {
- #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE
- _mm_free(p);
- #elif defined(CRYPTOPP_NO_ALIGNED_ALLOC)
- assert(m_pBlock == p || (byte *)m_pBlock+8 == p);
- free(m_pBlock);
- m_pBlock = NULL;
- #else
- free(p);
- #endif
- }
- else
- delete [] (T *)p;
-}
-
-template class CRYPTOPP_DLL AlignedAllocator<word>;
-#endif
-
-static int Compare(const word *A, const word *B, size_t N)
+inline static int Compare(const word *A, const word *B, size_t N)
{
while (N--)
if (A[N] > B[N])
@@ -121,7 +51,7 @@ static int Compare(const word *A, const word *B, size_t N)
return 0;
}
-static int Increment(word *A, size_t N, word B=1)
+inline static int Increment(word *A, size_t N, word B=1)
{
assert(N);
word t = A[0];
@@ -134,7 +64,7 @@ static int Increment(word *A, size_t N, word B=1)
return 1;
}
-static int Decrement(word *A, size_t N, word B=1)
+inline static int Decrement(word *A, size_t N, word B=1)
{
assert(N);
word t = A[0];
@@ -169,6 +99,45 @@ static word AtomicInverseModPower2(word A)
// ********************************************************
+#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+ #define Declare2Words(x) dword x;
+ #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
+ #define MultiplyWords(p, a, b) p = __emulu(a, b);
+ #else
+ #define MultiplyWords(p, a, b) p = (dword)a*b;
+ #endif
+ #define AssignWord(a, b) a = b;
+ #define Add2WordsBy1(a, b, c) a = b + c;
+ #define Acc2WordsBy1(a, b) a += b;
+ #define Acc2WordsBy2(a, b) a += b;
+ #define LowWord(a) (word)a
+ #define HighWord(a) (word)(a>>WORD_BITS)
+ #define Double2Words(a) a += a;
+ #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u);
+ #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u);
+ #define GetCarry(u) HighWord(u)
+ #define GetBorrow(u) word(u>>(WORD_BITS*2-1))
+#else
+ #define Declare2Words(x) word x##0, x##1;
+ #define AssignWord(a, b) a##0 = b; a##1 = 0;
+ #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
+ #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b)
+ #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
+ #define LowWord(a) a##0
+ #define HighWord(a) a##1
+ #ifdef _MSC_VER
+ #define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
+ #define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0;
+ #elif defined(__DECCXX)
+ #define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
+ #define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0;
+ #endif
+ #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
+ #define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
+ #define GetCarry(u) u##1
+ #define GetBorrow(u) u##1
+#endif
+
class DWord
{
public:
@@ -198,25 +167,8 @@ public:
DWord r;
#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
r.m_whole = (dword)a * b;
- #elif defined(__alpha__)
- r.m_halfs.low = a*b; __asm__("umulh %1,%2,%0" : "=r" (r.m_halfs.high) : "r" (a), "r" (b));
- #elif defined(__ia64__)
- r.m_halfs.low = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (r.m_halfs.high) : "f" (a), "f" (b));
- #elif defined(_ARCH_PPC64)
- r.m_halfs.low = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (r.m_halfs.high) : "r" (a), "r" (b) : "cc");
- #elif defined(__x86_64__)
- __asm__("mulq %3" : "=d" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc");
- #elif defined(__mips64)
- __asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b));
- #elif defined(_M_X64)
- r.m_halfs.low = _umul128(a, b, &r.m_halfs.high);
- #elif defined(_M_IX86)
- // for testing
- word64 t = (word64)a * b;
- r.m_halfs.high = ((word32 *)(&t))[1];
- r.m_halfs.low = (word32)t;
#else
- #error can not implement DWord
+ r.m_halfs.low = _umul128(a, b, &r.m_halfs.high);
#endif
return r;
}
@@ -457,1529 +409,1449 @@ inline word DWord::operator%(word a)
// ********************************************************
-class Portable
-{
-public:
- static int Add(word *C, const word *A, const word *B, size_t N);
- static int Subtract(word *C, const word *A, const word *B, size_t N);
-
- static inline void Multiply2(word *C, const word *A, const word *B);
- static inline word Multiply2Add(word *C, const word *A, const word *B);
- static void Multiply4(word *C, const word *A, const word *B);
- static void Multiply8(word *C, const word *A, const word *B);
- static inline unsigned int MultiplyRecursionLimit() {return 8;}
-
- static inline void Multiply2Bottom(word *C, const word *A, const word *B);
- static void Multiply4Bottom(word *C, const word *A, const word *B);
- static void Multiply8Bottom(word *C, const word *A, const word *B);
- static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
-
- static void Square2(word *R, const word *A);
- static void Square4(word *R, const word *A);
- static void Square8(word *R, const word *A) {assert(false);}
- static inline unsigned int SquareRecursionLimit() {return 4;}
-};
+// use some tricks to share assembly code between MSVC and GCC
+#if defined(__GNUC__)
+ #define CRYPTOPP_NAKED
+ #define AddPrologue \
+ __asm__ __volatile__ \
+ ( \
+ "push %%ebx;" /* save this manually, in case of -fPIC */ \
+ "mov %2, %%ebx;" \
+ ".intel_syntax noprefix;"
+ #define AddEpilogue \
+ ".att_syntax prefix;" \
+ "pop %%ebx;" \
+ : \
+ : "d" (C), "a" (A), "m" (B), "c" (N) \
+ : "%esi", "memory", "cc" \
+ );
+ #define MulPrologue \
+ __asm__ __volatile__ \
+ ( \
+ ".intel_syntax noprefix;" \
+ AS1( push ebx) \
+ AS2( mov ebx, edx)
+ #define MulEpilogue \
+ AS1( pop ebx) \
+ ".att_syntax prefix;" \
+ : \
+ : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
+ : "%esi", "memory", "cc" \
+ );
+ #define SquPrologue MulPrologue
+ #define SquEpilogue \
+ AS1( pop ebx) \
+ ".att_syntax prefix;" \
+ : \
+ : "d" (s_maskLow16), "c" (C), "a" (A) \
+ : "%esi", "%edi", "memory", "cc" \
+ );
+ #define TopPrologue MulPrologue
+ #define TopEpilogue \
+ AS1( pop ebx) \
+ ".att_syntax prefix;" \
+ : \
+ : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
+ : "memory", "cc" \
+ );
+#else
+ #define CRYPTOPP_NAKED __declspec(naked)
+ #define AddPrologue \
+ __asm push ebx \
+ __asm push esi \
+ __asm mov eax, [esp+12] \
+ __asm mov ebx, [esp+16]
+ #define AddEpilogue \
+ __asm pop esi \
+ __asm pop ebx \
+ __asm ret 8
+ #define SquPrologue \
+ AS2( mov eax, A) \
+ AS2( mov ecx, C) \
+ AS2( lea ebx, s_maskLow16)
+ #define SquEpilogue
+ #define MulPrologue \
+ AS2( mov eax, A) \
+ AS2( mov edi, B) \
+ AS2( mov ecx, C) \
+ AS2( lea ebx, s_maskLow16)
+ #define MulEpilogue
+ #define TopPrologue \
+ AS2( mov eax, A) \
+ AS2( mov edi, B) \
+ AS2( mov ecx, C) \
+ AS2( mov esi, L) \
+ AS2( lea ebx, s_maskLow16)
+ #define TopEpilogue
+#endif
-int Portable::Add(word *C, const word *A, const word *B, size_t N)
+#if defined(_MSC_VER) && defined(_M_X64)
+extern "C" {
+int Baseline_Add(size_t N, word *C, const word *A, const word *B);
+int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
+}
+#elif defined(CRYPTOPP_X86_ASM_AVAILABLE)
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
- assert (N%2 == 0);
+ AddPrologue
- DWord u(0, 0);
- for (unsigned int i = 0; i < N; i+=2)
- {
- u = DWord(A[i]) + B[i] + u.GetHighHalf();
- C[i] = u.GetLowHalf();
- u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
- C[i+1] = u.GetLowHalf();
- }
- return int(u.GetHighHalf());
+ // now: eax = A, ebx = B, edx = C, ecx = N
+ AS2( lea eax, [eax+4*ecx])
+ AS2( lea ebx, [ebx+4*ecx])
+ AS2( lea edx, [edx+4*ecx])
+
+ AS1( neg ecx) // ecx is negative index
+ AS2( test ecx, 2) // this clears carry flag
+ ASJ( jz, 0, f)
+ AS2( sub ecx, 2)
+ ASJ( jmp, 1, f)
+
+ ASL(0)
+ ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
+ AS2( mov esi,[eax+4*ecx])
+ AS2( adc esi,[ebx+4*ecx])
+ AS2( mov [edx+4*ecx],esi)
+ AS2( mov esi,[eax+4*ecx+4])
+ AS2( adc esi,[ebx+4*ecx+4])
+ AS2( mov [edx+4*ecx+4],esi)
+ ASL(1)
+ AS2( mov esi,[eax+4*ecx+8])
+ AS2( adc esi,[ebx+4*ecx+8])
+ AS2( mov [edx+4*ecx+8],esi)
+ AS2( mov esi,[eax+4*ecx+12])
+ AS2( adc esi,[ebx+4*ecx+12])
+ AS2( mov [edx+4*ecx+12],esi)
+
+ AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
+ ASJ( jmp, 0, b)
+
+ ASL(2)
+ AS2( mov eax, 0)
+ AS1( setc al) // store carry into eax (return result register)
+
+ AddEpilogue
}
-int Portable::Subtract(word *C, const word *A, const word *B, size_t N)
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
- assert (N%2 == 0);
+ AddPrologue
- DWord u(0, 0);
- for (unsigned int i = 0; i < N; i+=2)
- {
- u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
- C[i] = u.GetLowHalf();
- u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
- C[i+1] = u.GetLowHalf();
- }
- return int(0-u.GetHighHalf());
+ // now: eax = A, ebx = B, edx = C, ecx = N
+ AS2( lea eax, [eax+4*ecx])
+ AS2( lea ebx, [ebx+4*ecx])
+ AS2( lea edx, [edx+4*ecx])
+
+ AS1( neg ecx) // ecx is negative index
+ AS2( test ecx, 2) // this clears carry flag
+ ASJ( jz, 0, f)
+ AS2( sub ecx, 2)
+ ASJ( jmp, 1, f)
+
+ ASL(0)
+ ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
+ AS2( mov esi,[eax+4*ecx])
+ AS2( sbb esi,[ebx+4*ecx])
+ AS2( mov [edx+4*ecx],esi)
+ AS2( mov esi,[eax+4*ecx+4])
+ AS2( sbb esi,[ebx+4*ecx+4])
+ AS2( mov [edx+4*ecx+4],esi)
+ ASL(1)
+ AS2( mov esi,[eax+4*ecx+8])
+ AS2( sbb esi,[ebx+4*ecx+8])
+ AS2( mov [edx+4*ecx+8],esi)
+ AS2( mov esi,[eax+4*ecx+12])
+ AS2( sbb esi,[ebx+4*ecx+12])
+ AS2( mov [edx+4*ecx+12],esi)
+
+ AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
+ ASJ( jmp, 0, b)
+
+ ASL(2)
+ AS2( mov eax, 0)
+ AS1( setc al) // store carry into eax (return result register)
+
+ AddEpilogue
}
-void Portable::Multiply2(word *C, const word *A, const word *B)
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
{
-/*
- word s;
- dword d;
+ AddPrologue
- if (A1 >= A0)
- if (B0 >= B1)
- {
- s = 0;
- d = (dword)(A1-A0)*(B0-B1);
- }
- else
- {
- s = (A1-A0);
- d = (dword)s*(word)(B0-B1);
- }
- else
- if (B0 > B1)
- {
- s = (B0-B1);
- d = (word)(A1-A0)*(dword)s;
- }
- else
- {
- s = 0;
- d = (dword)(A0-A1)*(B1-B0);
- }
-*/
- // this segment is the branchless equivalent of above
- word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
- unsigned int ai = A[1] < A[0];
- unsigned int bi = B[0] < B[1];
- unsigned int di = ai & bi;
- DWord d = DWord::Multiply(D[di], D[di+2]);
- D[1] = D[3] = 0;
- unsigned int si = ai + !bi;
- word s = D[si];
-
- DWord A0B0 = DWord::Multiply(A[0], B[0]);
- C[0] = A0B0.GetLowHalf();
-
- DWord A1B1 = DWord::Multiply(A[1], B[1]);
- DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf();
- C[1] = t.GetLowHalf();
-
- t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s;
- C[2] = t.GetLowHalf();
- C[3] = t.GetHighHalf();
-}
-
-inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
-{
- DWord t = DWord::Multiply(A[0], B[0]);
- C[0] = t.GetLowHalf();
- C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
-}
-
-word Portable::Multiply2Add(word *C, const word *A, const word *B)
-{
- word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
- unsigned int ai = A[1] < A[0];
- unsigned int bi = B[0] < B[1];
- unsigned int di = ai & bi;
- DWord d = DWord::Multiply(D[di], D[di+2]);
- D[1] = D[3] = 0;
- unsigned int si = ai + !bi;
- word s = D[si];
-
- DWord A0B0 = DWord::Multiply(A[0], B[0]);
- DWord t = A0B0 + C[0];
- C[0] = t.GetLowHalf();
-
- DWord A1B1 = DWord::Multiply(A[1], B[1]);
- t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1];
- C[1] = t.GetLowHalf();
-
- t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
- C[2] = t.GetLowHalf();
-
- t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
- C[3] = t.GetLowHalf();
- return t.GetHighHalf();
-}
-
-#define MulAcc(x, y) \
- p = DWord::MultiplyAndAdd(A[x], B[y], c); \
- c = p.GetLowHalf(); \
- p = (DWord) d + p.GetHighHalf(); \
- d = p.GetLowHalf(); \
- e += p.GetHighHalf();
-
-#define SaveMulAcc(s, x, y) \
- R[s] = c; \
- p = DWord::MultiplyAndAdd(A[x], B[y], d); \
- c = p.GetLowHalf(); \
- p = (DWord) e + p.GetHighHalf(); \
- d = p.GetLowHalf(); \
- e = p.GetHighHalf();
-
-#define SquAcc(x, y) \
- q = DWord::Multiply(A[x], A[y]); \
- p = q + c; \
- c = p.GetLowHalf(); \
- p = (DWord) d + p.GetHighHalf(); \
- d = p.GetLowHalf(); \
- e += p.GetHighHalf(); \
- p = q + c; \
- c = p.GetLowHalf(); \
- p = (DWord) d + p.GetHighHalf(); \
- d = p.GetLowHalf(); \
- e += p.GetHighHalf();
-
-#define SaveSquAcc(s, x, y) \
- R[s] = c; \
- q = DWord::Multiply(A[x], A[y]); \
- p = q + d; \
- c = p.GetLowHalf(); \
- p = (DWord) e + p.GetHighHalf(); \
- d = p.GetLowHalf(); \
- e = p.GetHighHalf(); \
- p = q + c; \
- c = p.GetLowHalf(); \
- p = (DWord) d + p.GetHighHalf(); \
- d = p.GetLowHalf(); \
- e += p.GetHighHalf();
-
-void Portable::Multiply4(word *R, const word *A, const word *B)
-{
- DWord p;
- word c, d, e;
-
- p = DWord::Multiply(A[0], B[0]);
- R[0] = p.GetLowHalf();
- c = p.GetHighHalf();
- d = e = 0;
-
- MulAcc(0, 1);
- MulAcc(1, 0);
-
- SaveMulAcc(1, 2, 0);
- MulAcc(1, 1);
- MulAcc(0, 2);
-
- SaveMulAcc(2, 0, 3);
- MulAcc(1, 2);
- MulAcc(2, 1);
- MulAcc(3, 0);
-
- SaveMulAcc(3, 3, 1);
- MulAcc(2, 2);
- MulAcc(1, 3);
-
- SaveMulAcc(4, 2, 3);
- MulAcc(3, 2);
-
- R[5] = c;
- p = DWord::MultiplyAndAdd(A[3], B[3], d);
- R[6] = p.GetLowHalf();
- R[7] = e + p.GetHighHalf();
-}
-
-void Portable::Square2(word *R, const word *A)
-{
- DWord p, q;
- word c, d, e;
-
- p = DWord::Multiply(A[0], A[0]);
- R[0] = p.GetLowHalf();
- c = p.GetHighHalf();
- d = e = 0;
-
- SquAcc(0, 1);
-
- R[1] = c;
- p = DWord::MultiplyAndAdd(A[1], A[1], d);
- R[2] = p.GetLowHalf();
- R[3] = e + p.GetHighHalf();
-}
-
-void Portable::Square4(word *R, const word *A)
-{
-#ifdef _MSC_VER
- // VC60 workaround: MSVC 6.0 has an optimization bug that makes
- // (dword)A*B where either A or B has been cast to a dword before
- // very expensive. Revisit this function when this
- // bug is fixed.
- Multiply4(R, A, A);
-#else
- const word *B = A;
- DWord p, q;
- word c, d, e;
+ // now: eax = A, ebx = B, edx = C, ecx = N
+ AS2( lea eax, [eax+4*ecx])
+ AS2( lea ebx, [ebx+4*ecx])
+ AS2( lea edx, [edx+4*ecx])
+
+ AS1( neg ecx) // ecx is negative index
+ AS2( pxor mm2, mm2)
+ ASJ( jz, 2, f)
+ AS2( test ecx, 2) // this clears carry flag
+ ASJ( jz, 0, f)
+ AS2( sub ecx, 2)
+ ASJ( jmp, 1, f)
+
+ ASL(0)
+ AS2( movd mm0, DWORD PTR [eax+4*ecx])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx])
+ AS2( paddq mm0, mm1)
+ AS2( paddq mm2, mm0)
+ AS2( movd DWORD PTR [edx+4*ecx], mm2)
+ AS2( psrlq mm2, 32)
+
+ AS2( movd mm0, DWORD PTR [eax+4*ecx+4])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx+4])
+ AS2( paddq mm0, mm1)
+ AS2( paddq mm2, mm0)
+ AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
+ AS2( psrlq mm2, 32)
+
+ ASL(1)
+ AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx+8])
+ AS2( paddq mm0, mm1)
+ AS2( paddq mm2, mm0)
+ AS2( movd DWORD PTR [edx+4*ecx+8], mm2)
+ AS2( psrlq mm2, 32)
+
+ AS2( movd mm0, DWORD PTR [eax+4*ecx+12])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx+12])
+ AS2( paddq mm0, mm1)
+ AS2( paddq mm2, mm0)
+ AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
+ AS2( psrlq mm2, 32)
+
+ AS2( add ecx, 4)
+ ASJ( jnz, 0, b)
+
+ ASL(2)
+ AS2( movd eax, mm2)
+ AS1( emms)
- p = DWord::Multiply(A[0], A[0]);
- R[0] = p.GetLowHalf();
- c = p.GetHighHalf();
- d = e = 0;
+ AddEpilogue
+}
- SquAcc(0, 1);
+CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
+{
+ AddPrologue
- SaveSquAcc(1, 2, 0);
- MulAcc(1, 1);
+ // now: eax = A, ebx = B, edx = C, ecx = N
+ AS2( lea eax, [eax+4*ecx])
+ AS2( lea ebx, [ebx+4*ecx])
+ AS2( lea edx, [edx+4*ecx])
+
+ AS1( neg ecx) // ecx is negative index
+ AS2( pxor mm2, mm2)
+ ASJ( jz, 2, f)
+ AS2( test ecx, 2) // this clears carry flag
+ ASJ( jz, 0, f)
+ AS2( sub ecx, 2)
+ ASJ( jmp, 1, f)
+
+ ASL(0)
+ AS2( movd mm0, DWORD PTR [eax+4*ecx])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx])
+ AS2( psubq mm0, mm1)
+ AS2( psubq mm0, mm2)
+ AS2( movd DWORD PTR [edx+4*ecx], mm0)
+ AS2( psrlq mm0, 63)
+
+ AS2( movd mm2, DWORD PTR [eax+4*ecx+4])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx+4])
+ AS2( psubq mm2, mm1)
+ AS2( psubq mm2, mm0)
+ AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
+ AS2( psrlq mm2, 63)
+
+ ASL(1)
+ AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx+8])
+ AS2( psubq mm0, mm1)
+ AS2( psubq mm0, mm2)
+ AS2( movd DWORD PTR [edx+4*ecx+8], mm0)
+ AS2( psrlq mm0, 63)
+
+ AS2( movd mm2, DWORD PTR [eax+4*ecx+12])
+ AS2( movd mm1, DWORD PTR [ebx+4*ecx+12])
+ AS2( psubq mm2, mm1)
+ AS2( psubq mm2, mm0)
+ AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
+ AS2( psrlq mm2, 63)
+
+ AS2( add ecx, 4)
+ ASJ( jnz, 0, b)
+
+ ASL(2)
+ AS2( movd eax, mm2)
+ AS1( emms)
- SaveSquAcc(2, 0, 3);
- SquAcc(1, 2);
+ AddEpilogue
+}
+#else
+int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
+{
+ assert (N%2 == 0);
- SaveSquAcc(3, 3, 1);
- MulAcc(2, 2);
+ Declare2Words(u);
+ for (size_t i=0; i<N; i+=2)
+ {
+ AddWithCarry(u, A[i], B[i]);
+ C[i] = LowWord(u);
+ AddWithCarry(u, A[i+1], B[i+1]);
+ C[i+1] = LowWord(u);
+ }
+ return int(GetCarry(u));
+}
- SaveSquAcc(4, 2, 3);
+int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
+{
+ assert (N%2 == 0);
- R[5] = c;
- p = DWord::MultiplyAndAdd(A[3], A[3], d);
- R[6] = p.GetLowHalf();
- R[7] = e + p.GetHighHalf();
+ Declare2Words(u);
+ for (size_t i=0; i<N; i+=2)
+ {
+ SubtractWithBorrow(u, A[i], B[i]);
+ C[i] = LowWord(u);
+ SubtractWithBorrow(u, A[i+1], B[i+1]);
+ C[i+1] = LowWord(u);
+ }
+ return int(GetBorrow(u));
+}
#endif
+
+static word LinearMultiply(word *C, const word *A, word B, size_t N)
+{
+ word carry=0;
+ for(unsigned i=0; i<N; i++)
+ {
+ Declare2Words(p);
+ MultiplyWords(p, A[i], B);
+ Acc2WordsBy1(p, carry);
+ C[i] = LowWord(p);
+ carry = HighWord(p);
+ }
+ return carry;
}
-void Portable::Multiply8(word *R, const word *A, const word *B)
-{
- DWord p;
- word c, d, e;
-
- p = DWord::Multiply(A[0], B[0]);
- R[0] = p.GetLowHalf();
- c = p.GetHighHalf();
- d = e = 0;
-
- MulAcc(0, 1);
- MulAcc(1, 0);
-
- SaveMulAcc(1, 2, 0);
- MulAcc(1, 1);
- MulAcc(0, 2);
+#define Mul_2 \
+ Mul_Begin(2) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_End(2)
+
+#define Mul_4 \
+ Mul_Begin(4) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+ Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+ Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
+ Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
+ Mul_End(4)
+
+#define Mul_8 \
+ Mul_Begin(8) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+ Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+ Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+ Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+ Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+ Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+ Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
+ Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
+ Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
+ Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
+ Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
+ Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
+ Mul_End(8)
+
+#define Mul_16 \
+ Mul_Begin(16) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+ Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+ Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+ Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+ Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+ Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+ Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
+ Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
+ Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
+ Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
+ Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
+ Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
+ Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
+ Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
+ Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
+ Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
+ Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
+ Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
+ Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
+ Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
+ Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
+ Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
+ Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
+ Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
+ Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
+ Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
+ Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
+ Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
+ Mul_End(16)
+
+#define Squ_2 \
+ Squ_Begin(2) \
+ Squ_End(2)
+
+#define Squ_4 \
+ Squ_Begin(4) \
+ Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
+ Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
+ Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
+ Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
+ Squ_End(4)
+
+#define Squ_8 \
+ Squ_Begin(8) \
+ Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
+ Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
+ Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
+ Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
+ Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
+ Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
+ Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
+ Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
+ Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
+ Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
+ Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
+ Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
+ Squ_End(8)
+
+#define Squ_16 \
+ Squ_Begin(16) \
+ Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
+ Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
+ Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
+ Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
+ Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
+ Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
+ Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
+ Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
+ Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
+ Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
+ Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
+ Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
+ Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
+ Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
+ Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
+ Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
+ Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
+ Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
+ Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
+ Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
+ Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
+ Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
+ Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
+ Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
+ Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
+ Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
+ Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
+ Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
+ Squ_End(16)
+
+#define Bot_2 \
+ Mul_Begin(2) \
+ Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
+ Bot_End(2)
+
+#define Bot_4 \
+ Mul_Begin(4) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \
+ Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \
+ Bot_End(4)
+
+#define Bot_8 \
+ Mul_Begin(8) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+ Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+ Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+ Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+ Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+ Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
+ Bot_End(8)
+
+#define Bot_16 \
+ Mul_Begin(16) \
+ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
+ Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
+ Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
+ Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
+ Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
+ Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
+ Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+ Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
+ Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
+ Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
+ Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
+ Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
+ Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
+ Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
+ Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
+ Bot_End(16)
+
+#define Mul_Begin(n) \
+ Declare2Words(p) \
+ Declare2Words(c) \
+ Declare2Words(d) \
+ MultiplyWords(p, A[0], B[0]) \
+ AssignWord(c, LowWord(p)) \
+ AssignWord(d, HighWord(p))
+
+#define Mul_Acc(i, j) \
+ MultiplyWords(p, A[i], B[j]) \
+ Acc2WordsBy1(c, LowWord(p)) \
+ Acc2WordsBy1(d, HighWord(p))
+
+#define Mul_SaveAcc(k, i, j) \
+ R[k] = LowWord(c); \
+ Add2WordsBy1(c, d, HighWord(c)) \
+ MultiplyWords(p, A[i], B[j]) \
+ AssignWord(d, HighWord(p)) \
+ Acc2WordsBy1(c, LowWord(p))
+
+#define Mul_End(n) \
+ R[2*n-3] = LowWord(c); \
+ Acc2WordsBy1(d, HighWord(c)) \
+ MultiplyWords(p, A[n-1], B[n-1])\
+ Acc2WordsBy2(d, p) \
+ R[2*n-2] = LowWord(d); \
+ R[2*n-1] = HighWord(d);
+
+#define Bot_SaveAcc(k, i, j) \
+ R[k] = LowWord(c); \
+ word e = LowWord(d) + HighWord(c); \
+ e += A[i] * B[j];
+
+#define Bot_Acc(i, j) \
+ e += A[i] * B[j];
+
+#define Bot_End(n) \
+ R[n-1] = e;
- SaveMulAcc(2, 0, 3);
- MulAcc(1, 2);
- MulAcc(2, 1);
- MulAcc(3, 0);
-
- SaveMulAcc(3, 0, 4);
- MulAcc(1, 3);
- MulAcc(2, 2);
- MulAcc(3, 1);
- MulAcc(4, 0);
-
- SaveMulAcc(4, 0, 5);
- MulAcc(1, 4);
- MulAcc(2, 3);
- MulAcc(3, 2);
- MulAcc(4, 1);
- MulAcc(5, 0);
-
- SaveMulAcc(5, 0, 6);
- MulAcc(1, 5);
- MulAcc(2, 4);
- MulAcc(3, 3);
- MulAcc(4, 2);
- MulAcc(5, 1);
- MulAcc(6, 0);
-
- SaveMulAcc(6, 0, 7);
- MulAcc(1, 6);
- MulAcc(2, 5);
- MulAcc(3, 4);
- MulAcc(4, 3);
- MulAcc(5, 2);
- MulAcc(6, 1);
- MulAcc(7, 0);
-
- SaveMulAcc(7, 1, 7);
- MulAcc(2, 6);
- MulAcc(3, 5);
- MulAcc(4, 4);
- MulAcc(5, 3);
- MulAcc(6, 2);
- MulAcc(7, 1);
-
- SaveMulAcc(8, 2, 7);
- MulAcc(3, 6);
- MulAcc(4, 5);
- MulAcc(5, 4);
- MulAcc(6, 3);
- MulAcc(7, 2);
-
- SaveMulAcc(9, 3, 7);
- MulAcc(4, 6);
- MulAcc(5, 5);
- MulAcc(6, 4);
- MulAcc(7, 3);
+/*
+// this is slower on MSVC 2005 Win32
+#define Mul_Begin(n) \
+ Declare2Words(p) \
+ word c; \
+ Declare2Words(d) \
+ MultiplyWords(p, A[0], B[0]) \
+ c = LowWord(p); \
+ AssignWord(d, HighWord(p))
+
+#define Mul_Acc(i, j) \
+ MultiplyWords(p, A[i], B[j]) \
+ Acc2WordsBy1(p, c) \
+ c = LowWord(p); \
+ Acc2WordsBy1(d, HighWord(p))
+
+#define Mul_SaveAcc(k, i, j) \
+ R[k] = c; \
+ MultiplyWords(p, A[i], B[j]) \
+ Acc2WordsBy1(p, LowWord(d)) \
+ c = LowWord(p); \
+ AssignWord(d, HighWord(d)) \
+ Acc2WordsBy1(d, HighWord(p))
+
+#define Mul_End(n) \
+ R[2*n-3] = c; \
+ MultiplyWords(p, A[n-1], B[n-1])\
+ Acc2WordsBy2(d, p) \
+ R[2*n-2] = LowWord(d); \
+ R[2*n-1] = HighWord(d);
+
+#define Bot_SaveAcc(k, i, j) \
+ R[k] = c; \
+ c = LowWord(d); \
+ c += A[i] * B[j];
+
+#define Bot_Acc(i, j) \
+ c += A[i] * B[j];
+
+#define Bot_End(n) \
+ R[n-1] = c;
+*/
- SaveMulAcc(10, 4, 7);
- MulAcc(5, 6);
- MulAcc(6, 5);
- MulAcc(7, 4);
-
- SaveMulAcc(11, 5, 7);
- MulAcc(6, 6);
- MulAcc(7, 5);
-
- SaveMulAcc(12, 6, 7);
- MulAcc(7, 6);
-
- R[13] = c;
- p = DWord::MultiplyAndAdd(A[7], B[7], d);
- R[14] = p.GetLowHalf();
- R[15] = e + p.GetHighHalf();
-}
-
-void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
-{
- DWord p;
- word c, d, e;
-
- p = DWord::Multiply(A[0], B[0]);
- R[0] = p.GetLowHalf();
- c = p.GetHighHalf();
- d = e = 0;
-
- MulAcc(0, 1);
- MulAcc(1, 0);
-
- SaveMulAcc(1, 2, 0);
- MulAcc(1, 1);
- MulAcc(0, 2);
-
- R[2] = c;
- R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
-}
-
-void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
-{
- DWord p;
- word c, d, e;
-
- p = DWord::Multiply(A[0], B[0]);
- R[0] = p.GetLowHalf();
- c = p.GetHighHalf();
- d = e = 0;
-
- MulAcc(0, 1);
- MulAcc(1, 0);
-
- SaveMulAcc(1, 2, 0);
- MulAcc(1, 1);
- MulAcc(0, 2);
-
- SaveMulAcc(2, 0, 3);
- MulAcc(1, 2);
- MulAcc(2, 1);
- MulAcc(3, 0);
-
- SaveMulAcc(3, 0, 4);
- MulAcc(1, 3);
- MulAcc(2, 2);
- MulAcc(3, 1);
- MulAcc(4, 0);
-
- SaveMulAcc(4, 0, 5);
- MulAcc(1, 4);
- MulAcc(2, 3);
- MulAcc(3, 2);
- MulAcc(4, 1);
- MulAcc(5, 0);
-
- SaveMulAcc(5, 0, 6);
- MulAcc(1, 5);
- MulAcc(2, 4);
- MulAcc(3, 3);
- MulAcc(4, 2);
- MulAcc(5, 1);
- MulAcc(6, 0);
+#define Squ_Begin(n) \
+ Declare2Words(p) \
+ Declare2Words(c) \
+ Declare2Words(d) \
+ Declare2Words(e) \
+ MultiplyWords(p, A[0], A[0]) \
+ R[0] = LowWord(p); \
+ AssignWord(e, HighWord(p)) \
+ MultiplyWords(p, A[0], A[1]) \
+ AssignWord(c, LowWord(p)) \
+ AssignWord(d, HighWord(p)) \
+ Squ_NonDiag \
- R[6] = c;
- R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
- A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
-}
+#define Squ_NonDiag \
+ Double2Words(c) \
+ Double2Words(d) \
-#undef MulAcc
-#undef SaveMulAcc
-#undef SquAcc
-#undef SaveSquAcc
+#define Squ_SaveAcc(k, i, j) \
+ Acc2WordsBy2(c, e) \
+ R[k] = LowWord(c); \
+ Add2WordsBy1(e, d, HighWord(c)) \
+ MultiplyWords(p, A[i], A[j]) \
+ AssignWord(c, LowWord(p)) \
+ AssignWord(d, HighWord(p)) \
-#ifdef CRYPTOPP_X86ASM_AVAILABLE
+#define Squ_Acc(i, j) \
+ MultiplyWords(p, A[i], A[j]) \
+ Acc2WordsBy1(c, LowWord(p)) \
+ Acc2WordsBy1(d, HighWord(p))
-// ************** x86 feature detection ***************
+#define Squ_Diag(i) \
+ Squ_NonDiag \
+ MultiplyWords(p, A[i], A[i]) \
+ Acc2WordsBy1(c, LowWord(p)) \
+ Acc2WordsBy1(d, HighWord(p)) \
-static bool s_sse2Enabled = true;
+#define Squ_End(n) \
+ Acc2WordsBy2(c, e) \
+ R[2*n-3] = LowWord(c); \
+ Acc2WordsBy1(d, HighWord(c)) \
+ MultiplyWords(p, A[n-1], A[n-1])\
+ Acc2WordsBy2(d, p) \
+ R[2*n-2] = LowWord(d); \
+ R[2*n-1] = HighWord(d);
-static void CpuId(word32 input, word32 *output)
+void Baseline_Multiply2(word *R, const word *A, const word *B)
{
-#ifdef __GNUC__
- __asm__
- (
- // save ebx in case -fPIC is being used
- "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
- : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
- : "a" (input)
- );
-#else
- __asm
- {
- mov eax, input
- cpuid
- mov edi, output
- mov [edi], eax
- mov [edi+4], ebx
- mov [edi+8], ecx
- mov [edi+12], edx
- }
-#endif
+ Mul_2
}
-#ifdef SSE2_INTRINSICS_AVAILABLE
-#ifndef _MSC_VER
-static jmp_buf s_env;
-static void SigIllHandler(int)
+void Baseline_Multiply4(word *R, const word *A, const word *B)
{
- longjmp(s_env, 1);
+ Mul_4
}
-#endif
-static bool HasSSE2()
+void Baseline_Multiply8(word *R, const word *A, const word *B)
{
- if (!s_sse2Enabled)
- return false;
-
- word32 cpuid[4];
- CpuId(1, cpuid);
- if ((cpuid[3] & (1 << 26)) == 0)
- return false;
-
-#ifdef _MSC_VER
- __try
- {
- __asm xorpd xmm0, xmm0 // executing SSE2 instruction
- }
- __except (1)
- {
- return false;
- }
- return true;
-#else
- typedef void (*SigHandler)(int);
+ Mul_8
+}
- SigHandler oldHandler = signal(SIGILL, SigIllHandler);
- if (oldHandler == SIG_ERR)
- return false;
+void Baseline_Square2(word *R, const word *A)
+{
+ Squ_2
+}
- bool result = true;
- if (setjmp(s_env))
- result = false;
- else
- __asm __volatile ("xorps %xmm0, %xmm0");
+void Baseline_Square4(word *R, const word *A)
+{
+ Squ_4
+}
- signal(SIGILL, oldHandler);
- return result;
-#endif
+void Baseline_Square8(word *R, const word *A)
+{
+ Squ_8
}
-#endif
-static bool IsP4()
+void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
{
- word32 cpuid[4];
+ Bot_2
+}
- CpuId(0, cpuid);
- std::swap(cpuid[2], cpuid[3]);
- if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
- return false;
+void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
+{
+ Bot_4
+}
- CpuId(1, cpuid);
- return ((cpuid[0] >> 8) & 0xf) == 0xf;
+void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
+{
+ Bot_8
}
-// ************** Pentium/P4 optimizations ***************
+/*
+void Baseline_Multiply16(word *R, const word *A, const word *B)
+{
+ Mul_16
+}
-class PentiumOptimized : public Portable
+void Baseline_Square16(word *R, const word *A)
{
-public:
- static int Add(word *C, const word *A, const word *B, size_t N);
- static int Subtract(word *C, const word *A, const word *B, size_t N);
- static void Multiply4(word *C, const word *A, const word *B);
- static void Multiply8(word *C, const word *A, const word *B);
- static void Multiply8Bottom(word *C, const word *A, const word *B);
-};
+ Squ_16
+}
-class P4Optimized
+void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
{
-public:
- static int Add(word *C, const word *A, const word *B, size_t N);
- static int Subtract(word *C, const word *A, const word *B, size_t N);
-#ifdef SSE2_INTRINSICS_AVAILABLE
- static void Multiply4(word *C, const word *A, const word *B);
- static void Multiply8(word *C, const word *A, const word *B);
- static void Multiply8Bottom(word *C, const word *A, const word *B);
-#endif
-};
+ Bot_16
+}
+*/
-typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);
-typedef void (* PMul)(word *C, const word *A, const word *B);
+// ********************************************************
+
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
+
+#undef Mul_Begin
+#undef Mul_Acc
+#undef Squ_Acc
+#undef Squ_NonDiag
+#undef Squ_Diag
+#undef Squ_SaveAcc
+#undef Squ_Begin
+#undef Mul_SaveAcc
+#undef Bot_Acc
+#undef Bot_SaveAcc
+#undef Bot_End
+#undef Squ_End
+#undef Mul_End
+
+#define SSE2_FinalSave(k) \
+ AS2( psllq xmm5, 16) \
+ AS2( paddq xmm4, xmm5) \
+ AS2( movq QWORD PTR [ecx+8*(k)], xmm4)
+
+#define SSE2_SaveShift(k) \
+ AS2( movq xmm0, xmm6) \
+ AS2( punpckhqdq xmm6, xmm0) \
+ AS2( movq xmm1, xmm7) \
+ AS2( punpckhqdq xmm7, xmm1) \
+ AS2( paddd xmm6, xmm0) \
+ AS2( pslldq xmm6, 4) \
+ AS2( paddd xmm7, xmm1) \
+ AS2( paddd xmm4, xmm6) \
+ AS2( pslldq xmm7, 4) \
+ AS2( movq xmm6, xmm4) \
+ AS2( paddd xmm5, xmm7) \
+ AS2( movq xmm7, xmm5) \
+ AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
+ AS2( psrlq xmm6, 16) \
+ AS2( paddq xmm6, xmm7) \
+ AS2( punpckhqdq xmm4, xmm0) \
+ AS2( punpckhqdq xmm5, xmm0) \
+ AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \
+ AS2( psrlq xmm6, 3*16) \
+ AS2( paddd xmm4, xmm6) \
+
+#define Squ_SSE2_SaveShift(k) \
+ AS2( movq xmm0, xmm6) \
+ AS2( punpckhqdq xmm6, xmm0) \
+ AS2( movq xmm1, xmm7) \
+ AS2( punpckhqdq xmm7, xmm1) \
+ AS2( paddd xmm6, xmm0) \
+ AS2( pslldq xmm6, 4) \
+ AS2( paddd xmm7, xmm1) \
+ AS2( paddd xmm4, xmm6) \
+ AS2( pslldq xmm7, 4) \
+ AS2( movhlps xmm6, xmm4) \
+ AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
+ AS2( paddd xmm5, xmm7) \
+ AS2( movhps QWORD PTR [esp+12], xmm5)\
+ AS2( psrlq xmm4, 16) \
+ AS2( paddq xmm4, xmm5) \
+ AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \
+ AS2( psrlq xmm4, 3*16) \
+ AS2( paddd xmm4, xmm6) \
+ AS2( movq QWORD PTR [esp+4], xmm4)\
+
+#define SSE2_FirstMultiply(i) \
+ AS2( movdqa xmm7, [esi+(i)*16])\
+ AS2( movdqa xmm5, [edi-(i)*16])\
+ AS2( pmuludq xmm5, xmm7) \
+ AS2( movdqa xmm4, [ebx])\
+ AS2( movdqa xmm6, xmm4) \
+ AS2( pand xmm4, xmm5) \
+ AS2( psrld xmm5, 16) \
+ AS2( pmuludq xmm7, [edx-(i)*16])\
+ AS2( pand xmm6, xmm7) \
+ AS2( psrld xmm7, 16)
+
+#define Squ_Begin(n) \
+ SquPrologue \
+ AS2( mov esi, esp)\
+ AS2( and esp, 0xfffffff0)\
+ AS2( lea edi, [esp-32*n])\
+ AS2( sub esp, 32*n+16)\
+ AS1( push esi)\
+ AS2( mov esi, edi) \
+ AS2( xor edx, edx) \
+ ASL(1) \
+ ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
+ ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
+ AS2( movdqa [edi+2*edx], xmm0) \
+ AS2( psrlq xmm0, 32) \
+ AS2( movdqa [edi+2*edx+16], xmm0) \
+ AS2( movdqa [edi+16*n+2*edx], xmm1) \
+ AS2( psrlq xmm1, 32) \
+ AS2( movdqa [edi+16*n+2*edx+16], xmm1) \
+ AS2( add edx, 16) \
+ AS2( cmp edx, 8*(n)) \
+ ASJ( jne, 1, b) \
+ AS2( lea edx, [edi+16*n])\
+ SSE2_FirstMultiply(0) \
+
+#define Squ_Acc(i) \
+ ASL(LSqu##i) \
+ AS2( movdqa xmm1, [esi+(i)*16]) \
+ AS2( movdqa xmm0, [edi-(i)*16]) \
+ AS2( movdqa xmm2, [ebx]) \
+ AS2( pmuludq xmm0, xmm1) \
+ AS2( pmuludq xmm1, [edx-(i)*16]) \
+ AS2( movdqa xmm3, xmm2) \
+ AS2( pand xmm2, xmm0) \
+ AS2( psrld xmm0, 16) \
+ AS2( paddd xmm4, xmm2) \
+ AS2( paddd xmm5, xmm0) \
+ AS2( pand xmm3, xmm1) \
+ AS2( psrld xmm1, 16) \
+ AS2( paddd xmm6, xmm3) \
+ AS2( paddd xmm7, xmm1) \
+
+#define Squ_Acc1(i)
+#define Squ_Acc2(i) ASC(call, LSqu##i)
+#define Squ_Acc3(i) Squ_Acc2(i)
+#define Squ_Acc4(i) Squ_Acc2(i)
+#define Squ_Acc5(i) Squ_Acc2(i)
+#define Squ_Acc6(i) Squ_Acc2(i)
+#define Squ_Acc7(i) Squ_Acc2(i)
+#define Squ_Acc8(i) Squ_Acc2(i)
+
+#define SSE2_End(E, n) \
+ SSE2_SaveShift(2*(n)-3) \
+ AS2( movdqa xmm7, [esi+16]) \
+ AS2( movdqa xmm0, [edi]) \
+ AS2( pmuludq xmm0, xmm7) \
+ AS2( movdqa xmm2, [ebx]) \
+ AS2( pmuludq xmm7, [edx]) \
+ AS2( movdqa xmm6, xmm2) \
+ AS2( pand xmm2, xmm0) \
+ AS2( psrld xmm0, 16) \
+ AS2( paddd xmm4, xmm2) \
+ AS2( paddd xmm5, xmm0) \
+ AS2( pand xmm6, xmm7) \
+ AS2( psrld xmm7, 16) \
+ SSE2_SaveShift(2*(n)-2) \
+ SSE2_FinalSave(2*(n)-1) \
+ AS1( pop esp)\
+ E
+
+#define Squ_End(n) SSE2_End(SquEpilogue, n)
+#define Mul_End(n) SSE2_End(MulEpilogue, n)
+#define Top_End(n) SSE2_End(TopEpilogue, n)
+
+#define Squ_Column1(k, i) \
+ Squ_SSE2_SaveShift(k) \
+ AS2( add esi, 16) \
+ SSE2_FirstMultiply(1)\
+ Squ_Acc##i(i) \
+ AS2( paddd xmm4, xmm4) \
+ AS2( paddd xmm5, xmm5) \
+ AS2( movdqa xmm3, [esi]) \
+ AS2( movq xmm1, QWORD PTR [esi+8]) \
+ AS2( pmuludq xmm1, xmm3) \
+ AS2( pmuludq xmm3, xmm3) \
+ AS2( movdqa xmm0, [ebx])\
+ AS2( movdqa xmm2, xmm0) \
+ AS2( pand xmm0, xmm1) \
+ AS2( psrld xmm1, 16) \
+ AS2( paddd xmm6, xmm0) \
+ AS2( paddd xmm7, xmm1) \
+ AS2( pand xmm2, xmm3) \
+ AS2( psrld xmm3, 16) \
+ AS2( paddd xmm6, xmm6) \
+ AS2( paddd xmm7, xmm7) \
+ AS2( paddd xmm4, xmm2) \
+ AS2( paddd xmm5, xmm3) \
+ AS2( movq xmm0, QWORD PTR [esp+4])\
+ AS2( movq xmm1, QWORD PTR [esp+12])\
+ AS2( paddd xmm4, xmm0)\
+ AS2( paddd xmm5, xmm1)\
+
+#define Squ_Column0(k, i) \
+ Squ_SSE2_SaveShift(k) \
+ AS2( add edi, 16) \
+ AS2( add edx, 16) \
+ SSE2_FirstMultiply(1)\
+ Squ_Acc##i(i) \
+ AS2( paddd xmm6, xmm6) \
+ AS2( paddd xmm7, xmm7) \
+ AS2( paddd xmm4, xmm4) \
+ AS2( paddd xmm5, xmm5) \
+ AS2( movq xmm0, QWORD PTR [esp+4])\
+ AS2( movq xmm1, QWORD PTR [esp+12])\
+ AS2( paddd xmm4, xmm0)\
+ AS2( paddd xmm5, xmm1)\
+
+#define SSE2_MulAdd45 \
+ AS2( movdqa xmm7, [esi]) \
+ AS2( movdqa xmm0, [edi]) \
+ AS2( pmuludq xmm0, xmm7) \
+ AS2( movdqa xmm2, [ebx]) \
+ AS2( pmuludq xmm7, [edx]) \
+ AS2( movdqa xmm6, xmm2) \
+ AS2( pand xmm2, xmm0) \
+ AS2( psrld xmm0, 16) \
+ AS2( paddd xmm4, xmm2) \
+ AS2( paddd xmm5, xmm0) \
+ AS2( pand xmm6, xmm7) \
+ AS2( psrld xmm7, 16)
+
+#define Mul_Begin(n) \
+ MulPrologue \
+ AS2( mov esi, esp)\
+ AS2( and esp, 0xfffffff0)\
+ AS2( sub esp, 48*n+16)\
+ AS1( push esi)\
+ AS2( xor edx, edx) \
+ ASL(1) \
+ ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
+ ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
+ ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
+ AS2( movdqa [esp+20+2*edx], xmm0) \
+ AS2( psrlq xmm0, 32) \
+ AS2( movdqa [esp+20+2*edx+16], xmm0) \
+ AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
+ AS2( psrlq xmm1, 32) \
+ AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
+ AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
+ AS2( psrlq xmm2, 32) \
+ AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
+ AS2( add edx, 16) \
+ AS2( cmp edx, 8*(n)) \
+ ASJ( jne, 1, b) \
+ AS2( lea edi, [esp+20])\
+ AS2( lea edx, [esp+20+16*n])\
+ AS2( lea esi, [esp+20+32*n])\
+ SSE2_FirstMultiply(0) \
+
+#define Mul_Acc(i) \
+ ASL(LMul##i) \
+ AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( movdqa xmm2, [ebx]) \
+ AS2( pmuludq xmm0, xmm1) \
+ AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( movdqa xmm3, xmm2) \
+ AS2( pand xmm2, xmm0) \
+ AS2( psrld xmm0, 16) \
+ AS2( paddd xmm4, xmm2) \
+ AS2( paddd xmm5, xmm0) \
+ AS2( pand xmm3, xmm1) \
+ AS2( psrld xmm1, 16) \
+ AS2( paddd xmm6, xmm3) \
+ AS2( paddd xmm7, xmm1) \
+
+#define Mul_Acc1(i)
+#define Mul_Acc2(i) ASC(call, LMul##i)
+#define Mul_Acc3(i) Mul_Acc2(i)
+#define Mul_Acc4(i) Mul_Acc2(i)
+#define Mul_Acc5(i) Mul_Acc2(i)
+#define Mul_Acc6(i) Mul_Acc2(i)
+#define Mul_Acc7(i) Mul_Acc2(i)
+#define Mul_Acc8(i) Mul_Acc2(i)
+#define Mul_Acc9(i) Mul_Acc2(i)
+#define Mul_Acc10(i) Mul_Acc2(i)
+#define Mul_Acc11(i) Mul_Acc2(i)
+#define Mul_Acc12(i) Mul_Acc2(i)
+#define Mul_Acc13(i) Mul_Acc2(i)
+#define Mul_Acc14(i) Mul_Acc2(i)
+#define Mul_Acc15(i) Mul_Acc2(i)
+#define Mul_Acc16(i) Mul_Acc2(i)
+
+#define Mul_Column1(k, i) \
+ SSE2_SaveShift(k) \
+ AS2( add esi, 16) \
+ SSE2_MulAdd45\
+ Mul_Acc##i(i) \
+
+#define Mul_Column0(k, i) \
+ SSE2_SaveShift(k) \
+ AS2( add edi, 16) \
+ AS2( add edx, 16) \
+ SSE2_MulAdd45\
+ Mul_Acc##i(i) \
+
+#define Bot_Acc(i) \
+ AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( pmuludq xmm0, xmm1) \
+ AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( paddq xmm4, xmm0) \
+ AS2( paddd xmm6, xmm1)
+
+#define Bot_SaveAcc(k) \
+ SSE2_SaveShift(k) \
+ AS2( add edi, 16) \
+ AS2( add edx, 16) \
+ AS2( movdqa xmm6, [esi]) \
+ AS2( movdqa xmm0, [edi]) \
+ AS2( pmuludq xmm0, xmm6) \
+ AS2( paddq xmm4, xmm0) \
+ AS2( psllq xmm5, 16) \
+ AS2( paddq xmm4, xmm5) \
+ AS2( pmuludq xmm6, [edx])
+
+#define Bot_End(n) \
+ AS2( movhlps xmm7, xmm6) \
+ AS2( paddd xmm6, xmm7) \
+ AS2( psllq xmm6, 32) \
+ AS2( paddd xmm4, xmm6) \
+ AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
+ AS1( pop esp)\
+ MulEpilogue
-static PAddSub s_pAdd, s_pSub;
-#ifdef SSE2_INTRINSICS_AVAILABLE
-static PMul s_pMul4, s_pMul8, s_pMul8B;
+#define Top_Begin(n) \
+ TopPrologue \
+ AS2( mov edx, esp)\
+ AS2( and esp, 0xfffffff0)\
+ AS2( sub esp, 48*n+16)\
+ AS1( push edx)\
+ AS2( xor edx, edx) \
+ ASL(1) \
+ ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
+ ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
+ ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
+ AS2( movdqa [esp+20+2*edx], xmm0) \
+ AS2( psrlq xmm0, 32) \
+ AS2( movdqa [esp+20+2*edx+16], xmm0) \
+ AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
+ AS2( psrlq xmm1, 32) \
+ AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
+ AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
+ AS2( psrlq xmm2, 32) \
+ AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
+ AS2( add edx, 16) \
+ AS2( cmp edx, 8*(n)) \
+ ASJ( jne, 1, b) \
+ AS2( mov eax, esi) \
+ AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\
+ AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\
+ AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\
+ AS2( pxor xmm4, xmm4)\
+ AS2( pxor xmm5, xmm5)
+
+#define Top_Acc(i) \
+ AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \
+ AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
+ AS2( psrlq xmm0, 48) \
+ AS2( paddd xmm5, xmm0)\
+
+#define Top_Column0(i) \
+ AS2( psllq xmm5, 32) \
+ AS2( add edi, 16) \
+ AS2( add edx, 16) \
+ SSE2_MulAdd45\
+ Mul_Acc##i(i) \
+
+#define Top_Column1(i) \
+ SSE2_SaveShift(0) \
+ AS2( add esi, 16) \
+ SSE2_MulAdd45\
+ Mul_Acc##i(i) \
+ AS2( shr eax, 16) \
+ AS2( movd xmm0, eax)\
+ AS2( movd xmm1, [ecx+4])\
+ AS2( psrld xmm1, 16)\
+ AS2( pcmpgtd xmm1, xmm0)\
+ AS2( psrld xmm1, 31)\
+ AS2( paddd xmm4, xmm1)\
+
+void SSE2_Square4(word *C, const word *A)
+{
+ Squ_Begin(2)
+ Squ_Column0(0, 1)
+ Squ_End(2)
+}
+
+void SSE2_Square8(word *C, const word *A)
+{
+ Squ_Begin(4)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Squ_Acc(2)
+ AS1( ret) ASL(0)
#endif
+ Squ_Column0(0, 1)
+ Squ_Column1(1, 1)
+ Squ_Column0(2, 2)
+ Squ_Column1(3, 1)
+ Squ_Column0(4, 1)
+ Squ_End(4)
+}
-static void SetPentiumFunctionPointers()
+void SSE2_Square16(word *C, const word *A)
{
- if (IsP4())
- {
- s_pAdd = &P4Optimized::Add;
- s_pSub = &P4Optimized::Subtract;
- }
- else
- {
- s_pAdd = &PentiumOptimized::Add;
- s_pSub = &PentiumOptimized::Subtract;
- }
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
- if (HasSSE2())
- {
- s_pMul4 = &P4Optimized::Multiply4;
- s_pMul8 = &P4Optimized::Multiply8;
- s_pMul8B = &P4Optimized::Multiply8Bottom;
- }
- else
- {
- s_pMul4 = &PentiumOptimized::Multiply4;
- s_pMul8 = &PentiumOptimized::Multiply8;
- s_pMul8B = &PentiumOptimized::Multiply8Bottom;
- }
+ Squ_Begin(8)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Squ_Column0(0, 1)
+ Squ_Column1(1, 1)
+ Squ_Column0(2, 2)
+ Squ_Column1(3, 2)
+ Squ_Column0(4, 3)
+ Squ_Column1(5, 3)
+ Squ_Column0(6, 4)
+ Squ_Column1(7, 3)
+ Squ_Column0(8, 3)
+ Squ_Column1(9, 2)
+ Squ_Column0(10, 2)
+ Squ_Column1(11, 1)
+ Squ_Column0(12, 1)
+ Squ_End(8)
+}
+
+void SSE2_Square32(word *C, const word *A)
+{
+ Squ_Begin(16)
+ ASJ( jmp, 0, f)
+ Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
+ AS1( ret) ASL(0)
+ Squ_Column0(0, 1)
+ Squ_Column1(1, 1)
+ Squ_Column0(2, 2)
+ Squ_Column1(3, 2)
+ Squ_Column0(4, 3)
+ Squ_Column1(5, 3)
+ Squ_Column0(6, 4)
+ Squ_Column1(7, 4)
+ Squ_Column0(8, 5)
+ Squ_Column1(9, 5)
+ Squ_Column0(10, 6)
+ Squ_Column1(11, 6)
+ Squ_Column0(12, 7)
+ Squ_Column1(13, 7)
+ Squ_Column0(14, 8)
+ Squ_Column1(15, 7)
+ Squ_Column0(16, 7)
+ Squ_Column1(17, 6)
+ Squ_Column0(18, 6)
+ Squ_Column1(19, 5)
+ Squ_Column0(20, 5)
+ Squ_Column1(21, 4)
+ Squ_Column0(22, 4)
+ Squ_Column1(23, 3)
+ Squ_Column0(24, 3)
+ Squ_Column1(25, 2)
+ Squ_Column0(26, 2)
+ Squ_Column1(27, 1)
+ Squ_Column0(28, 1)
+ Squ_End(16)
+}
+
+void SSE2_Multiply4(word *C, const word *A, const word *B)
+{
+ Mul_Begin(2)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(2)
+ AS1( ret) ASL(0)
#endif
+ Mul_Column0(0, 2)
+ Mul_End(2)
}
-void DisableSSE2()
+void SSE2_Multiply8(word *C, const word *A, const word *B)
{
- s_sse2Enabled = false;
- SetPentiumFunctionPointers();
+ Mul_Begin(4)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Mul_Column0(0, 2)
+ Mul_Column1(1, 3)
+ Mul_Column0(2, 4)
+ Mul_Column1(3, 3)
+ Mul_Column0(4, 2)
+ Mul_End(4)
}
-class LowLevel : public PentiumOptimized
+void SSE2_Multiply16(word *C, const word *A, const word *B)
{
-public:
- inline static int Add(word *C, const word *A, const word *B, size_t N)
- {return s_pAdd(C, A, B, N);}
- inline static int Subtract(word *C, const word *A, const word *B, size_t N)
- {return s_pSub(C, A, B, N);}
- inline static void Square4(word *R, const word *A)
- {Multiply4(R, A, A);}
-#ifdef SSE2_INTRINSICS_AVAILABLE
- inline static void Multiply4(word *C, const word *A, const word *B)
- {s_pMul4(C, A, B);}
- inline static void Multiply8(word *C, const word *A, const word *B)
- {s_pMul8(C, A, B);}
- inline static void Multiply8Bottom(word *C, const word *A, const word *B)
- {s_pMul8B(C, A, B);}
+ Mul_Begin(8)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
#endif
-};
-
-// use some tricks to share assembly code between MSVC and GCC
-#ifdef _MSC_VER
- #define CRYPTOPP_NAKED __declspec(naked)
- #define AS1(x) __asm x
- #define AS2(x, y) __asm x, y
- #define AddPrologue \
- __asm push ebp \
- __asm push ebx \
- __asm push esi \
- __asm push edi \
- __asm mov ecx, [esp+20] \
- __asm mov edx, [esp+24] \
- __asm mov ebx, [esp+28] \
- __asm mov esi, [esp+32]
- #define AddEpilogue \
- __asm pop edi \
- __asm pop esi \
- __asm pop ebx \
- __asm pop ebp \
- __asm ret
- #define MulPrologue \
- __asm push ebp \
- __asm push ebx \
- __asm push esi \
- __asm push edi \
- __asm mov ecx, [esp+28] \
- __asm mov esi, [esp+24] \
- __asm push [esp+20]
- #define MulEpilogue \
- __asm add esp, 4 \
- __asm pop edi \
- __asm pop esi \
- __asm pop ebx \
- __asm pop ebp \
- __asm ret
-#else
- #define CRYPTOPP_NAKED
- #define AS1(x) #x ";"
- #define AS2(x, y) #x ", " #y ";"
- #define AddPrologue \
- __asm__ __volatile__ \
- ( \
- "push %%ebx;" /* save this manually, in case of -fPIC */ \
- "mov %2, %%ebx;" \
- ".intel_syntax noprefix;" \
- "push ebp;"
- #define AddEpilogue \
- "pop ebp;" \
- ".att_syntax prefix;" \
- "pop %%ebx;" \
- : \
- : "c" (C), "d" (A), "m" (B), "S" (N) \
- : "%edi", "memory", "cc" \
- );
- #define MulPrologue \
- __asm__ __volatile__ \
- ( \
- "push %%ebx;" /* save this manually, in case of -fPIC */ \
- "push %%ebp;" \
- "push %0;" \
- ".intel_syntax noprefix;"
- #define MulEpilogue \
- "add esp, 4;" \
- "pop ebp;" \
- "pop ebx;" \
- ".att_syntax prefix;" \
- : \
- : "rm" (Z), "S" (X), "c" (Y) \
- : "%eax", "%edx", "%edi", "memory", "cc" \
- );
+ Mul_Column0(0, 2)
+ Mul_Column1(1, 3)
+ Mul_Column0(2, 4)
+ Mul_Column1(3, 5)
+ Mul_Column0(4, 6)
+ Mul_Column1(5, 7)
+ Mul_Column0(6, 8)
+ Mul_Column1(7, 7)
+ Mul_Column0(8, 6)
+ Mul_Column1(9, 5)
+ Mul_Column0(10, 4)
+ Mul_Column1(11, 3)
+ Mul_Column0(12, 2)
+ Mul_End(8)
+}
+
+void SSE2_Multiply32(word *C, const word *A, const word *B)
+{
+ Mul_Begin(16)
+ ASJ( jmp, 0, f)
+ Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+ Mul_Column0(0, 2)
+ Mul_Column1(1, 3)
+ Mul_Column0(2, 4)
+ Mul_Column1(3, 5)
+ Mul_Column0(4, 6)
+ Mul_Column1(5, 7)
+ Mul_Column0(6, 8)
+ Mul_Column1(7, 9)
+ Mul_Column0(8, 10)
+ Mul_Column1(9, 11)
+ Mul_Column0(10, 12)
+ Mul_Column1(11, 13)
+ Mul_Column0(12, 14)
+ Mul_Column1(13, 15)
+ Mul_Column0(14, 16)
+ Mul_Column1(15, 15)
+ Mul_Column0(16, 14)
+ Mul_Column1(17, 13)
+ Mul_Column0(18, 12)
+ Mul_Column1(19, 11)
+ Mul_Column0(20, 10)
+ Mul_Column1(21, 9)
+ Mul_Column0(22, 8)
+ Mul_Column1(23, 7)
+ Mul_Column0(24, 6)
+ Mul_Column1(25, 5)
+ Mul_Column0(26, 4)
+ Mul_Column1(27, 3)
+ Mul_Column0(28, 2)
+ Mul_End(16)
+}
+
+void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
+{
+ Mul_Begin(2)
+ Bot_SaveAcc(0) Bot_Acc(2)
+ Bot_End(2)
+}
+
+void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
+{
+ Mul_Begin(4)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
#endif
+ Mul_Column0(0, 2)
+ Mul_Column1(1, 3)
+ Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
+ Bot_End(4)
+}
-CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)
+void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
{
- AddPrologue
-
- // now: ebx = B, ecx = C, edx = A, esi = N
- AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
- AS2( xor eax, eax) // clear eax
-
- AS2( sub eax, esi) // eax is a negative index from end of B
- AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
-
- AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
- AS1( jz loopendAdd) // if no dwords then nothing to do
-
- AS1(loopstartAdd:)
- AS2( mov esi,[edx]) // load lower word of A
- AS2( mov ebp,[edx+4]) // load higher word of A
-
- AS2( mov edi,[ebx+8*eax]) // load lower word of B
- AS2( lea edx,[edx+8]) // advance A and C
-
- AS2( adc esi,edi) // add lower words
- AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
-
- AS2( adc ebp,edi) // add higher words
- AS1( inc eax) // advance B
-
- AS2( mov [edx+ecx-8],esi) // store lower word result
- AS2( mov [edx+ecx-4],ebp) // store higher word result
-
- AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
-
- AS1(loopendAdd:)
- AS2( adc eax, 0) // store carry into eax (return result register)
-
- AddEpilogue
+ Mul_Begin(8)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Mul_Column0(0, 2)
+ Mul_Column1(1, 3)
+ Mul_Column0(2, 4)
+ Mul_Column1(3, 5)
+ Mul_Column0(4, 6)
+ Mul_Column1(5, 7)
+ Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
+ Bot_End(8)
+}
+
+void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
+{
+ Mul_Begin(16)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Mul_Column0(0, 2)
+ Mul_Column1(1, 3)
+ Mul_Column0(2, 4)
+ Mul_Column1(3, 5)
+ Mul_Column0(4, 6)
+ Mul_Column1(5, 7)
+ Mul_Column0(6, 8)
+ Mul_Column1(7, 9)
+ Mul_Column0(8, 10)
+ Mul_Column1(9, 11)
+ Mul_Column0(10, 12)
+ Mul_Column1(11, 13)
+ Mul_Column0(12, 14)
+ Mul_Column1(13, 15)
+ Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
+ Bot_End(16)
+}
+
+void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
+{
+ Top_Begin(4)
+ Top_Acc(3) Top_Acc(2) Top_Acc(1)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Top_Column0(4)
+ Top_Column1(3)
+ Mul_Column0(0, 2)
+ Top_End(2)
}
-CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)
+void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
{
- AddPrologue
-
- // now: ebx = B, ecx = C, edx = A, esi = N
- AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
- AS2( xor eax, eax) // clear eax
-
- AS2( sub eax, esi) // eax is a negative index from end of B
- AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
-
- AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
- AS1( jz loopendSub) // if no dwords then nothing to do
-
- AS1(loopstartSub:)
- AS2( mov esi,[edx]) // load lower word of A
- AS2( mov ebp,[edx+4]) // load higher word of A
-
- AS2( mov edi,[ebx+8*eax]) // load lower word of B
- AS2( lea edx,[edx+8]) // advance A and C
-
- AS2( sbb esi,edi) // subtract lower words
- AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
-
- AS2( sbb ebp,edi) // subtract higher words
- AS1( inc eax) // advance B
-
- AS2( mov [edx+ecx-8],esi) // store lower word result
- AS2( mov [edx+ecx-4],ebp) // store higher word result
+ Top_Begin(8)
+ Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Top_Column0(8)
+ Top_Column1(7)
+ Mul_Column0(0, 6)
+ Mul_Column1(1, 5)
+ Mul_Column0(2, 4)
+ Mul_Column1(3, 3)
+ Mul_Column0(4, 2)
+ Top_End(4)
+}
+
+void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
+{
+ Top_Begin(16)
+ Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
+#ifndef __GNUC__
+ ASJ( jmp, 0, f)
+ Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
+ AS1( ret) ASL(0)
+#endif
+ Top_Column0(16)
+ Top_Column1(15)
+ Mul_Column0(0, 14)
+ Mul_Column1(1, 13)
+ Mul_Column0(2, 12)
+ Mul_Column1(3, 11)
+ Mul_Column0(4, 10)
+ Mul_Column1(5, 9)
+ Mul_Column0(6, 8)
+ Mul_Column1(7, 7)
+ Mul_Column0(8, 6)
+ Mul_Column1(9, 5)
+ Mul_Column0(10, 4)
+ Mul_Column1(11, 3)
+ Mul_Column0(12, 2)
+ Top_End(8)
+}
+
+#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
- AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
+// ********************************************************
- AS1(loopendSub:)
- AS2( adc eax, 0) // store carry into eax (return result register)
+typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
+typedef void (* PMul)(word *C, const word *A, const word *B);
+typedef void (* PSqu)(word *C, const word *A);
+typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
- AddEpilogue
-}
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
+static PMulTop s_pTop[3];
+static size_t s_recursionLimit = 8;
+#else
+static const size_t s_recursionLimit = 8;
+#endif
-// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
+static PMul s_pMul[9], s_pBot[9];
+static PSqu s_pSqu[9];
-CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)
+static void SetFunctionPointers()
{
- AddPrologue
+ s_pMul[0] = &Baseline_Multiply2;
+ s_pBot[0] = &Baseline_MultiplyBottom2;
+ s_pSqu[0] = &Baseline_Square2;
- // now: ebx = B, ecx = C, edx = A, esi = N
- AS2( xor eax, eax)
- AS1( neg esi)
- AS1( jz loopendAddP4) // if no dwords then nothing to do
-
- AS2( mov edi, [edx])
- AS2( mov ebp, [ebx])
- AS1( jmp carry1AddP4)
-
- AS1(loopstartAddP4:)
- AS2( mov edi, [edx+8])
- AS2( add ecx, 8)
- AS2( add edx, 8)
- AS2( mov ebp, [ebx])
- AS2( add edi, eax)
- AS1( jc carry1AddP4)
- AS2( xor eax, eax)
-
- AS1(carry1AddP4:)
- AS2( add edi, ebp)
- AS2( mov ebp, 1)
- AS2( mov [ecx], edi)
- AS2( mov edi, [edx+4])
- AS2( cmovc eax, ebp)
- AS2( mov ebp, [ebx+4])
- AS2( add ebx, 8)
- AS2( add edi, eax)
- AS1( jc carry2AddP4)
- AS2( xor eax, eax)
-
- AS1(carry2AddP4:)
- AS2( add edi, ebp)
- AS2( mov ebp, 1)
- AS2( cmovc eax, ebp)
- AS2( mov [ecx+4], edi)
- AS2( add esi, 2)
- AS1( jnz loopstartAddP4)
-
- AS1(loopendAddP4:)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ if (HasSSE2())
+ {
+ if (IsP4())
+ {
+ s_pAdd = &SSE2_Add;
+ s_pSub = &SSE2_Sub;
+ }
- AddEpilogue
-}
+ s_recursionLimit = 32;
-CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)
-{
- AddPrologue
+ s_pMul[1] = &SSE2_Multiply4;
+ s_pMul[2] = &SSE2_Multiply8;
+ s_pMul[4] = &SSE2_Multiply16;
+ s_pMul[8] = &SSE2_Multiply32;
- // now: ebx = B, ecx = C, edx = A, esi = N
- AS2( xor eax, eax)
- AS1( neg esi)
- AS1( jz loopendSubP4) // if no dwords then nothing to do
-
- AS2( mov edi, [edx])
- AS2( mov ebp, [ebx])
- AS1( jmp carry1SubP4)
-
- AS1(loopstartSubP4:)
- AS2( mov edi, [edx+8])
- AS2( add edx, 8)
- AS2( add ecx, 8)
- AS2( mov ebp, [ebx])
- AS2( sub edi, eax)
- AS1( jc carry1SubP4)
- AS2( xor eax, eax)
-
- AS1(carry1SubP4:)
- AS2( sub edi, ebp)
- AS2( mov ebp, 1)
- AS2( mov [ecx], edi)
- AS2( mov edi, [edx+4])
- AS2( cmovc eax, ebp)
- AS2( mov ebp, [ebx+4])
- AS2( add ebx, 8)
- AS2( sub edi, eax)
- AS1( jc carry2SubP4)
- AS2( xor eax, eax)
-
- AS1(carry2SubP4:)
- AS2( sub edi, ebp)
- AS2( mov ebp, 1)
- AS2( cmovc eax, ebp)
- AS2( mov [ecx+4], edi)
- AS2( add esi, 2)
- AS1( jnz loopstartSubP4)
-
- AS1(loopendSubP4:)
+ s_pBot[1] = &SSE2_MultiplyBottom4;
+ s_pBot[2] = &SSE2_MultiplyBottom8;
+ s_pBot[4] = &SSE2_MultiplyBottom16;
+ s_pBot[8] = &SSE2_MultiplyBottom32;
- AddEpilogue
-}
+ s_pSqu[1] = &SSE2_Square4;
+ s_pSqu[2] = &SSE2_Square8;
+ s_pSqu[4] = &SSE2_Square16;
+ s_pSqu[8] = &SSE2_Square32;
-// multiply assembly code originally contributed by Leonard Janke
-
-#define MulStartup \
- AS2(xor ebp, ebp) \
- AS2(xor edi, edi) \
- AS2(xor ebx, ebx)
-
-#define MulShiftCarry \
- AS2(mov ebp, edx) \
- AS2(mov edi, ebx) \
- AS2(xor ebx, ebx)
-
-#define MulAccumulateBottom(i,j) \
- AS2(mov eax, [ecx+4*j]) \
- AS2(imul eax, dword ptr [esi+4*i]) \
- AS2(add ebp, eax)
-
-#define MulAccumulate(i,j) \
- AS2(mov eax, [ecx+4*j]) \
- AS1(mul dword ptr [esi+4*i]) \
- AS2(add ebp, eax) \
- AS2(adc edi, edx) \
- AS2(adc bl, bh)
-
-#define MulStoreDigit(i) \
- AS2(mov edx, edi) \
- AS2(mov edi, [esp]) \
- AS2(mov [edi+4*i], ebp)
-
-#define MulLastDiagonal(digits) \
- AS2(mov eax, [ecx+4*(digits-1)]) \
- AS1(mul dword ptr [esi+4*(digits-1)]) \
- AS2(add ebp, eax) \
- AS2(adc edx, edi) \
- AS2(mov edi, [esp]) \
- AS2(mov [edi+4*(2*digits-2)], ebp) \
- AS2(mov [edi+4*(2*digits-1)], edx)
-
-CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
-{
- MulPrologue
- // now: [esp] = Z, esi = X, ecx = Y
- MulStartup
- MulAccumulate(0,0)
- MulStoreDigit(0)
- MulShiftCarry
-
- MulAccumulate(1,0)
- MulAccumulate(0,1)
- MulStoreDigit(1)
- MulShiftCarry
-
- MulAccumulate(2,0)
- MulAccumulate(1,1)
- MulAccumulate(0,2)
- MulStoreDigit(2)
- MulShiftCarry
-
- MulAccumulate(3,0)
- MulAccumulate(2,1)
- MulAccumulate(1,2)
- MulAccumulate(0,3)
- MulStoreDigit(3)
- MulShiftCarry
-
- MulAccumulate(3,1)
- MulAccumulate(2,2)
- MulAccumulate(1,3)
- MulStoreDigit(4)
- MulShiftCarry
-
- MulAccumulate(3,2)
- MulAccumulate(2,3)
- MulStoreDigit(5)
- MulShiftCarry
-
- MulLastDiagonal(4)
- MulEpilogue
-}
+ s_pTop[0] = &SSE2_MultiplyTop8;
+ s_pTop[1] = &SSE2_MultiplyTop16;
+ s_pTop[2] = &SSE2_MultiplyTop32;
+ }
+ else
+#endif
+ {
+ s_pMul[1] = &Baseline_Multiply4;
+ s_pMul[2] = &Baseline_Multiply8;
+// s_pMul[4] = &Baseline_Multiply16;
-CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
-{
- MulPrologue
- // now: [esp] = Z, esi = X, ecx = Y
- MulStartup
- MulAccumulate(0,0)
- MulStoreDigit(0)
- MulShiftCarry
-
- MulAccumulate(1,0)
- MulAccumulate(0,1)
- MulStoreDigit(1)
- MulShiftCarry
-
- MulAccumulate(2,0)
- MulAccumulate(1,1)
- MulAccumulate(0,2)
- MulStoreDigit(2)
- MulShiftCarry
-
- MulAccumulate(3,0)
- MulAccumulate(2,1)
- MulAccumulate(1,2)
- MulAccumulate(0,3)
- MulStoreDigit(3)
- MulShiftCarry
-
- MulAccumulate(4,0)
- MulAccumulate(3,1)
- MulAccumulate(2,2)
- MulAccumulate(1,3)
- MulAccumulate(0,4)
- MulStoreDigit(4)
- MulShiftCarry
-
- MulAccumulate(5,0)
- MulAccumulate(4,1)
- MulAccumulate(3,2)
- MulAccumulate(2,3)
- MulAccumulate(1,4)
- MulAccumulate(0,5)
- MulStoreDigit(5)
- MulShiftCarry
-
- MulAccumulate(6,0)
- MulAccumulate(5,1)
- MulAccumulate(4,2)
- MulAccumulate(3,3)
- MulAccumulate(2,4)
- MulAccumulate(1,5)
- MulAccumulate(0,6)
- MulStoreDigit(6)
- MulShiftCarry
-
- MulAccumulate(7,0)
- MulAccumulate(6,1)
- MulAccumulate(5,2)
- MulAccumulate(4,3)
- MulAccumulate(3,4)
- MulAccumulate(2,5)
- MulAccumulate(1,6)
- MulAccumulate(0,7)
- MulStoreDigit(7)
- MulShiftCarry
-
- MulAccumulate(7,1)
- MulAccumulate(6,2)
- MulAccumulate(5,3)
- MulAccumulate(4,4)
- MulAccumulate(3,5)
- MulAccumulate(2,6)
- MulAccumulate(1,7)
- MulStoreDigit(8)
- MulShiftCarry
-
- MulAccumulate(7,2)
- MulAccumulate(6,3)
- MulAccumulate(5,4)
- MulAccumulate(4,5)
- MulAccumulate(3,6)
- MulAccumulate(2,7)
- MulStoreDigit(9)
- MulShiftCarry
-
- MulAccumulate(7,3)
- MulAccumulate(6,4)
- MulAccumulate(5,5)
- MulAccumulate(4,6)
- MulAccumulate(3,7)
- MulStoreDigit(10)
- MulShiftCarry
-
- MulAccumulate(7,4)
- MulAccumulate(6,5)
- MulAccumulate(5,6)
- MulAccumulate(4,7)
- MulStoreDigit(11)
- MulShiftCarry
-
- MulAccumulate(7,5)
- MulAccumulate(6,6)
- MulAccumulate(5,7)
- MulStoreDigit(12)
- MulShiftCarry
-
- MulAccumulate(7,6)
- MulAccumulate(6,7)
- MulStoreDigit(13)
- MulShiftCarry
-
- MulLastDiagonal(8)
- MulEpilogue
-}
+ s_pBot[1] = &Baseline_MultiplyBottom4;
+ s_pBot[2] = &Baseline_MultiplyBottom8;
+// s_pBot[4] = &Baseline_MultiplyBottom16;
-CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
-{
- MulPrologue
- // now: [esp] = Z, esi = X, ecx = Y
- MulStartup
- MulAccumulate(0,0)
- MulStoreDigit(0)
- MulShiftCarry
-
- MulAccumulate(1,0)
- MulAccumulate(0,1)
- MulStoreDigit(1)
- MulShiftCarry
-
- MulAccumulate(2,0)
- MulAccumulate(1,1)
- MulAccumulate(0,2)
- MulStoreDigit(2)
- MulShiftCarry
-
- MulAccumulate(3,0)
- MulAccumulate(2,1)
- MulAccumulate(1,2)
- MulAccumulate(0,3)
- MulStoreDigit(3)
- MulShiftCarry
-
- MulAccumulate(4,0)
- MulAccumulate(3,1)
- MulAccumulate(2,2)
- MulAccumulate(1,3)
- MulAccumulate(0,4)
- MulStoreDigit(4)
- MulShiftCarry
-
- MulAccumulate(5,0)
- MulAccumulate(4,1)
- MulAccumulate(3,2)
- MulAccumulate(2,3)
- MulAccumulate(1,4)
- MulAccumulate(0,5)
- MulStoreDigit(5)
- MulShiftCarry
-
- MulAccumulate(6,0)
- MulAccumulate(5,1)
- MulAccumulate(4,2)
- MulAccumulate(3,3)
- MulAccumulate(2,4)
- MulAccumulate(1,5)
- MulAccumulate(0,6)
- MulStoreDigit(6)
- MulShiftCarry
-
- MulAccumulateBottom(7,0)
- MulAccumulateBottom(6,1)
- MulAccumulateBottom(5,2)
- MulAccumulateBottom(4,3)
- MulAccumulateBottom(3,4)
- MulAccumulateBottom(2,5)
- MulAccumulateBottom(1,6)
- MulAccumulateBottom(0,7)
- MulStoreDigit(7)
- MulEpilogue
+ s_pSqu[1] = &Baseline_Square4;
+ s_pSqu[2] = &Baseline_Square8;
+// s_pSqu[4] = &Baseline_Square16;
+ }
}
-#undef AS1
-#undef AS2
-
-#else // not x86 - no processor specific code at this layer
-
-typedef Portable LowLevel;
-
-#endif
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-
-#ifdef __GNUC__
-#define CRYPTOPP_FASTCALL
+inline int Add(word *C, const word *A, const word *B, size_t N)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ return s_pAdd(N, C, A, B);
#else
-#define CRYPTOPP_FASTCALL __fastcall
+ return Baseline_Add(N, C, A, B);
#endif
-
-static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
-{
- __m128i a3210 = _mm_load_si128(A);
- __m128i b3210 = _mm_load_si128(B);
-
- __m128i sum;
-
- __m128i z = _mm_setzero_si128();
- __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
- C[0] = a2b2_a0b0;
-
- __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
- __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
- __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
- __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
- C[1] = _mm_add_epi64(a1b0, a0b1);
-
- __m128i a31 = _mm_srli_epi64(a3210, 32);
- __m128i b31 = _mm_srli_epi64(b3210, 32);
- __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
- C[6] = a3b3_a1b1;
-
- __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
- __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
- __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
- __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
- __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
- sum = _mm_add_epi64(a1b1, a0b2);
- C[2] = _mm_add_epi64(sum, a2b0);
-
- __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
- __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
- __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
- __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
- __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
- __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
- __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
- __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
- __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
- sum = _mm_add_epi64(a2b1, a0b3);
- C[3] = _mm_add_epi64(sum, sum1);
-
- __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
- __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
- __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
- __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
- sum = _mm_add_epi64(a2b2, a3b1);
- C[4] = _mm_add_epi64(sum, a1b3);
-
- __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
- __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
- __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
- __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
- __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
- C[5] = _mm_add_epi64(a3b2, a2b3);
-}
-
-void P4Optimized::Multiply4(word *C, const word *A, const word *B)
-{
- __m128i temp[7];
- const word *w = (word *)temp;
- const __m64 *mw = (__m64 *)w;
-
- P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
- C[0] = w[0];
-
- __m64 s1, s2;
-
- __m64 w1 = _mm_cvtsi32_si64(w[1]);
- __m64 w4 = mw[2];
- __m64 w6 = mw[3];
- __m64 w8 = mw[4];
- __m64 w10 = mw[5];
- __m64 w12 = mw[6];
- __m64 w14 = mw[7];
- __m64 w16 = mw[8];
- __m64 w18 = mw[9];
- __m64 w20 = mw[10];
- __m64 w22 = mw[11];
- __m64 w26 = _mm_cvtsi32_si64(w[26]);
-
- s1 = _mm_add_si64(w1, w4);
- C[1] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w6, w8);
- s1 = _mm_add_si64(s1, s2);
- C[2] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w10, w12);
- s1 = _mm_add_si64(s1, s2);
- C[3] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w14, w16);
- s1 = _mm_add_si64(s1, s2);
- C[4] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w18, w20);
- s1 = _mm_add_si64(s1, s2);
- C[5] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w22, w26);
- s1 = _mm_add_si64(s1, s2);
- C[6] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- C[7] = _mm_cvtsi64_si32(s1) + w[27];
- _mm_empty();
-}
-
-void P4Optimized::Multiply8(word *C, const word *A, const word *B)
-{
- __m128i temp[28];
- const word *w = (word *)temp;
- const __m64 *mw = (__m64 *)w;
- const word *x = (word *)temp+7*4;
- const __m64 *mx = (__m64 *)x;
- const word *y = (word *)temp+7*4*2;
- const __m64 *my = (__m64 *)y;
- const word *z = (word *)temp+7*4*3;
- const __m64 *mz = (__m64 *)z;
-
- P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
- P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
-
- P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
-
- P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
-
- C[0] = w[0];
-
- __m64 s1, s2, s3, s4;
-
- __m64 w1 = _mm_cvtsi32_si64(w[1]);
- __m64 w4 = mw[2];
- __m64 w6 = mw[3];
- __m64 w8 = mw[4];
- __m64 w10 = mw[5];
- __m64 w12 = mw[6];
- __m64 w14 = mw[7];
- __m64 w16 = mw[8];
- __m64 w18 = mw[9];
- __m64 w20 = mw[10];
- __m64 w22 = mw[11];
- __m64 w26 = _mm_cvtsi32_si64(w[26]);
- __m64 w27 = _mm_cvtsi32_si64(w[27]);
-
- __m64 x0 = _mm_cvtsi32_si64(x[0]);
- __m64 x1 = _mm_cvtsi32_si64(x[1]);
- __m64 x4 = mx[2];
- __m64 x6 = mx[3];
- __m64 x8 = mx[4];
- __m64 x10 = mx[5];
- __m64 x12 = mx[6];
- __m64 x14 = mx[7];
- __m64 x16 = mx[8];
- __m64 x18 = mx[9];
- __m64 x20 = mx[10];
- __m64 x22 = mx[11];
- __m64 x26 = _mm_cvtsi32_si64(x[26]);
- __m64 x27 = _mm_cvtsi32_si64(x[27]);
-
- __m64 y0 = _mm_cvtsi32_si64(y[0]);
- __m64 y1 = _mm_cvtsi32_si64(y[1]);
- __m64 y4 = my[2];
- __m64 y6 = my[3];
- __m64 y8 = my[4];
- __m64 y10 = my[5];
- __m64 y12 = my[6];
- __m64 y14 = my[7];
- __m64 y16 = my[8];
- __m64 y18 = my[9];
- __m64 y20 = my[10];
- __m64 y22 = my[11];
- __m64 y26 = _mm_cvtsi32_si64(y[26]);
- __m64 y27 = _mm_cvtsi32_si64(y[27]);
-
- __m64 z0 = _mm_cvtsi32_si64(z[0]);
- __m64 z1 = _mm_cvtsi32_si64(z[1]);
- __m64 z4 = mz[2];
- __m64 z6 = mz[3];
- __m64 z8 = mz[4];
- __m64 z10 = mz[5];
- __m64 z12 = mz[6];
- __m64 z14 = mz[7];
- __m64 z16 = mz[8];
- __m64 z18 = mz[9];
- __m64 z20 = mz[10];
- __m64 z22 = mz[11];
- __m64 z26 = _mm_cvtsi32_si64(z[26]);
-
- s1 = _mm_add_si64(w1, w4);
- C[1] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w6, w8);
- s1 = _mm_add_si64(s1, s2);
- C[2] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w10, w12);
- s1 = _mm_add_si64(s1, s2);
- C[3] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x0, y0);
- s2 = _mm_add_si64(w14, w16);
- s1 = _mm_add_si64(s1, s3);
- s1 = _mm_add_si64(s1, s2);
- C[4] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x1, y1);
- s4 = _mm_add_si64(x4, y4);
- s1 = _mm_add_si64(s1, w18);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, w20);
- s1 = _mm_add_si64(s1, s3);
- C[5] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x6, y6);
- s4 = _mm_add_si64(x8, y8);
- s1 = _mm_add_si64(s1, w22);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, w26);
- s1 = _mm_add_si64(s1, s3);
- C[6] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x10, y10);
- s4 = _mm_add_si64(x12, y12);
- s1 = _mm_add_si64(s1, w27);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, s3);
- C[7] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x14, y14);
- s4 = _mm_add_si64(x16, y16);
- s1 = _mm_add_si64(s1, z0);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, s3);
- C[8] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x18, y18);
- s4 = _mm_add_si64(x20, y20);
- s1 = _mm_add_si64(s1, z1);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, z4);
- s1 = _mm_add_si64(s1, s3);
- C[9] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x22, y22);
- s4 = _mm_add_si64(x26, y26);
- s1 = _mm_add_si64(s1, z6);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, z8);
- s1 = _mm_add_si64(s1, s3);
- C[10] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x27, y27);
- s1 = _mm_add_si64(s1, z10);
- s1 = _mm_add_si64(s1, z12);
- s1 = _mm_add_si64(s1, s3);
- C[11] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(z14, z16);
- s1 = _mm_add_si64(s1, s3);
- C[12] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(z18, z20);
- s1 = _mm_add_si64(s1, s3);
- C[13] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(z22, z26);
- s1 = _mm_add_si64(s1, s3);
- C[14] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- C[15] = z[27] + _mm_cvtsi64_si32(s1);
- _mm_empty();
}
-void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
-{
- __m128i temp[21];
- const word *w = (word *)temp;
- const __m64 *mw = (__m64 *)w;
- const word *x = (word *)temp+7*4;
- const __m64 *mx = (__m64 *)x;
- const word *y = (word *)temp+7*4*2;
- const __m64 *my = (__m64 *)y;
-
- P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
- P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
-
- P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
-
- C[0] = w[0];
-
- __m64 s1, s2, s3, s4;
-
- __m64 w1 = _mm_cvtsi32_si64(w[1]);
- __m64 w4 = mw[2];
- __m64 w6 = mw[3];
- __m64 w8 = mw[4];
- __m64 w10 = mw[5];
- __m64 w12 = mw[6];
- __m64 w14 = mw[7];
- __m64 w16 = mw[8];
- __m64 w18 = mw[9];
- __m64 w20 = mw[10];
- __m64 w22 = mw[11];
- __m64 w26 = _mm_cvtsi32_si64(w[26]);
-
- __m64 x0 = _mm_cvtsi32_si64(x[0]);
- __m64 x1 = _mm_cvtsi32_si64(x[1]);
- __m64 x4 = mx[2];
- __m64 x6 = mx[3];
- __m64 x8 = mx[4];
-
- __m64 y0 = _mm_cvtsi32_si64(y[0]);
- __m64 y1 = _mm_cvtsi32_si64(y[1]);
- __m64 y4 = my[2];
- __m64 y6 = my[3];
- __m64 y8 = my[4];
-
- s1 = _mm_add_si64(w1, w4);
- C[1] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w6, w8);
- s1 = _mm_add_si64(s1, s2);
- C[2] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s2 = _mm_add_si64(w10, w12);
- s1 = _mm_add_si64(s1, s2);
- C[3] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x0, y0);
- s2 = _mm_add_si64(w14, w16);
- s1 = _mm_add_si64(s1, s3);
- s1 = _mm_add_si64(s1, s2);
- C[4] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x1, y1);
- s4 = _mm_add_si64(x4, y4);
- s1 = _mm_add_si64(s1, w18);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, w20);
- s1 = _mm_add_si64(s1, s3);
- C[5] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- s3 = _mm_add_si64(x6, y6);
- s4 = _mm_add_si64(x8, y8);
- s1 = _mm_add_si64(s1, w22);
- s3 = _mm_add_si64(s3, s4);
- s1 = _mm_add_si64(s1, w26);
- s1 = _mm_add_si64(s1, s3);
- C[6] = _mm_cvtsi64_si32(s1);
- s1 = _mm_srli_si64(s1, 32);
-
- C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
- _mm_empty();
+inline int Subtract(word *C, const word *A, const word *B, size_t N)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ return s_pSub(N, C, A, B);
+#else
+ return Baseline_Sub(N, C, A, B);
+#endif
}
-#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
-
// ********************************************************
+
#define A0 A
#define A1 (A+N2)
#define B0 B
@@ -2004,64 +1876,37 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
{
assert(N>=2 && N%2==0);
- if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
- LowLevel::Multiply8(R, A, B);
- else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
- LowLevel::Multiply4(R, A, B);
- else if (N==2)
- LowLevel::Multiply2(R, A, B);
+ if (N <= s_recursionLimit)
+ s_pMul[N/4](R, A, B);
else
{
const size_t N2 = N/2;
- int carry;
- int aComp = Compare(A0, A1, N2);
- int bComp = Compare(B0, B1, N2);
+ size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2;
+ Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
- switch (2*aComp + aComp + bComp)
- {
- case -4:
- LowLevel::Subtract(R0, A1, A0, N2);
- LowLevel::Subtract(R1, B0, B1, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- LowLevel::Subtract(T1, T1, R0, N2);
- carry = -1;
- break;
- case -2:
- LowLevel::Subtract(R0, A1, A0, N2);
- LowLevel::Subtract(R1, B0, B1, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- carry = 0;
- break;
- case 2:
- LowLevel::Subtract(R0, A0, A1, N2);
- LowLevel::Subtract(R1, B1, B0, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- carry = 0;
- break;
- case 4:
- LowLevel::Subtract(R0, A1, A0, N2);
- LowLevel::Subtract(R1, B0, B1, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- LowLevel::Subtract(T1, T1, R1, N2);
- carry = -1;
- break;
- default:
- SetWords(T0, 0, N);
- carry = 0;
- }
+ size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2;
+ Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
- RecursiveMultiply(R0, T2, A0, B0, N2);
RecursiveMultiply(R2, T2, A1, B1, N2);
+ RecursiveMultiply(T0, T2, R0, R1, N2);
+ RecursiveMultiply(R0, T2, A0, B0, N2);
// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
- carry += LowLevel::Add(T0, T0, R0, N);
- carry += LowLevel::Add(T0, T0, R2, N);
- carry += LowLevel::Add(R1, R1, T0, N);
+ int c2 = Add(R2, R2, R1, N2);
+ int c3 = c2;
+ c2 += Add(R1, R2, R0, N2);
+ c3 += Add(R2, R2, R3, N2);
- assert (carry >= 0 && carry <= 2);
- Increment(R3, N2, carry);
+ if (AN2 == BN2)
+ c3 -= Subtract(R1, R1, T0, N);
+ else
+ c3 += Add(R1, R1, T0, N);
+
+ c3 += Increment(R2, N2, c2);
+ assert (c3 >= 0 && c3 <= 2);
+ Increment(R3, N2, c3);
}
}
@@ -2072,12 +1917,9 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
void RecursiveSquare(word *R, word *T, const word *A, size_t N)
{
assert(N && N%2==0);
- if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
- LowLevel::Square8(R, A);
- if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
- LowLevel::Square4(R, A);
- else if (N==2)
- LowLevel::Square2(R, A);
+
+ if (N <= s_recursionLimit)
+ s_pSqu[N/4](R, A);
else
{
const size_t N2 = N/2;
@@ -2086,35 +1928,32 @@ void RecursiveSquare(word *R, word *T, const word *A, size_t N)
RecursiveSquare(R2, T2, A1, N2);
RecursiveMultiply(T0, T2, A0, A1, N2);
- int carry = LowLevel::Add(R1, R1, T0, N);
- carry += LowLevel::Add(R1, R1, T0, N);
+ int carry = Add(R1, R1, T0, N);
+ carry += Add(R1, R1, T0, N);
Increment(R3, N2, carry);
}
}
// R[N] - bottom half of A*B
-// T[N] - temporary work space
+// T[3*N/2] - temporary work space
// A[N] - multiplier
// B[N] - multiplicant
void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
{
assert(N>=2 && N%2==0);
- if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
- LowLevel::Multiply8Bottom(R, A, B);
- else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
- LowLevel::Multiply4Bottom(R, A, B);
- else if (N==2)
- LowLevel::Multiply2Bottom(R, A, B);
+
+ if (N <= s_recursionLimit)
+ s_pBot[N/4](R, A, B);
else
{
const size_t N2 = N/2;
RecursiveMultiply(R, T, A0, B0, N2);
RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
- LowLevel::Add(R1, R1, T0, N2);
+ Add(R1, R1, T0, N2);
RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
- LowLevel::Add(R1, R1, T0, N2);
+ Add(R1, R1, T0, N2);
}
}
@@ -2124,88 +1963,61 @@ void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, siz
// A[N] --- multiplier
// B[N] --- multiplicant
-void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
+void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
{
assert(N>=2 && N%2==0);
- if (N==4)
- {
- LowLevel::Multiply4(T, A, B);
- memcpy(R, T+4, 4*WORD_SIZE);
- }
- else if (N==2)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ if (HasSSE2() && ((N>=8) & (N<=32)))
+ s_pTop[N/16](R, A, B, L[N-1]);
+ else
+#endif
+ if (N<=4)
{
- LowLevel::Multiply2(T, A, B);
- memcpy(R, T+2, 2*WORD_SIZE);
+ s_pMul[N/4](T, A, B);
+ memcpy(R, T+N, N*WORD_SIZE);
}
else
{
const size_t N2 = N/2;
- int carry;
- int aComp = Compare(A0, A1, N2);
- int bComp = Compare(B0, B1, N2);
+ size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2;
+ Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
- switch (2*aComp + aComp + bComp)
- {
- case -4:
- LowLevel::Subtract(R0, A1, A0, N2);
- LowLevel::Subtract(R1, B0, B1, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- LowLevel::Subtract(T1, T1, R0, N2);
- carry = -1;
- break;
- case -2:
- LowLevel::Subtract(R0, A1, A0, N2);
- LowLevel::Subtract(R1, B0, B1, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- carry = 0;
- break;
- case 2:
- LowLevel::Subtract(R0, A0, A1, N2);
- LowLevel::Subtract(R1, B1, B0, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- carry = 0;
- break;
- case 4:
- LowLevel::Subtract(R0, A1, A0, N2);
- LowLevel::Subtract(R1, B0, B1, N2);
- RecursiveMultiply(T0, T2, R0, R1, N2);
- LowLevel::Subtract(T1, T1, R1, N2);
- carry = -1;
- break;
- default:
- SetWords(T0, 0, N);
- carry = 0;
- }
-
- RecursiveMultiply(T2, R0, A1, B1, N2);
+ size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2;
+ Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
- // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
+ RecursiveMultiply(T0, T2, R0, R1, N2);
+ RecursiveMultiply(R0, T2, A1, B1, N2);
- int c2 = LowLevel::Subtract(R0, L+N2, L, N2);
- c2 += LowLevel::Subtract(R0, R0, T0, N2);
- int t = (Compare(R0, T2, N2) == -1);
+ // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
- carry += t;
- carry += Increment(R0, N2, c2+t);
- carry += LowLevel::Add(R0, R0, T1, N2);
- carry += LowLevel::Add(R0, R0, T3, N2);
- assert (carry >= 0 && carry <= 2);
+ int t, c3;
+ int c2 = Subtract(T2, L+N2, L, N2);
- CopyWords(R1, T3, N2);
- Increment(R1, N2, carry);
- }
-}
+ if (AN2 == BN2)
+ {
+ c2 -= Add(T2, T2, T0, N2);
+ t = (Compare(T2, R0, N2) == -1);
+ c3 = t - Subtract(T2, T2, T1, N2);
+ }
+ else
+ {
+ c2 += Subtract(T2, T2, T0, N2);
+ t = (Compare(T2, R0, N2) == -1);
+ c3 = t + Add(T2, T2, T1, N2);
+ }
-inline int Add(word *C, const word *A, const word *B, size_t N)
-{
- return LowLevel::Add(C, A, B, N);
-}
+ c2 += t;
+ if (c2 >= 0)
+ c3 += Increment(T2, N2, c2);
+ else
+ c3 -= Decrement(T2, N2, -c2);
+ c3 += Add(R0, T2, R1, N2);
-inline int Subtract(word *C, const word *A, const word *B, size_t N)
-{
- return LowLevel::Subtract(C, A, B, N);
+ assert (c3 >= 0 && c3 <= 2);
+ Increment(R1, N2, c3);
+ }
}
inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
@@ -2223,23 +2035,6 @@ inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_
RecursiveMultiplyBottom(R, T, A, B, N);
}
-inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
-{
- RecursiveMultiplyTop(R, T, L, A, B, N);
-}
-
-static word LinearMultiply(word *C, const word *A, word B, size_t N)
-{
- word carry=0;
- for(unsigned i=0; i<N; i++)
- {
- DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
- C[i] = p.GetLowHalf();
- carry = p.GetHighHalf();
- }
- return carry;
-}
-
// R[NA+NB] - result = A*B
// T[NA+NB] - temporary work space
// A[NA] ---- multiplier
@@ -2264,7 +2059,6 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *
}
assert(NB % NA == 0);
- assert((NB/NA)%2 == 0); // NB is an even multiple of NA
if (NA==2 && !A[1])
{
@@ -2284,15 +2078,24 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *
}
}
- Multiply(R, T, A, B, NA);
- CopyWords(T+2*NA, R+NA, NA);
-
size_t i;
+ if ((NB/NA)%2 == 0)
+ {
+ Multiply(R, T, A, B, NA);
+ CopyWords(T+2*NA, R+NA, NA);
- for (i=2*NA; i<NB; i+=2*NA)
- Multiply(T+NA+i, T, A, B+i, NA);
- for (i=NA; i<NB; i+=2*NA)
- Multiply(R+i, T, A, B+i, NA);
+ for (i=2*NA; i<NB; i+=2*NA)
+ Multiply(T+NA+i, T, A, B+i, NA);
+ for (i=NA; i<NB; i+=2*NA)
+ Multiply(R+i, T, A, B+i, NA);
+ }
+ else
+ {
+ for (i=0; i<NB; i+=2*NA)
+ Multiply(R+i, T, A, B+i, NA);
+ for (i=NA; i<NB; i+=2*NA)
+ Multiply(T+NA+i, T, A, B+i, NA);
+ }
if (Add(R+NA, R+NA, T+2*NA, NB-NA))
Increment(R+NB, NA);
@@ -2308,10 +2111,10 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
{
T[0] = AtomicInverseModPower2(A[0]);
T[1] = 0;
- LowLevel::Multiply2Bottom(T+2, T, A);
+ s_pBot[0](T+2, T, A);
TwosComplement(T+2, 2);
Increment(T+2, 2, 2);
- LowLevel::Multiply2Bottom(R, T, T+2);
+ s_pBot[0](R, T, T+2);
}
else
{
@@ -2333,8 +2136,9 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
// M[N] --- modulus
// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
-void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, size_t N)
+void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
{
+#if 1
MultiplyBottom(R, T, X, U, N);
MultiplyTop(T, T+N, X, R, M, N);
word borrow = Subtract(T, X+N, T, N);
@@ -2342,6 +2146,60 @@ void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word
word carry = Add(T+N, T, M, N);
assert(carry || !borrow);
CopyWords(R, T + (borrow ? N : 0), N);
+#elif 0
+ const word u = 0-U[0];
+ Declare2Words(p)
+ for (size_t i=0; i<N; i++)
+ {
+ const word t = u * X[i];
+ word c = 0;
+ for (size_t j=0; j<N; j+=2)
+ {
+ MultiplyWords(p, t, M[j]);
+ Acc2WordsBy1(p, X[i+j]);
+ Acc2WordsBy1(p, c);
+ X[i+j] = LowWord(p);
+ c = HighWord(p);
+ MultiplyWords(p, t, M[j+1]);
+ Acc2WordsBy1(p, X[i+j+1]);
+ Acc2WordsBy1(p, c);
+ X[i+j+1] = LowWord(p);
+ c = HighWord(p);
+ }
+
+ if (Increment(X+N+i, N-i, c))
+ while (!Subtract(X+N, X+N, M, N)) {}
+ }
+
+ memcpy(R, X+N, N*WORD_SIZE);
+#else
+ __m64 u = _mm_cvtsi32_si64(0-U[0]), p;
+ for (size_t i=0; i<N; i++)
+ {
+ __m64 t = _mm_cvtsi32_si64(X[i]);
+ t = _mm_mul_su32(t, u);
+ __m64 c = _mm_setzero_si64();
+ for (size_t j=0; j<N; j+=2)
+ {
+ p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
+ p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
+ c = _mm_add_si64(c, p);
+ X[i+j] = _mm_cvtsi64_si32(c);
+ c = _mm_srli_si64(c, 32);
+ p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
+ p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
+ c = _mm_add_si64(c, p);
+ X[i+j+1] = _mm_cvtsi64_si32(c);
+ c = _mm_srli_si64(c, 32);
+ }
+
+ if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
+ while (!Subtract(X+N, X+N, M, N)) {}
+ }
+
+ memcpy(R, X+N, N*WORD_SIZE);
+ _mm_empty();
+#endif
}
// R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
@@ -2491,7 +2349,7 @@ static inline void AtomicDivide(word *Q, const word *A, const word *B)
// multiply quotient and divisor and add remainder, make sure it equals dividend
assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
word P[4];
- Portable::Multiply2(P, Q, B);
+ s_pMul[0](P, Q, B);
Add(P, P, T, 4);
assert(memcmp(P, A, 4*WORD_SIZE)==0);
}
@@ -2503,21 +2361,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si
{
assert(N && N%2==0);
- if (Q[1])
- {
- T[N] = T[N+1] = 0;
- unsigned i;
- for (i=0; i<N; i+=4)
- LowLevel::Multiply2(T+i, Q, B+i);
- for (i=2; i<N; i+=4)
- if (LowLevel::Multiply2Add(T+i, Q, B+i))
- T[i+5] += (++T[i+4]==0);
- }
- else
- {
- T[N] = LinearMultiply(T, B, Q[0], N);
- T[N+1] = 0;
- }
+ AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
word borrow = Subtract(R, R, T, N+2);
assert(!borrow && !R[N+1]);
@@ -2532,7 +2376,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si
// R[NB] -------- remainder = A%B
// Q[NA-NB+2] --- quotient = A/B
-// T[NA+2*NB+4] - temp work space
+// T[NA+3*(NB+2)] - temp work space
// A[NA] -------- dividend
// B[NB] -------- divisor
@@ -2726,9 +2570,7 @@ InitializeInteger::InitializeInteger()
{
if (!g_pAssignIntToInteger)
{
-#ifdef CRYPTOPP_X86ASM_AVAILABLE
- SetPentiumFunctionPointers();
-#endif
+ SetFunctionPointers();
g_pAssignIntToInteger = AssignIntToInteger;
}
}
@@ -2877,7 +2719,8 @@ Integer& Integer::operator=(const Integer& t)
{
if (this != &t)
{
- reg.New(RoundupSize(t.WordCount()));
+ if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
+ reg.New(RoundupSize(t.WordCount()));
CopyWords(reg, t.reg, reg.size());
sign = t.sign;
}
@@ -3240,7 +3083,7 @@ public:
void GenerateBlock(byte *output, size_t size)
{
- UnalignedPutWord(BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
+ PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
++m_counter;
P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
}
@@ -3657,7 +3500,7 @@ void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
product.reg.CleanNew(RoundupSize(aSize+bSize));
product.sign = Integer::POSITIVE;
- SecAlignedWordBlock workspace(aSize + bSize);
+ IntegerSecBlock workspace(aSize + bSize);
AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
}
@@ -3723,7 +3566,7 @@ void PositiveDivide(Integer &remainder, Integer &quotient,
quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
quotient.sign = Integer::POSITIVE;
- SecAlignedWordBlock T(aSize+2*bSize+4);
+ IntegerSecBlock T(aSize+3*(bSize+2));
Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
}
diff --git a/integer.h b/integer.h
index 547e3778..4e93c3a1 100644
--- a/integer.h
+++ b/integer.h
@@ -11,44 +11,13 @@
NAMESPACE_BEGIN(CryptoPP)
-#if defined(SSE2_INTRINSICS_AVAILABLE)
- template <class T>
- class AlignedAllocator : public AllocatorBase<T>
- {
- public:
- CRYPTOPP_INHERIT_ALLOCATOR_TYPES
-
- pointer allocate(size_type n, const void *);
- void deallocate(void *p, size_type n);
- pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
- {
- return StandardReallocate(*this, p, oldSize, newSize, preserve);
- }
-
- #if !(defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) || defined(CRYPTOPP_MEMALIGN_AVAILABLE) || defined(CRYPTOPP_MM_MALLOC_AVAILABLE))
- #define CRYPTOPP_NO_ALIGNED_ALLOC
- AlignedAllocator() : m_pBlock(NULL) {}
- protected:
- void *m_pBlock;
- #endif
- };
-
- #ifdef CRYPTOPP_IMPORTS
- CRYPTOPP_DLL_TEMPLATE_CLASS AlignedAllocator<word>;
- #endif
-
- typedef SecBlock<word, AlignedAllocator<word> > SecAlignedWordBlock;
-#else
- typedef SecWordBlock SecAlignedWordBlock;
-#endif
-
-void CRYPTOPP_DLL CRYPTOPP_API DisableSSE2();
-
struct InitializeInteger // used to initialize static variables
{
InitializeInteger();
};
+typedef SecBlock<word, AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86> > IntegerSecBlock;
+
//! multiple precision integer and basic arithmetics
/*! This class can represent positive and negative integers
with absolute value less than (256**sizeof(word)) ** (256**sizeof(int)).
@@ -406,7 +375,7 @@ private:
friend void PositiveMultiply(Integer &product, const Integer &a, const Integer &b);
friend void PositiveDivide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor);
- SecAlignedWordBlock reg;
+ IntegerSecBlock reg;
Sign sign;
};
diff --git a/rijndael.cpp b/rijndael.cpp
index 2a1a19ef..4a8572f2 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -51,10 +51,7 @@ being unloaded from L1 cache, until that round is finished.
#include "rijndael.h"
#include "misc.h"
-
-#ifdef CRYPTOPP_L1_CACHE_ALIGN_NOT_AVAILABLE
-#pragma message("Don't know how to align data on L1 cache boundary. Defense against AES timing attack may be affected.")
-#endif
+#include "cpu.h"
NAMESPACE_BEGIN(CryptoPP)
@@ -122,25 +119,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
for (i = 1; i < m_rounds; i++) {
rk += 4;
rk[0] =
- Td0[Se[GETBYTE(rk[0], 3)]] ^
- Td1[Se[GETBYTE(rk[0], 2)]] ^
- Td2[Se[GETBYTE(rk[0], 1)]] ^
- Td3[Se[GETBYTE(rk[0], 0)]];
+ Td[0*256+Se[GETBYTE(rk[0], 3)]] ^
+ Td[1*256+Se[GETBYTE(rk[0], 2)]] ^
+ Td[2*256+Se[GETBYTE(rk[0], 1)]] ^
+ Td[3*256+Se[GETBYTE(rk[0], 0)]];
rk[1] =
- Td0[Se[GETBYTE(rk[1], 3)]] ^
- Td1[Se[GETBYTE(rk[1], 2)]] ^
- Td2[Se[GETBYTE(rk[1], 1)]] ^
- Td3[Se[GETBYTE(rk[1], 0)]];
+ Td[0*256+Se[GETBYTE(rk[1], 3)]] ^
+ Td[1*256+Se[GETBYTE(rk[1], 2)]] ^
+ Td[2*256+Se[GETBYTE(rk[1], 1)]] ^
+ Td[3*256+Se[GETBYTE(rk[1], 0)]];
rk[2] =
- Td0[Se[GETBYTE(rk[2], 3)]] ^
- Td1[Se[GETBYTE(rk[2], 2)]] ^
- Td2[Se[GETBYTE(rk[2], 1)]] ^
- Td3[Se[GETBYTE(rk[2], 0)]];
+ Td[0*256+Se[GETBYTE(rk[2], 3)]] ^
+ Td[1*256+Se[GETBYTE(rk[2], 2)]] ^
+ Td[2*256+Se[GETBYTE(rk[2], 1)]] ^
+ Td[3*256+Se[GETBYTE(rk[2], 0)]];
rk[3] =
- Td0[Se[GETBYTE(rk[3], 3)]] ^
- Td1[Se[GETBYTE(rk[3], 2)]] ^
- Td2[Se[GETBYTE(rk[3], 1)]] ^
- Td3[Se[GETBYTE(rk[3], 0)]];
+ Td[0*256+Se[GETBYTE(rk[3], 3)]] ^
+ Td[1*256+Se[GETBYTE(rk[3], 2)]] ^
+ Td[2*256+Se[GETBYTE(rk[3], 1)]] ^
+ Td[3*256+Se[GETBYTE(rk[3], 0)]];
}
}
@@ -148,15 +145,245 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
}
-const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4;
-#ifdef IS_BIG_ENDIAN
-const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0;
-#else
-const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3;
-#endif
+#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
+#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+ if (HasMMX())
+ {
+ const word32 *k = m_key;
+ const word32 *kLoopEnd = k + m_rounds*4;
+#ifdef __GNUC__
+ word32 t0, t1, t2, t3;
+ __asm__ __volatile__
+ (
+ ".intel_syntax noprefix;"
+ AS1( push ebx)
+ AS1( push ebp)
+ AS2( mov ebp, eax)
+ AS2( movd mm5, ecx)
+#else
+ AS2( mov edx, g_cacheLineSize)
+ AS2( mov edi, inBlock)
+ AS2( mov esi, k)
+ AS2( movd mm5, kLoopEnd)
+ AS1( push ebp)
+ AS2( lea ebp, Te)
+#endif
+ AS2( mov eax, [esi+0*4]) // s0
+ AS2( xor eax, [edi+0*4])
+ AS2( movd mm0, eax)
+ AS2( mov ebx, [esi+1*4])
+ AS2( xor ebx, [edi+1*4])
+ AS2( movd mm1, ebx)
+ AS2( and ebx, eax)
+ AS2( mov eax, [esi+2*4])
+ AS2( xor eax, [edi+2*4])
+ AS2( movd mm2, eax)
+ AS2( and ebx, eax)
+ AS2( mov ecx, [esi+3*4])
+ AS2( xor ecx, [edi+3*4])
+ AS2( and ebx, ecx)
+
+ // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
+ AS2( and ebx, 0)
+ AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
+ ASL(2)
+ AS2( and ebx, [ebp+edi])
+ AS2( add edi, edx)
+ AS2( and ebx, [ebp+edi])
+ AS2( add edi, edx)
+ AS2( and ebx, [ebp+edi])
+ AS2( add edi, edx)
+ AS2( and ebx, [ebp+edi])
+ AS2( add edi, edx)
+ AS2( cmp edi, 1024)
+ ASJ( jl, 2, b)
+ AS2( and ebx, [ebp+1020])
+ AS2( movd mm6, ebx)
+ AS2( pxor mm2, mm6)
+ AS2( pxor mm1, mm6)
+ AS2( pxor mm0, mm6)
+ AS2( xor ecx, ebx)
+
+ AS2( mov edi, [esi+4*4]) // t0
+ AS2( mov eax, [esi+5*4])
+ AS2( mov ebx, [esi+6*4])
+ AS2( mov edx, [esi+7*4])
+ AS2( add esi, 8*4)
+ AS2( movd mm4, esi)
+
+#define QUARTER_ROUND(t, a, b, c, d) \
+ AS2(movzx esi, t##l)\
+ AS2(d, [ebp+0*1024+4*esi])\
+ AS2(movzx esi, t##h)\
+ AS2(c, [ebp+1*1024+4*esi])\
+ AS2(shr e##t##x, 16)\
+ AS2(movzx esi, t##l)\
+ AS2(b, [ebp+2*1024+4*esi])\
+ AS2(movzx esi, t##h)\
+ AS2(a, [ebp+3*1024+4*esi])
+
+#define s0 xor edi
+#define s1 xor eax
+#define s2 xor ebx
+#define s3 xor ecx
+#define t0 xor edi
+#define t1 xor eax
+#define t2 xor ebx
+#define t3 xor edx
+
+ QUARTER_ROUND(c, t0, t1, t2, t3)
+ AS2( movd ecx, mm2)
+ QUARTER_ROUND(c, t3, t0, t1, t2)
+ AS2( movd ecx, mm1)
+ QUARTER_ROUND(c, t2, t3, t0, t1)
+ AS2( movd ecx, mm0)
+ QUARTER_ROUND(c, t1, t2, t3, t0)
+ AS2( movd mm2, ebx)
+ AS2( movd mm1, eax)
+ AS2( movd mm0, edi)
+#undef QUARTER_ROUND
+
+ AS2( movd esi, mm4)
+
+ ASL(0)
+ AS2( mov edi, [esi+0*4])
+ AS2( mov eax, [esi+1*4])
+ AS2( mov ebx, [esi+2*4])
+ AS2( mov ecx, [esi+3*4])
+
+#define QUARTER_ROUND(t, a, b, c, d) \
+ AS2(movzx esi, t##l)\
+ AS2(a, [ebp+3*1024+4*esi])\
+ AS2(movzx esi, t##h)\
+ AS2(b, [ebp+2*1024+4*esi])\
+ AS2(shr e##t##x, 16)\
+ AS2(movzx esi, t##l)\
+ AS2(c, [ebp+1*1024+4*esi])\
+ AS2(movzx esi, t##h)\
+ AS2(d, [ebp+0*1024+4*esi])
+
+ QUARTER_ROUND(d, s0, s1, s2, s3)
+ AS2( movd edx, mm2)
+ QUARTER_ROUND(d, s3, s0, s1, s2)
+ AS2( movd edx, mm1)
+ QUARTER_ROUND(d, s2, s3, s0, s1)
+ AS2( movd edx, mm0)
+ QUARTER_ROUND(d, s1, s2, s3, s0)
+ AS2( movd esi, mm4)
+ AS2( movd mm2, ebx)
+ AS2( movd mm1, eax)
+ AS2( movd mm0, edi)
+
+ AS2( mov edi, [esi+4*4])
+ AS2( mov eax, [esi+5*4])
+ AS2( mov ebx, [esi+6*4])
+ AS2( mov edx, [esi+7*4])
+
+ QUARTER_ROUND(c, t0, t1, t2, t3)
+ AS2( movd ecx, mm2)
+ QUARTER_ROUND(c, t3, t0, t1, t2)
+ AS2( movd ecx, mm1)
+ QUARTER_ROUND(c, t2, t3, t0, t1)
+ AS2( movd ecx, mm0)
+ QUARTER_ROUND(c, t1, t2, t3, t0)
+ AS2( movd mm2, ebx)
+ AS2( movd mm1, eax)
+ AS2( movd mm0, edi)
+
+ AS2( movd esi, mm4)
+ AS2( movd edi, mm5)
+ AS2( add esi, 8*4)
+ AS2( movd mm4, esi)
+ AS2( cmp edi, esi)
+ ASJ( jne, 0, b)
+
+#undef QUARTER_ROUND
+#undef s0
+#undef s1
+#undef s2
+#undef s3
+#undef t0
+#undef t1
+#undef t2
+#undef t3
+
+ AS2( mov eax, [edi+0*4])
+ AS2( mov ecx, [edi+1*4])
+ AS2( mov esi, [edi+2*4])
+ AS2( mov edi, [edi+3*4])
+
+#define QUARTER_ROUND(a, b, c, d) \
+ AS2( movzx ebx, dl)\
+ AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
+ AS2( shl ebx, 3*8)\
+ AS2( xor a, ebx)\
+ AS2( movzx ebx, dh)\
+ AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
+ AS2( shl ebx, 2*8)\
+ AS2( xor b, ebx)\
+ AS2( shr edx, 16)\
+ AS2( movzx ebx, dl)\
+ AS2( shr edx, 8)\
+ AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
+ AS2( shl ebx, 1*8)\
+ AS2( xor c, ebx)\
+ AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\
+ AS2( xor d, ebx)
+
+ QUARTER_ROUND(eax, ecx, esi, edi)
+ AS2( movd edx, mm2)
+ QUARTER_ROUND(edi, eax, ecx, esi)
+ AS2( movd edx, mm1)
+ QUARTER_ROUND(esi, edi, eax, ecx)
+ AS2( movd edx, mm0)
+ QUARTER_ROUND(ecx, esi, edi, eax)
+
+#undef QUARTER_ROUND
+
+ AS1( pop ebp)
+ AS1( emms)
+
+#ifdef __GNUC__
+ AS1( pop ebx)
+ ".att_syntax prefix;"
+ : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
+ : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
+ : "memory", "cc"
+ );
+
+ if (xorBlock)
+ {
+ t0 ^= ((const word32 *)xorBlock)[0];
+ t1 ^= ((const word32 *)xorBlock)[1];
+ t2 ^= ((const word32 *)xorBlock)[2];
+ t3 ^= ((const word32 *)xorBlock)[3];
+ }
+ ((word32 *)outBlock)[0] = t0;
+ ((word32 *)outBlock)[1] = t1;
+ ((word32 *)outBlock)[2] = t2;
+ ((word32 *)outBlock)[3] = t3;
+#else
+ AS2( mov ebx, xorBlock)
+ AS2( test ebx, ebx)
+ ASJ( jz, 1, f)
+ AS2( xor eax, [ebx+0*4])
+ AS2( xor ecx, [ebx+1*4])
+ AS2( xor esi, [ebx+2*4])
+ AS2( xor edi, [ebx+3*4])
+ ASL(1)
+ AS2( mov ebx, outBlock)
+ AS2( mov [ebx+0*4], eax)
+ AS2( mov [ebx+1*4], ecx)
+ AS2( mov [ebx+2*4], esi)
+ AS2( mov [ebx+3*4], edi)
+#endif
+ }
+ else
+#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE
+ {
word32 s0, s1, s2, s3, t0, t1, t2, t3;
const word32 *rk = m_key;
@@ -171,95 +398,68 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
rk += 8;
// timing attack countermeasure. see comments at top for more details
+ const int cacheLineSize = GetCacheLineSize();
unsigned int i;
word32 u = 0;
- for (i=0; i<sizeof(Te0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
- u &= (Te0[i+0*s_lineSizeDiv4] & Te0[i+2*s_lineSizeDiv4]) & (Te0[i+1*s_lineSizeDiv4] & Te0[i+3*s_lineSizeDiv4]);
+ for (i=0; i<1024; i+=cacheLineSize)
+ u &= *(const word32 *)(((const byte *)Te)+i);
+ u &= Te[255];
s0 |= u; s1 |= u; s2 |= u; s3 |= u;
// first round
- t0 ^=
- Te0[GETBYTE(s0, s_i3)] ^
- rotrFixed(Te0[GETBYTE(s1, s_i2)], 8) ^
- rotrFixed(Te0[GETBYTE(s2, s_i1)], 16) ^
- rotrFixed(Te0[GETBYTE(s3, s_i0)], 24);
- t1 ^=
- Te0[GETBYTE(s1, s_i3)] ^
- rotrFixed(Te0[GETBYTE(s2, s_i2)], 8) ^
- rotrFixed(Te0[GETBYTE(s3, s_i1)], 16) ^
- rotrFixed(Te0[GETBYTE(s0, s_i0)], 24);
- t2 ^=
- Te0[GETBYTE(s2, s_i3)] ^
- rotrFixed(Te0[GETBYTE(s3, s_i2)], 8) ^
- rotrFixed(Te0[GETBYTE(s0, s_i1)], 16) ^
- rotrFixed(Te0[GETBYTE(s1, s_i0)], 24);
- t3 ^=
- Te0[GETBYTE(s3, s_i3)] ^
- rotrFixed(Te0[GETBYTE(s0, s_i2)], 8) ^
- rotrFixed(Te0[GETBYTE(s1, s_i1)], 16) ^
- rotrFixed(Te0[GETBYTE(s2, s_i0)], 24);
+#ifdef IS_BIG_ENDIAN
+#define QUARTER_ROUND(t, a, b, c, d) \
+ a ^= rotrFixed(Te[byte(t)], 24); t >>= 8;\
+ b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\
+ c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\
+ d ^= Te[t];
+#else
+#define QUARTER_ROUND(t, a, b, c, d) \
+ d ^= Te[byte(t)]; t >>= 8;\
+ c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\
+ b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\
+ a ^= rotrFixed(Te[t], 24);
+#endif
+
+ QUARTER_ROUND(s3, t0, t1, t2, t3)
+ QUARTER_ROUND(s2, t3, t0, t1, t2)
+ QUARTER_ROUND(s1, t2, t3, t0, t1)
+ QUARTER_ROUND(s0, t1, t2, t3, t0)
+#undef QUARTER_ROUND
// Nr - 2 full rounds:
unsigned int r = m_rounds/2 - 1;
do
{
- s0 =
- Te0[GETBYTE(t0, 3)] ^
- Te1[GETBYTE(t1, 2)] ^
- Te2[GETBYTE(t2, 1)] ^
- Te3[GETBYTE(t3, 0)] ^
- rk[0];
- s1 =
- Te0[GETBYTE(t1, 3)] ^
- Te1[GETBYTE(t2, 2)] ^
- Te2[GETBYTE(t3, 1)] ^
- Te3[GETBYTE(t0, 0)] ^
- rk[1];
- s2 =
- Te0[GETBYTE(t2, 3)] ^
- Te1[GETBYTE(t3, 2)] ^
- Te2[GETBYTE(t0, 1)] ^
- Te3[GETBYTE(t1, 0)] ^
- rk[2];
- s3 =
- Te0[GETBYTE(t3, 3)] ^
- Te1[GETBYTE(t0, 2)] ^
- Te2[GETBYTE(t1, 1)] ^
- Te3[GETBYTE(t2, 0)] ^
- rk[3];
-
- t0 =
- Te0[GETBYTE(s0, 3)] ^
- Te1[GETBYTE(s1, 2)] ^
- Te2[GETBYTE(s2, 1)] ^
- Te3[GETBYTE(s3, 0)] ^
- rk[4];
- t1 =
- Te0[GETBYTE(s1, 3)] ^
- Te1[GETBYTE(s2, 2)] ^
- Te2[GETBYTE(s3, 1)] ^
- Te3[GETBYTE(s0, 0)] ^
- rk[5];
- t2 =
- Te0[GETBYTE(s2, 3)] ^
- Te1[GETBYTE(s3, 2)] ^
- Te2[GETBYTE(s0, 1)] ^
- Te3[GETBYTE(s1, 0)] ^
- rk[6];
- t3 =
- Te0[GETBYTE(s3, 3)] ^
- Te1[GETBYTE(s0, 2)] ^
- Te2[GETBYTE(s1, 1)] ^
- Te3[GETBYTE(s2, 0)] ^
- rk[7];
+#define QUARTER_ROUND(t, a, b, c, d) \
+ a ^= Te[3*256+byte(t)]; t >>= 8;\
+ b ^= Te[2*256+byte(t)]; t >>= 8;\
+ c ^= Te[1*256+byte(t)]; t >>= 8;\
+ d ^= Te[t];
+
+ s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+ QUARTER_ROUND(t3, s0, s1, s2, s3)
+ QUARTER_ROUND(t2, s3, s0, s1, s2)
+ QUARTER_ROUND(t1, s2, s3, s0, s1)
+ QUARTER_ROUND(t0, s1, s2, s3, s0)
+
+ t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+ QUARTER_ROUND(s3, t0, t1, t2, t3)
+ QUARTER_ROUND(s2, t3, t0, t1, t2)
+ QUARTER_ROUND(s1, t2, t3, t0, t1)
+ QUARTER_ROUND(s0, t1, t2, t3, t0)
+#undef QUARTER_ROUND
rk += 8;
} while (--r);
// timing attack countermeasure. see comments at top for more details
u = 0;
- for (i=0; i<sizeof(Se)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
- u &= (((word32*)Se)[i+0*s_lineSizeDiv4] & ((word32*)Se)[i+2*s_lineSizeDiv4]) & (((word32*)Se)[i+1*s_lineSizeDiv4] & ((word32*)Se)[i+3*s_lineSizeDiv4]);
+ for (i=0; i<256; i+=cacheLineSize)
+ u &= *(const word32 *)(Se+i);
+ u &= *(const word32 *)(Se+252);
t0 |= u; t1 |= u; t2 |= u; t3 |= u;
word32 tbw[4];
@@ -267,23 +467,17 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
word32 *const obw = (word32 *)outBlock;
const word32 *const xbw = (const word32 *)xorBlock;
- // last round
- tempBlock[0] = Se[GETBYTE(t0, 3)];
- tempBlock[1] = Se[GETBYTE(t1, 2)];
- tempBlock[2] = Se[GETBYTE(t2, 1)];
- tempBlock[3] = Se[GETBYTE(t3, 0)];
- tempBlock[4] = Se[GETBYTE(t1, 3)];
- tempBlock[5] = Se[GETBYTE(t2, 2)];
- tempBlock[6] = Se[GETBYTE(t3, 1)];
- tempBlock[7] = Se[GETBYTE(t0, 0)];
- tempBlock[8] = Se[GETBYTE(t2, 3)];
- tempBlock[9] = Se[GETBYTE(t3, 2)];
- tempBlock[10] = Se[GETBYTE(t0, 1)];
- tempBlock[11] = Se[GETBYTE(t1, 0)];
- tempBlock[12] = Se[GETBYTE(t3, 3)];
- tempBlock[13] = Se[GETBYTE(t0, 2)];
- tempBlock[14] = Se[GETBYTE(t1, 1)];
- tempBlock[15] = Se[GETBYTE(t2, 0)];
+#define QUARTER_ROUND(t, a, b, c, d) \
+ tempBlock[a] = Se[byte(t)]; t >>= 8;\
+ tempBlock[b] = Se[byte(t)]; t >>= 8;\
+ tempBlock[c] = Se[byte(t)]; t >>= 8;\
+ tempBlock[d] = Se[t];
+
+ QUARTER_ROUND(t2, 15, 2, 5, 8)
+ QUARTER_ROUND(t1, 11, 14, 1, 4)
+ QUARTER_ROUND(t0, 7, 10, 13, 0)
+ QUARTER_ROUND(t3, 3, 6, 9, 12)
+#undef QUARTER_ROUND
if (xbw)
{
@@ -299,12 +493,13 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
obw[2] = tbw[2] ^ rk[2];
obw[3] = tbw[3] ^ rk[3];
}
+ }
}
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
word32 s0, s1, s2, s3, t0, t1, t2, t3;
- const word32 *rk = m_key;
+ const word32 *rk = m_key;
s0 = ((const word32 *)inBlock)[0] ^ rk[0];
s1 = ((const word32 *)inBlock)[1] ^ rk[1];
@@ -317,95 +512,68 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
rk += 8;
// timing attack countermeasure. see comments at top for more details
+ const int cacheLineSize = GetCacheLineSize();
unsigned int i;
word32 u = 0;
- for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
- u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]);
+ for (i=0; i<1024; i+=cacheLineSize)
+ u &= *(const word32 *)(((const byte *)Td)+i);
+ u &= Td[255];
s0 |= u; s1 |= u; s2 |= u; s3 |= u;
// first round
- t0 ^=
- Td0[GETBYTE(s0, s_i3)] ^
- rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^
- rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^
- rotrFixed(Td0[GETBYTE(s1, s_i0)], 24);
- t1 ^=
- Td0[GETBYTE(s1, s_i3)] ^
- rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^
- rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^
- rotrFixed(Td0[GETBYTE(s2, s_i0)], 24);
- t2 ^=
- Td0[GETBYTE(s2, s_i3)] ^
- rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^
- rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^
- rotrFixed(Td0[GETBYTE(s3, s_i0)], 24);
- t3 ^=
- Td0[GETBYTE(s3, s_i3)] ^
- rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^
- rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^
- rotrFixed(Td0[GETBYTE(s0, s_i0)], 24);
+#ifdef IS_BIG_ENDIAN
+#define QUARTER_ROUND(t, a, b, c, d) \
+ a ^= rotrFixed(Td[byte(t)], 24); t >>= 8;\
+ b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\
+ c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\
+ d ^= Td[t];
+#else
+#define QUARTER_ROUND(t, a, b, c, d) \
+ d ^= Td[byte(t)]; t >>= 8;\
+ c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\
+ b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\
+ a ^= rotrFixed(Td[t], 24);
+#endif
+
+ QUARTER_ROUND(s3, t2, t1, t0, t3)
+ QUARTER_ROUND(s2, t1, t0, t3, t2)
+ QUARTER_ROUND(s1, t0, t3, t2, t1)
+ QUARTER_ROUND(s0, t3, t2, t1, t0)
+#undef QUARTER_ROUND
// Nr - 2 full rounds:
unsigned int r = m_rounds/2 - 1;
do
{
- s0 =
- Td0[GETBYTE(t0, 3)] ^
- Td1[GETBYTE(t3, 2)] ^
- Td2[GETBYTE(t2, 1)] ^
- Td3[GETBYTE(t1, 0)] ^
- rk[0];
- s1 =
- Td0[GETBYTE(t1, 3)] ^
- Td1[GETBYTE(t0, 2)] ^
- Td2[GETBYTE(t3, 1)] ^
- Td3[GETBYTE(t2, 0)] ^
- rk[1];
- s2 =
- Td0[GETBYTE(t2, 3)] ^
- Td1[GETBYTE(t1, 2)] ^
- Td2[GETBYTE(t0, 1)] ^
- Td3[GETBYTE(t3, 0)] ^
- rk[2];
- s3 =
- Td0[GETBYTE(t3, 3)] ^
- Td1[GETBYTE(t2, 2)] ^
- Td2[GETBYTE(t1, 1)] ^
- Td3[GETBYTE(t0, 0)] ^
- rk[3];
-
- t0 =
- Td0[GETBYTE(s0, 3)] ^
- Td1[GETBYTE(s3, 2)] ^
- Td2[GETBYTE(s2, 1)] ^
- Td3[GETBYTE(s1, 0)] ^
- rk[4];
- t1 =
- Td0[GETBYTE(s1, 3)] ^
- Td1[GETBYTE(s0, 2)] ^
- Td2[GETBYTE(s3, 1)] ^
- Td3[GETBYTE(s2, 0)] ^
- rk[5];
- t2 =
- Td0[GETBYTE(s2, 3)] ^
- Td1[GETBYTE(s1, 2)] ^
- Td2[GETBYTE(s0, 1)] ^
- Td3[GETBYTE(s3, 0)] ^
- rk[6];
- t3 =
- Td0[GETBYTE(s3, 3)] ^
- Td1[GETBYTE(s2, 2)] ^
- Td2[GETBYTE(s1, 1)] ^
- Td3[GETBYTE(s0, 0)] ^
- rk[7];
+#define QUARTER_ROUND(t, a, b, c, d) \
+ a ^= Td[3*256+byte(t)]; t >>= 8;\
+ b ^= Td[2*256+byte(t)]; t >>= 8;\
+ c ^= Td[1*256+byte(t)]; t >>= 8;\
+ d ^= Td[t];
+
+ s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+ QUARTER_ROUND(t3, s2, s1, s0, s3)
+ QUARTER_ROUND(t2, s1, s0, s3, s2)
+ QUARTER_ROUND(t1, s0, s3, s2, s1)
+ QUARTER_ROUND(t0, s3, s2, s1, s0)
+
+ t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+ QUARTER_ROUND(s3, t2, t1, t0, t3)
+ QUARTER_ROUND(s2, t1, t0, t3, t2)
+ QUARTER_ROUND(s1, t0, t3, t2, t1)
+ QUARTER_ROUND(s0, t3, t2, t1, t0)
+#undef QUARTER_ROUND
rk += 8;
} while (--r);
// timing attack countermeasure. see comments at top for more details
u = 0;
- for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
- u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]);
+ for (i=0; i<256; i+=cacheLineSize)
+ u &= *(const word32 *)(Sd+i);
+ u &= *(const word32 *)(Sd+252);
t0 |= u; t1 |= u; t2 |= u; t3 |= u;
word32 tbw[4];
@@ -413,23 +581,17 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
word32 *const obw = (word32 *)outBlock;
const word32 *const xbw = (const word32 *)xorBlock;
- // last round
- tempBlock[0] = Sd[GETBYTE(t0, 3)];
- tempBlock[1] = Sd[GETBYTE(t3, 2)];
- tempBlock[2] = Sd[GETBYTE(t2, 1)];
- tempBlock[3] = Sd[GETBYTE(t1, 0)];
- tempBlock[4] = Sd[GETBYTE(t1, 3)];
- tempBlock[5] = Sd[GETBYTE(t0, 2)];
- tempBlock[6] = Sd[GETBYTE(t3, 1)];
- tempBlock[7] = Sd[GETBYTE(t2, 0)];
- tempBlock[8] = Sd[GETBYTE(t2, 3)];
- tempBlock[9] = Sd[GETBYTE(t1, 2)];
- tempBlock[10] = Sd[GETBYTE(t0, 1)];
- tempBlock[11] = Sd[GETBYTE(t3, 0)];
- tempBlock[12] = Sd[GETBYTE(t3, 3)];
- tempBlock[13] = Sd[GETBYTE(t2, 2)];
- tempBlock[14] = Sd[GETBYTE(t1, 1)];
- tempBlock[15] = Sd[GETBYTE(t0, 0)];
+#define QUARTER_ROUND(t, a, b, c, d) \
+ tempBlock[a] = Sd[byte(t)]; t >>= 8;\
+ tempBlock[b] = Sd[byte(t)]; t >>= 8;\
+ tempBlock[c] = Sd[byte(t)]; t >>= 8;\
+ tempBlock[d] = Sd[t];
+
+ QUARTER_ROUND(t2, 7, 2, 13, 8)
+ QUARTER_ROUND(t1, 3, 14, 9, 4)
+ QUARTER_ROUND(t0, 15, 10, 5, 0)
+ QUARTER_ROUND(t3, 11, 6, 1, 12)
+#undef QUARTER_ROUND
if (xbw)
{
diff --git a/rijndael.h b/rijndael.h
index a035da4c..a068d637 100644
--- a/rijndael.h
+++ b/rijndael.h
@@ -25,16 +25,10 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
protected:
// VS2005 workaround: have to put these on seperate lines, or error C2487 is triggered in DLL build
- CRYPTOPP_L1_CACHE_ALIGN(static const byte Se[256]);
- CRYPTOPP_L1_CACHE_ALIGN(static const byte Sd[256]);
- CRYPTOPP_L1_CACHE_ALIGN(static const word32 Te0[256]);
- static const word32 Te1[256];
- static const word32 Te2[256];
- static const word32 Te3[256];
- CRYPTOPP_L1_CACHE_ALIGN(static const word32 Td0[256]);
- static const word32 Td1[256];
- static const word32 Td2[256];
- static const word32 Td3[256];
+ static const byte Se[256];
+ static const byte Sd[256];
+ static const word32 Te[4*256];
+ static const word32 Td[4*256];
static const word32 rcon[];
@@ -52,6 +46,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+ void ProcessAndXorBlock_Old(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
};
public:
diff --git a/sha.cpp b/sha.cpp
index 5355995a..127d1f99 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -9,6 +9,7 @@
#include "sha.h"
#include "misc.h"
+#include "cpu.h"
NAMESPACE_BEGIN(CryptoPP)
@@ -74,27 +75,43 @@ void SHA1::Transform(word32 *state, const word32 *data)
state[2] += c;
state[3] += d;
state[4] += e;
- /* Wipe variables */
- a = b = c = d = e = 0;
- memset(W, 0, sizeof(W));
}
// end of Steve Reid's code
// *************************************************************
+void SHA224::InitState(HashWordType *state)
+{
+ static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
+ memcpy(state, s, sizeof(s));
+}
+
void SHA256::InitState(HashWordType *state)
{
- state[0] = 0x6a09e667;
- state[1] = 0xbb67ae85;
- state[2] = 0x3c6ef372;
- state[3] = 0xa54ff53a;
- state[4] = 0x510e527f;
- state[5] = 0x9b05688c;
- state[6] = 0x1f83d9ab;
- state[7] = 0x5be0cd19;
+ static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
+ memcpy(state, s, sizeof(s));
}
+static const word32 SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
@@ -109,7 +126,7 @@ void SHA256::InitState(HashWordType *state)
#define g(i) T[(6-i)&7]
#define h(i) T[(7-i)&7]
-#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
// for SHA256
@@ -141,98 +158,114 @@ void SHA256::Transform(word32 *state, const word32 *data)
state[5] += f(0);
state[6] += g(0);
state[7] += h(0);
- /* Wipe variables */
- memset(W, 0, sizeof(W));
- memset(T, 0, sizeof(T));
}
+/*
+// smaller but slower
+void SHA256_Transform(word32 *state, const word32 *data)
+{
+ word32 T[20];
+ word32 W[32];
+ unsigned int i = 0, j = 0;
+ word32 *t = T+8;
+
+ memcpy(t, state, 8*4);
+ word32 e = t[4], a = t[0];
+
+ do
+ {
+ word32 w = data[j];
+ W[j] = w;
+ w += K[j];
+ w += t[7];
+ w += S1(e);
+ w += Ch(e, t[5], t[6]);
+ e = t[3] + w;
+ t[3] = t[3+8] = e;
+ w += S0(t[0]);
+ a = w + Maj(a, t[1], t[2]);
+ t[-1] = t[7] = a;
+ --t;
+ ++j;
+ if (j%8 == 0)
+ t += 8;
+ } while (j<16);
+
+ do
+ {
+ i = j&0xf;
+ word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
+ W[i+16] = W[i] = w;
+ w += K[j];
+ w += t[7];
+ w += S1(e);
+ w += Ch(e, t[5], t[6]);
+ e = t[3] + w;
+ t[3] = t[3+8] = e;
+ w += S0(t[0]);
+ a = w + Maj(a, t[1], t[2]);
+ t[-1] = t[7] = a;
+
+ w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
+ W[(i+1)+16] = W[(i+1)] = w;
+ w += K[j+1];
+ w += (t-1)[7];
+ w += S1(e);
+ w += Ch(e, (t-1)[5], (t-1)[6]);
+ e = (t-1)[3] + w;
+ (t-1)[3] = (t-1)[3+8] = e;
+ w += S0((t-1)[0]);
+ a = w + Maj(a, (t-1)[1], (t-1)[2]);
+ (t-1)[-1] = (t-1)[7] = a;
+
+ t-=2;
+ j+=2;
+ if (j%8 == 0)
+ t += 8;
+ } while (j<64);
+
+ state[0] += a;
+ state[1] += t[1];
+ state[2] += t[2];
+ state[3] += t[3];
+ state[4] += e;
+ state[5] += t[5];
+ state[6] += t[6];
+ state[7] += t[7];
+}
+*/
+
#undef S0
#undef S1
#undef s0
#undef s1
-
-const word32 SHA256::K[64] = {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-void SHA224::InitState(HashWordType *state)
-{
- state[0] = 0xc1059ed8;
- state[1] = 0x367cd507;
- state[2] = 0x3070dd17;
- state[3] = 0xf70e5939;
- state[4] = 0xffc00b31;
- state[5] = 0x68581511;
- state[6] = 0x64f98fa7;
- state[7] = 0xbefa4fa4;
-}
+#undef R
// *************************************************************
#ifdef WORD64_AVAILABLE
-void SHA512::InitState(HashWordType *state)
+void SHA384::InitState(HashWordType *state)
{
- state[0] = W64LIT(0x6a09e667f3bcc908);
- state[1] = W64LIT(0xbb67ae8584caa73b);
- state[2] = W64LIT(0x3c6ef372fe94f82b);
- state[3] = W64LIT(0xa54ff53a5f1d36f1);
- state[4] = W64LIT(0x510e527fade682d1);
- state[5] = W64LIT(0x9b05688c2b3e6c1f);
- state[6] = W64LIT(0x1f83d9abfb41bd6b);
- state[7] = W64LIT(0x5be0cd19137e2179);
+ static const word64 s[8] = {
+ W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507),
+ W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939),
+ W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511),
+ W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)};
+ memcpy(state, s, sizeof(s));
}
-// for SHA512
-#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
-#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
-#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
-#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
-
-void SHA512::Transform(word64 *state, const word64 *data)
+void SHA512::InitState(HashWordType *state)
{
- word64 W[16];
- word64 T[8];
- /* Copy context->state[] to working vars */
- memcpy(T, state, sizeof(T));
- /* 80 operations, partially loop unrolled */
- for (unsigned int j=0; j<80; j+=16)
- {
- R( 0); R( 1); R( 2); R( 3);
- R( 4); R( 5); R( 6); R( 7);
- R( 8); R( 9); R(10); R(11);
- R(12); R(13); R(14); R(15);
- }
- /* Add the working vars back into context.state[] */
- state[0] += a(0);
- state[1] += b(0);
- state[2] += c(0);
- state[3] += d(0);
- state[4] += e(0);
- state[5] += f(0);
- state[6] += g(0);
- state[7] += h(0);
- /* Wipe variables */
- memset(W, 0, sizeof(W));
- memset(T, 0, sizeof(T));
+ static const word64 s[8] = {
+ W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
+ W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
+ W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
+ W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)};
+ memcpy(state, s, sizeof(s));
}
-const word64 SHA512::K[80] = {
+CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
@@ -275,16 +308,231 @@ const word64 SHA512::K[80] = {
W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
};
-void SHA384::InitState(HashWordType *state)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
+static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
+{
+#ifdef __GNUC__
+ __asm__ __volatile__
+ (
+ ".intel_syntax noprefix;"
+ AS1( push ebx)
+ AS2( mov ebx, eax)
+#else
+ AS2( lea ebx, SHA512_K)
+#endif
+
+ AS2( mov eax, esp)
+ AS2( and esp, 0xfffffff0)
+ AS2( sub esp, 27*16) // 17*16 for expanded data, 20*8 for state
+ AS1( push eax)
+ AS2( xor eax, eax)
+ AS2( lea edi, [esp+4+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying
+ AS2( lea esi, [esp+4+20*8+8]) // 16-byte alignment, then add 8
+
+ AS2( movq mm4, [ecx+0*8])
+ AS2( movq [edi+0*8], mm4)
+ AS2( movq mm0, [ecx+1*8])
+ AS2( movq [edi+1*8], mm0)
+ AS2( movq mm0, [ecx+2*8])
+ AS2( movq [edi+2*8], mm0)
+ AS2( movq mm0, [ecx+3*8])
+ AS2( movq [edi+3*8], mm0)
+ AS2( movq mm5, [ecx+4*8])
+ AS2( movq [edi+4*8], mm5)
+ AS2( movq mm0, [ecx+5*8])
+ AS2( movq [edi+5*8], mm0)
+ AS2( movq mm0, [ecx+6*8])
+ AS2( movq [edi+6*8], mm0)
+ AS2( movq mm0, [ecx+7*8])
+ AS2( movq [edi+7*8], mm0)
+ ASJ( jmp, 0, f)
+
+#define SSE2_S0_S1(r, a, b, c) \
+ AS2( movq mm6, r)\
+ AS2( psrlq r, a)\
+ AS2( movq mm7, r)\
+ AS2( psllq mm6, 64-c)\
+ AS2( pxor mm7, mm6)\
+ AS2( psrlq r, b-a)\
+ AS2( pxor mm7, r)\
+ AS2( psllq mm6, c-b)\
+ AS2( pxor mm7, mm6)\
+ AS2( psrlq r, c-b)\
+ AS2( pxor r, mm7)\
+ AS2( psllq mm6, b-a)\
+ AS2( pxor r, mm6)
+
+#define SSE2_s0(r, a, b, c) \
+ AS2( movdqa xmm6, r)\
+ AS2( psrlq r, a)\
+ AS2( movdqa xmm7, r)\
+ AS2( psllq xmm6, 64-c)\
+ AS2( pxor xmm7, xmm6)\
+ AS2( psrlq r, b-a)\
+ AS2( pxor xmm7, r)\
+ AS2( psrlq r, c-b)\
+ AS2( pxor r, xmm7)\
+ AS2( psllq xmm6, c-a)\
+ AS2( pxor r, xmm6)
+
+#define SSE2_s1(r, a, b, c) \
+ AS2( movdqa xmm6, r)\
+ AS2( psrlq r, a)\
+ AS2( movdqa xmm7, r)\
+ AS2( psllq xmm6, 64-c)\
+ AS2( pxor xmm7, xmm6)\
+ AS2( psrlq r, b-a)\
+ AS2( pxor xmm7, r)\
+ AS2( psllq xmm6, c-b)\
+ AS2( pxor xmm7, xmm6)\
+ AS2( psrlq r, c-b)\
+ AS2( pxor r, xmm7)
+
+ ASL(SHA512_Round)
+ // k + w is in mm0, a is in mm4, e is in mm5
+ AS2( paddq mm0, [edi+7*8]) // h
+ AS2( movq mm2, [edi+5*8]) // f
+ AS2( movq mm3, [edi+6*8]) // g
+ AS2( pxor mm2, mm3)
+ AS2( pand mm2, mm5)
+ SSE2_S0_S1(mm5,14,18,41)
+ AS2( pxor mm2, mm3)
+ AS2( paddq mm0, mm2) // h += Ch(e,f,g)
+ AS2( paddq mm5, mm0) // h += S1(e)
+ AS2( movq mm2, [edi+1*8]) // b
+ AS2( movq mm1, mm2)
+ AS2( por mm2, mm4)
+ AS2( pand mm2, [edi+2*8]) // c
+ AS2( pand mm1, mm4)
+ AS2( por mm1, mm2)
+ AS2( paddq mm1, mm5) // temp = h + Maj(a,b,c)
+ AS2( paddq mm5, [edi+3*8]) // e = d + h
+ AS2( movq [edi+3*8], mm5)
+ AS2( movq [edi+11*8], mm5)
+ SSE2_S0_S1(mm4,28,34,39) // S0(a)
+ AS2( paddq mm4, mm1) // a = temp + S0(a)
+ AS2( movq [edi-8], mm4)
+ AS2( movq [edi+7*8], mm4)
+ AS1( ret)
+
+ // first 16 rounds
+ ASL(0)
+ AS2( movq mm0, [edx+eax*8])
+ AS2( movq [esi+eax*8], mm0)
+ AS2( movq [esi+eax*8+16*8], mm0)
+ AS2( paddq mm0, [ebx+eax*8])
+ ASC( call, SHA512_Round)
+ AS1( inc eax)
+ AS2( sub edi, 8)
+ AS2( test eax, 7)
+ ASJ( jnz, 0, b)
+ AS2( add edi, 8*8)
+ AS2( cmp eax, 16)
+ ASJ( jne, 0, b)
+
+ // rest of the rounds
+ AS2( movdqu xmm0, [esi+(16-2)*8])
+ ASL(1)
+ // data expansion, W[i-2] already in xmm0
+ AS2( movdqu xmm3, [esi])
+ AS2( paddq xmm3, [esi+(16-7)*8])
+ AS2( movdqa xmm2, [esi+(16-15)*8])
+ SSE2_s1(xmm0, 6, 19, 61)
+ AS2( paddq xmm0, xmm3)
+ SSE2_s0(xmm2, 1, 7, 8)
+ AS2( paddq xmm0, xmm2)
+ AS2( movdq2q mm0, xmm0)
+ AS2( movhlps xmm1, xmm0)
+ AS2( paddq mm0, [ebx+eax*8])
+ AS2( movlps [esi], xmm0)
+ AS2( movlps [esi+8], xmm1)
+ AS2( movlps [esi+8*16], xmm0)
+ AS2( movlps [esi+8*17], xmm1)
+ // 2 rounds
+ ASC( call, SHA512_Round)
+ AS2( sub edi, 8)
+ AS2( movdq2q mm0, xmm1)
+ AS2( paddq mm0, [ebx+eax*8+8])
+ ASC( call, SHA512_Round)
+ // update indices and loop
+ AS2( add esi, 16)
+ AS2( add eax, 2)
+ AS2( sub edi, 8)
+ AS2( test eax, 7)
+ ASJ( jnz, 1, b)
+ // do housekeeping every 8 rounds
+ AS2( mov esi, 0xf)
+ AS2( and esi, eax)
+ AS2( lea esi, [esp+4+20*8+8+esi*8])
+ AS2( add edi, 8*8)
+ AS2( cmp eax, 80)
+ ASJ( jne, 1, b)
+
+#define SSE2_CombineState(i) \
+ AS2( movq mm0, [edi+i*8])\
+ AS2( paddq mm0, [ecx+i*8])\
+ AS2( movq [ecx+i*8], mm0)
+
+ SSE2_CombineState(0)
+ SSE2_CombineState(1)
+ SSE2_CombineState(2)
+ SSE2_CombineState(3)
+ SSE2_CombineState(4)
+ SSE2_CombineState(5)
+ SSE2_CombineState(6)
+ SSE2_CombineState(7)
+
+ AS1( pop esp)
+ AS1( emms)
+
+#ifdef __GNUC__
+ AS1( pop ebx)
+ ".att_syntax prefix;"
+ :
+ : "a" (SHA512_K), "c" (state), "d" (data)
+ : "%esi", "%edi", "memory", "cc"
+ );
+#endif
+}
+#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+void SHA512::Transform(word64 *state, const word64 *data)
{
- state[0] = W64LIT(0xcbbb9d5dc1059ed8);
- state[1] = W64LIT(0x629a292a367cd507);
- state[2] = W64LIT(0x9159015a3070dd17);
- state[3] = W64LIT(0x152fecd8f70e5939);
- state[4] = W64LIT(0x67332667ffc00b31);
- state[5] = W64LIT(0x8eb44a8768581511);
- state[6] = W64LIT(0xdb0c2e0d64f98fa7);
- state[7] = W64LIT(0x47b5481dbefa4fa4);
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ if (HasSSE2())
+ return SHA512_SSE2_Transform(state, data);
+#endif
+
+#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
+#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
+#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
+#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
+
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\
+ d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
+
+ word64 W[16];
+ word64 T[8];
+ /* Copy context->state[] to working vars */
+ memcpy(T, state, sizeof(T));
+ /* 80 operations, partially loop unrolled */
+ for (unsigned int j=0; j<80; j+=16)
+ {
+ R( 0); R( 1); R( 2); R( 3);
+ R( 4); R( 5); R( 6); R( 7);
+ R( 8); R( 9); R(10); R(11);
+ R(12); R(13); R(14); R(15);
+ }
+ /* Add the working vars back into context.state[] */
+ state[0] += a(0);
+ state[1] += b(0);
+ state[2] += c(0);
+ state[3] += d(0);
+ state[4] += e(0);
+ state[5] += f(0);
+ state[6] += g(0);
+ state[7] += h(0);
}
#endif
diff --git a/sha.h b/sha.h
index 69b02ff7..40eb6df6 100644
--- a/sha.h
+++ b/sha.h
@@ -23,9 +23,6 @@ public:
static void CRYPTOPP_API InitState(HashWordType *state);
static void CRYPTOPP_API Transform(word32 *digest, const word32 *data);
static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-256";}
-
-protected:
- static const word32 K[64];
};
//! implements the SHA-224 standard
@@ -46,9 +43,6 @@ public:
static void CRYPTOPP_API InitState(HashWordType *state);
static void CRYPTOPP_API Transform(word64 *digest, const word64 *data);
static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-512";}
-
-protected:
- static const word64 K[80];
};
//! implements the SHA-384 standard
diff --git a/tiger.cpp b/tiger.cpp
index b69e975a..332de2c6 100644
--- a/tiger.cpp
+++ b/tiger.cpp
@@ -3,6 +3,7 @@
#include "pch.h"
#include "tiger.h"
#include "misc.h"
+#include "cpu.h"
#ifdef WORD64_AVAILABLE
@@ -24,13 +25,187 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
m_data[7] = GetBitCountLo();
- Transform(m_digest, m_data);
- CorrectEndianess(m_digest, m_digest, DigestSize());
- memcpy(hash, m_digest, size);
+ Transform(m_state, m_data);
+ CorrectEndianess(m_state, m_state, DigestSize());
+ memcpy(hash, m_state, size);
Restart(); // reinit for next use
}
+void Tiger::Transform (word64 *digest, const word64 *X)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ if (HasSSE2())
+ {
+#ifdef __GNUC__
+ __asm__ __volatile__
+ (
+ ".intel_syntax noprefix;"
+ AS1( push ebx)
+#else
+ AS2( mov eax, digest)
+ AS2( mov esi, X)
+ AS2( lea edx, [table])
+#endif
+ AS2( movq mm0, [eax])
+ AS2( movq mm1, [eax+1*8])
+ AS2( movq mm5, mm1)
+ AS2( movq mm2, [eax+2*8])
+ AS2( movq mm7, [edx+4*2048+0*8])
+ AS2( movq mm6, [edx+4*2048+1*8])
+ AS2( mov ecx, esp)
+ AS2( and esp, 0xfffffff0)
+ AS2( sub esp, 8*8)
+ AS1( push ecx)
+
+#define SSE2_round(a,b,c,x,mul) \
+ AS2( pxor c, [x])\
+ AS2( movd ecx, c)\
+ AS2( movzx edi, cl)\
+ AS2( movq mm3, [edx+0*2048+edi*8])\
+ AS2( movzx edi, ch)\
+ AS2( movq mm4, [edx+3*2048+edi*8])\
+ AS2( shr ecx, 16)\
+ AS2( movzx edi, cl)\
+ AS2( pxor mm3, [edx+1*2048+edi*8])\
+ AS2( movzx edi, ch)\
+ AS2( pxor mm4, [edx+2*2048+edi*8])\
+ AS3( pextrw ecx, c, 2)\
+ AS2( movzx edi, cl)\
+ AS2( pxor mm3, [edx+2*2048+edi*8])\
+ AS2( movzx edi, ch)\
+ AS2( pxor mm4, [edx+1*2048+edi*8])\
+ AS3( pextrw ecx, c, 3)\
+ AS2( movzx edi, cl)\
+ AS2( pxor mm3, [edx+3*2048+edi*8])\
+ AS2( psubq a, mm3)\
+ AS2( movzx edi, ch)\
+ AS2( pxor mm4, [edx+0*2048+edi*8])\
+ AS2( paddq b, mm4)\
+ SSE2_mul_##mul(b)
+
+#define SSE2_mul_5(b) \
+ AS2( movq mm3, b)\
+ AS2( psllq b, 2)\
+ AS2( paddq b, mm3)
+
+#define SSE2_mul_7(b) \
+ AS2( movq mm3, b)\
+ AS2( psllq b, 3)\
+ AS2( psubq b, mm3)
+
+#define SSE2_mul_9(b) \
+ AS2( movq mm3, b)\
+ AS2( psllq b, 3)\
+ AS2( paddq b, mm3)
+
+#define label2_5 1
+#define label2_7 2
+#define label2_9 3
+
+#define SSE2_pass(A,B,C,mul,X) \
+ AS2( xor ebx, ebx)\
+ ASL(mul)\
+ SSE2_round(A,B,C,X+0*8+ebx,mul)\
+ SSE2_round(B,C,A,X+1*8+ebx,mul)\
+ AS2( cmp ebx, 6*8)\
+ ASJ( je, label2_##mul, f)\
+ SSE2_round(C,A,B,X+2*8+ebx,mul)\
+ AS2( add ebx, 3*8)\
+ ASJ( jmp, mul, b)\
+ ASL(label2_##mul)
+
+#define SSE2_key_schedule(Y,X) \
+ AS2( movq mm3, [X+7*8])\
+ AS2( pxor mm3, mm6)\
+ AS2( movq mm4, [X+0*8])\
+ AS2( psubq mm4, mm3)\
+ AS2( movq [Y+0*8], mm4)\
+ AS2( pxor mm4, [X+1*8])\
+ AS2( movq mm3, mm4)\
+ AS2( movq [Y+1*8], mm4)\
+ AS2( paddq mm4, [X+2*8])\
+ AS2( pxor mm3, mm7)\
+ AS2( psllq mm3, 19)\
+ AS2( movq [Y+2*8], mm4)\
+ AS2( pxor mm3, mm4)\
+ AS2( movq mm4, [X+3*8])\
+ AS2( psubq mm4, mm3)\
+ AS2( movq [Y+3*8], mm4)\
+ AS2( pxor mm4, [X+4*8])\
+ AS2( movq mm3, mm4)\
+ AS2( movq [Y+4*8], mm4)\
+ AS2( paddq mm4, [X+5*8])\
+ AS2( pxor mm3, mm7)\
+ AS2( psrlq mm3, 23)\
+ AS2( movq [Y+5*8], mm4)\
+ AS2( pxor mm3, mm4)\
+ AS2( movq mm4, [X+6*8])\
+ AS2( psubq mm4, mm3)\
+ AS2( movq [Y+6*8], mm4)\
+ AS2( pxor mm4, [X+7*8])\
+ AS2( movq mm3, mm4)\
+ AS2( movq [Y+7*8], mm4)\
+ AS2( paddq mm4, [Y+0*8])\
+ AS2( pxor mm3, mm7)\
+ AS2( psllq mm3, 19)\
+ AS2( movq [Y+0*8], mm4)\
+ AS2( pxor mm3, mm4)\
+ AS2( movq mm4, [Y+1*8])\
+ AS2( psubq mm4, mm3)\
+ AS2( movq [Y+1*8], mm4)\
+ AS2( pxor mm4, [Y+2*8])\
+ AS2( movq mm3, mm4)\
+ AS2( movq [Y+2*8], mm4)\
+ AS2( paddq mm4, [Y+3*8])\
+ AS2( pxor mm3, mm7)\
+ AS2( psrlq mm3, 23)\
+ AS2( movq [Y+3*8], mm4)\
+ AS2( pxor mm3, mm4)\
+ AS2( movq mm4, [Y+4*8])\
+ AS2( psubq mm4, mm3)\
+ AS2( movq [Y+4*8], mm4)\
+ AS2( pxor mm4, [Y+5*8])\
+ AS2( movq [Y+5*8], mm4)\
+ AS2( paddq mm4, [Y+6*8])\
+ AS2( movq [Y+6*8], mm4)\
+ AS2( pxor mm4, [edx+4*2048+2*8])\
+ AS2( movq mm3, [Y+7*8])\
+ AS2( psubq mm3, mm4)\
+ AS2( movq [Y+7*8], mm3)
+
+ SSE2_pass(mm0, mm1, mm2, 5, esi)
+ SSE2_key_schedule(esp+4, esi)
+ SSE2_pass(mm2, mm0, mm1, 7, esp+4)
+ SSE2_key_schedule(esp+4, esp+4)
+ SSE2_pass(mm1, mm2, mm0, 9, esp+4)
+
+ AS2( pxor mm0, [eax+0*8])
+ AS2( movq [eax+0*8], mm0)
+ AS2( psubq mm1, mm5)
+ AS2( movq [eax+1*8], mm1)
+ AS2( paddq mm2, [eax+2*8])
+ AS2( movq [eax+2*8], mm2)
+
+ AS1( pop esp)
+ AS1( emms)
+#ifdef __GNUC__
+ AS1( pop ebx)
+ ".att_syntax prefix;"
+ :
+ : "a" (digest), "S" (X), "d" (table)
+ : "%ecx", "%edi", "memory", "cc"
+ );
+#endif
+ }
+ else
+#endif
+ {
+ word64 a = digest[0];
+ word64 b = digest[1];
+ word64 c = digest[2];
+ word64 Y[8];
+
#define t1 (table)
#define t2 (table+256)
#define t3 (table+256*2)
@@ -42,15 +217,17 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \
b *= mul
-#define pass(a,b,c,mul,X) \
- round(a,b,c,X[0],mul); \
- round(b,c,a,X[1],mul); \
- round(c,a,b,X[2],mul); \
- round(a,b,c,X[3],mul); \
- round(b,c,a,X[4],mul); \
- round(c,a,b,X[5],mul); \
- round(a,b,c,X[6],mul); \
- round(b,c,a,X[7],mul)
+#define pass(a,b,c,mul,X) {\
+ int i=0;\
+ while (true)\
+ {\
+ round(a,b,c,X[i+0],mul); \
+ round(b,c,a,X[i+1],mul); \
+ if (i==6)\
+ break;\
+ round(c,a,b,X[i+2],mul); \
+ i+=3;\
+ }}
#define key_schedule(Y,X) \
Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \
@@ -70,24 +247,16 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
Y[6] += Y[5]; \
Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF)
-void Tiger::Transform (word64 *digest, const word64 *X)
-{
- word64 a = digest[0];
- word64 b = digest[1];
- word64 c = digest[2];
- word64 Y[8];
-
- pass(a,b,c,5,X);
- key_schedule(Y,X);
- pass(c,a,b,7,Y);
- key_schedule(Y,Y);
- pass(b,c,a,9,Y);
-
- digest[0] = a ^ digest[0];
- digest[1] = b - digest[1];
- digest[2] = c + digest[2];
-
- memset(Y, 0, sizeof(Y));
+ pass(a,b,c,5,X);
+ key_schedule(Y,X);
+ pass(c,a,b,7,Y);
+ key_schedule(Y,Y);
+ pass(b,c,a,9,Y);
+
+ digest[0] = a ^ digest[0];
+ digest[1] = b - digest[1];
+ digest[2] = c + digest[2];
+ }
}
NAMESPACE_END
diff --git a/tiger.h b/tiger.h
index 66d1da2a..42bf1614 100644
--- a/tiger.h
+++ b/tiger.h
@@ -9,7 +9,7 @@
NAMESPACE_BEGIN(CryptoPP)
-/// <a href="http://www.weidai.com/scan-mirror/md.html#Tiger">Tiger</a>
+/// <a href="http://www.cryptolounge.org/wiki/Tiger">Tiger</a>
class Tiger : public IteratedHashWithStaticTransform<word64, LittleEndian, 64, 24, Tiger>
{
public:
@@ -19,7 +19,7 @@ public:
static const char * StaticAlgorithmName() {return "Tiger";}
protected:
- static const word64 table[4*256];
+ static const word64 table[4*256+3];
};
NAMESPACE_END
diff --git a/whrlpool.cpp b/whrlpool.cpp
index 989281a3..da19d7ff 100644
--- a/whrlpool.cpp
+++ b/whrlpool.cpp
@@ -1,7 +1,7 @@
-// Whrlpool.cpp - modified by Kevin Springle from
+// whrlpool.cpp - originally modified by Kevin Springle from
// Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c.
+// Updated to Whirlpool version 3.0, optimized and MMX version added by Wei Dai
// Any modifications are placed in the public domain
-// Updated to Whirlpool version 3.0 by Wei Dai
// This is the original introductory comment:
@@ -69,6 +69,7 @@
#include "whrlpool.h"
#include "misc.h"
+#include "cpu.h"
NAMESPACE_BEGIN(CryptoPP)
@@ -94,9 +95,9 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size)
m_data[m_data.size()-2] = GetBitCountHi();
m_data[m_data.size()-1] = GetBitCountLo();
- Transform(m_digest, m_data);
- CorrectEndianess(m_digest, m_digest, DigestSize());
- memcpy(hash, m_digest, size);
+ Transform(m_state, m_data);
+ CorrectEndianess(m_state, m_state, DigestSize());
+ memcpy(hash, m_state, size);
Restart(); // reinit for next use
}
@@ -113,7 +114,7 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size)
* employed).
*/
-static const word64 C0[256] = {
+CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTION_ALIGN16 = {
W64LIT(0x18186018c07830d8), W64LIT(0x23238c2305af4626), W64LIT(0xc6c63fc67ef991b8), W64LIT(0xe8e887e8136fcdfb),
W64LIT(0x878726874ca113cb), W64LIT(0xb8b8dab8a9626d11), W64LIT(0x0101040108050209), W64LIT(0x4f4f214f426e9e0d),
W64LIT(0x3636d836adee6c9b), W64LIT(0xa6a6a2a6590451ff), W64LIT(0xd2d26fd2debdb90c), W64LIT(0xf5f5f3f5fb06f70e),
@@ -177,11 +178,9 @@ static const word64 C0[256] = {
W64LIT(0x16165816b04e2ca6), W64LIT(0x3a3ae83acdd274f7), W64LIT(0x6969b9696fd0d206), W64LIT(0x09092409482d1241),
W64LIT(0x7070dd70a7ade0d7), W64LIT(0xb6b6e2b6d954716f), W64LIT(0xd0d067d0ceb7bd1e), W64LIT(0xeded93ed3b7ec7d6),
W64LIT(0xcccc17cc2edb85e2), W64LIT(0x424215422a578468), W64LIT(0x98985a98b4c22d2c), W64LIT(0xa4a4aaa4490e55ed),
- W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2),
-};
+ W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2),
-static const word64 C1[256] = {
- W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd),
+ W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd),
W64LIT(0xcb878726874ca113), W64LIT(0x11b8b8dab8a9626d), W64LIT(0x0901010401080502), W64LIT(0x0d4f4f214f426e9e),
W64LIT(0x9b3636d836adee6c), W64LIT(0xffa6a6a2a6590451), W64LIT(0x0cd2d26fd2debdb9), W64LIT(0x0ef5f5f3f5fb06f7),
W64LIT(0x967979f979ef80f2), W64LIT(0x306f6fa16f5fcede), W64LIT(0x6d91917e91fcef3f), W64LIT(0xf852525552aa07a4),
@@ -245,10 +244,8 @@ static const word64 C1[256] = {
W64LIT(0xd77070dd70a7ade0), W64LIT(0x6fb6b6e2b6d95471), W64LIT(0x1ed0d067d0ceb7bd), W64LIT(0xd6eded93ed3b7ec7),
W64LIT(0xe2cccc17cc2edb85), W64LIT(0x68424215422a5784), W64LIT(0x2c98985a98b4c22d), W64LIT(0xeda4a4aaa4490e55),
W64LIT(0x752828a0285d8850), W64LIT(0x865c5c6d5cda31b8), W64LIT(0x6bf8f8c7f8933fed), W64LIT(0xc28686228644a411),
-};
-static const word64 C2[256] = {
- W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f),
+ W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f),
W64LIT(0x13cb878726874ca1), W64LIT(0x6d11b8b8dab8a962), W64LIT(0x0209010104010805), W64LIT(0x9e0d4f4f214f426e),
W64LIT(0x6c9b3636d836adee), W64LIT(0x51ffa6a6a2a65904), W64LIT(0xb90cd2d26fd2debd), W64LIT(0xf70ef5f5f3f5fb06),
W64LIT(0xf2967979f979ef80), W64LIT(0xde306f6fa16f5fce), W64LIT(0x3f6d91917e91fcef), W64LIT(0xa4f852525552aa07),
@@ -312,10 +309,8 @@ static const word64 C2[256] = {
W64LIT(0xe0d77070dd70a7ad), W64LIT(0x716fb6b6e2b6d954), W64LIT(0xbd1ed0d067d0ceb7), W64LIT(0xc7d6eded93ed3b7e),
W64LIT(0x85e2cccc17cc2edb), W64LIT(0x8468424215422a57), W64LIT(0x2d2c98985a98b4c2), W64LIT(0x55eda4a4aaa4490e),
W64LIT(0x50752828a0285d88), W64LIT(0xb8865c5c6d5cda31), W64LIT(0xed6bf8f8c7f8933f), W64LIT(0x11c28686228644a4),
-};
-static const word64 C3[256] = {
- W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813),
+ W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813),
W64LIT(0xa113cb878726874c), W64LIT(0x626d11b8b8dab8a9), W64LIT(0x0502090101040108), W64LIT(0x6e9e0d4f4f214f42),
W64LIT(0xee6c9b3636d836ad), W64LIT(0x0451ffa6a6a2a659), W64LIT(0xbdb90cd2d26fd2de), W64LIT(0x06f70ef5f5f3f5fb),
W64LIT(0x80f2967979f979ef), W64LIT(0xcede306f6fa16f5f), W64LIT(0xef3f6d91917e91fc), W64LIT(0x07a4f852525552aa),
@@ -379,9 +374,7 @@ static const word64 C3[256] = {
W64LIT(0xade0d77070dd70a7), W64LIT(0x54716fb6b6e2b6d9), W64LIT(0xb7bd1ed0d067d0ce), W64LIT(0x7ec7d6eded93ed3b),
W64LIT(0xdb85e2cccc17cc2e), W64LIT(0x578468424215422a), W64LIT(0xc22d2c98985a98b4), W64LIT(0x0e55eda4a4aaa449),
W64LIT(0x8850752828a0285d), W64LIT(0x31b8865c5c6d5cda), W64LIT(0x3fed6bf8f8c7f893), W64LIT(0xa411c28686228644),
-};
-static const word64 rc[R] = {
W64LIT(0x1823c6e887b8014f),
W64LIT(0x36a6d2f5796f9152),
W64LIT(0x60bc9b8ea30c7b35),
@@ -397,55 +390,292 @@ static const word64 rc[R] = {
// Whirlpool basic transformation. Transforms state based on block.
void Whirlpool::Transform(word64 *digest, const word64 *block)
{
+#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+ if (HasMMX())
+ {
+ // MMX version has the same structure as C version below
+#ifdef __GNUC__
+ __asm__ __volatile__
+ (
+ ".intel_syntax noprefix;"
+ AS1( push ebx)
+ AS2( mov ebx, eax)
+#else
+ AS2( lea ebx, [Whirlpool_C])
+ AS2( mov ecx, digest)
+ AS2( mov edx, block)
+#endif
+ AS2( mov eax, esp)
+ AS2( and esp, 0xfffffff0)
+ AS2( sub esp, 16*8)
+ AS1( push eax)
+ AS2( xor esi, esi)
+ ASL(0)
+ AS2( movq mm0, [ecx+8*esi])
+ AS2( movq [esp+4+8*esi], mm0) // k
+ AS2( pxor mm0, [edx+8*esi])
+ AS2( movq [esp+4+64+8*esi], mm0) // s
+ AS2( movq [ecx+8*esi], mm0)
+ AS1( inc esi)
+ AS2( cmp esi, 8)
+ ASJ( jne, 0, b)
+
+ AS2( xor esi, esi)
+ ASL(1)
+
+#define KSL0(a, b) AS2(movq mm##a, b)
+#define KSL1(a, b) AS2(pxor mm##a, b)
+
+#define KSL(op, i, a, b, c, d) \
+ AS2(mov eax, [esp+4+8*i])\
+ AS2(movzx edi, al)\
+ KSL##op(a, [ebx+3*2048+8*edi])\
+ AS2(movzx edi, ah)\
+ KSL##op(b, [ebx+2*2048+8*edi])\
+ AS2(shr eax, 16)\
+ AS2(movzx edi, al)\
+ AS2(shr eax, 8)\
+ KSL##op(c, [ebx+1*2048+8*edi])\
+ KSL##op(d, [ebx+0*2048+8*eax])
+
+#define KSH0(a, b) \
+ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
+ AS2(pxor mm##a, b)
+#define KSH1(a, b) \
+ AS2(pxor mm##a, b)
+#define KSH2(a, b) \
+ AS2(pxor mm##a, b)\
+ AS2(movq [esp+4+8*a], mm##a)
+
+#define KSH(op, i, a, b, c, d) \
+ AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\
+ AS2(movzx edi, al)\
+ KSH##op(a, [ebx+3*2048+8*edi])\
+ AS2(movzx edi, ah)\
+ KSH##op(b, [ebx+2*2048+8*edi])\
+ AS2(shr eax, 16)\
+ AS2(movzx edi, al)\
+ AS2(shr eax, 8)\
+ KSH##op(c, [ebx+1*2048+8*edi])\
+ KSH##op(d, [ebx+0*2048+8*eax])
+
+#define TSL(op, i, a, b, c, d) \
+ AS2(mov eax, [esp+4+64+8*i])\
+ AS2(movzx edi, al)\
+ KSL##op(a, [ebx+3*2048+8*edi])\
+ AS2(movzx edi, ah)\
+ KSL##op(b, [ebx+2*2048+8*edi])\
+ AS2(shr eax, 16)\
+ AS2(movzx edi, al)\
+ AS2(shr eax, 8)\
+ KSL##op(c, [ebx+1*2048+8*edi])\
+ KSL##op(d, [ebx+0*2048+8*eax])
+
+#define TSH0(a, b) \
+ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
+ AS2(pxor mm##a, [esp+4+8*a])\
+ AS2(pxor mm##a, b)
+#define TSH1(a, b) \
+ AS2(pxor mm##a, b)
+#define TSH2(a, b) \
+ AS2(pxor mm##a, b)\
+ AS2(movq [esp+4+64+8*a], mm##a)
+#define TSH3(a, b) \
+ AS2(pxor mm##a, b)\
+ AS2(pxor mm##a, [ecx+8*a])\
+ AS2(movq [ecx+8*a], mm##a)
+
+#define TSH(op, i, a, b, c, d) \
+ AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\
+ AS2(movzx edi, al)\
+ TSH##op(a, [ebx+3*2048+8*edi])\
+ AS2(movzx edi, ah)\
+ TSH##op(b, [ebx+2*2048+8*edi])\
+ AS2(shr eax, 16)\
+ AS2(movzx edi, al)\
+ AS2(shr eax, 8)\
+ TSH##op(c, [ebx+1*2048+8*edi])\
+ TSH##op(d, [ebx+0*2048+8*eax])
+
+ KSL(0, 4, 3, 2, 1, 0)
+ KSL(0, 0, 7, 6, 5, 4)
+ KSL(1, 1, 0, 7, 6, 5)
+ KSL(1, 2, 1, 0, 7, 6)
+ KSL(1, 3, 2, 1, 0, 7)
+ KSL(1, 5, 4, 3, 2, 1)
+ KSL(1, 6, 5, 4, 3, 2)
+ KSL(1, 7, 6, 5, 4, 3)
+ KSH(0, 0, 7, 6, 5, 4)
+ KSH(0, 4, 3, 2, 1, 0)
+ KSH(1, 1, 0, 7, 6, 5)
+ KSH(1, 2, 1, 0, 7, 6)
+ KSH(1, 5, 4, 3, 2, 1)
+ KSH(1, 6, 5, 4, 3, 2)
+ KSH(2, 3, 2, 1, 0, 7)
+ KSH(2, 7, 6, 5, 4, 3)
+
+ AS2( pxor mm0, [ebx + 8*1024 + esi*8])
+ AS2( movq [esp+4], mm0)
+
+ TSL(0, 4, 3, 2, 1, 0)
+ TSL(0, 0, 7, 6, 5, 4)
+ TSL(1, 1, 0, 7, 6, 5)
+ TSL(1, 2, 1, 0, 7, 6)
+ TSL(1, 3, 2, 1, 0, 7)
+ TSL(1, 5, 4, 3, 2, 1)
+ TSL(1, 6, 5, 4, 3, 2)
+ TSL(1, 7, 6, 5, 4, 3)
+ TSH(0, 0, 7, 6, 5, 4)
+ TSH(0, 4, 3, 2, 1, 0)
+ TSH(1, 1, 0, 7, 6, 5)
+ TSH(1, 2, 1, 0, 7, 6)
+ TSH(1, 5, 4, 3, 2, 1)
+ TSH(1, 6, 5, 4, 3, 2)
+
+ AS1( inc esi)
+ AS2( cmp esi, 10)
+ ASJ( je, 2, f)
+
+ TSH(2, 3, 2, 1, 0, 7)
+ TSH(2, 7, 6, 5, 4, 3)
+
+ ASJ( jmp, 1, b)
+ ASL(2)
+
+ TSH(3, 3, 2, 1, 0, 7)
+ TSH(3, 7, 6, 5, 4, 3)
+
+#undef KSL
+#undef KSH
+#undef TSL
+#undef TSH
+
+ AS1( emms)
+ AS1( pop esp)
+
+#ifdef __GNUC__
+ AS1( pop ebx)
+ ".att_syntax prefix;"
+ :
+ : "a" (Whirlpool_C), "c" (digest), "d" (block)
+ : "%esi", "%edi", "memory", "cc"
+ );
+#endif
+ }
+ else
+#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE
+ {
word64 s[8]; // the cipher state
word64 k[8]; // the round key
// Compute and apply K^0 to the cipher state
// Also apply part of the Miyaguchi-Preneel compression function
- digest[0] = s[0] = block[0] ^ (k[0] = digest[0]);
- digest[1] = s[1] = block[1] ^ (k[1] = digest[1]);
- digest[2] = s[2] = block[2] ^ (k[2] = digest[2]);
- digest[3] = s[3] = block[3] ^ (k[3] = digest[3]);
- digest[4] = s[4] = block[4] ^ (k[4] = digest[4]);
- digest[5] = s[5] = block[5] ^ (k[5] = digest[5]);
- digest[6] = s[6] = block[6] ^ (k[6] = digest[6]);
- digest[7] = s[7] = block[7] ^ (k[7] = digest[7]);
+ for (int i=0; i<8; i++)
+ digest[i] = s[i] = block[i] ^ (k[i] = digest[i]);
+
+#define KSL(op, i, a, b, c, d) \
+ t = (word32)k[i];\
+ w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\
+ t >>= 8;\
+ w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\
+ t >>= 8;\
+ w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\
+ t >>= 8;\
+ w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : 0);
+
+#define KSH(op, i, a, b, c, d) \
+ t = (word32)(k[(i+4)%8]>>32);\
+ w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32));\
+ if (op==2) k[a] = w##a;\
+ t >>= 8;\
+ w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : rotrFixed(w##b, 32));\
+ if (op==2) k[b] = w##b;\
+ t >>= 8;\
+ w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : rotrFixed(w##c, 32));\
+ if (op==2) k[c] = w##c;\
+ t >>= 8;\
+ w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : rotrFixed(w##d, 32));\
+ if (op==2) k[d] = w##d;\
+
+#define TSL(op, i, a, b, c, d) \
+ t = (word32)s[i];\
+ w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\
+ t >>= 8;\
+ w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\
+ t >>= 8;\
+ w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\
+ t >>= 8;\
+ w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : 0);
+
+#define TSH_OP(op, a, b) \
+ w##a = Whirlpool_C[b*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32) ^ k[a]);\
+ if (op==2) s[a] = w##a;\
+ if (op==3) digest[a] ^= w##a;\
+
+#define TSH(op, i, a, b, c, d) \
+ t = (word32)(s[(i+4)%8]>>32);\
+ TSH_OP(op, a, 3);\
+ t >>= 8;\
+ TSH_OP(op, b, 2);\
+ t >>= 8;\
+ TSH_OP(op, c, 1);\
+ t >>= 8;\
+ TSH_OP(op, d, 0);\
// Iterate over all rounds:
- for (int r = 0; r < R; r++)
+ int r=0;
+ while (true)
{
word64 w0, w1, w2, w3, w4, w5, w6, w7; // temporary storage
- word64 t;
-
- // Compute K^r from K^{r-1}:
-#define K(i,j) GETBYTE(k[(i+j+1)%8], j)
-#define KS(i) \
- t = C0[K(i,3)] ^ C1[K(i,2)] ^ C2[K(i,1)] ^ C3[K(i,0)]; \
- w##i = rotrFixed(t, 32) ^ C0[K(i,7)] ^ C1[K(i,6)] ^ C2[K(i,5)] ^ C3[K(i,4)];
-
- KS(0); KS(1); KS(2); KS(3); KS(4); KS(5); KS(6); KS(7);
- k[0] = w0 ^ rc[r];
- k[1] = w1; k[2] = w2; k[3] = w3; k[4] = w4; k[5] = w5; k[6] = w6; k[7] = w7;
-
- // Apply the r-th round transformation:
-#define S(i,j) GETBYTE(s[(i+j+1)%8], j)
-#define TS(i) \
- t = C0[S(i,3)] ^ C1[S(i,2)] ^ C2[S(i,1)] ^ C3[S(i,0)]; \
- w##i = rotrFixed(t, 32) ^ C0[S(i,7)] ^ C1[S(i,6)] ^ C2[S(i,5)] ^ C3[S(i,4)] ^ k[i];
-
- TS(0); TS(1); TS(2); TS(3); TS(4); TS(5); TS(6); TS(7);
- s[0] = w0; s[1] = w1; s[2] = w2; s[3] = w3; s[4] = w4; s[5] = w5; s[6] = w6; s[7] = w7;
- }
+ word32 t;
+
+ KSL(0, 4, 3, 2, 1, 0)
+ KSL(0, 0, 7, 6, 5, 4)
+ KSL(1, 1, 0, 7, 6, 5)
+ KSL(1, 2, 1, 0, 7, 6)
+ KSL(1, 3, 2, 1, 0, 7)
+ KSL(1, 5, 4, 3, 2, 1)
+ KSL(1, 6, 5, 4, 3, 2)
+ KSL(1, 7, 6, 5, 4, 3)
+ KSH(0, 0, 7, 6, 5, 4)
+ KSH(0, 4, 3, 2, 1, 0)
+ KSH(1, 1, 0, 7, 6, 5)
+ KSH(1, 2, 1, 0, 7, 6)
+ KSH(1, 5, 4, 3, 2, 1)
+ KSH(1, 6, 5, 4, 3, 2)
+ KSH(2, 3, 2, 1, 0, 7)
+ KSH(2, 7, 6, 5, 4, 3)
- // Apply the rest of the Miyaguchi-Preneel compression function:
- digest[0] ^= s[0];
- digest[1] ^= s[1];
- digest[2] ^= s[2];
- digest[3] ^= s[3];
- digest[4] ^= s[4];
- digest[5] ^= s[5];
- digest[6] ^= s[6];
- digest[7] ^= s[7];
+ k[0] ^= Whirlpool_C[1024+r];
+
+ TSL(0, 4, 3, 2, 1, 0)
+ TSL(0, 0, 7, 6, 5, 4)
+ TSL(1, 1, 0, 7, 6, 5)
+ TSL(1, 2, 1, 0, 7, 6)
+ TSL(1, 3, 2, 1, 0, 7)
+ TSL(1, 5, 4, 3, 2, 1)
+ TSL(1, 6, 5, 4, 3, 2)
+ TSL(1, 7, 6, 5, 4, 3)
+ TSH(0, 0, 7, 6, 5, 4)
+ TSH(0, 4, 3, 2, 1, 0)
+ TSH(1, 1, 0, 7, 6, 5)
+ TSH(1, 2, 1, 0, 7, 6)
+ TSH(1, 5, 4, 3, 2, 1)
+ TSH(1, 6, 5, 4, 3, 2)
+
+ if (++r < R)
+ {
+ TSH(2, 3, 2, 1, 0, 7)
+ TSH(2, 7, 6, 5, 4, 3)
+ }
+ else
+ {
+ TSH(3, 3, 2, 1, 0, 7)
+ TSH(3, 7, 6, 5, 4, 3)
+ break;
+ }
+ }
+ }
}
NAMESPACE_END
diff --git a/whrlpool.h b/whrlpool.h
index c6971f08..298850ab 100644
--- a/whrlpool.h
+++ b/whrlpool.h
@@ -9,8 +9,7 @@
NAMESPACE_BEGIN(CryptoPP)
-//! <a href="http://www.weidai.com/scan-mirror/md.html#Whirlpool">Whirlpool</a>
-/*! 512 Bit Hash */
+//! <a href="http://www.cryptolounge.org/wiki/Whirlpool">Whirlpool</a>
class Whirlpool : public IteratedHashWithStaticTransform<word64, BigEndian, 64, 64, Whirlpool>
{
public: