diff options
-rw-r--r-- | integer.cpp | 3267 | ||||
-rw-r--r-- | integer.h | 37 | ||||
-rw-r--r-- | rijndael.cpp | 576 | ||||
-rw-r--r-- | rijndael.h | 15 | ||||
-rw-r--r-- | sha.cpp | 440 | ||||
-rw-r--r-- | sha.h | 6 | ||||
-rw-r--r-- | tiger.cpp | 229 | ||||
-rw-r--r-- | tiger.h | 4 | ||||
-rw-r--r-- | whrlpool.cpp | 344 | ||||
-rw-r--r-- | whrlpool.h | 3 |
10 files changed, 2765 insertions, 2156 deletions
diff --git a/integer.cpp b/integer.cpp index 0c5018ee..515643ed 100644 --- a/integer.cpp +++ b/integer.cpp @@ -14,30 +14,20 @@ #include "algparam.h" #include "pubkey.h" // for P1363_KDF2 #include "sha.h" +#include "cpu.h" #include <iostream> -#ifdef _M_X64 -#include <Intrin.h> +#if defined(_MSC_VER) && _MSC_VER >= 1400 + #include <intrin.h> #endif -#ifdef SSE2_INTRINSICS_AVAILABLE - #ifdef __GNUC__ - #include <xmmintrin.h> - #include <signal.h> - #include <setjmp.h> - #ifdef CRYPTOPP_MEMALIGN_AVAILABLE - #include <malloc.h> - #else - #include <stdlib.h> - #endif - #else - #include <emmintrin.h> - #endif -#elif defined(_MSC_VER) && defined(_M_IX86) - #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") -#elif defined(__GNUC__) && defined(__i386__) - #warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled." +#ifdef __DECCXX + #include <c_asm.h> +#endif + +#ifdef CRYPTOPP_MSVC6_NO_PP + #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.") #endif NAMESPACE_BEGIN(CryptoPP) @@ -50,67 +40,7 @@ bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const v return true; } -#ifdef SSE2_INTRINSICS_AVAILABLE -template <class T> -CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *) -{ - CheckSize(n); - if (n == 0) - return NULL; - if (n >= 4) - { - void *p; - #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE - while (!(p = _mm_malloc(sizeof(T)*n, 16))) - #elif defined(CRYPTOPP_MEMALIGN_AVAILABLE) - while (!(p = memalign(16, sizeof(T)*n))) - #elif defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) - while (!(p = malloc(sizeof(T)*n))) - #else - while (!(p = (byte *)malloc(sizeof(T)*n + 8))) // assume malloc alignment is at least 8 - #endif - CallNewHandler(); - - #ifdef CRYPTOPP_NO_ALIGNED_ALLOC - assert(m_pBlock == NULL); - m_pBlock = p; - if (!IsAlignedOn(p, 16)) - { - assert(IsAlignedOn(p, 8)); - p = (byte *)p + 8; - } - #endif - - assert(IsAlignedOn(p, 16)); - return (T*)p; - } - return new T[n]; -} - -template <class T> -void AlignedAllocator<T>::deallocate(void *p, size_type n) -{ - memset(p, 0, n*sizeof(T)); - if (n >= 4) - { - #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE - _mm_free(p); - #elif defined(CRYPTOPP_NO_ALIGNED_ALLOC) - assert(m_pBlock == p || (byte *)m_pBlock+8 == p); - free(m_pBlock); - m_pBlock = NULL; - #else - free(p); - #endif - } - else - delete [] (T *)p; -} - -template class CRYPTOPP_DLL AlignedAllocator<word>; -#endif - -static int Compare(const word *A, const word *B, size_t N) +inline static int Compare(const word *A, const word *B, size_t N) { while (N--) if (A[N] > B[N]) @@ -121,7 +51,7 @@ static int Compare(const word *A, const word *B, size_t N) return 0; } -static int Increment(word *A, size_t N, word B=1) +inline static int Increment(word *A, size_t N, word B=1) { assert(N); word t = A[0]; @@ -134,7 +64,7 @@ static int Increment(word *A, size_t N, word B=1) return 1; } -static int Decrement(word *A, size_t N, word B=1) +inline static int Decrement(word *A, size_t N, word B=1) { assert(N); word t = A[0]; @@ -169,6 +99,45 @@ static word AtomicInverseModPower2(word A) // ******************************************************** +#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE + #define Declare2Words(x) dword x; + #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) + #define MultiplyWords(p, a, b) p = __emulu(a, b); + #else + #define MultiplyWords(p, a, b) p = (dword)a*b; + #endif + #define AssignWord(a, b) a = b; + #define Add2WordsBy1(a, b, c) a = b + c; + #define Acc2WordsBy1(a, b) a += b; + #define Acc2WordsBy2(a, b) a += b; + #define LowWord(a) (word)a + #define HighWord(a) (word)(a>>WORD_BITS) + #define Double2Words(a) a += a; + #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u); + #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u); + #define GetCarry(u) HighWord(u) + #define GetBorrow(u) word(u>>(WORD_BITS*2-1)) +#else + #define Declare2Words(x) word x##0, x##1; + #define AssignWord(a, b) a##0 = b; a##1 = 0; + #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); + #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b) + #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; + #define LowWord(a) a##0 + #define HighWord(a) a##1 + #ifdef _MSC_VER + #define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1); + #define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0; + #elif defined(__DECCXX) + #define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b); + #define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0; + #endif + #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);} + #define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);} + #define GetCarry(u) u##1 + #define GetBorrow(u) u##1 +#endif + class DWord { public: @@ -198,25 +167,8 @@ public: DWord r; #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE r.m_whole = (dword)a * b; - #elif defined(__alpha__) - r.m_halfs.low = a*b; __asm__("umulh %1,%2,%0" : "=r" (r.m_halfs.high) : "r" (a), "r" (b)); - #elif defined(__ia64__) - r.m_halfs.low = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (r.m_halfs.high) : "f" (a), "f" (b)); - #elif defined(_ARCH_PPC64) - r.m_halfs.low = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (r.m_halfs.high) : "r" (a), "r" (b) : "cc"); - #elif defined(__x86_64__) - __asm__("mulq %3" : "=d" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc"); - #elif defined(__mips64) - __asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b)); - #elif defined(_M_X64) - r.m_halfs.low = _umul128(a, b, &r.m_halfs.high); - #elif defined(_M_IX86) - // for testing - word64 t = (word64)a * b; - r.m_halfs.high = ((word32 *)(&t))[1]; - r.m_halfs.low = (word32)t; #else - #error can not implement DWord + r.m_halfs.low = _umul128(a, b, &r.m_halfs.high); #endif return r; } @@ -457,1529 +409,1449 @@ inline word DWord::operator%(word a) // ******************************************************** -class Portable -{ -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); - - static inline void Multiply2(word *C, const word *A, const word *B); - static inline word Multiply2Add(word *C, const word *A, const word *B); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static inline unsigned int MultiplyRecursionLimit() {return 8;} - - static inline void Multiply2Bottom(word *C, const word *A, const word *B); - static void Multiply4Bottom(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); - static inline unsigned int MultiplyBottomRecursionLimit() {return 8;} - - static void Square2(word *R, const word *A); - static void Square4(word *R, const word *A); - static void Square8(word *R, const word *A) {assert(false);} - static inline unsigned int SquareRecursionLimit() {return 4;} -}; +// use some tricks to share assembly code between MSVC and GCC +#if defined(__GNUC__) + #define CRYPTOPP_NAKED + #define AddPrologue \ + __asm__ __volatile__ \ + ( \ + "push %%ebx;" /* save this manually, in case of -fPIC */ \ + "mov %2, %%ebx;" \ + ".intel_syntax noprefix;" + #define AddEpilogue \ + ".att_syntax prefix;" \ + "pop %%ebx;" \ + : \ + : "d" (C), "a" (A), "m" (B), "c" (N) \ + : "%esi", "memory", "cc" \ + ); + #define MulPrologue \ + __asm__ __volatile__ \ + ( \ + ".intel_syntax noprefix;" \ + AS1( push ebx) \ + AS2( mov ebx, edx) + #define MulEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \ + : "%esi", "memory", "cc" \ + ); + #define SquPrologue MulPrologue + #define SquEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A) \ + : "%esi", "%edi", "memory", "cc" \ + ); + #define TopPrologue MulPrologue + #define TopEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \ + : "memory", "cc" \ + ); +#else + #define CRYPTOPP_NAKED __declspec(naked) + #define AddPrologue \ + __asm push ebx \ + __asm push esi \ + __asm mov eax, [esp+12] \ + __asm mov ebx, [esp+16] + #define AddEpilogue \ + __asm pop esi \ + __asm pop ebx \ + __asm ret 8 + #define SquPrologue \ + AS2( mov eax, A) \ + AS2( mov ecx, C) \ + AS2( lea ebx, s_maskLow16) + #define SquEpilogue + #define MulPrologue \ + AS2( mov eax, A) \ + AS2( mov edi, B) \ + AS2( mov ecx, C) \ + AS2( lea ebx, s_maskLow16) + #define MulEpilogue + #define TopPrologue \ + AS2( mov eax, A) \ + AS2( mov edi, B) \ + AS2( mov ecx, C) \ + AS2( mov esi, L) \ + AS2( lea ebx, s_maskLow16) + #define TopEpilogue +#endif -int Portable::Add(word *C, const word *A, const word *B, size_t N) +#if defined(_MSC_VER) && defined(_M_X64) +extern "C" { +int Baseline_Add(size_t N, word *C, const word *A, const word *B); +int Baseline_Sub(size_t N, word *C, const word *A, const word *B); +} +#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) { - assert (N%2 == 0); + AddPrologue - DWord u(0, 0); - for (unsigned int i = 0; i < N; i+=2) - { - u = DWord(A[i]) + B[i] + u.GetHighHalf(); - C[i] = u.GetLowHalf(); - u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf(); - C[i+1] = u.GetLowHalf(); - } - return int(u.GetHighHalf()); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero + AS2( mov esi,[eax+4*ecx]) + AS2( adc esi,[ebx+4*ecx]) + AS2( mov [edx+4*ecx],esi) + AS2( mov esi,[eax+4*ecx+4]) + AS2( adc esi,[ebx+4*ecx+4]) + AS2( mov [edx+4*ecx+4],esi) + ASL(1) + AS2( mov esi,[eax+4*ecx+8]) + AS2( adc esi,[ebx+4*ecx+8]) + AS2( mov [edx+4*ecx+8],esi) + AS2( mov esi,[eax+4*ecx+12]) + AS2( adc esi,[ebx+4*ecx+12]) + AS2( mov [edx+4*ecx+12],esi) + + AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 + ASJ( jmp, 0, b) + + ASL(2) + AS2( mov eax, 0) + AS1( setc al) // store carry into eax (return result register) + + AddEpilogue } -int Portable::Subtract(word *C, const word *A, const word *B, size_t N) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) { - assert (N%2 == 0); + AddPrologue - DWord u(0, 0); - for (unsigned int i = 0; i < N; i+=2) - { - u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow(); - C[i] = u.GetLowHalf(); - u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow(); - C[i+1] = u.GetLowHalf(); - } - return int(0-u.GetHighHalf()); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero + AS2( mov esi,[eax+4*ecx]) + AS2( sbb esi,[ebx+4*ecx]) + AS2( mov [edx+4*ecx],esi) + AS2( mov esi,[eax+4*ecx+4]) + AS2( sbb esi,[ebx+4*ecx+4]) + AS2( mov [edx+4*ecx+4],esi) + ASL(1) + AS2( mov esi,[eax+4*ecx+8]) + AS2( sbb esi,[ebx+4*ecx+8]) + AS2( mov [edx+4*ecx+8],esi) + AS2( mov esi,[eax+4*ecx+12]) + AS2( sbb esi,[ebx+4*ecx+12]) + AS2( mov [edx+4*ecx+12],esi) + + AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 + ASJ( jmp, 0, b) + + ASL(2) + AS2( mov eax, 0) + AS1( setc al) // store carry into eax (return result register) + + AddEpilogue } -void Portable::Multiply2(word *C, const word *A, const word *B) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) { -/* - word s; - dword d; + AddPrologue - if (A1 >= A0) - if (B0 >= B1) - { - s = 0; - d = (dword)(A1-A0)*(B0-B1); - } - else - { - s = (A1-A0); - d = (dword)s*(word)(B0-B1); - } - else - if (B0 > B1) - { - s = (B0-B1); - d = (word)(A1-A0)*(dword)s; - } - else - { - s = 0; - d = (dword)(A0-A1)*(B1-B0); - } -*/ - // this segment is the branchless equivalent of above - word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; - unsigned int ai = A[1] < A[0]; - unsigned int bi = B[0] < B[1]; - unsigned int di = ai & bi; - DWord d = DWord::Multiply(D[di], D[di+2]); - D[1] = D[3] = 0; - unsigned int si = ai + !bi; - word s = D[si]; - - DWord A0B0 = DWord::Multiply(A[0], B[0]); - C[0] = A0B0.GetLowHalf(); - - DWord A1B1 = DWord::Multiply(A[1], B[1]); - DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf(); - C[1] = t.GetLowHalf(); - - t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s; - C[2] = t.GetLowHalf(); - C[3] = t.GetHighHalf(); -} - -inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B) -{ - DWord t = DWord::Multiply(A[0], B[0]); - C[0] = t.GetLowHalf(); - C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0]; -} - -word Portable::Multiply2Add(word *C, const word *A, const word *B) -{ - word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; - unsigned int ai = A[1] < A[0]; - unsigned int bi = B[0] < B[1]; - unsigned int di = ai & bi; - DWord d = DWord::Multiply(D[di], D[di+2]); - D[1] = D[3] = 0; - unsigned int si = ai + !bi; - word s = D[si]; - - DWord A0B0 = DWord::Multiply(A[0], B[0]); - DWord t = A0B0 + C[0]; - C[0] = t.GetLowHalf(); - - DWord A1B1 = DWord::Multiply(A[1], B[1]); - t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1]; - C[1] = t.GetLowHalf(); - - t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2]; - C[2] = t.GetLowHalf(); - - t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3]; - C[3] = t.GetLowHalf(); - return t.GetHighHalf(); -} - -#define MulAcc(x, y) \ - p = DWord::MultiplyAndAdd(A[x], B[y], c); \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -#define SaveMulAcc(s, x, y) \ - R[s] = c; \ - p = DWord::MultiplyAndAdd(A[x], B[y], d); \ - c = p.GetLowHalf(); \ - p = (DWord) e + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e = p.GetHighHalf(); - -#define SquAcc(x, y) \ - q = DWord::Multiply(A[x], A[y]); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -#define SaveSquAcc(s, x, y) \ - R[s] = c; \ - q = DWord::Multiply(A[x], A[y]); \ - p = q + d; \ - c = p.GetLowHalf(); \ - p = (DWord) e + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e = p.GetHighHalf(); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -void Portable::Multiply4(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 3, 1); - MulAcc(2, 2); - MulAcc(1, 3); - - SaveMulAcc(4, 2, 3); - MulAcc(3, 2); - - R[5] = c; - p = DWord::MultiplyAndAdd(A[3], B[3], d); - R[6] = p.GetLowHalf(); - R[7] = e + p.GetHighHalf(); -} - -void Portable::Square2(word *R, const word *A) -{ - DWord p, q; - word c, d, e; - - p = DWord::Multiply(A[0], A[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - SquAcc(0, 1); - - R[1] = c; - p = DWord::MultiplyAndAdd(A[1], A[1], d); - R[2] = p.GetLowHalf(); - R[3] = e + p.GetHighHalf(); -} - -void Portable::Square4(word *R, const word *A) -{ -#ifdef _MSC_VER - // VC60 workaround: MSVC 6.0 has an optimization bug that makes - // (dword)A*B where either A or B has been cast to a dword before - // very expensive. Revisit this function when this - // bug is fixed. - Multiply4(R, A, A); -#else - const word *B = A; - DWord p, q; - word c, d, e; + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( pxor mm2, mm2) + ASJ( jz, 2, f) + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + AS2( movd mm0, DWORD PTR [eax+4*ecx]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx], mm2) + AS2( psrlq mm2, 32) + + AS2( movd mm0, DWORD PTR [eax+4*ecx+4]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+4]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+4], mm2) + AS2( psrlq mm2, 32) + + ASL(1) + AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+8]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+8], mm2) + AS2( psrlq mm2, 32) + + AS2( movd mm0, DWORD PTR [eax+4*ecx+12]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+12]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+12], mm2) + AS2( psrlq mm2, 32) + + AS2( add ecx, 4) + ASJ( jnz, 0, b) + + ASL(2) + AS2( movd eax, mm2) + AS1( emms) - p = DWord::Multiply(A[0], A[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; + AddEpilogue +} - SquAcc(0, 1); +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) +{ + AddPrologue - SaveSquAcc(1, 2, 0); - MulAcc(1, 1); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( pxor mm2, mm2) + ASJ( jz, 2, f) + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + AS2( movd mm0, DWORD PTR [eax+4*ecx]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx]) + AS2( psubq mm0, mm1) + AS2( psubq mm0, mm2) + AS2( movd DWORD PTR [edx+4*ecx], mm0) + AS2( psrlq mm0, 63) + + AS2( movd mm2, DWORD PTR [eax+4*ecx+4]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+4]) + AS2( psubq mm2, mm1) + AS2( psubq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+4], mm2) + AS2( psrlq mm2, 63) + + ASL(1) + AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+8]) + AS2( psubq mm0, mm1) + AS2( psubq mm0, mm2) + AS2( movd DWORD PTR [edx+4*ecx+8], mm0) + AS2( psrlq mm0, 63) + + AS2( movd mm2, DWORD PTR [eax+4*ecx+12]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+12]) + AS2( psubq mm2, mm1) + AS2( psubq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+12], mm2) + AS2( psrlq mm2, 63) + + AS2( add ecx, 4) + ASJ( jnz, 0, b) + + ASL(2) + AS2( movd eax, mm2) + AS1( emms) - SaveSquAcc(2, 0, 3); - SquAcc(1, 2); + AddEpilogue +} +#else +int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) +{ + assert (N%2 == 0); - SaveSquAcc(3, 3, 1); - MulAcc(2, 2); + Declare2Words(u); + for (size_t i=0; i<N; i+=2) + { + AddWithCarry(u, A[i], B[i]); + C[i] = LowWord(u); + AddWithCarry(u, A[i+1], B[i+1]); + C[i+1] = LowWord(u); + } + return int(GetCarry(u)); +} - SaveSquAcc(4, 2, 3); +int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) +{ + assert (N%2 == 0); - R[5] = c; - p = DWord::MultiplyAndAdd(A[3], A[3], d); - R[6] = p.GetLowHalf(); - R[7] = e + p.GetHighHalf(); + Declare2Words(u); + for (size_t i=0; i<N; i+=2) + { + SubtractWithBorrow(u, A[i], B[i]); + C[i] = LowWord(u); + SubtractWithBorrow(u, A[i+1], B[i+1]); + C[i+1] = LowWord(u); + } + return int(GetBorrow(u)); +} #endif + +static word LinearMultiply(word *C, const word *A, word B, size_t N) +{ + word carry=0; + for(unsigned i=0; i<N; i++) + { + Declare2Words(p); + MultiplyWords(p, A[i], B); + Acc2WordsBy1(p, carry); + C[i] = LowWord(p); + carry = HighWord(p); + } + return carry; } -void Portable::Multiply8(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); +#define Mul_2 \ + Mul_Begin(2) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_End(2) + +#define Mul_4 \ + Mul_Begin(4) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \ + Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \ + Mul_End(4) + +#define Mul_8 \ + Mul_Begin(8) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ + Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \ + Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \ + Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \ + Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \ + Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \ + Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \ + Mul_End(8) + +#define Mul_16 \ + Mul_Begin(16) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ + Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \ + Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \ + Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \ + Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \ + Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \ + Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \ + Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \ + Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \ + Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \ + Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \ + Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \ + Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \ + Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \ + Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \ + Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \ + Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \ + Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \ + Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \ + Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \ + Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \ + Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \ + Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \ + Mul_End(16) + +#define Squ_2 \ + Squ_Begin(2) \ + Squ_End(2) + +#define Squ_4 \ + Squ_Begin(4) \ + Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ + Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ + Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \ + Squ_SaveAcc(4, 2, 3) Squ_NonDiag \ + Squ_End(4) + +#define Squ_8 \ + Squ_Begin(8) \ + Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ + Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ + Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \ + Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \ + Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \ + Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \ + Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \ + Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \ + Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \ + Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \ + Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \ + Squ_SaveAcc(12, 6, 7) Squ_NonDiag \ + Squ_End(8) + +#define Squ_16 \ + Squ_Begin(16) \ + Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ + Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ + Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \ + Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \ + Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \ + Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \ + Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \ + Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \ + Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \ + Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \ + Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \ + Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \ + Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \ + Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \ + Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \ + Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \ + Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \ + Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \ + Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \ + Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \ + Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \ + Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \ + Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \ + Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \ + Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \ + Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \ + Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \ + Squ_SaveAcc(28, 14, 15) Squ_NonDiag \ + Squ_End(16) + +#define Bot_2 \ + Mul_Begin(2) \ + Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \ + Bot_End(2) + +#define Bot_4 \ + Mul_Begin(4) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \ + Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \ + Bot_End(4) + +#define Bot_8 \ + Mul_Begin(8) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \ + Bot_End(8) + +#define Bot_16 \ + Mul_Begin(16) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ + Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \ + Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \ + Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \ + Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \ + Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \ + Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \ + Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \ + Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \ + Bot_End(16) + +#define Mul_Begin(n) \ + Declare2Words(p) \ + Declare2Words(c) \ + Declare2Words(d) \ + MultiplyWords(p, A[0], B[0]) \ + AssignWord(c, LowWord(p)) \ + AssignWord(d, HighWord(p)) + +#define Mul_Acc(i, j) \ + MultiplyWords(p, A[i], B[j]) \ + Acc2WordsBy1(c, LowWord(p)) \ + Acc2WordsBy1(d, HighWord(p)) + +#define Mul_SaveAcc(k, i, j) \ + R[k] = LowWord(c); \ + Add2WordsBy1(c, d, HighWord(c)) \ + MultiplyWords(p, A[i], B[j]) \ + AssignWord(d, HighWord(p)) \ + Acc2WordsBy1(c, LowWord(p)) + +#define Mul_End(n) \ + R[2*n-3] = LowWord(c); \ + Acc2WordsBy1(d, HighWord(c)) \ + MultiplyWords(p, A[n-1], B[n-1])\ + Acc2WordsBy2(d, p) \ + R[2*n-2] = LowWord(d); \ + R[2*n-1] = HighWord(d); + +#define Bot_SaveAcc(k, i, j) \ + R[k] = LowWord(c); \ + word e = LowWord(d) + HighWord(c); \ + e += A[i] * B[j]; + +#define Bot_Acc(i, j) \ + e += A[i] * B[j]; + +#define Bot_End(n) \ + R[n-1] = e; - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 0, 4); - MulAcc(1, 3); - MulAcc(2, 2); - MulAcc(3, 1); - MulAcc(4, 0); - - SaveMulAcc(4, 0, 5); - MulAcc(1, 4); - MulAcc(2, 3); - MulAcc(3, 2); - MulAcc(4, 1); - MulAcc(5, 0); - - SaveMulAcc(5, 0, 6); - MulAcc(1, 5); - MulAcc(2, 4); - MulAcc(3, 3); - MulAcc(4, 2); - MulAcc(5, 1); - MulAcc(6, 0); - - SaveMulAcc(6, 0, 7); - MulAcc(1, 6); - MulAcc(2, 5); - MulAcc(3, 4); - MulAcc(4, 3); - MulAcc(5, 2); - MulAcc(6, 1); - MulAcc(7, 0); - - SaveMulAcc(7, 1, 7); - MulAcc(2, 6); - MulAcc(3, 5); - MulAcc(4, 4); - MulAcc(5, 3); - MulAcc(6, 2); - MulAcc(7, 1); - - SaveMulAcc(8, 2, 7); - MulAcc(3, 6); - MulAcc(4, 5); - MulAcc(5, 4); - MulAcc(6, 3); - MulAcc(7, 2); - - SaveMulAcc(9, 3, 7); - MulAcc(4, 6); - MulAcc(5, 5); - MulAcc(6, 4); - MulAcc(7, 3); +/* +// this is slower on MSVC 2005 Win32 +#define Mul_Begin(n) \ + Declare2Words(p) \ + word c; \ + Declare2Words(d) \ + MultiplyWords(p, A[0], B[0]) \ + c = LowWord(p); \ + AssignWord(d, HighWord(p)) + +#define Mul_Acc(i, j) \ + MultiplyWords(p, A[i], B[j]) \ + Acc2WordsBy1(p, c) \ + c = LowWord(p); \ + Acc2WordsBy1(d, HighWord(p)) + +#define Mul_SaveAcc(k, i, j) \ + R[k] = c; \ + MultiplyWords(p, A[i], B[j]) \ + Acc2WordsBy1(p, LowWord(d)) \ + c = LowWord(p); \ + AssignWord(d, HighWord(d)) \ + Acc2WordsBy1(d, HighWord(p)) + +#define Mul_End(n) \ + R[2*n-3] = c; \ + MultiplyWords(p, A[n-1], B[n-1])\ + Acc2WordsBy2(d, p) \ + R[2*n-2] = LowWord(d); \ + R[2*n-1] = HighWord(d); + +#define Bot_SaveAcc(k, i, j) \ + R[k] = c; \ + c = LowWord(d); \ + c += A[i] * B[j]; + +#define Bot_Acc(i, j) \ + c += A[i] * B[j]; + +#define Bot_End(n) \ + R[n-1] = c; +*/ - SaveMulAcc(10, 4, 7); - MulAcc(5, 6); - MulAcc(6, 5); - MulAcc(7, 4); - - SaveMulAcc(11, 5, 7); - MulAcc(6, 6); - MulAcc(7, 5); - - SaveMulAcc(12, 6, 7); - MulAcc(7, 6); - - R[13] = c; - p = DWord::MultiplyAndAdd(A[7], B[7], d); - R[14] = p.GetLowHalf(); - R[15] = e + p.GetHighHalf(); -} - -void Portable::Multiply4Bottom(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - R[2] = c; - R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0]; -} - -void Portable::Multiply8Bottom(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 0, 4); - MulAcc(1, 3); - MulAcc(2, 2); - MulAcc(3, 1); - MulAcc(4, 0); - - SaveMulAcc(4, 0, 5); - MulAcc(1, 4); - MulAcc(2, 3); - MulAcc(3, 2); - MulAcc(4, 1); - MulAcc(5, 0); - - SaveMulAcc(5, 0, 6); - MulAcc(1, 5); - MulAcc(2, 4); - MulAcc(3, 3); - MulAcc(4, 2); - MulAcc(5, 1); - MulAcc(6, 0); +#define Squ_Begin(n) \ + Declare2Words(p) \ + Declare2Words(c) \ + Declare2Words(d) \ + Declare2Words(e) \ + MultiplyWords(p, A[0], A[0]) \ + R[0] = LowWord(p); \ + AssignWord(e, HighWord(p)) \ + MultiplyWords(p, A[0], A[1]) \ + AssignWord(c, LowWord(p)) \ + AssignWord(d, HighWord(p)) \ + Squ_NonDiag \ - R[6] = c; - R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] + - A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0]; -} +#define Squ_NonDiag \ + Double2Words(c) \ + Double2Words(d) \ -#undef MulAcc -#undef SaveMulAcc -#undef SquAcc -#undef SaveSquAcc +#define Squ_SaveAcc(k, i, j) \ + Acc2WordsBy2(c, e) \ + R[k] = LowWord(c); \ + Add2WordsBy1(e, d, HighWord(c)) \ + MultiplyWords(p, A[i], A[j]) \ + AssignWord(c, LowWord(p)) \ + AssignWord(d, HighWord(p)) \ -#ifdef CRYPTOPP_X86ASM_AVAILABLE +#define Squ_Acc(i, j) \ + MultiplyWords(p, A[i], A[j]) \ + Acc2WordsBy1(c, LowWord(p)) \ + Acc2WordsBy1(d, HighWord(p)) -// ************** x86 feature detection *************** +#define Squ_Diag(i) \ + Squ_NonDiag \ + MultiplyWords(p, A[i], A[i]) \ + Acc2WordsBy1(c, LowWord(p)) \ + Acc2WordsBy1(d, HighWord(p)) \ -static bool s_sse2Enabled = true; +#define Squ_End(n) \ + Acc2WordsBy2(c, e) \ + R[2*n-3] = LowWord(c); \ + Acc2WordsBy1(d, HighWord(c)) \ + MultiplyWords(p, A[n-1], A[n-1])\ + Acc2WordsBy2(d, p) \ + R[2*n-2] = LowWord(d); \ + R[2*n-1] = HighWord(d); -static void CpuId(word32 input, word32 *output) +void Baseline_Multiply2(word *R, const word *A, const word *B) { -#ifdef __GNUC__ - __asm__ - ( - // save ebx in case -fPIC is being used - "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" - : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3]) - : "a" (input) - ); -#else - __asm - { - mov eax, input - cpuid - mov edi, output - mov [edi], eax - mov [edi+4], ebx - mov [edi+8], ecx - mov [edi+12], edx - } -#endif + Mul_2 } -#ifdef SSE2_INTRINSICS_AVAILABLE -#ifndef _MSC_VER -static jmp_buf s_env; -static void SigIllHandler(int) +void Baseline_Multiply4(word *R, const word *A, const word *B) { - longjmp(s_env, 1); + Mul_4 } -#endif -static bool HasSSE2() +void Baseline_Multiply8(word *R, const word *A, const word *B) { - if (!s_sse2Enabled) - return false; - - word32 cpuid[4]; - CpuId(1, cpuid); - if ((cpuid[3] & (1 << 26)) == 0) - return false; - -#ifdef _MSC_VER - __try - { - __asm xorpd xmm0, xmm0 // executing SSE2 instruction - } - __except (1) - { - return false; - } - return true; -#else - typedef void (*SigHandler)(int); + Mul_8 +} - SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; +void Baseline_Square2(word *R, const word *A) +{ + Squ_2 +} - bool result = true; - if (setjmp(s_env)) - result = false; - else - __asm __volatile ("xorps %xmm0, %xmm0"); +void Baseline_Square4(word *R, const word *A) +{ + Squ_4 +} - signal(SIGILL, oldHandler); - return result; -#endif +void Baseline_Square8(word *R, const word *A) +{ + Squ_8 } -#endif -static bool IsP4() +void Baseline_MultiplyBottom2(word *R, const word *A, const word *B) { - word32 cpuid[4]; + Bot_2 +} - CpuId(0, cpuid); - std::swap(cpuid[2], cpuid[3]); - if (memcmp(cpuid+1, "GenuineIntel", 12) != 0) - return false; +void Baseline_MultiplyBottom4(word *R, const word *A, const word *B) +{ + Bot_4 +} - CpuId(1, cpuid); - return ((cpuid[0] >> 8) & 0xf) == 0xf; +void Baseline_MultiplyBottom8(word *R, const word *A, const word *B) +{ + Bot_8 } -// ************** Pentium/P4 optimizations *************** +/* +void Baseline_Multiply16(word *R, const word *A, const word *B) +{ + Mul_16 +} -class PentiumOptimized : public Portable +void Baseline_Square16(word *R, const word *A) { -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); -}; + Squ_16 +} -class P4Optimized +void Baseline_MultiplyBottom16(word *R, const word *A, const word *B) { -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); -#ifdef SSE2_INTRINSICS_AVAILABLE - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); -#endif -}; + Bot_16 +} +*/ -typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N); -typedef void (* PMul)(word *C, const word *A, const word *B); +// ******************************************************** + +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff}; + +#undef Mul_Begin +#undef Mul_Acc +#undef Squ_Acc +#undef Squ_NonDiag +#undef Squ_Diag +#undef Squ_SaveAcc +#undef Squ_Begin +#undef Mul_SaveAcc +#undef Bot_Acc +#undef Bot_SaveAcc +#undef Bot_End +#undef Squ_End +#undef Mul_End + +#define SSE2_FinalSave(k) \ + AS2( psllq xmm5, 16) \ + AS2( paddq xmm4, xmm5) \ + AS2( movq QWORD PTR [ecx+8*(k)], xmm4) + +#define SSE2_SaveShift(k) \ + AS2( movq xmm0, xmm6) \ + AS2( punpckhqdq xmm6, xmm0) \ + AS2( movq xmm1, xmm7) \ + AS2( punpckhqdq xmm7, xmm1) \ + AS2( paddd xmm6, xmm0) \ + AS2( pslldq xmm6, 4) \ + AS2( paddd xmm7, xmm1) \ + AS2( paddd xmm4, xmm6) \ + AS2( pslldq xmm7, 4) \ + AS2( movq xmm6, xmm4) \ + AS2( paddd xmm5, xmm7) \ + AS2( movq xmm7, xmm5) \ + AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \ + AS2( psrlq xmm6, 16) \ + AS2( paddq xmm6, xmm7) \ + AS2( punpckhqdq xmm4, xmm0) \ + AS2( punpckhqdq xmm5, xmm0) \ + AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \ + AS2( psrlq xmm6, 3*16) \ + AS2( paddd xmm4, xmm6) \ + +#define Squ_SSE2_SaveShift(k) \ + AS2( movq xmm0, xmm6) \ + AS2( punpckhqdq xmm6, xmm0) \ + AS2( movq xmm1, xmm7) \ + AS2( punpckhqdq xmm7, xmm1) \ + AS2( paddd xmm6, xmm0) \ + AS2( pslldq xmm6, 4) \ + AS2( paddd xmm7, xmm1) \ + AS2( paddd xmm4, xmm6) \ + AS2( pslldq xmm7, 4) \ + AS2( movhlps xmm6, xmm4) \ + AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \ + AS2( paddd xmm5, xmm7) \ + AS2( movhps QWORD PTR [esp+12], xmm5)\ + AS2( psrlq xmm4, 16) \ + AS2( paddq xmm4, xmm5) \ + AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \ + AS2( psrlq xmm4, 3*16) \ + AS2( paddd xmm4, xmm6) \ + AS2( movq QWORD PTR [esp+4], xmm4)\ + +#define SSE2_FirstMultiply(i) \ + AS2( movdqa xmm7, [esi+(i)*16])\ + AS2( movdqa xmm5, [edi-(i)*16])\ + AS2( pmuludq xmm5, xmm7) \ + AS2( movdqa xmm4, [ebx])\ + AS2( movdqa xmm6, xmm4) \ + AS2( pand xmm4, xmm5) \ + AS2( psrld xmm5, 16) \ + AS2( pmuludq xmm7, [edx-(i)*16])\ + AS2( pand xmm6, xmm7) \ + AS2( psrld xmm7, 16) + +#define Squ_Begin(n) \ + SquPrologue \ + AS2( mov esi, esp)\ + AS2( and esp, 0xfffffff0)\ + AS2( lea edi, [esp-32*n])\ + AS2( sub esp, 32*n+16)\ + AS1( push esi)\ + AS2( mov esi, edi) \ + AS2( xor edx, edx) \ + ASL(1) \ + ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ + ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ + AS2( movdqa [edi+2*edx], xmm0) \ + AS2( psrlq xmm0, 32) \ + AS2( movdqa [edi+2*edx+16], xmm0) \ + AS2( movdqa [edi+16*n+2*edx], xmm1) \ + AS2( psrlq xmm1, 32) \ + AS2( movdqa [edi+16*n+2*edx+16], xmm1) \ + AS2( add edx, 16) \ + AS2( cmp edx, 8*(n)) \ + ASJ( jne, 1, b) \ + AS2( lea edx, [edi+16*n])\ + SSE2_FirstMultiply(0) \ + +#define Squ_Acc(i) \ + ASL(LSqu##i) \ + AS2( movdqa xmm1, [esi+(i)*16]) \ + AS2( movdqa xmm0, [edi-(i)*16]) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm0, xmm1) \ + AS2( pmuludq xmm1, [edx-(i)*16]) \ + AS2( movdqa xmm3, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm3, xmm1) \ + AS2( psrld xmm1, 16) \ + AS2( paddd xmm6, xmm3) \ + AS2( paddd xmm7, xmm1) \ + +#define Squ_Acc1(i) +#define Squ_Acc2(i) ASC(call, LSqu##i) +#define Squ_Acc3(i) Squ_Acc2(i) +#define Squ_Acc4(i) Squ_Acc2(i) +#define Squ_Acc5(i) Squ_Acc2(i) +#define Squ_Acc6(i) Squ_Acc2(i) +#define Squ_Acc7(i) Squ_Acc2(i) +#define Squ_Acc8(i) Squ_Acc2(i) + +#define SSE2_End(E, n) \ + SSE2_SaveShift(2*(n)-3) \ + AS2( movdqa xmm7, [esi+16]) \ + AS2( movdqa xmm0, [edi]) \ + AS2( pmuludq xmm0, xmm7) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm7, [edx]) \ + AS2( movdqa xmm6, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm6, xmm7) \ + AS2( psrld xmm7, 16) \ + SSE2_SaveShift(2*(n)-2) \ + SSE2_FinalSave(2*(n)-1) \ + AS1( pop esp)\ + E + +#define Squ_End(n) SSE2_End(SquEpilogue, n) +#define Mul_End(n) SSE2_End(MulEpilogue, n) +#define Top_End(n) SSE2_End(TopEpilogue, n) + +#define Squ_Column1(k, i) \ + Squ_SSE2_SaveShift(k) \ + AS2( add esi, 16) \ + SSE2_FirstMultiply(1)\ + Squ_Acc##i(i) \ + AS2( paddd xmm4, xmm4) \ + AS2( paddd xmm5, xmm5) \ + AS2( movdqa xmm3, [esi]) \ + AS2( movq xmm1, QWORD PTR [esi+8]) \ + AS2( pmuludq xmm1, xmm3) \ + AS2( pmuludq xmm3, xmm3) \ + AS2( movdqa xmm0, [ebx])\ + AS2( movdqa xmm2, xmm0) \ + AS2( pand xmm0, xmm1) \ + AS2( psrld xmm1, 16) \ + AS2( paddd xmm6, xmm0) \ + AS2( paddd xmm7, xmm1) \ + AS2( pand xmm2, xmm3) \ + AS2( psrld xmm3, 16) \ + AS2( paddd xmm6, xmm6) \ + AS2( paddd xmm7, xmm7) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm3) \ + AS2( movq xmm0, QWORD PTR [esp+4])\ + AS2( movq xmm1, QWORD PTR [esp+12])\ + AS2( paddd xmm4, xmm0)\ + AS2( paddd xmm5, xmm1)\ + +#define Squ_Column0(k, i) \ + Squ_SSE2_SaveShift(k) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + SSE2_FirstMultiply(1)\ + Squ_Acc##i(i) \ + AS2( paddd xmm6, xmm6) \ + AS2( paddd xmm7, xmm7) \ + AS2( paddd xmm4, xmm4) \ + AS2( paddd xmm5, xmm5) \ + AS2( movq xmm0, QWORD PTR [esp+4])\ + AS2( movq xmm1, QWORD PTR [esp+12])\ + AS2( paddd xmm4, xmm0)\ + AS2( paddd xmm5, xmm1)\ + +#define SSE2_MulAdd45 \ + AS2( movdqa xmm7, [esi]) \ + AS2( movdqa xmm0, [edi]) \ + AS2( pmuludq xmm0, xmm7) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm7, [edx]) \ + AS2( movdqa xmm6, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm6, xmm7) \ + AS2( psrld xmm7, 16) + +#define Mul_Begin(n) \ + MulPrologue \ + AS2( mov esi, esp)\ + AS2( and esp, 0xfffffff0)\ + AS2( sub esp, 48*n+16)\ + AS1( push esi)\ + AS2( xor edx, edx) \ + ASL(1) \ + ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ + ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ + ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ + AS2( movdqa [esp+20+2*edx], xmm0) \ + AS2( psrlq xmm0, 32) \ + AS2( movdqa [esp+20+2*edx+16], xmm0) \ + AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ + AS2( psrlq xmm1, 32) \ + AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ + AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ + AS2( psrlq xmm2, 32) \ + AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ + AS2( add edx, 16) \ + AS2( cmp edx, 8*(n)) \ + ASJ( jne, 1, b) \ + AS2( lea edi, [esp+20])\ + AS2( lea edx, [esp+20+16*n])\ + AS2( lea esi, [esp+20+32*n])\ + SSE2_FirstMultiply(0) \ + +#define Mul_Acc(i) \ + ASL(LMul##i) \ + AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm0, xmm1) \ + AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm3, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm3, xmm1) \ + AS2( psrld xmm1, 16) \ + AS2( paddd xmm6, xmm3) \ + AS2( paddd xmm7, xmm1) \ + +#define Mul_Acc1(i) +#define Mul_Acc2(i) ASC(call, LMul##i) +#define Mul_Acc3(i) Mul_Acc2(i) +#define Mul_Acc4(i) Mul_Acc2(i) +#define Mul_Acc5(i) Mul_Acc2(i) +#define Mul_Acc6(i) Mul_Acc2(i) +#define Mul_Acc7(i) Mul_Acc2(i) +#define Mul_Acc8(i) Mul_Acc2(i) +#define Mul_Acc9(i) Mul_Acc2(i) +#define Mul_Acc10(i) Mul_Acc2(i) +#define Mul_Acc11(i) Mul_Acc2(i) +#define Mul_Acc12(i) Mul_Acc2(i) +#define Mul_Acc13(i) Mul_Acc2(i) +#define Mul_Acc14(i) Mul_Acc2(i) +#define Mul_Acc15(i) Mul_Acc2(i) +#define Mul_Acc16(i) Mul_Acc2(i) + +#define Mul_Column1(k, i) \ + SSE2_SaveShift(k) \ + AS2( add esi, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + +#define Mul_Column0(k, i) \ + SSE2_SaveShift(k) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + +#define Bot_Acc(i) \ + AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( pmuludq xmm0, xmm1) \ + AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( paddq xmm4, xmm0) \ + AS2( paddd xmm6, xmm1) + +#define Bot_SaveAcc(k) \ + SSE2_SaveShift(k) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + AS2( movdqa xmm6, [esi]) \ + AS2( movdqa xmm0, [edi]) \ + AS2( pmuludq xmm0, xmm6) \ + AS2( paddq xmm4, xmm0) \ + AS2( psllq xmm5, 16) \ + AS2( paddq xmm4, xmm5) \ + AS2( pmuludq xmm6, [edx]) + +#define Bot_End(n) \ + AS2( movhlps xmm7, xmm6) \ + AS2( paddd xmm6, xmm7) \ + AS2( psllq xmm6, 32) \ + AS2( paddd xmm4, xmm6) \ + AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \ + AS1( pop esp)\ + MulEpilogue -static PAddSub s_pAdd, s_pSub; -#ifdef SSE2_INTRINSICS_AVAILABLE -static PMul s_pMul4, s_pMul8, s_pMul8B; +#define Top_Begin(n) \ + TopPrologue \ + AS2( mov edx, esp)\ + AS2( and esp, 0xfffffff0)\ + AS2( sub esp, 48*n+16)\ + AS1( push edx)\ + AS2( xor edx, edx) \ + ASL(1) \ + ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ + ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ + ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ + AS2( movdqa [esp+20+2*edx], xmm0) \ + AS2( psrlq xmm0, 32) \ + AS2( movdqa [esp+20+2*edx+16], xmm0) \ + AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ + AS2( psrlq xmm1, 32) \ + AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ + AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ + AS2( psrlq xmm2, 32) \ + AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ + AS2( add edx, 16) \ + AS2( cmp edx, 8*(n)) \ + ASJ( jne, 1, b) \ + AS2( mov eax, esi) \ + AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\ + AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\ + AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\ + AS2( pxor xmm4, xmm4)\ + AS2( pxor xmm5, xmm5) + +#define Top_Acc(i) \ + AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \ + AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( psrlq xmm0, 48) \ + AS2( paddd xmm5, xmm0)\ + +#define Top_Column0(i) \ + AS2( psllq xmm5, 32) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + +#define Top_Column1(i) \ + SSE2_SaveShift(0) \ + AS2( add esi, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + AS2( shr eax, 16) \ + AS2( movd xmm0, eax)\ + AS2( movd xmm1, [ecx+4])\ + AS2( psrld xmm1, 16)\ + AS2( pcmpgtd xmm1, xmm0)\ + AS2( psrld xmm1, 31)\ + AS2( paddd xmm4, xmm1)\ + +void SSE2_Square4(word *C, const word *A) +{ + Squ_Begin(2) + Squ_Column0(0, 1) + Squ_End(2) +} + +void SSE2_Square8(word *C, const word *A) +{ + Squ_Begin(4) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Squ_Acc(2) + AS1( ret) ASL(0) #endif + Squ_Column0(0, 1) + Squ_Column1(1, 1) + Squ_Column0(2, 2) + Squ_Column1(3, 1) + Squ_Column0(4, 1) + Squ_End(4) +} -static void SetPentiumFunctionPointers() +void SSE2_Square16(word *C, const word *A) { - if (IsP4()) - { - s_pAdd = &P4Optimized::Add; - s_pSub = &P4Optimized::Subtract; - } - else - { - s_pAdd = &PentiumOptimized::Add; - s_pSub = &PentiumOptimized::Subtract; - } - -#ifdef SSE2_INTRINSICS_AVAILABLE - if (HasSSE2()) - { - s_pMul4 = &P4Optimized::Multiply4; - s_pMul8 = &P4Optimized::Multiply8; - s_pMul8B = &P4Optimized::Multiply8Bottom; - } - else - { - s_pMul4 = &PentiumOptimized::Multiply4; - s_pMul8 = &PentiumOptimized::Multiply8; - s_pMul8B = &PentiumOptimized::Multiply8Bottom; - } + Squ_Begin(8) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) + AS1( ret) ASL(0) +#endif + Squ_Column0(0, 1) + Squ_Column1(1, 1) + Squ_Column0(2, 2) + Squ_Column1(3, 2) + Squ_Column0(4, 3) + Squ_Column1(5, 3) + Squ_Column0(6, 4) + Squ_Column1(7, 3) + Squ_Column0(8, 3) + Squ_Column1(9, 2) + Squ_Column0(10, 2) + Squ_Column1(11, 1) + Squ_Column0(12, 1) + Squ_End(8) +} + +void SSE2_Square32(word *C, const word *A) +{ + Squ_Begin(16) + ASJ( jmp, 0, f) + Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) + AS1( ret) ASL(0) + Squ_Column0(0, 1) + Squ_Column1(1, 1) + Squ_Column0(2, 2) + Squ_Column1(3, 2) + Squ_Column0(4, 3) + Squ_Column1(5, 3) + Squ_Column0(6, 4) + Squ_Column1(7, 4) + Squ_Column0(8, 5) + Squ_Column1(9, 5) + Squ_Column0(10, 6) + Squ_Column1(11, 6) + Squ_Column0(12, 7) + Squ_Column1(13, 7) + Squ_Column0(14, 8) + Squ_Column1(15, 7) + Squ_Column0(16, 7) + Squ_Column1(17, 6) + Squ_Column0(18, 6) + Squ_Column1(19, 5) + Squ_Column0(20, 5) + Squ_Column1(21, 4) + Squ_Column0(22, 4) + Squ_Column1(23, 3) + Squ_Column0(24, 3) + Squ_Column1(25, 2) + Squ_Column0(26, 2) + Squ_Column1(27, 1) + Squ_Column0(28, 1) + Squ_End(16) +} + +void SSE2_Multiply4(word *C, const word *A, const word *B) +{ + Mul_Begin(2) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(2) + AS1( ret) ASL(0) #endif + Mul_Column0(0, 2) + Mul_End(2) } -void DisableSSE2() +void SSE2_Multiply8(word *C, const word *A, const word *B) { - s_sse2Enabled = false; - SetPentiumFunctionPointers(); + Mul_Begin(4) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 3) + Mul_Column0(4, 2) + Mul_End(4) } -class LowLevel : public PentiumOptimized +void SSE2_Multiply16(word *C, const word *A, const word *B) { -public: - inline static int Add(word *C, const word *A, const word *B, size_t N) - {return s_pAdd(C, A, B, N);} - inline static int Subtract(word *C, const word *A, const word *B, size_t N) - {return s_pSub(C, A, B, N);} - inline static void Square4(word *R, const word *A) - {Multiply4(R, A, A);} -#ifdef SSE2_INTRINSICS_AVAILABLE - inline static void Multiply4(word *C, const word *A, const word *B) - {s_pMul4(C, A, B);} - inline static void Multiply8(word *C, const word *A, const word *B) - {s_pMul8(C, A, B);} - inline static void Multiply8Bottom(word *C, const word *A, const word *B) - {s_pMul8B(C, A, B);} + Mul_Begin(8) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) #endif -}; - -// use some tricks to share assembly code between MSVC and GCC -#ifdef _MSC_VER - #define CRYPTOPP_NAKED __declspec(naked) - #define AS1(x) __asm x - #define AS2(x, y) __asm x, y - #define AddPrologue \ - __asm push ebp \ - __asm push ebx \ - __asm push esi \ - __asm push edi \ - __asm mov ecx, [esp+20] \ - __asm mov edx, [esp+24] \ - __asm mov ebx, [esp+28] \ - __asm mov esi, [esp+32] - #define AddEpilogue \ - __asm pop edi \ - __asm pop esi \ - __asm pop ebx \ - __asm pop ebp \ - __asm ret - #define MulPrologue \ - __asm push ebp \ - __asm push ebx \ - __asm push esi \ - __asm push edi \ - __asm mov ecx, [esp+28] \ - __asm mov esi, [esp+24] \ - __asm push [esp+20] - #define MulEpilogue \ - __asm add esp, 4 \ - __asm pop edi \ - __asm pop esi \ - __asm pop ebx \ - __asm pop ebp \ - __asm ret -#else - #define CRYPTOPP_NAKED - #define AS1(x) #x ";" - #define AS2(x, y) #x ", " #y ";" - #define AddPrologue \ - __asm__ __volatile__ \ - ( \ - "push %%ebx;" /* save this manually, in case of -fPIC */ \ - "mov %2, %%ebx;" \ - ".intel_syntax noprefix;" \ - "push ebp;" - #define AddEpilogue \ - "pop ebp;" \ - ".att_syntax prefix;" \ - "pop %%ebx;" \ - : \ - : "c" (C), "d" (A), "m" (B), "S" (N) \ - : "%edi", "memory", "cc" \ - ); - #define MulPrologue \ - __asm__ __volatile__ \ - ( \ - "push %%ebx;" /* save this manually, in case of -fPIC */ \ - "push %%ebp;" \ - "push %0;" \ - ".intel_syntax noprefix;" - #define MulEpilogue \ - "add esp, 4;" \ - "pop ebp;" \ - "pop ebx;" \ - ".att_syntax prefix;" \ - : \ - : "rm" (Z), "S" (X), "c" (Y) \ - : "%eax", "%edx", "%edi", "memory", "cc" \ - ); + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Mul_Column0(6, 8) + Mul_Column1(7, 7) + Mul_Column0(8, 6) + Mul_Column1(9, 5) + Mul_Column0(10, 4) + Mul_Column1(11, 3) + Mul_Column0(12, 2) + Mul_End(8) +} + +void SSE2_Multiply32(word *C, const word *A, const word *B) +{ + Mul_Begin(16) + ASJ( jmp, 0, f) + Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Mul_Column0(6, 8) + Mul_Column1(7, 9) + Mul_Column0(8, 10) + Mul_Column1(9, 11) + Mul_Column0(10, 12) + Mul_Column1(11, 13) + Mul_Column0(12, 14) + Mul_Column1(13, 15) + Mul_Column0(14, 16) + Mul_Column1(15, 15) + Mul_Column0(16, 14) + Mul_Column1(17, 13) + Mul_Column0(18, 12) + Mul_Column1(19, 11) + Mul_Column0(20, 10) + Mul_Column1(21, 9) + Mul_Column0(22, 8) + Mul_Column1(23, 7) + Mul_Column0(24, 6) + Mul_Column1(25, 5) + Mul_Column0(26, 4) + Mul_Column1(27, 3) + Mul_Column0(28, 2) + Mul_End(16) +} + +void SSE2_MultiplyBottom4(word *C, const word *A, const word *B) +{ + Mul_Begin(2) + Bot_SaveAcc(0) Bot_Acc(2) + Bot_End(2) +} + +void SSE2_MultiplyBottom8(word *C, const word *A, const word *B) +{ + Mul_Begin(4) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) #endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) + Bot_End(4) +} -CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N) +void SSE2_MultiplyBottom16(word *C, const word *A, const word *B) { - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C - AS2( xor eax, eax) // clear eax - - AS2( sub eax, esi) // eax is a negative index from end of B - AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - - AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag - AS1( jz loopendAdd) // if no dwords then nothing to do - - AS1(loopstartAdd:) - AS2( mov esi,[edx]) // load lower word of A - AS2( mov ebp,[edx+4]) // load higher word of A - - AS2( mov edi,[ebx+8*eax]) // load lower word of B - AS2( lea edx,[edx+8]) // advance A and C - - AS2( adc esi,edi) // add lower words - AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - - AS2( adc ebp,edi) // add higher words - AS1( inc eax) // advance B - - AS2( mov [edx+ecx-8],esi) // store lower word result - AS2( mov [edx+ecx-4],ebp) // store higher word result - - AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero - - AS1(loopendAdd:) - AS2( adc eax, 0) // store carry into eax (return result register) - - AddEpilogue + Mul_Begin(8) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) + Bot_End(8) +} + +void SSE2_MultiplyBottom32(word *C, const word *A, const word *B) +{ + Mul_Begin(16) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Mul_Column0(6, 8) + Mul_Column1(7, 9) + Mul_Column0(8, 10) + Mul_Column1(9, 11) + Mul_Column0(10, 12) + Mul_Column1(11, 13) + Mul_Column0(12, 14) + Mul_Column1(13, 15) + Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) + Bot_End(16) +} + +void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L) +{ + Top_Begin(4) + Top_Acc(3) Top_Acc(2) Top_Acc(1) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Top_Column0(4) + Top_Column1(3) + Mul_Column0(0, 2) + Top_End(2) } -CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N) +void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L) { - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C - AS2( xor eax, eax) // clear eax - - AS2( sub eax, esi) // eax is a negative index from end of B - AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - - AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag - AS1( jz loopendSub) // if no dwords then nothing to do - - AS1(loopstartSub:) - AS2( mov esi,[edx]) // load lower word of A - AS2( mov ebp,[edx+4]) // load higher word of A - - AS2( mov edi,[ebx+8*eax]) // load lower word of B - AS2( lea edx,[edx+8]) // advance A and C - - AS2( sbb esi,edi) // subtract lower words - AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - - AS2( sbb ebp,edi) // subtract higher words - AS1( inc eax) // advance B - - AS2( mov [edx+ecx-8],esi) // store lower word result - AS2( mov [edx+ecx-4],ebp) // store higher word result + Top_Begin(8) + Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Top_Column0(8) + Top_Column1(7) + Mul_Column0(0, 6) + Mul_Column1(1, 5) + Mul_Column0(2, 4) + Mul_Column1(3, 3) + Mul_Column0(4, 2) + Top_End(4) +} + +void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L) +{ + Top_Begin(16) + Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Top_Column0(16) + Top_Column1(15) + Mul_Column0(0, 14) + Mul_Column1(1, 13) + Mul_Column0(2, 12) + Mul_Column1(3, 11) + Mul_Column0(4, 10) + Mul_Column1(5, 9) + Mul_Column0(6, 8) + Mul_Column1(7, 7) + Mul_Column0(8, 6) + Mul_Column1(9, 5) + Mul_Column0(10, 4) + Mul_Column1(11, 3) + Mul_Column0(12, 2) + Top_End(8) +} + +#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - AS1( jnz loopstartSub) // loop until eax overflows and becomes zero +// ******************************************************** - AS1(loopendSub:) - AS2( adc eax, 0) // store carry into eax (return result register) +typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B); +typedef void (* PMul)(word *C, const word *A, const word *B); +typedef void (* PSqu)(word *C, const word *A); +typedef void (* PMulTop)(word *C, const word *A, const word *B, word L); - AddEpilogue -} +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub; +static PMulTop s_pTop[3]; +static size_t s_recursionLimit = 8; +#else +static const size_t s_recursionLimit = 8; +#endif -// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them. +static PMul s_pMul[9], s_pBot[9]; +static PSqu s_pSqu[9]; -CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N) +static void SetFunctionPointers() { - AddPrologue + s_pMul[0] = &Baseline_Multiply2; + s_pBot[0] = &Baseline_MultiplyBottom2; + s_pSqu[0] = &Baseline_Square2; - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( xor eax, eax) - AS1( neg esi) - AS1( jz loopendAddP4) // if no dwords then nothing to do - - AS2( mov edi, [edx]) - AS2( mov ebp, [ebx]) - AS1( jmp carry1AddP4) - - AS1(loopstartAddP4:) - AS2( mov edi, [edx+8]) - AS2( add ecx, 8) - AS2( add edx, 8) - AS2( mov ebp, [ebx]) - AS2( add edi, eax) - AS1( jc carry1AddP4) - AS2( xor eax, eax) - - AS1(carry1AddP4:) - AS2( add edi, ebp) - AS2( mov ebp, 1) - AS2( mov [ecx], edi) - AS2( mov edi, [edx+4]) - AS2( cmovc eax, ebp) - AS2( mov ebp, [ebx+4]) - AS2( add ebx, 8) - AS2( add edi, eax) - AS1( jc carry2AddP4) - AS2( xor eax, eax) - - AS1(carry2AddP4:) - AS2( add edi, ebp) - AS2( mov ebp, 1) - AS2( cmovc eax, ebp) - AS2( mov [ecx+4], edi) - AS2( add esi, 2) - AS1( jnz loopstartAddP4) - - AS1(loopendAddP4:) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + { + if (IsP4()) + { + s_pAdd = &SSE2_Add; + s_pSub = &SSE2_Sub; + } - AddEpilogue -} + s_recursionLimit = 32; -CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N) -{ - AddPrologue + s_pMul[1] = &SSE2_Multiply4; + s_pMul[2] = &SSE2_Multiply8; + s_pMul[4] = &SSE2_Multiply16; + s_pMul[8] = &SSE2_Multiply32; - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( xor eax, eax) - AS1( neg esi) - AS1( jz loopendSubP4) // if no dwords then nothing to do - - AS2( mov edi, [edx]) - AS2( mov ebp, [ebx]) - AS1( jmp carry1SubP4) - - AS1(loopstartSubP4:) - AS2( mov edi, [edx+8]) - AS2( add edx, 8) - AS2( add ecx, 8) - AS2( mov ebp, [ebx]) - AS2( sub edi, eax) - AS1( jc carry1SubP4) - AS2( xor eax, eax) - - AS1(carry1SubP4:) - AS2( sub edi, ebp) - AS2( mov ebp, 1) - AS2( mov [ecx], edi) - AS2( mov edi, [edx+4]) - AS2( cmovc eax, ebp) - AS2( mov ebp, [ebx+4]) - AS2( add ebx, 8) - AS2( sub edi, eax) - AS1( jc carry2SubP4) - AS2( xor eax, eax) - - AS1(carry2SubP4:) - AS2( sub edi, ebp) - AS2( mov ebp, 1) - AS2( cmovc eax, ebp) - AS2( mov [ecx+4], edi) - AS2( add esi, 2) - AS1( jnz loopstartSubP4) - - AS1(loopendSubP4:) + s_pBot[1] = &SSE2_MultiplyBottom4; + s_pBot[2] = &SSE2_MultiplyBottom8; + s_pBot[4] = &SSE2_MultiplyBottom16; + s_pBot[8] = &SSE2_MultiplyBottom32; - AddEpilogue -} + s_pSqu[1] = &SSE2_Square4; + s_pSqu[2] = &SSE2_Square8; + s_pSqu[4] = &SSE2_Square16; + s_pSqu[8] = &SSE2_Square32; -// multiply assembly code originally contributed by Leonard Janke - -#define MulStartup \ - AS2(xor ebp, ebp) \ - AS2(xor edi, edi) \ - AS2(xor ebx, ebx) - -#define MulShiftCarry \ - AS2(mov ebp, edx) \ - AS2(mov edi, ebx) \ - AS2(xor ebx, ebx) - -#define MulAccumulateBottom(i,j) \ - AS2(mov eax, [ecx+4*j]) \ - AS2(imul eax, dword ptr [esi+4*i]) \ - AS2(add ebp, eax) - -#define MulAccumulate(i,j) \ - AS2(mov eax, [ecx+4*j]) \ - AS1(mul dword ptr [esi+4*i]) \ - AS2(add ebp, eax) \ - AS2(adc edi, edx) \ - AS2(adc bl, bh) - -#define MulStoreDigit(i) \ - AS2(mov edx, edi) \ - AS2(mov edi, [esp]) \ - AS2(mov [edi+4*i], ebp) - -#define MulLastDiagonal(digits) \ - AS2(mov eax, [ecx+4*(digits-1)]) \ - AS1(mul dword ptr [esi+4*(digits-1)]) \ - AS2(add ebp, eax) \ - AS2(adc edx, edi) \ - AS2(mov edi, [esp]) \ - AS2(mov [edi+4*(2*digits-2)], ebp) \ - AS2(mov [edi+4*(2*digits-1)], edx) - -CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(3,2) - MulAccumulate(2,3) - MulStoreDigit(5) - MulShiftCarry - - MulLastDiagonal(4) - MulEpilogue -} + s_pTop[0] = &SSE2_MultiplyTop8; + s_pTop[1] = &SSE2_MultiplyTop16; + s_pTop[2] = &SSE2_MultiplyTop32; + } + else +#endif + { + s_pMul[1] = &Baseline_Multiply4; + s_pMul[2] = &Baseline_Multiply8; +// s_pMul[4] = &Baseline_Multiply16; -CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulate(7,0) - MulAccumulate(6,1) - MulAccumulate(5,2) - MulAccumulate(4,3) - MulAccumulate(3,4) - MulAccumulate(2,5) - MulAccumulate(1,6) - MulAccumulate(0,7) - MulStoreDigit(7) - MulShiftCarry - - MulAccumulate(7,1) - MulAccumulate(6,2) - MulAccumulate(5,3) - MulAccumulate(4,4) - MulAccumulate(3,5) - MulAccumulate(2,6) - MulAccumulate(1,7) - MulStoreDigit(8) - MulShiftCarry - - MulAccumulate(7,2) - MulAccumulate(6,3) - MulAccumulate(5,4) - MulAccumulate(4,5) - MulAccumulate(3,6) - MulAccumulate(2,7) - MulStoreDigit(9) - MulShiftCarry - - MulAccumulate(7,3) - MulAccumulate(6,4) - MulAccumulate(5,5) - MulAccumulate(4,6) - MulAccumulate(3,7) - MulStoreDigit(10) - MulShiftCarry - - MulAccumulate(7,4) - MulAccumulate(6,5) - MulAccumulate(5,6) - MulAccumulate(4,7) - MulStoreDigit(11) - MulShiftCarry - - MulAccumulate(7,5) - MulAccumulate(6,6) - MulAccumulate(5,7) - MulStoreDigit(12) - MulShiftCarry - - MulAccumulate(7,6) - MulAccumulate(6,7) - MulStoreDigit(13) - MulShiftCarry - - MulLastDiagonal(8) - MulEpilogue -} + s_pBot[1] = &Baseline_MultiplyBottom4; + s_pBot[2] = &Baseline_MultiplyBottom8; +// s_pBot[4] = &Baseline_MultiplyBottom16; -CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulateBottom(7,0) - MulAccumulateBottom(6,1) - MulAccumulateBottom(5,2) - MulAccumulateBottom(4,3) - MulAccumulateBottom(3,4) - MulAccumulateBottom(2,5) - MulAccumulateBottom(1,6) - MulAccumulateBottom(0,7) - MulStoreDigit(7) - MulEpilogue + s_pSqu[1] = &Baseline_Square4; + s_pSqu[2] = &Baseline_Square8; +// s_pSqu[4] = &Baseline_Square16; + } } -#undef AS1 -#undef AS2 - -#else // not x86 - no processor specific code at this layer - -typedef Portable LowLevel; - -#endif - -#ifdef SSE2_INTRINSICS_AVAILABLE - -#ifdef __GNUC__ -#define CRYPTOPP_FASTCALL +inline int Add(word *C, const word *A, const word *B, size_t N) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + return s_pAdd(N, C, A, B); #else -#define CRYPTOPP_FASTCALL __fastcall + return Baseline_Add(N, C, A, B); #endif - -static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B) -{ - __m128i a3210 = _mm_load_si128(A); - __m128i b3210 = _mm_load_si128(B); - - __m128i sum; - - __m128i z = _mm_setzero_si128(); - __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210); - C[0] = a2b2_a0b0; - - __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1)); - __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021); - __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z); - __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z); - C[1] = _mm_add_epi64(a1b0, a0b1); - - __m128i a31 = _mm_srli_epi64(a3210, 32); - __m128i b31 = _mm_srli_epi64(b3210, 32); - __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31); - C[6] = a3b3_a1b1; - - __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z); - __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2)); - __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012); - __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z); - __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z); - sum = _mm_add_epi64(a1b1, a0b2); - C[2] = _mm_add_epi64(sum, a2b0); - - __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3)); - __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012); - __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103); - __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z); - __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z); - __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z); - __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z); - __m128i sum1 = _mm_add_epi64(a3b0, a1b2); - sum = _mm_add_epi64(a2b1, a0b3); - C[3] = _mm_add_epi64(sum, sum1); - - __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103); - __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z); - __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z); - __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z); - sum = _mm_add_epi64(a2b2, a3b1); - C[4] = _mm_add_epi64(sum, a1b3); - - __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2)); - __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3)); - __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203); - __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z); - __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z); - C[5] = _mm_add_epi64(a3b2, a2b3); -} - -void P4Optimized::Multiply4(word *C, const word *A, const word *B) -{ - __m128i temp[7]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - C[0] = w[0]; - - __m64 s1, s2; - - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w18, w20); - s1 = _mm_add_si64(s1, s2); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w22, w26); - s1 = _mm_add_si64(s1, s2); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[7] = _mm_cvtsi64_si32(s1) + w[27]; - _mm_empty(); -} - -void P4Optimized::Multiply8(word *C, const word *A, const word *B) -{ - __m128i temp[28]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - const word *z = (word *)temp+7*4*3; - const __m64 *mz = (__m64 *)z; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); - - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); - - P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1); - - C[0] = w[0]; - - __m64 s1, s2, s3, s4; - - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - __m64 w27 = _mm_cvtsi32_si64(w[27]); - - __m64 x0 = _mm_cvtsi32_si64(x[0]); - __m64 x1 = _mm_cvtsi32_si64(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - __m64 x10 = mx[5]; - __m64 x12 = mx[6]; - __m64 x14 = mx[7]; - __m64 x16 = mx[8]; - __m64 x18 = mx[9]; - __m64 x20 = mx[10]; - __m64 x22 = mx[11]; - __m64 x26 = _mm_cvtsi32_si64(x[26]); - __m64 x27 = _mm_cvtsi32_si64(x[27]); - - __m64 y0 = _mm_cvtsi32_si64(y[0]); - __m64 y1 = _mm_cvtsi32_si64(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - __m64 y10 = my[5]; - __m64 y12 = my[6]; - __m64 y14 = my[7]; - __m64 y16 = my[8]; - __m64 y18 = my[9]; - __m64 y20 = my[10]; - __m64 y22 = my[11]; - __m64 y26 = _mm_cvtsi32_si64(y[26]); - __m64 y27 = _mm_cvtsi32_si64(y[27]); - - __m64 z0 = _mm_cvtsi32_si64(z[0]); - __m64 z1 = _mm_cvtsi32_si64(z[1]); - __m64 z4 = mz[2]; - __m64 z6 = mz[3]; - __m64 z8 = mz[4]; - __m64 z10 = mz[5]; - __m64 z12 = mz[6]; - __m64 z14 = mz[7]; - __m64 z16 = mz[8]; - __m64 z18 = mz[9]; - __m64 z20 = mz[10]; - __m64 z22 = mz[11]; - __m64 z26 = _mm_cvtsi32_si64(z[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x10, y10); - s4 = _mm_add_si64(x12, y12); - s1 = _mm_add_si64(s1, w27); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[7] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x14, y14); - s4 = _mm_add_si64(x16, y16); - s1 = _mm_add_si64(s1, z0); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[8] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x18, y18); - s4 = _mm_add_si64(x20, y20); - s1 = _mm_add_si64(s1, z1); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z4); - s1 = _mm_add_si64(s1, s3); - C[9] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x22, y22); - s4 = _mm_add_si64(x26, y26); - s1 = _mm_add_si64(s1, z6); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z8); - s1 = _mm_add_si64(s1, s3); - C[10] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x27, y27); - s1 = _mm_add_si64(s1, z10); - s1 = _mm_add_si64(s1, z12); - s1 = _mm_add_si64(s1, s3); - C[11] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z14, z16); - s1 = _mm_add_si64(s1, s3); - C[12] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z18, z20); - s1 = _mm_add_si64(s1, s3); - C[13] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z22, z26); - s1 = _mm_add_si64(s1, s3); - C[14] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[15] = z[27] + _mm_cvtsi64_si32(s1); - _mm_empty(); } -void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) -{ - __m128i temp[21]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); - - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); - - C[0] = w[0]; - - __m64 s1, s2, s3, s4; - - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - - __m64 x0 = _mm_cvtsi32_si64(x[0]); - __m64 x1 = _mm_cvtsi32_si64(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - - __m64 y0 = _mm_cvtsi32_si64(y[0]); - __m64 y1 = _mm_cvtsi32_si64(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; - _mm_empty(); +inline int Subtract(word *C, const word *A, const word *B, size_t N) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + return s_pSub(N, C, A, B); +#else + return Baseline_Sub(N, C, A, B); +#endif } -#endif // #ifdef SSE2_INTRINSICS_AVAILABLE - // ******************************************************** + #define A0 A #define A1 (A+N2) #define B0 B @@ -2004,64 +1876,37 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8) - LowLevel::Multiply8(R, A, B); - else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4) - LowLevel::Multiply4(R, A, B); - else if (N==2) - LowLevel::Multiply2(R, A, B); + if (N <= s_recursionLimit) + s_pMul[N/4](R, A, B); else { const size_t N2 = N/2; - int carry; - int aComp = Compare(A0, A1, N2); - int bComp = Compare(B0, B1, N2); + size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; + Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); - switch (2*aComp + aComp + bComp) - { - case -4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R0, N2); - carry = -1; - break; - case -2: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 2: - LowLevel::Subtract(R0, A0, A1, N2); - LowLevel::Subtract(R1, B1, B0, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R1, N2); - carry = -1; - break; - default: - SetWords(T0, 0, N); - carry = 0; - } + size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; + Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); - RecursiveMultiply(R0, T2, A0, B0, N2); RecursiveMultiply(R2, T2, A1, B1, N2); + RecursiveMultiply(T0, T2, R0, R1, N2); + RecursiveMultiply(R0, T2, A0, B0, N2); // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1 - carry += LowLevel::Add(T0, T0, R0, N); - carry += LowLevel::Add(T0, T0, R2, N); - carry += LowLevel::Add(R1, R1, T0, N); + int c2 = Add(R2, R2, R1, N2); + int c3 = c2; + c2 += Add(R1, R2, R0, N2); + c3 += Add(R2, R2, R3, N2); - assert (carry >= 0 && carry <= 2); - Increment(R3, N2, carry); + if (AN2 == BN2) + c3 -= Subtract(R1, R1, T0, N); + else + c3 += Add(R1, R1, T0, N); + + c3 += Increment(R2, N2, c2); + assert (c3 >= 0 && c3 <= 2); + Increment(R3, N2, c3); } } @@ -2072,12 +1917,9 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N) void RecursiveSquare(word *R, word *T, const word *A, size_t N) { assert(N && N%2==0); - if (LowLevel::SquareRecursionLimit() >= 8 && N==8) - LowLevel::Square8(R, A); - if (LowLevel::SquareRecursionLimit() >= 4 && N==4) - LowLevel::Square4(R, A); - else if (N==2) - LowLevel::Square2(R, A); + + if (N <= s_recursionLimit) + s_pSqu[N/4](R, A); else { const size_t N2 = N/2; @@ -2086,35 +1928,32 @@ void RecursiveSquare(word *R, word *T, const word *A, size_t N) RecursiveSquare(R2, T2, A1, N2); RecursiveMultiply(T0, T2, A0, A1, N2); - int carry = LowLevel::Add(R1, R1, T0, N); - carry += LowLevel::Add(R1, R1, T0, N); + int carry = Add(R1, R1, T0, N); + carry += Add(R1, R1, T0, N); Increment(R3, N2, carry); } } // R[N] - bottom half of A*B -// T[N] - temporary work space +// T[3*N/2] - temporary work space // A[N] - multiplier // B[N] - multiplicant void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8) - LowLevel::Multiply8Bottom(R, A, B); - else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4) - LowLevel::Multiply4Bottom(R, A, B); - else if (N==2) - LowLevel::Multiply2Bottom(R, A, B); + + if (N <= s_recursionLimit) + s_pBot[N/4](R, A, B); else { const size_t N2 = N/2; RecursiveMultiply(R, T, A0, B0, N2); RecursiveMultiplyBottom(T0, T1, A1, B0, N2); - LowLevel::Add(R1, R1, T0, N2); + Add(R1, R1, T0, N2); RecursiveMultiplyBottom(T0, T1, A0, B1, N2); - LowLevel::Add(R1, R1, T0, N2); + Add(R1, R1, T0, N2); } } @@ -2124,88 +1963,61 @@ void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, siz // A[N] --- multiplier // B[N] --- multiplicant -void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) +void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (N==4) - { - LowLevel::Multiply4(T, A, B); - memcpy(R, T+4, 4*WORD_SIZE); - } - else if (N==2) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2() && ((N>=8) & (N<=32))) + s_pTop[N/16](R, A, B, L[N-1]); + else +#endif + if (N<=4) { - LowLevel::Multiply2(T, A, B); - memcpy(R, T+2, 2*WORD_SIZE); + s_pMul[N/4](T, A, B); + memcpy(R, T+N, N*WORD_SIZE); } else { const size_t N2 = N/2; - int carry; - int aComp = Compare(A0, A1, N2); - int bComp = Compare(B0, B1, N2); + size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; + Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); - switch (2*aComp + aComp + bComp) - { - case -4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R0, N2); - carry = -1; - break; - case -2: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 2: - LowLevel::Subtract(R0, A0, A1, N2); - LowLevel::Subtract(R1, B1, B0, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R1, N2); - carry = -1; - break; - default: - SetWords(T0, 0, N); - carry = 0; - } - - RecursiveMultiply(T2, R0, A1, B1, N2); + size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; + Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); - // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1 + RecursiveMultiply(T0, T2, R0, R1, N2); + RecursiveMultiply(R0, T2, A1, B1, N2); - int c2 = LowLevel::Subtract(R0, L+N2, L, N2); - c2 += LowLevel::Subtract(R0, R0, T0, N2); - int t = (Compare(R0, T2, N2) == -1); + // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1 - carry += t; - carry += Increment(R0, N2, c2+t); - carry += LowLevel::Add(R0, R0, T1, N2); - carry += LowLevel::Add(R0, R0, T3, N2); - assert (carry >= 0 && carry <= 2); + int t, c3; + int c2 = Subtract(T2, L+N2, L, N2); - CopyWords(R1, T3, N2); - Increment(R1, N2, carry); - } -} + if (AN2 == BN2) + { + c2 -= Add(T2, T2, T0, N2); + t = (Compare(T2, R0, N2) == -1); + c3 = t - Subtract(T2, T2, T1, N2); + } + else + { + c2 += Subtract(T2, T2, T0, N2); + t = (Compare(T2, R0, N2) == -1); + c3 = t + Add(T2, T2, T1, N2); + } -inline int Add(word *C, const word *A, const word *B, size_t N) -{ - return LowLevel::Add(C, A, B, N); -} + c2 += t; + if (c2 >= 0) + c3 += Increment(T2, N2, c2); + else + c3 -= Decrement(T2, N2, -c2); + c3 += Add(R0, T2, R1, N2); -inline int Subtract(word *C, const word *A, const word *B, size_t N) -{ - return LowLevel::Subtract(C, A, B, N); + assert (c3 >= 0 && c3 <= 2); + Increment(R1, N2, c3); + } } inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N) @@ -2223,23 +2035,6 @@ inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_ RecursiveMultiplyBottom(R, T, A, B, N); } -inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) -{ - RecursiveMultiplyTop(R, T, L, A, B, N); -} - -static word LinearMultiply(word *C, const word *A, word B, size_t N) -{ - word carry=0; - for(unsigned i=0; i<N; i++) - { - DWord p = DWord::MultiplyAndAdd(A[i], B, carry); - C[i] = p.GetLowHalf(); - carry = p.GetHighHalf(); - } - return carry; -} - // R[NA+NB] - result = A*B // T[NA+NB] - temporary work space // A[NA] ---- multiplier @@ -2264,7 +2059,6 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word * } assert(NB % NA == 0); - assert((NB/NA)%2 == 0); // NB is an even multiple of NA if (NA==2 && !A[1]) { @@ -2284,15 +2078,24 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word * } } - Multiply(R, T, A, B, NA); - CopyWords(T+2*NA, R+NA, NA); - size_t i; + if ((NB/NA)%2 == 0) + { + Multiply(R, T, A, B, NA); + CopyWords(T+2*NA, R+NA, NA); - for (i=2*NA; i<NB; i+=2*NA) - Multiply(T+NA+i, T, A, B+i, NA); - for (i=NA; i<NB; i+=2*NA) - Multiply(R+i, T, A, B+i, NA); + for (i=2*NA; i<NB; i+=2*NA) + Multiply(T+NA+i, T, A, B+i, NA); + for (i=NA; i<NB; i+=2*NA) + Multiply(R+i, T, A, B+i, NA); + } + else + { + for (i=0; i<NB; i+=2*NA) + Multiply(R+i, T, A, B+i, NA); + for (i=NA; i<NB; i+=2*NA) + Multiply(T+NA+i, T, A, B+i, NA); + } if (Add(R+NA, R+NA, T+2*NA, NB-NA)) Increment(R+NB, NA); @@ -2308,10 +2111,10 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N) { T[0] = AtomicInverseModPower2(A[0]); T[1] = 0; - LowLevel::Multiply2Bottom(T+2, T, A); + s_pBot[0](T+2, T, A); TwosComplement(T+2, 2); Increment(T+2, 2, 2); - LowLevel::Multiply2Bottom(R, T, T+2); + s_pBot[0](R, T, T+2); } else { @@ -2333,8 +2136,9 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N) // M[N] --- modulus // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N) -void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, size_t N) +void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N) { +#if 1 MultiplyBottom(R, T, X, U, N); MultiplyTop(T, T+N, X, R, M, N); word borrow = Subtract(T, X+N, T, N); @@ -2342,6 +2146,60 @@ void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word word carry = Add(T+N, T, M, N); assert(carry || !borrow); CopyWords(R, T + (borrow ? N : 0), N); +#elif 0 + const word u = 0-U[0]; + Declare2Words(p) + for (size_t i=0; i<N; i++) + { + const word t = u * X[i]; + word c = 0; + for (size_t j=0; j<N; j+=2) + { + MultiplyWords(p, t, M[j]); + Acc2WordsBy1(p, X[i+j]); + Acc2WordsBy1(p, c); + X[i+j] = LowWord(p); + c = HighWord(p); + MultiplyWords(p, t, M[j+1]); + Acc2WordsBy1(p, X[i+j+1]); + Acc2WordsBy1(p, c); + X[i+j+1] = LowWord(p); + c = HighWord(p); + } + + if (Increment(X+N+i, N-i, c)) + while (!Subtract(X+N, X+N, M, N)) {} + } + + memcpy(R, X+N, N*WORD_SIZE); +#else + __m64 u = _mm_cvtsi32_si64(0-U[0]), p; + for (size_t i=0; i<N; i++) + { + __m64 t = _mm_cvtsi32_si64(X[i]); + t = _mm_mul_su32(t, u); + __m64 c = _mm_setzero_si64(); + for (size_t j=0; j<N; j+=2) + { + p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j])); + p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j])); + c = _mm_add_si64(c, p); + X[i+j] = _mm_cvtsi64_si32(c); + c = _mm_srli_si64(c, 32); + p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1])); + p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1])); + c = _mm_add_si64(c, p); + X[i+j+1] = _mm_cvtsi64_si32(c); + c = _mm_srli_si64(c, 32); + } + + if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c))) + while (!Subtract(X+N, X+N, M, N)) {} + } + + memcpy(R, X+N, N*WORD_SIZE); + _mm_empty(); +#endif } // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M @@ -2491,7 +2349,7 @@ static inline void AtomicDivide(word *Q, const word *A, const word *B) // multiply quotient and divisor and add remainder, make sure it equals dividend assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0]))); word P[4]; - Portable::Multiply2(P, Q, B); + s_pMul[0](P, Q, B); Add(P, P, T, 4); assert(memcmp(P, A, 4*WORD_SIZE)==0); } @@ -2503,21 +2361,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si { assert(N && N%2==0); - if (Q[1]) - { - T[N] = T[N+1] = 0; - unsigned i; - for (i=0; i<N; i+=4) - LowLevel::Multiply2(T+i, Q, B+i); - for (i=2; i<N; i+=4) - if (LowLevel::Multiply2Add(T+i, Q, B+i)) - T[i+5] += (++T[i+4]==0); - } - else - { - T[N] = LinearMultiply(T, B, Q[0], N); - T[N+1] = 0; - } + AsymmetricMultiply(T, T+N+2, Q, 2, B, N); word borrow = Subtract(R, R, T, N+2); assert(!borrow && !R[N+1]); @@ -2532,7 +2376,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si // R[NB] -------- remainder = A%B // Q[NA-NB+2] --- quotient = A/B -// T[NA+2*NB+4] - temp work space +// T[NA+3*(NB+2)] - temp work space // A[NA] -------- dividend // B[NB] -------- divisor @@ -2726,9 +2570,7 @@ InitializeInteger::InitializeInteger() { if (!g_pAssignIntToInteger) { -#ifdef CRYPTOPP_X86ASM_AVAILABLE - SetPentiumFunctionPointers(); -#endif + SetFunctionPointers(); g_pAssignIntToInteger = AssignIntToInteger; } } @@ -2877,7 +2719,8 @@ Integer& Integer::operator=(const Integer& t) { if (this != &t) { - reg.New(RoundupSize(t.WordCount())); + if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0) + reg.New(RoundupSize(t.WordCount())); CopyWords(reg, t.reg, reg.size()); sign = t.sign; } @@ -3240,7 +3083,7 @@ public: void GenerateBlock(byte *output, size_t size) { - UnalignedPutWord(BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); + PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); ++m_counter; P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0); } @@ -3657,7 +3500,7 @@ void PositiveMultiply(Integer &product, const Integer &a, const Integer &b) product.reg.CleanNew(RoundupSize(aSize+bSize)); product.sign = Integer::POSITIVE; - SecAlignedWordBlock workspace(aSize + bSize); + IntegerSecBlock workspace(aSize + bSize); AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize); } @@ -3723,7 +3566,7 @@ void PositiveDivide(Integer &remainder, Integer "ient, quotient.reg.CleanNew(RoundupSize(aSize-bSize+2)); quotient.sign = Integer::POSITIVE; - SecAlignedWordBlock T(aSize+2*bSize+4); + IntegerSecBlock T(aSize+3*(bSize+2)); Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize); } @@ -11,44 +11,13 @@ NAMESPACE_BEGIN(CryptoPP) -#if defined(SSE2_INTRINSICS_AVAILABLE) - template <class T> - class AlignedAllocator : public AllocatorBase<T> - { - public: - CRYPTOPP_INHERIT_ALLOCATOR_TYPES - - pointer allocate(size_type n, const void *); - void deallocate(void *p, size_type n); - pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve) - { - return StandardReallocate(*this, p, oldSize, newSize, preserve); - } - - #if !(defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) || defined(CRYPTOPP_MEMALIGN_AVAILABLE) || defined(CRYPTOPP_MM_MALLOC_AVAILABLE)) - #define CRYPTOPP_NO_ALIGNED_ALLOC - AlignedAllocator() : m_pBlock(NULL) {} - protected: - void *m_pBlock; - #endif - }; - - #ifdef CRYPTOPP_IMPORTS - CRYPTOPP_DLL_TEMPLATE_CLASS AlignedAllocator<word>; - #endif - - typedef SecBlock<word, AlignedAllocator<word> > SecAlignedWordBlock; -#else - typedef SecWordBlock SecAlignedWordBlock; -#endif - -void CRYPTOPP_DLL CRYPTOPP_API DisableSSE2(); - struct InitializeInteger // used to initialize static variables { InitializeInteger(); }; +typedef SecBlock<word, AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86> > IntegerSecBlock; + //! multiple precision integer and basic arithmetics /*! This class can represent positive and negative integers with absolute value less than (256**sizeof(word)) ** (256**sizeof(int)). @@ -406,7 +375,7 @@ private: friend void PositiveMultiply(Integer &product, const Integer &a, const Integer &b); friend void PositiveDivide(Integer &remainder, Integer "ient, const Integer ÷nd, const Integer &divisor); - SecAlignedWordBlock reg; + IntegerSecBlock reg; Sign sign; }; diff --git a/rijndael.cpp b/rijndael.cpp index 2a1a19ef..4a8572f2 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -51,10 +51,7 @@ being unloaded from L1 cache, until that round is finished. #include "rijndael.h" #include "misc.h" - -#ifdef CRYPTOPP_L1_CACHE_ALIGN_NOT_AVAILABLE -#pragma message("Don't know how to align data on L1 cache boundary. Defense against AES timing attack may be affected.") -#endif +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -122,25 +119,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c for (i = 1; i < m_rounds; i++) { rk += 4; rk[0] = - Td0[Se[GETBYTE(rk[0], 3)]] ^ - Td1[Se[GETBYTE(rk[0], 2)]] ^ - Td2[Se[GETBYTE(rk[0], 1)]] ^ - Td3[Se[GETBYTE(rk[0], 0)]]; + Td[0*256+Se[GETBYTE(rk[0], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[0], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[0], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[0], 0)]]; rk[1] = - Td0[Se[GETBYTE(rk[1], 3)]] ^ - Td1[Se[GETBYTE(rk[1], 2)]] ^ - Td2[Se[GETBYTE(rk[1], 1)]] ^ - Td3[Se[GETBYTE(rk[1], 0)]]; + Td[0*256+Se[GETBYTE(rk[1], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[1], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[1], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[1], 0)]]; rk[2] = - Td0[Se[GETBYTE(rk[2], 3)]] ^ - Td1[Se[GETBYTE(rk[2], 2)]] ^ - Td2[Se[GETBYTE(rk[2], 1)]] ^ - Td3[Se[GETBYTE(rk[2], 0)]]; + Td[0*256+Se[GETBYTE(rk[2], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[2], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[2], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[2], 0)]]; rk[3] = - Td0[Se[GETBYTE(rk[3], 3)]] ^ - Td1[Se[GETBYTE(rk[3], 2)]] ^ - Td2[Se[GETBYTE(rk[3], 1)]] ^ - Td3[Se[GETBYTE(rk[3], 0)]]; + Td[0*256+Se[GETBYTE(rk[3], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[3], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[3], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[3], 0)]]; } } @@ -148,15 +145,245 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); } -const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4; -#ifdef IS_BIG_ENDIAN -const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0; -#else -const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3; -#endif +#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { +#ifdef CRYPTOPP_X86_ASM_AVAILABLE + if (HasMMX()) + { + const word32 *k = m_key; + const word32 *kLoopEnd = k + m_rounds*4; +#ifdef __GNUC__ + word32 t0, t1, t2, t3; + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS1( push ebp) + AS2( mov ebp, eax) + AS2( movd mm5, ecx) +#else + AS2( mov edx, g_cacheLineSize) + AS2( mov edi, inBlock) + AS2( mov esi, k) + AS2( movd mm5, kLoopEnd) + AS1( push ebp) + AS2( lea ebp, Te) +#endif + AS2( mov eax, [esi+0*4]) // s0 + AS2( xor eax, [edi+0*4]) + AS2( movd mm0, eax) + AS2( mov ebx, [esi+1*4]) + AS2( xor ebx, [edi+1*4]) + AS2( movd mm1, ebx) + AS2( and ebx, eax) + AS2( mov eax, [esi+2*4]) + AS2( xor eax, [edi+2*4]) + AS2( movd mm2, eax) + AS2( and ebx, eax) + AS2( mov ecx, [esi+3*4]) + AS2( xor ecx, [edi+3*4]) + AS2( and ebx, ecx) + + // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction + AS2( and ebx, 0) + AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence + ASL(2) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( cmp edi, 1024) + ASJ( jl, 2, b) + AS2( and ebx, [ebp+1020]) + AS2( movd mm6, ebx) + AS2( pxor mm2, mm6) + AS2( pxor mm1, mm6) + AS2( pxor mm0, mm6) + AS2( xor ecx, ebx) + + AS2( mov edi, [esi+4*4]) // t0 + AS2( mov eax, [esi+5*4]) + AS2( mov ebx, [esi+6*4]) + AS2( mov edx, [esi+7*4]) + AS2( add esi, 8*4) + AS2( movd mm4, esi) + +#define QUARTER_ROUND(t, a, b, c, d) \ + AS2(movzx esi, t##l)\ + AS2(d, [ebp+0*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(c, [ebp+1*1024+4*esi])\ + AS2(shr e##t##x, 16)\ + AS2(movzx esi, t##l)\ + AS2(b, [ebp+2*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(a, [ebp+3*1024+4*esi]) + +#define s0 xor edi +#define s1 xor eax +#define s2 xor ebx +#define s3 xor ecx +#define t0 xor edi +#define t1 xor eax +#define t2 xor ebx +#define t3 xor edx + + QUARTER_ROUND(c, t0, t1, t2, t3) + AS2( movd ecx, mm2) + QUARTER_ROUND(c, t3, t0, t1, t2) + AS2( movd ecx, mm1) + QUARTER_ROUND(c, t2, t3, t0, t1) + AS2( movd ecx, mm0) + QUARTER_ROUND(c, t1, t2, t3, t0) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) +#undef QUARTER_ROUND + + AS2( movd esi, mm4) + + ASL(0) + AS2( mov edi, [esi+0*4]) + AS2( mov eax, [esi+1*4]) + AS2( mov ebx, [esi+2*4]) + AS2( mov ecx, [esi+3*4]) + +#define QUARTER_ROUND(t, a, b, c, d) \ + AS2(movzx esi, t##l)\ + AS2(a, [ebp+3*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(b, [ebp+2*1024+4*esi])\ + AS2(shr e##t##x, 16)\ + AS2(movzx esi, t##l)\ + AS2(c, [ebp+1*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(d, [ebp+0*1024+4*esi]) + + QUARTER_ROUND(d, s0, s1, s2, s3) + AS2( movd edx, mm2) + QUARTER_ROUND(d, s3, s0, s1, s2) + AS2( movd edx, mm1) + QUARTER_ROUND(d, s2, s3, s0, s1) + AS2( movd edx, mm0) + QUARTER_ROUND(d, s1, s2, s3, s0) + AS2( movd esi, mm4) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) + + AS2( mov edi, [esi+4*4]) + AS2( mov eax, [esi+5*4]) + AS2( mov ebx, [esi+6*4]) + AS2( mov edx, [esi+7*4]) + + QUARTER_ROUND(c, t0, t1, t2, t3) + AS2( movd ecx, mm2) + QUARTER_ROUND(c, t3, t0, t1, t2) + AS2( movd ecx, mm1) + QUARTER_ROUND(c, t2, t3, t0, t1) + AS2( movd ecx, mm0) + QUARTER_ROUND(c, t1, t2, t3, t0) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) + + AS2( movd esi, mm4) + AS2( movd edi, mm5) + AS2( add esi, 8*4) + AS2( movd mm4, esi) + AS2( cmp edi, esi) + ASJ( jne, 0, b) + +#undef QUARTER_ROUND +#undef s0 +#undef s1 +#undef s2 +#undef s3 +#undef t0 +#undef t1 +#undef t2 +#undef t3 + + AS2( mov eax, [edi+0*4]) + AS2( mov ecx, [edi+1*4]) + AS2( mov esi, [edi+2*4]) + AS2( mov edi, [edi+3*4]) + +#define QUARTER_ROUND(a, b, c, d) \ + AS2( movzx ebx, dl)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 3*8)\ + AS2( xor a, ebx)\ + AS2( movzx ebx, dh)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 2*8)\ + AS2( xor b, ebx)\ + AS2( shr edx, 16)\ + AS2( movzx ebx, dl)\ + AS2( shr edx, 8)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 1*8)\ + AS2( xor c, ebx)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\ + AS2( xor d, ebx) + + QUARTER_ROUND(eax, ecx, esi, edi) + AS2( movd edx, mm2) + QUARTER_ROUND(edi, eax, ecx, esi) + AS2( movd edx, mm1) + QUARTER_ROUND(esi, edi, eax, ecx) + AS2( movd edx, mm0) + QUARTER_ROUND(ecx, esi, edi, eax) + +#undef QUARTER_ROUND + + AS1( pop ebp) + AS1( emms) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) + : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) + : "memory", "cc" + ); + + if (xorBlock) + { + t0 ^= ((const word32 *)xorBlock)[0]; + t1 ^= ((const word32 *)xorBlock)[1]; + t2 ^= ((const word32 *)xorBlock)[2]; + t3 ^= ((const word32 *)xorBlock)[3]; + } + ((word32 *)outBlock)[0] = t0; + ((word32 *)outBlock)[1] = t1; + ((word32 *)outBlock)[2] = t2; + ((word32 *)outBlock)[3] = t3; +#else + AS2( mov ebx, xorBlock) + AS2( test ebx, ebx) + ASJ( jz, 1, f) + AS2( xor eax, [ebx+0*4]) + AS2( xor ecx, [ebx+1*4]) + AS2( xor esi, [ebx+2*4]) + AS2( xor edi, [ebx+3*4]) + ASL(1) + AS2( mov ebx, outBlock) + AS2( mov [ebx+0*4], eax) + AS2( mov [ebx+1*4], ecx) + AS2( mov [ebx+2*4], esi) + AS2( mov [ebx+3*4], edi) +#endif + } + else +#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE + { word32 s0, s1, s2, s3, t0, t1, t2, t3; const word32 *rk = m_key; @@ -171,95 +398,68 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; // timing attack countermeasure. see comments at top for more details + const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; - for (i=0; i<sizeof(Te0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (Te0[i+0*s_lineSizeDiv4] & Te0[i+2*s_lineSizeDiv4]) & (Te0[i+1*s_lineSizeDiv4] & Te0[i+3*s_lineSizeDiv4]); + for (i=0; i<1024; i+=cacheLineSize) + u &= *(const word32 *)(((const byte *)Te)+i); + u &= Te[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; // first round - t0 ^= - Te0[GETBYTE(s0, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s1, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s2, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s3, s_i0)], 24); - t1 ^= - Te0[GETBYTE(s1, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s2, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s3, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s0, s_i0)], 24); - t2 ^= - Te0[GETBYTE(s2, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s3, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s0, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s1, s_i0)], 24); - t3 ^= - Te0[GETBYTE(s3, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s0, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s1, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s2, s_i0)], 24); +#ifdef IS_BIG_ENDIAN +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= rotrFixed(Te[byte(t)], 24); t >>= 8;\ + b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ + c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ + d ^= Te[t]; +#else +#define QUARTER_ROUND(t, a, b, c, d) \ + d ^= Te[byte(t)]; t >>= 8;\ + c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ + b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ + a ^= rotrFixed(Te[t], 24); +#endif + + QUARTER_ROUND(s3, t0, t1, t2, t3) + QUARTER_ROUND(s2, t3, t0, t1, t2) + QUARTER_ROUND(s1, t2, t3, t0, t1) + QUARTER_ROUND(s0, t1, t2, t3, t0) +#undef QUARTER_ROUND // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { - s0 = - Te0[GETBYTE(t0, 3)] ^ - Te1[GETBYTE(t1, 2)] ^ - Te2[GETBYTE(t2, 1)] ^ - Te3[GETBYTE(t3, 0)] ^ - rk[0]; - s1 = - Te0[GETBYTE(t1, 3)] ^ - Te1[GETBYTE(t2, 2)] ^ - Te2[GETBYTE(t3, 1)] ^ - Te3[GETBYTE(t0, 0)] ^ - rk[1]; - s2 = - Te0[GETBYTE(t2, 3)] ^ - Te1[GETBYTE(t3, 2)] ^ - Te2[GETBYTE(t0, 1)] ^ - Te3[GETBYTE(t1, 0)] ^ - rk[2]; - s3 = - Te0[GETBYTE(t3, 3)] ^ - Te1[GETBYTE(t0, 2)] ^ - Te2[GETBYTE(t1, 1)] ^ - Te3[GETBYTE(t2, 0)] ^ - rk[3]; - - t0 = - Te0[GETBYTE(s0, 3)] ^ - Te1[GETBYTE(s1, 2)] ^ - Te2[GETBYTE(s2, 1)] ^ - Te3[GETBYTE(s3, 0)] ^ - rk[4]; - t1 = - Te0[GETBYTE(s1, 3)] ^ - Te1[GETBYTE(s2, 2)] ^ - Te2[GETBYTE(s3, 1)] ^ - Te3[GETBYTE(s0, 0)] ^ - rk[5]; - t2 = - Te0[GETBYTE(s2, 3)] ^ - Te1[GETBYTE(s3, 2)] ^ - Te2[GETBYTE(s0, 1)] ^ - Te3[GETBYTE(s1, 0)] ^ - rk[6]; - t3 = - Te0[GETBYTE(s3, 3)] ^ - Te1[GETBYTE(s0, 2)] ^ - Te2[GETBYTE(s1, 1)] ^ - Te3[GETBYTE(s2, 0)] ^ - rk[7]; +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= Te[3*256+byte(t)]; t >>= 8;\ + b ^= Te[2*256+byte(t)]; t >>= 8;\ + c ^= Te[1*256+byte(t)]; t >>= 8;\ + d ^= Te[t]; + + s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; + + QUARTER_ROUND(t3, s0, s1, s2, s3) + QUARTER_ROUND(t2, s3, s0, s1, s2) + QUARTER_ROUND(t1, s2, s3, s0, s1) + QUARTER_ROUND(t0, s1, s2, s3, s0) + + t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; + + QUARTER_ROUND(s3, t0, t1, t2, t3) + QUARTER_ROUND(s2, t3, t0, t1, t2) + QUARTER_ROUND(s1, t2, t3, t0, t1) + QUARTER_ROUND(s0, t1, t2, t3, t0) +#undef QUARTER_ROUND rk += 8; } while (--r); // timing attack countermeasure. see comments at top for more details u = 0; - for (i=0; i<sizeof(Se)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (((word32*)Se)[i+0*s_lineSizeDiv4] & ((word32*)Se)[i+2*s_lineSizeDiv4]) & (((word32*)Se)[i+1*s_lineSizeDiv4] & ((word32*)Se)[i+3*s_lineSizeDiv4]); + for (i=0; i<256; i+=cacheLineSize) + u &= *(const word32 *)(Se+i); + u &= *(const word32 *)(Se+252); t0 |= u; t1 |= u; t2 |= u; t3 |= u; word32 tbw[4]; @@ -267,23 +467,17 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock word32 *const obw = (word32 *)outBlock; const word32 *const xbw = (const word32 *)xorBlock; - // last round - tempBlock[0] = Se[GETBYTE(t0, 3)]; - tempBlock[1] = Se[GETBYTE(t1, 2)]; - tempBlock[2] = Se[GETBYTE(t2, 1)]; - tempBlock[3] = Se[GETBYTE(t3, 0)]; - tempBlock[4] = Se[GETBYTE(t1, 3)]; - tempBlock[5] = Se[GETBYTE(t2, 2)]; - tempBlock[6] = Se[GETBYTE(t3, 1)]; - tempBlock[7] = Se[GETBYTE(t0, 0)]; - tempBlock[8] = Se[GETBYTE(t2, 3)]; - tempBlock[9] = Se[GETBYTE(t3, 2)]; - tempBlock[10] = Se[GETBYTE(t0, 1)]; - tempBlock[11] = Se[GETBYTE(t1, 0)]; - tempBlock[12] = Se[GETBYTE(t3, 3)]; - tempBlock[13] = Se[GETBYTE(t0, 2)]; - tempBlock[14] = Se[GETBYTE(t1, 1)]; - tempBlock[15] = Se[GETBYTE(t2, 0)]; +#define QUARTER_ROUND(t, a, b, c, d) \ + tempBlock[a] = Se[byte(t)]; t >>= 8;\ + tempBlock[b] = Se[byte(t)]; t >>= 8;\ + tempBlock[c] = Se[byte(t)]; t >>= 8;\ + tempBlock[d] = Se[t]; + + QUARTER_ROUND(t2, 15, 2, 5, 8) + QUARTER_ROUND(t1, 11, 14, 1, 4) + QUARTER_ROUND(t0, 7, 10, 13, 0) + QUARTER_ROUND(t3, 3, 6, 9, 12) +#undef QUARTER_ROUND if (xbw) { @@ -299,12 +493,13 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock obw[2] = tbw[2] ^ rk[2]; obw[3] = tbw[3] ^ rk[3]; } + } } void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { word32 s0, s1, s2, s3, t0, t1, t2, t3; - const word32 *rk = m_key; + const word32 *rk = m_key; s0 = ((const word32 *)inBlock)[0] ^ rk[0]; s1 = ((const word32 *)inBlock)[1] ^ rk[1]; @@ -317,95 +512,68 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; // timing attack countermeasure. see comments at top for more details + const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; - for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]); + for (i=0; i<1024; i+=cacheLineSize) + u &= *(const word32 *)(((const byte *)Td)+i); + u &= Td[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; // first round - t0 ^= - Td0[GETBYTE(s0, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s1, s_i0)], 24); - t1 ^= - Td0[GETBYTE(s1, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s2, s_i0)], 24); - t2 ^= - Td0[GETBYTE(s2, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s3, s_i0)], 24); - t3 ^= - Td0[GETBYTE(s3, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s0, s_i0)], 24); +#ifdef IS_BIG_ENDIAN +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= rotrFixed(Td[byte(t)], 24); t >>= 8;\ + b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\ + c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\ + d ^= Td[t]; +#else +#define QUARTER_ROUND(t, a, b, c, d) \ + d ^= Td[byte(t)]; t >>= 8;\ + c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\ + b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\ + a ^= rotrFixed(Td[t], 24); +#endif + + QUARTER_ROUND(s3, t2, t1, t0, t3) + QUARTER_ROUND(s2, t1, t0, t3, t2) + QUARTER_ROUND(s1, t0, t3, t2, t1) + QUARTER_ROUND(s0, t3, t2, t1, t0) +#undef QUARTER_ROUND // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { - s0 = - Td0[GETBYTE(t0, 3)] ^ - Td1[GETBYTE(t3, 2)] ^ - Td2[GETBYTE(t2, 1)] ^ - Td3[GETBYTE(t1, 0)] ^ - rk[0]; - s1 = - Td0[GETBYTE(t1, 3)] ^ - Td1[GETBYTE(t0, 2)] ^ - Td2[GETBYTE(t3, 1)] ^ - Td3[GETBYTE(t2, 0)] ^ - rk[1]; - s2 = - Td0[GETBYTE(t2, 3)] ^ - Td1[GETBYTE(t1, 2)] ^ - Td2[GETBYTE(t0, 1)] ^ - Td3[GETBYTE(t3, 0)] ^ - rk[2]; - s3 = - Td0[GETBYTE(t3, 3)] ^ - Td1[GETBYTE(t2, 2)] ^ - Td2[GETBYTE(t1, 1)] ^ - Td3[GETBYTE(t0, 0)] ^ - rk[3]; - - t0 = - Td0[GETBYTE(s0, 3)] ^ - Td1[GETBYTE(s3, 2)] ^ - Td2[GETBYTE(s2, 1)] ^ - Td3[GETBYTE(s1, 0)] ^ - rk[4]; - t1 = - Td0[GETBYTE(s1, 3)] ^ - Td1[GETBYTE(s0, 2)] ^ - Td2[GETBYTE(s3, 1)] ^ - Td3[GETBYTE(s2, 0)] ^ - rk[5]; - t2 = - Td0[GETBYTE(s2, 3)] ^ - Td1[GETBYTE(s1, 2)] ^ - Td2[GETBYTE(s0, 1)] ^ - Td3[GETBYTE(s3, 0)] ^ - rk[6]; - t3 = - Td0[GETBYTE(s3, 3)] ^ - Td1[GETBYTE(s2, 2)] ^ - Td2[GETBYTE(s1, 1)] ^ - Td3[GETBYTE(s0, 0)] ^ - rk[7]; +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= Td[3*256+byte(t)]; t >>= 8;\ + b ^= Td[2*256+byte(t)]; t >>= 8;\ + c ^= Td[1*256+byte(t)]; t >>= 8;\ + d ^= Td[t]; + + s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; + + QUARTER_ROUND(t3, s2, s1, s0, s3) + QUARTER_ROUND(t2, s1, s0, s3, s2) + QUARTER_ROUND(t1, s0, s3, s2, s1) + QUARTER_ROUND(t0, s3, s2, s1, s0) + + t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; + + QUARTER_ROUND(s3, t2, t1, t0, t3) + QUARTER_ROUND(s2, t1, t0, t3, t2) + QUARTER_ROUND(s1, t0, t3, t2, t1) + QUARTER_ROUND(s0, t3, t2, t1, t0) +#undef QUARTER_ROUND rk += 8; } while (--r); // timing attack countermeasure. see comments at top for more details u = 0; - for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]); + for (i=0; i<256; i+=cacheLineSize) + u &= *(const word32 *)(Sd+i); + u &= *(const word32 *)(Sd+252); t0 |= u; t1 |= u; t2 |= u; t3 |= u; word32 tbw[4]; @@ -413,23 +581,17 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock word32 *const obw = (word32 *)outBlock; const word32 *const xbw = (const word32 *)xorBlock; - // last round - tempBlock[0] = Sd[GETBYTE(t0, 3)]; - tempBlock[1] = Sd[GETBYTE(t3, 2)]; - tempBlock[2] = Sd[GETBYTE(t2, 1)]; - tempBlock[3] = Sd[GETBYTE(t1, 0)]; - tempBlock[4] = Sd[GETBYTE(t1, 3)]; - tempBlock[5] = Sd[GETBYTE(t0, 2)]; - tempBlock[6] = Sd[GETBYTE(t3, 1)]; - tempBlock[7] = Sd[GETBYTE(t2, 0)]; - tempBlock[8] = Sd[GETBYTE(t2, 3)]; - tempBlock[9] = Sd[GETBYTE(t1, 2)]; - tempBlock[10] = Sd[GETBYTE(t0, 1)]; - tempBlock[11] = Sd[GETBYTE(t3, 0)]; - tempBlock[12] = Sd[GETBYTE(t3, 3)]; - tempBlock[13] = Sd[GETBYTE(t2, 2)]; - tempBlock[14] = Sd[GETBYTE(t1, 1)]; - tempBlock[15] = Sd[GETBYTE(t0, 0)]; +#define QUARTER_ROUND(t, a, b, c, d) \ + tempBlock[a] = Sd[byte(t)]; t >>= 8;\ + tempBlock[b] = Sd[byte(t)]; t >>= 8;\ + tempBlock[c] = Sd[byte(t)]; t >>= 8;\ + tempBlock[d] = Sd[t]; + + QUARTER_ROUND(t2, 7, 2, 13, 8) + QUARTER_ROUND(t1, 3, 14, 9, 4) + QUARTER_ROUND(t0, 15, 10, 5, 0) + QUARTER_ROUND(t3, 11, 6, 1, 12) +#undef QUARTER_ROUND if (xbw) { @@ -25,16 +25,10 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat protected: // VS2005 workaround: have to put these on seperate lines, or error C2487 is triggered in DLL build - CRYPTOPP_L1_CACHE_ALIGN(static const byte Se[256]); - CRYPTOPP_L1_CACHE_ALIGN(static const byte Sd[256]); - CRYPTOPP_L1_CACHE_ALIGN(static const word32 Te0[256]); - static const word32 Te1[256]; - static const word32 Te2[256]; - static const word32 Te3[256]; - CRYPTOPP_L1_CACHE_ALIGN(static const word32 Td0[256]); - static const word32 Td1[256]; - static const word32 Td2[256]; - static const word32 Td3[256]; + static const byte Se[256]; + static const byte Sd[256]; + static const word32 Te[4*256]; + static const word32 Td[4*256]; static const word32 rcon[]; @@ -52,6 +46,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; + void ProcessAndXorBlock_Old(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; }; public: @@ -9,6 +9,7 @@ #include "sha.h" #include "misc.h" +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -74,27 +75,43 @@ void SHA1::Transform(word32 *state, const word32 *data) state[2] += c; state[3] += d; state[4] += e; - /* Wipe variables */ - a = b = c = d = e = 0; - memset(W, 0, sizeof(W)); } // end of Steve Reid's code // ************************************************************* +void SHA224::InitState(HashWordType *state) +{ + static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4}; + memcpy(state, s, sizeof(s)); +} + void SHA256::InitState(HashWordType *state) { - state[0] = 0x6a09e667; - state[1] = 0xbb67ae85; - state[2] = 0x3c6ef372; - state[3] = 0xa54ff53a; - state[4] = 0x510e527f; - state[5] = 0x9b05688c; - state[6] = 0x1f83d9ab; - state[7] = 0x5be0cd19; + static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; + memcpy(state, s, sizeof(s)); } +static const word32 SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15])) #define Ch(x,y,z) (z^(x&(y^z))) @@ -109,7 +126,7 @@ void SHA256::InitState(HashWordType *state) #define g(i) T[(6-i)&7] #define h(i) T[(7-i)&7] -#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\ +#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\ d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) // for SHA256 @@ -141,98 +158,114 @@ void SHA256::Transform(word32 *state, const word32 *data) state[5] += f(0); state[6] += g(0); state[7] += h(0); - /* Wipe variables */ - memset(W, 0, sizeof(W)); - memset(T, 0, sizeof(T)); } +/* +// smaller but slower +void SHA256_Transform(word32 *state, const word32 *data) +{ + word32 T[20]; + word32 W[32]; + unsigned int i = 0, j = 0; + word32 *t = T+8; + + memcpy(t, state, 8*4); + word32 e = t[4], a = t[0]; + + do + { + word32 w = data[j]; + W[j] = w; + w += K[j]; + w += t[7]; + w += S1(e); + w += Ch(e, t[5], t[6]); + e = t[3] + w; + t[3] = t[3+8] = e; + w += S0(t[0]); + a = w + Maj(a, t[1], t[2]); + t[-1] = t[7] = a; + --t; + ++j; + if (j%8 == 0) + t += 8; + } while (j<16); + + do + { + i = j&0xf; + word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7]; + W[i+16] = W[i] = w; + w += K[j]; + w += t[7]; + w += S1(e); + w += Ch(e, t[5], t[6]); + e = t[3] + w; + t[3] = t[3+8] = e; + w += S0(t[0]); + a = w + Maj(a, t[1], t[2]); + t[-1] = t[7] = a; + + w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7]; + W[(i+1)+16] = W[(i+1)] = w; + w += K[j+1]; + w += (t-1)[7]; + w += S1(e); + w += Ch(e, (t-1)[5], (t-1)[6]); + e = (t-1)[3] + w; + (t-1)[3] = (t-1)[3+8] = e; + w += S0((t-1)[0]); + a = w + Maj(a, (t-1)[1], (t-1)[2]); + (t-1)[-1] = (t-1)[7] = a; + + t-=2; + j+=2; + if (j%8 == 0) + t += 8; + } while (j<64); + + state[0] += a; + state[1] += t[1]; + state[2] += t[2]; + state[3] += t[3]; + state[4] += e; + state[5] += t[5]; + state[6] += t[6]; + state[7] += t[7]; +} +*/ + #undef S0 #undef S1 #undef s0 #undef s1 - -const word32 SHA256::K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -void SHA224::InitState(HashWordType *state) -{ - state[0] = 0xc1059ed8; - state[1] = 0x367cd507; - state[2] = 0x3070dd17; - state[3] = 0xf70e5939; - state[4] = 0xffc00b31; - state[5] = 0x68581511; - state[6] = 0x64f98fa7; - state[7] = 0xbefa4fa4; -} +#undef R // ************************************************************* #ifdef WORD64_AVAILABLE -void SHA512::InitState(HashWordType *state) +void SHA384::InitState(HashWordType *state) { - state[0] = W64LIT(0x6a09e667f3bcc908); - state[1] = W64LIT(0xbb67ae8584caa73b); - state[2] = W64LIT(0x3c6ef372fe94f82b); - state[3] = W64LIT(0xa54ff53a5f1d36f1); - state[4] = W64LIT(0x510e527fade682d1); - state[5] = W64LIT(0x9b05688c2b3e6c1f); - state[6] = W64LIT(0x1f83d9abfb41bd6b); - state[7] = W64LIT(0x5be0cd19137e2179); + static const word64 s[8] = { + W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507), + W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939), + W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511), + W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)}; + memcpy(state, s, sizeof(s)); } -// for SHA512 -#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39)) -#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41)) -#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7)) -#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6)) - -void SHA512::Transform(word64 *state, const word64 *data) +void SHA512::InitState(HashWordType *state) { - word64 W[16]; - word64 T[8]; - /* Copy context->state[] to working vars */ - memcpy(T, state, sizeof(T)); - /* 80 operations, partially loop unrolled */ - for (unsigned int j=0; j<80; j+=16) - { - R( 0); R( 1); R( 2); R( 3); - R( 4); R( 5); R( 6); R( 7); - R( 8); R( 9); R(10); R(11); - R(12); R(13); R(14); R(15); - } - /* Add the working vars back into context.state[] */ - state[0] += a(0); - state[1] += b(0); - state[2] += c(0); - state[3] += d(0); - state[4] += e(0); - state[5] += f(0); - state[6] += g(0); - state[7] += h(0); - /* Wipe variables */ - memset(W, 0, sizeof(W)); - memset(T, 0, sizeof(T)); + static const word64 s[8] = { + W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b), + W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1), + W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f), + W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)}; + memcpy(state, s, sizeof(s)); } -const word64 SHA512::K[80] = { +CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), @@ -275,16 +308,231 @@ const word64 SHA512::K[80] = { W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) }; -void SHA384::InitState(HashWordType *state) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version +static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) +{ +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS2( mov ebx, eax) +#else + AS2( lea ebx, SHA512_K) +#endif + + AS2( mov eax, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 27*16) // 17*16 for expanded data, 20*8 for state + AS1( push eax) + AS2( xor eax, eax) + AS2( lea edi, [esp+4+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying + AS2( lea esi, [esp+4+20*8+8]) // 16-byte alignment, then add 8 + + AS2( movq mm4, [ecx+0*8]) + AS2( movq [edi+0*8], mm4) + AS2( movq mm0, [ecx+1*8]) + AS2( movq [edi+1*8], mm0) + AS2( movq mm0, [ecx+2*8]) + AS2( movq [edi+2*8], mm0) + AS2( movq mm0, [ecx+3*8]) + AS2( movq [edi+3*8], mm0) + AS2( movq mm5, [ecx+4*8]) + AS2( movq [edi+4*8], mm5) + AS2( movq mm0, [ecx+5*8]) + AS2( movq [edi+5*8], mm0) + AS2( movq mm0, [ecx+6*8]) + AS2( movq [edi+6*8], mm0) + AS2( movq mm0, [ecx+7*8]) + AS2( movq [edi+7*8], mm0) + ASJ( jmp, 0, f) + +#define SSE2_S0_S1(r, a, b, c) \ + AS2( movq mm6, r)\ + AS2( psrlq r, a)\ + AS2( movq mm7, r)\ + AS2( psllq mm6, 64-c)\ + AS2( pxor mm7, mm6)\ + AS2( psrlq r, b-a)\ + AS2( pxor mm7, r)\ + AS2( psllq mm6, c-b)\ + AS2( pxor mm7, mm6)\ + AS2( psrlq r, c-b)\ + AS2( pxor r, mm7)\ + AS2( psllq mm6, b-a)\ + AS2( pxor r, mm6) + +#define SSE2_s0(r, a, b, c) \ + AS2( movdqa xmm6, r)\ + AS2( psrlq r, a)\ + AS2( movdqa xmm7, r)\ + AS2( psllq xmm6, 64-c)\ + AS2( pxor xmm7, xmm6)\ + AS2( psrlq r, b-a)\ + AS2( pxor xmm7, r)\ + AS2( psrlq r, c-b)\ + AS2( pxor r, xmm7)\ + AS2( psllq xmm6, c-a)\ + AS2( pxor r, xmm6) + +#define SSE2_s1(r, a, b, c) \ + AS2( movdqa xmm6, r)\ + AS2( psrlq r, a)\ + AS2( movdqa xmm7, r)\ + AS2( psllq xmm6, 64-c)\ + AS2( pxor xmm7, xmm6)\ + AS2( psrlq r, b-a)\ + AS2( pxor xmm7, r)\ + AS2( psllq xmm6, c-b)\ + AS2( pxor xmm7, xmm6)\ + AS2( psrlq r, c-b)\ + AS2( pxor r, xmm7) + + ASL(SHA512_Round) + // k + w is in mm0, a is in mm4, e is in mm5 + AS2( paddq mm0, [edi+7*8]) // h + AS2( movq mm2, [edi+5*8]) // f + AS2( movq mm3, [edi+6*8]) // g + AS2( pxor mm2, mm3) + AS2( pand mm2, mm5) + SSE2_S0_S1(mm5,14,18,41) + AS2( pxor mm2, mm3) + AS2( paddq mm0, mm2) // h += Ch(e,f,g) + AS2( paddq mm5, mm0) // h += S1(e) + AS2( movq mm2, [edi+1*8]) // b + AS2( movq mm1, mm2) + AS2( por mm2, mm4) + AS2( pand mm2, [edi+2*8]) // c + AS2( pand mm1, mm4) + AS2( por mm1, mm2) + AS2( paddq mm1, mm5) // temp = h + Maj(a,b,c) + AS2( paddq mm5, [edi+3*8]) // e = d + h + AS2( movq [edi+3*8], mm5) + AS2( movq [edi+11*8], mm5) + SSE2_S0_S1(mm4,28,34,39) // S0(a) + AS2( paddq mm4, mm1) // a = temp + S0(a) + AS2( movq [edi-8], mm4) + AS2( movq [edi+7*8], mm4) + AS1( ret) + + // first 16 rounds + ASL(0) + AS2( movq mm0, [edx+eax*8]) + AS2( movq [esi+eax*8], mm0) + AS2( movq [esi+eax*8+16*8], mm0) + AS2( paddq mm0, [ebx+eax*8]) + ASC( call, SHA512_Round) + AS1( inc eax) + AS2( sub edi, 8) + AS2( test eax, 7) + ASJ( jnz, 0, b) + AS2( add edi, 8*8) + AS2( cmp eax, 16) + ASJ( jne, 0, b) + + // rest of the rounds + AS2( movdqu xmm0, [esi+(16-2)*8]) + ASL(1) + // data expansion, W[i-2] already in xmm0 + AS2( movdqu xmm3, [esi]) + AS2( paddq xmm3, [esi+(16-7)*8]) + AS2( movdqa xmm2, [esi+(16-15)*8]) + SSE2_s1(xmm0, 6, 19, 61) + AS2( paddq xmm0, xmm3) + SSE2_s0(xmm2, 1, 7, 8) + AS2( paddq xmm0, xmm2) + AS2( movdq2q mm0, xmm0) + AS2( movhlps xmm1, xmm0) + AS2( paddq mm0, [ebx+eax*8]) + AS2( movlps [esi], xmm0) + AS2( movlps [esi+8], xmm1) + AS2( movlps [esi+8*16], xmm0) + AS2( movlps [esi+8*17], xmm1) + // 2 rounds + ASC( call, SHA512_Round) + AS2( sub edi, 8) + AS2( movdq2q mm0, xmm1) + AS2( paddq mm0, [ebx+eax*8+8]) + ASC( call, SHA512_Round) + // update indices and loop + AS2( add esi, 16) + AS2( add eax, 2) + AS2( sub edi, 8) + AS2( test eax, 7) + ASJ( jnz, 1, b) + // do housekeeping every 8 rounds + AS2( mov esi, 0xf) + AS2( and esi, eax) + AS2( lea esi, [esp+4+20*8+8+esi*8]) + AS2( add edi, 8*8) + AS2( cmp eax, 80) + ASJ( jne, 1, b) + +#define SSE2_CombineState(i) \ + AS2( movq mm0, [edi+i*8])\ + AS2( paddq mm0, [ecx+i*8])\ + AS2( movq [ecx+i*8], mm0) + + SSE2_CombineState(0) + SSE2_CombineState(1) + SSE2_CombineState(2) + SSE2_CombineState(3) + SSE2_CombineState(4) + SSE2_CombineState(5) + SSE2_CombineState(6) + SSE2_CombineState(7) + + AS1( pop esp) + AS1( emms) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (SHA512_K), "c" (state), "d" (data) + : "%esi", "%edi", "memory", "cc" + ); +#endif +} +#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +void SHA512::Transform(word64 *state, const word64 *data) { - state[0] = W64LIT(0xcbbb9d5dc1059ed8); - state[1] = W64LIT(0x629a292a367cd507); - state[2] = W64LIT(0x9159015a3070dd17); - state[3] = W64LIT(0x152fecd8f70e5939); - state[4] = W64LIT(0x67332667ffc00b31); - state[5] = W64LIT(0x8eb44a8768581511); - state[6] = W64LIT(0xdb0c2e0d64f98fa7); - state[7] = W64LIT(0x47b5481dbefa4fa4); +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + return SHA512_SSE2_Transform(state, data); +#endif + +#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39)) +#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41)) +#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7)) +#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6)) + +#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\ + d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) + + word64 W[16]; + word64 T[8]; + /* Copy context->state[] to working vars */ + memcpy(T, state, sizeof(T)); + /* 80 operations, partially loop unrolled */ + for (unsigned int j=0; j<80; j+=16) + { + R( 0); R( 1); R( 2); R( 3); + R( 4); R( 5); R( 6); R( 7); + R( 8); R( 9); R(10); R(11); + R(12); R(13); R(14); R(15); + } + /* Add the working vars back into context.state[] */ + state[0] += a(0); + state[1] += b(0); + state[2] += c(0); + state[3] += d(0); + state[4] += e(0); + state[5] += f(0); + state[6] += g(0); + state[7] += h(0); } #endif @@ -23,9 +23,6 @@ public: static void CRYPTOPP_API InitState(HashWordType *state); static void CRYPTOPP_API Transform(word32 *digest, const word32 *data); static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-256";} - -protected: - static const word32 K[64]; }; //! implements the SHA-224 standard @@ -46,9 +43,6 @@ public: static void CRYPTOPP_API InitState(HashWordType *state); static void CRYPTOPP_API Transform(word64 *digest, const word64 *data); static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-512";} - -protected: - static const word64 K[80]; }; //! implements the SHA-384 standard @@ -3,6 +3,7 @@ #include "pch.h" #include "tiger.h" #include "misc.h" +#include "cpu.h" #ifdef WORD64_AVAILABLE @@ -24,13 +25,187 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) m_data[7] = GetBitCountLo(); - Transform(m_digest, m_data); - CorrectEndianess(m_digest, m_digest, DigestSize()); - memcpy(hash, m_digest, size); + Transform(m_state, m_data); + CorrectEndianess(m_state, m_state, DigestSize()); + memcpy(hash, m_state, size); Restart(); // reinit for next use } +void Tiger::Transform (word64 *digest, const word64 *X) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + { +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) +#else + AS2( mov eax, digest) + AS2( mov esi, X) + AS2( lea edx, [table]) +#endif + AS2( movq mm0, [eax]) + AS2( movq mm1, [eax+1*8]) + AS2( movq mm5, mm1) + AS2( movq mm2, [eax+2*8]) + AS2( movq mm7, [edx+4*2048+0*8]) + AS2( movq mm6, [edx+4*2048+1*8]) + AS2( mov ecx, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 8*8) + AS1( push ecx) + +#define SSE2_round(a,b,c,x,mul) \ + AS2( pxor c, [x])\ + AS2( movd ecx, c)\ + AS2( movzx edi, cl)\ + AS2( movq mm3, [edx+0*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( movq mm4, [edx+3*2048+edi*8])\ + AS2( shr ecx, 16)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+1*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+2*2048+edi*8])\ + AS3( pextrw ecx, c, 2)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+2*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+1*2048+edi*8])\ + AS3( pextrw ecx, c, 3)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+3*2048+edi*8])\ + AS2( psubq a, mm3)\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+0*2048+edi*8])\ + AS2( paddq b, mm4)\ + SSE2_mul_##mul(b) + +#define SSE2_mul_5(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 2)\ + AS2( paddq b, mm3) + +#define SSE2_mul_7(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 3)\ + AS2( psubq b, mm3) + +#define SSE2_mul_9(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 3)\ + AS2( paddq b, mm3) + +#define label2_5 1 +#define label2_7 2 +#define label2_9 3 + +#define SSE2_pass(A,B,C,mul,X) \ + AS2( xor ebx, ebx)\ + ASL(mul)\ + SSE2_round(A,B,C,X+0*8+ebx,mul)\ + SSE2_round(B,C,A,X+1*8+ebx,mul)\ + AS2( cmp ebx, 6*8)\ + ASJ( je, label2_##mul, f)\ + SSE2_round(C,A,B,X+2*8+ebx,mul)\ + AS2( add ebx, 3*8)\ + ASJ( jmp, mul, b)\ + ASL(label2_##mul) + +#define SSE2_key_schedule(Y,X) \ + AS2( movq mm3, [X+7*8])\ + AS2( pxor mm3, mm6)\ + AS2( movq mm4, [X+0*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+0*8], mm4)\ + AS2( pxor mm4, [X+1*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+1*8], mm4)\ + AS2( paddq mm4, [X+2*8])\ + AS2( pxor mm3, mm7)\ + AS2( psllq mm3, 19)\ + AS2( movq [Y+2*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [X+3*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+3*8], mm4)\ + AS2( pxor mm4, [X+4*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+4*8], mm4)\ + AS2( paddq mm4, [X+5*8])\ + AS2( pxor mm3, mm7)\ + AS2( psrlq mm3, 23)\ + AS2( movq [Y+5*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [X+6*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+6*8], mm4)\ + AS2( pxor mm4, [X+7*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+7*8], mm4)\ + AS2( paddq mm4, [Y+0*8])\ + AS2( pxor mm3, mm7)\ + AS2( psllq mm3, 19)\ + AS2( movq [Y+0*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [Y+1*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+1*8], mm4)\ + AS2( pxor mm4, [Y+2*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+2*8], mm4)\ + AS2( paddq mm4, [Y+3*8])\ + AS2( pxor mm3, mm7)\ + AS2( psrlq mm3, 23)\ + AS2( movq [Y+3*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [Y+4*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+4*8], mm4)\ + AS2( pxor mm4, [Y+5*8])\ + AS2( movq [Y+5*8], mm4)\ + AS2( paddq mm4, [Y+6*8])\ + AS2( movq [Y+6*8], mm4)\ + AS2( pxor mm4, [edx+4*2048+2*8])\ + AS2( movq mm3, [Y+7*8])\ + AS2( psubq mm3, mm4)\ + AS2( movq [Y+7*8], mm3) + + SSE2_pass(mm0, mm1, mm2, 5, esi) + SSE2_key_schedule(esp+4, esi) + SSE2_pass(mm2, mm0, mm1, 7, esp+4) + SSE2_key_schedule(esp+4, esp+4) + SSE2_pass(mm1, mm2, mm0, 9, esp+4) + + AS2( pxor mm0, [eax+0*8]) + AS2( movq [eax+0*8], mm0) + AS2( psubq mm1, mm5) + AS2( movq [eax+1*8], mm1) + AS2( paddq mm2, [eax+2*8]) + AS2( movq [eax+2*8], mm2) + + AS1( pop esp) + AS1( emms) +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (digest), "S" (X), "d" (table) + : "%ecx", "%edi", "memory", "cc" + ); +#endif + } + else +#endif + { + word64 a = digest[0]; + word64 b = digest[1]; + word64 c = digest[2]; + word64 Y[8]; + #define t1 (table) #define t2 (table+256) #define t3 (table+256*2) @@ -42,15 +217,17 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \ b *= mul -#define pass(a,b,c,mul,X) \ - round(a,b,c,X[0],mul); \ - round(b,c,a,X[1],mul); \ - round(c,a,b,X[2],mul); \ - round(a,b,c,X[3],mul); \ - round(b,c,a,X[4],mul); \ - round(c,a,b,X[5],mul); \ - round(a,b,c,X[6],mul); \ - round(b,c,a,X[7],mul) +#define pass(a,b,c,mul,X) {\ + int i=0;\ + while (true)\ + {\ + round(a,b,c,X[i+0],mul); \ + round(b,c,a,X[i+1],mul); \ + if (i==6)\ + break;\ + round(c,a,b,X[i+2],mul); \ + i+=3;\ + }} #define key_schedule(Y,X) \ Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \ @@ -70,24 +247,16 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) Y[6] += Y[5]; \ Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF) -void Tiger::Transform (word64 *digest, const word64 *X) -{ - word64 a = digest[0]; - word64 b = digest[1]; - word64 c = digest[2]; - word64 Y[8]; - - pass(a,b,c,5,X); - key_schedule(Y,X); - pass(c,a,b,7,Y); - key_schedule(Y,Y); - pass(b,c,a,9,Y); - - digest[0] = a ^ digest[0]; - digest[1] = b - digest[1]; - digest[2] = c + digest[2]; - - memset(Y, 0, sizeof(Y)); + pass(a,b,c,5,X); + key_schedule(Y,X); + pass(c,a,b,7,Y); + key_schedule(Y,Y); + pass(b,c,a,9,Y); + + digest[0] = a ^ digest[0]; + digest[1] = b - digest[1]; + digest[2] = c + digest[2]; + } } NAMESPACE_END @@ -9,7 +9,7 @@ NAMESPACE_BEGIN(CryptoPP) -/// <a href="http://www.weidai.com/scan-mirror/md.html#Tiger">Tiger</a> +/// <a href="http://www.cryptolounge.org/wiki/Tiger">Tiger</a> class Tiger : public IteratedHashWithStaticTransform<word64, LittleEndian, 64, 24, Tiger> { public: @@ -19,7 +19,7 @@ public: static const char * StaticAlgorithmName() {return "Tiger";} protected: - static const word64 table[4*256]; + static const word64 table[4*256+3]; }; NAMESPACE_END diff --git a/whrlpool.cpp b/whrlpool.cpp index 989281a3..da19d7ff 100644 --- a/whrlpool.cpp +++ b/whrlpool.cpp @@ -1,7 +1,7 @@ -// Whrlpool.cpp - modified by Kevin Springle from +// whrlpool.cpp - originally modified by Kevin Springle from // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c. +// Updated to Whirlpool version 3.0, optimized and MMX version added by Wei Dai // Any modifications are placed in the public domain -// Updated to Whirlpool version 3.0 by Wei Dai // This is the original introductory comment: @@ -69,6 +69,7 @@ #include "whrlpool.h" #include "misc.h" +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -94,9 +95,9 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size) m_data[m_data.size()-2] = GetBitCountHi(); m_data[m_data.size()-1] = GetBitCountLo(); - Transform(m_digest, m_data); - CorrectEndianess(m_digest, m_digest, DigestSize()); - memcpy(hash, m_digest, size); + Transform(m_state, m_data); + CorrectEndianess(m_state, m_state, DigestSize()); + memcpy(hash, m_state, size); Restart(); // reinit for next use } @@ -113,7 +114,7 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size) * employed). */ -static const word64 C0[256] = { +CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTION_ALIGN16 = { W64LIT(0x18186018c07830d8), W64LIT(0x23238c2305af4626), W64LIT(0xc6c63fc67ef991b8), W64LIT(0xe8e887e8136fcdfb), W64LIT(0x878726874ca113cb), W64LIT(0xb8b8dab8a9626d11), W64LIT(0x0101040108050209), W64LIT(0x4f4f214f426e9e0d), W64LIT(0x3636d836adee6c9b), W64LIT(0xa6a6a2a6590451ff), W64LIT(0xd2d26fd2debdb90c), W64LIT(0xf5f5f3f5fb06f70e), @@ -177,11 +178,9 @@ static const word64 C0[256] = { W64LIT(0x16165816b04e2ca6), W64LIT(0x3a3ae83acdd274f7), W64LIT(0x6969b9696fd0d206), W64LIT(0x09092409482d1241), W64LIT(0x7070dd70a7ade0d7), W64LIT(0xb6b6e2b6d954716f), W64LIT(0xd0d067d0ceb7bd1e), W64LIT(0xeded93ed3b7ec7d6), W64LIT(0xcccc17cc2edb85e2), W64LIT(0x424215422a578468), W64LIT(0x98985a98b4c22d2c), W64LIT(0xa4a4aaa4490e55ed), - W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2), -}; + W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2), -static const word64 C1[256] = { - W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd), + W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd), W64LIT(0xcb878726874ca113), W64LIT(0x11b8b8dab8a9626d), W64LIT(0x0901010401080502), W64LIT(0x0d4f4f214f426e9e), W64LIT(0x9b3636d836adee6c), W64LIT(0xffa6a6a2a6590451), W64LIT(0x0cd2d26fd2debdb9), W64LIT(0x0ef5f5f3f5fb06f7), W64LIT(0x967979f979ef80f2), W64LIT(0x306f6fa16f5fcede), W64LIT(0x6d91917e91fcef3f), W64LIT(0xf852525552aa07a4), @@ -245,10 +244,8 @@ static const word64 C1[256] = { W64LIT(0xd77070dd70a7ade0), W64LIT(0x6fb6b6e2b6d95471), W64LIT(0x1ed0d067d0ceb7bd), W64LIT(0xd6eded93ed3b7ec7), W64LIT(0xe2cccc17cc2edb85), W64LIT(0x68424215422a5784), W64LIT(0x2c98985a98b4c22d), W64LIT(0xeda4a4aaa4490e55), W64LIT(0x752828a0285d8850), W64LIT(0x865c5c6d5cda31b8), W64LIT(0x6bf8f8c7f8933fed), W64LIT(0xc28686228644a411), -}; -static const word64 C2[256] = { - W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f), + W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f), W64LIT(0x13cb878726874ca1), W64LIT(0x6d11b8b8dab8a962), W64LIT(0x0209010104010805), W64LIT(0x9e0d4f4f214f426e), W64LIT(0x6c9b3636d836adee), W64LIT(0x51ffa6a6a2a65904), W64LIT(0xb90cd2d26fd2debd), W64LIT(0xf70ef5f5f3f5fb06), W64LIT(0xf2967979f979ef80), W64LIT(0xde306f6fa16f5fce), W64LIT(0x3f6d91917e91fcef), W64LIT(0xa4f852525552aa07), @@ -312,10 +309,8 @@ static const word64 C2[256] = { W64LIT(0xe0d77070dd70a7ad), W64LIT(0x716fb6b6e2b6d954), W64LIT(0xbd1ed0d067d0ceb7), W64LIT(0xc7d6eded93ed3b7e), W64LIT(0x85e2cccc17cc2edb), W64LIT(0x8468424215422a57), W64LIT(0x2d2c98985a98b4c2), W64LIT(0x55eda4a4aaa4490e), W64LIT(0x50752828a0285d88), W64LIT(0xb8865c5c6d5cda31), W64LIT(0xed6bf8f8c7f8933f), W64LIT(0x11c28686228644a4), -}; -static const word64 C3[256] = { - W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813), + W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813), W64LIT(0xa113cb878726874c), W64LIT(0x626d11b8b8dab8a9), W64LIT(0x0502090101040108), W64LIT(0x6e9e0d4f4f214f42), W64LIT(0xee6c9b3636d836ad), W64LIT(0x0451ffa6a6a2a659), W64LIT(0xbdb90cd2d26fd2de), W64LIT(0x06f70ef5f5f3f5fb), W64LIT(0x80f2967979f979ef), W64LIT(0xcede306f6fa16f5f), W64LIT(0xef3f6d91917e91fc), W64LIT(0x07a4f852525552aa), @@ -379,9 +374,7 @@ static const word64 C3[256] = { W64LIT(0xade0d77070dd70a7), W64LIT(0x54716fb6b6e2b6d9), W64LIT(0xb7bd1ed0d067d0ce), W64LIT(0x7ec7d6eded93ed3b), W64LIT(0xdb85e2cccc17cc2e), W64LIT(0x578468424215422a), W64LIT(0xc22d2c98985a98b4), W64LIT(0x0e55eda4a4aaa449), W64LIT(0x8850752828a0285d), W64LIT(0x31b8865c5c6d5cda), W64LIT(0x3fed6bf8f8c7f893), W64LIT(0xa411c28686228644), -}; -static const word64 rc[R] = { W64LIT(0x1823c6e887b8014f), W64LIT(0x36a6d2f5796f9152), W64LIT(0x60bc9b8ea30c7b35), @@ -397,55 +390,292 @@ static const word64 rc[R] = { // Whirlpool basic transformation. Transforms state based on block. void Whirlpool::Transform(word64 *digest, const word64 *block) { +#ifdef CRYPTOPP_X86_ASM_AVAILABLE + if (HasMMX()) + { + // MMX version has the same structure as C version below +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS2( mov ebx, eax) +#else + AS2( lea ebx, [Whirlpool_C]) + AS2( mov ecx, digest) + AS2( mov edx, block) +#endif + AS2( mov eax, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 16*8) + AS1( push eax) + AS2( xor esi, esi) + ASL(0) + AS2( movq mm0, [ecx+8*esi]) + AS2( movq [esp+4+8*esi], mm0) // k + AS2( pxor mm0, [edx+8*esi]) + AS2( movq [esp+4+64+8*esi], mm0) // s + AS2( movq [ecx+8*esi], mm0) + AS1( inc esi) + AS2( cmp esi, 8) + ASJ( jne, 0, b) + + AS2( xor esi, esi) + ASL(1) + +#define KSL0(a, b) AS2(movq mm##a, b) +#define KSL1(a, b) AS2(pxor mm##a, b) + +#define KSL(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+8*i])\ + AS2(movzx edi, al)\ + KSL##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + KSL##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + KSL##op(c, [ebx+1*2048+8*edi])\ + KSL##op(d, [ebx+0*2048+8*eax]) + +#define KSH0(a, b) \ + ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ + AS2(pxor mm##a, b) +#define KSH1(a, b) \ + AS2(pxor mm##a, b) +#define KSH2(a, b) \ + AS2(pxor mm##a, b)\ + AS2(movq [esp+4+8*a], mm##a) + +#define KSH(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\ + AS2(movzx edi, al)\ + KSH##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + KSH##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + KSH##op(c, [ebx+1*2048+8*edi])\ + KSH##op(d, [ebx+0*2048+8*eax]) + +#define TSL(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+64+8*i])\ + AS2(movzx edi, al)\ + KSL##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + KSL##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + KSL##op(c, [ebx+1*2048+8*edi])\ + KSL##op(d, [ebx+0*2048+8*eax]) + +#define TSH0(a, b) \ + ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ + AS2(pxor mm##a, [esp+4+8*a])\ + AS2(pxor mm##a, b) +#define TSH1(a, b) \ + AS2(pxor mm##a, b) +#define TSH2(a, b) \ + AS2(pxor mm##a, b)\ + AS2(movq [esp+4+64+8*a], mm##a) +#define TSH3(a, b) \ + AS2(pxor mm##a, b)\ + AS2(pxor mm##a, [ecx+8*a])\ + AS2(movq [ecx+8*a], mm##a) + +#define TSH(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\ + AS2(movzx edi, al)\ + TSH##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + TSH##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + TSH##op(c, [ebx+1*2048+8*edi])\ + TSH##op(d, [ebx+0*2048+8*eax]) + + KSL(0, 4, 3, 2, 1, 0) + KSL(0, 0, 7, 6, 5, 4) + KSL(1, 1, 0, 7, 6, 5) + KSL(1, 2, 1, 0, 7, 6) + KSL(1, 3, 2, 1, 0, 7) + KSL(1, 5, 4, 3, 2, 1) + KSL(1, 6, 5, 4, 3, 2) + KSL(1, 7, 6, 5, 4, 3) + KSH(0, 0, 7, 6, 5, 4) + KSH(0, 4, 3, 2, 1, 0) + KSH(1, 1, 0, 7, 6, 5) + KSH(1, 2, 1, 0, 7, 6) + KSH(1, 5, 4, 3, 2, 1) + KSH(1, 6, 5, 4, 3, 2) + KSH(2, 3, 2, 1, 0, 7) + KSH(2, 7, 6, 5, 4, 3) + + AS2( pxor mm0, [ebx + 8*1024 + esi*8]) + AS2( movq [esp+4], mm0) + + TSL(0, 4, 3, 2, 1, 0) + TSL(0, 0, 7, 6, 5, 4) + TSL(1, 1, 0, 7, 6, 5) + TSL(1, 2, 1, 0, 7, 6) + TSL(1, 3, 2, 1, 0, 7) + TSL(1, 5, 4, 3, 2, 1) + TSL(1, 6, 5, 4, 3, 2) + TSL(1, 7, 6, 5, 4, 3) + TSH(0, 0, 7, 6, 5, 4) + TSH(0, 4, 3, 2, 1, 0) + TSH(1, 1, 0, 7, 6, 5) + TSH(1, 2, 1, 0, 7, 6) + TSH(1, 5, 4, 3, 2, 1) + TSH(1, 6, 5, 4, 3, 2) + + AS1( inc esi) + AS2( cmp esi, 10) + ASJ( je, 2, f) + + TSH(2, 3, 2, 1, 0, 7) + TSH(2, 7, 6, 5, 4, 3) + + ASJ( jmp, 1, b) + ASL(2) + + TSH(3, 3, 2, 1, 0, 7) + TSH(3, 7, 6, 5, 4, 3) + +#undef KSL +#undef KSH +#undef TSL +#undef TSH + + AS1( emms) + AS1( pop esp) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (Whirlpool_C), "c" (digest), "d" (block) + : "%esi", "%edi", "memory", "cc" + ); +#endif + } + else +#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE + { word64 s[8]; // the cipher state word64 k[8]; // the round key // Compute and apply K^0 to the cipher state // Also apply part of the Miyaguchi-Preneel compression function - digest[0] = s[0] = block[0] ^ (k[0] = digest[0]); - digest[1] = s[1] = block[1] ^ (k[1] = digest[1]); - digest[2] = s[2] = block[2] ^ (k[2] = digest[2]); - digest[3] = s[3] = block[3] ^ (k[3] = digest[3]); - digest[4] = s[4] = block[4] ^ (k[4] = digest[4]); - digest[5] = s[5] = block[5] ^ (k[5] = digest[5]); - digest[6] = s[6] = block[6] ^ (k[6] = digest[6]); - digest[7] = s[7] = block[7] ^ (k[7] = digest[7]); + for (int i=0; i<8; i++) + digest[i] = s[i] = block[i] ^ (k[i] = digest[i]); + +#define KSL(op, i, a, b, c, d) \ + t = (word32)k[i];\ + w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\ + t >>= 8;\ + w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\ + t >>= 8;\ + w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\ + t >>= 8;\ + w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : 0); + +#define KSH(op, i, a, b, c, d) \ + t = (word32)(k[(i+4)%8]>>32);\ + w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32));\ + if (op==2) k[a] = w##a;\ + t >>= 8;\ + w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : rotrFixed(w##b, 32));\ + if (op==2) k[b] = w##b;\ + t >>= 8;\ + w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : rotrFixed(w##c, 32));\ + if (op==2) k[c] = w##c;\ + t >>= 8;\ + w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : rotrFixed(w##d, 32));\ + if (op==2) k[d] = w##d;\ + +#define TSL(op, i, a, b, c, d) \ + t = (word32)s[i];\ + w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\ + t >>= 8;\ + w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\ + t >>= 8;\ + w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\ + t >>= 8;\ + w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : 0); + +#define TSH_OP(op, a, b) \ + w##a = Whirlpool_C[b*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32) ^ k[a]);\ + if (op==2) s[a] = w##a;\ + if (op==3) digest[a] ^= w##a;\ + +#define TSH(op, i, a, b, c, d) \ + t = (word32)(s[(i+4)%8]>>32);\ + TSH_OP(op, a, 3);\ + t >>= 8;\ + TSH_OP(op, b, 2);\ + t >>= 8;\ + TSH_OP(op, c, 1);\ + t >>= 8;\ + TSH_OP(op, d, 0);\ // Iterate over all rounds: - for (int r = 0; r < R; r++) + int r=0; + while (true) { word64 w0, w1, w2, w3, w4, w5, w6, w7; // temporary storage - word64 t; - - // Compute K^r from K^{r-1}: -#define K(i,j) GETBYTE(k[(i+j+1)%8], j) -#define KS(i) \ - t = C0[K(i,3)] ^ C1[K(i,2)] ^ C2[K(i,1)] ^ C3[K(i,0)]; \ - w##i = rotrFixed(t, 32) ^ C0[K(i,7)] ^ C1[K(i,6)] ^ C2[K(i,5)] ^ C3[K(i,4)]; - - KS(0); KS(1); KS(2); KS(3); KS(4); KS(5); KS(6); KS(7); - k[0] = w0 ^ rc[r]; - k[1] = w1; k[2] = w2; k[3] = w3; k[4] = w4; k[5] = w5; k[6] = w6; k[7] = w7; - - // Apply the r-th round transformation: -#define S(i,j) GETBYTE(s[(i+j+1)%8], j) -#define TS(i) \ - t = C0[S(i,3)] ^ C1[S(i,2)] ^ C2[S(i,1)] ^ C3[S(i,0)]; \ - w##i = rotrFixed(t, 32) ^ C0[S(i,7)] ^ C1[S(i,6)] ^ C2[S(i,5)] ^ C3[S(i,4)] ^ k[i]; - - TS(0); TS(1); TS(2); TS(3); TS(4); TS(5); TS(6); TS(7); - s[0] = w0; s[1] = w1; s[2] = w2; s[3] = w3; s[4] = w4; s[5] = w5; s[6] = w6; s[7] = w7; - } + word32 t; + + KSL(0, 4, 3, 2, 1, 0) + KSL(0, 0, 7, 6, 5, 4) + KSL(1, 1, 0, 7, 6, 5) + KSL(1, 2, 1, 0, 7, 6) + KSL(1, 3, 2, 1, 0, 7) + KSL(1, 5, 4, 3, 2, 1) + KSL(1, 6, 5, 4, 3, 2) + KSL(1, 7, 6, 5, 4, 3) + KSH(0, 0, 7, 6, 5, 4) + KSH(0, 4, 3, 2, 1, 0) + KSH(1, 1, 0, 7, 6, 5) + KSH(1, 2, 1, 0, 7, 6) + KSH(1, 5, 4, 3, 2, 1) + KSH(1, 6, 5, 4, 3, 2) + KSH(2, 3, 2, 1, 0, 7) + KSH(2, 7, 6, 5, 4, 3) - // Apply the rest of the Miyaguchi-Preneel compression function: - digest[0] ^= s[0]; - digest[1] ^= s[1]; - digest[2] ^= s[2]; - digest[3] ^= s[3]; - digest[4] ^= s[4]; - digest[5] ^= s[5]; - digest[6] ^= s[6]; - digest[7] ^= s[7]; + k[0] ^= Whirlpool_C[1024+r]; + + TSL(0, 4, 3, 2, 1, 0) + TSL(0, 0, 7, 6, 5, 4) + TSL(1, 1, 0, 7, 6, 5) + TSL(1, 2, 1, 0, 7, 6) + TSL(1, 3, 2, 1, 0, 7) + TSL(1, 5, 4, 3, 2, 1) + TSL(1, 6, 5, 4, 3, 2) + TSL(1, 7, 6, 5, 4, 3) + TSH(0, 0, 7, 6, 5, 4) + TSH(0, 4, 3, 2, 1, 0) + TSH(1, 1, 0, 7, 6, 5) + TSH(1, 2, 1, 0, 7, 6) + TSH(1, 5, 4, 3, 2, 1) + TSH(1, 6, 5, 4, 3, 2) + + if (++r < R) + { + TSH(2, 3, 2, 1, 0, 7) + TSH(2, 7, 6, 5, 4, 3) + } + else + { + TSH(3, 3, 2, 1, 0, 7) + TSH(3, 7, 6, 5, 4, 3) + break; + } + } + } } NAMESPACE_END @@ -9,8 +9,7 @@ NAMESPACE_BEGIN(CryptoPP) -//! <a href="http://www.weidai.com/scan-mirror/md.html#Whirlpool">Whirlpool</a> -/*! 512 Bit Hash */ +//! <a href="http://www.cryptolounge.org/wiki/Whirlpool">Whirlpool</a> class Whirlpool : public IteratedHashWithStaticTransform<word64, BigEndian, 64, 64, Whirlpool> { public: |