diff options
Diffstat (limited to 'integer.cpp')
-rw-r--r-- | integer.cpp | 1400 |
1 files changed, 671 insertions, 729 deletions
diff --git a/integer.cpp b/integer.cpp index f5b5fc4..93539dd 100644 --- a/integer.cpp +++ b/integer.cpp @@ -18,9 +18,16 @@ #include <iostream> #ifdef SSE2_INTRINSICS_AVAILABLE -#include <emmintrin.h> + #ifdef __GNUC__ + #include <xmmintrin.h> + #include <malloc.h> + #else + #include <emmintrin.h> + #endif #elif defined(_MSC_VER) && defined(_M_IX86) -#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") + #pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") +#elif defined(__GNUC__) && defined(__i386__) + #pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.") #endif NAMESPACE_BEGIN(CryptoPP) @@ -41,7 +48,11 @@ CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(size_type n { #ifdef SSE2_INTRINSICS_AVAILABLE if (n >= 4) - return (T *)_mm_malloc(sizeof(T)*n, 16); + #ifdef __GNUC__ + return (T *)memalign(16, sizeof(T)*n); + #else + return (T *)_mm_malloc(sizeof(T)*n, 16); + #endif else #endif return new T[n]; @@ -53,10 +64,14 @@ void AlignedAllocator<T>::deallocate(void *p, size_type n) memset(p, 0, n*sizeof(T)); #ifdef SSE2_INTRINSICS_AVAILABLE if (n >= 4) - _mm_free(p); + #ifdef __GNUC__ + free(p); + #else + _mm_free(p); + #endif else #endif - delete [] p; + delete [] (T *)p; } #endif @@ -640,6 +655,13 @@ void Portable::Square2(word *R, const word *A) void Portable::Square4(word *R, const word *A) { +#ifdef _MSC_VER + // VC60 workaround: MSVC 6.0 has an optimization bug that makes + // (dword)A*B where either A or B has been cast to a dword before + // very expensive. Revisit this function when this + // bug is fixed. + Multiply4(R, A, A); +#else const word *B = A; DWord p, q; word c, d, e; @@ -666,6 +688,7 @@ void Portable::Square4(word *R, const word *A) p = DWord::MultiplyAndAdd(A[3], A[3], d); R[6] = p.GetLowHalf(); R[7] = e + p.GetHighHalf(); +#endif } void Portable::Multiply8(word *R, const word *A, const word *B) @@ -834,125 +857,600 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B) #undef SaveSquAcc // CodeWarrior defines _MSC_VER -#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700) +#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__)) class PentiumOptimized : public Portable { public: - static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N); - static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N); -// TODO test this with .NET #if _MSC_VER < 1300 - static inline void Square4(word *R, const word *A) - { - // VC60 workaround: MSVC 6.0 has an optimization bug that makes - // (dword)A*B where either A or B has been cast to a dword before - // very expensive. Revisit this function when this - // bug is fixed. - Multiply4(R, A, A); - } -//#endif + static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N); + static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N); +#ifdef __GNUC__ + static void Square4(word *R, const word *A); + static void Multiply4(word *C, const word *A, const word *B); + static void Multiply8(word *C, const word *A, const word *B); +#endif }; typedef PentiumOptimized LowLevel; -__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) +// this may be selected at run time +class P4Optimized : public PentiumOptimized { - __asm - { - push ebp - push ebx - push esi - push edi +public: + static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N); + static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N); +#ifdef SSE2_INTRINSICS_AVAILABLE + static void Multiply4(word *C, const word *A, const word *B); + static void Multiply8(word *C, const word *A, const word *B); + static void Multiply8Bottom(word *C, const word *A, const word *B); + static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);} +#endif +}; + +// use some tricks to share assembly code between MSVC and GCC +#ifdef _MSC_VER + #define CRYPTOPP_NAKED __declspec(naked) + #define AS1(x) __asm x + #define AS2(x, y) __asm x, y + #define PentiumPrologue \ + __asm push ebp \ + __asm push ebx \ + __asm push esi \ + __asm push edi \ + __asm mov ecx, [esp+20] \ + __asm mov edx, [esp+24] \ + __asm mov ebx, [esp+28] \ + __asm mov esi, [esp+32] + #define PentiumEpilogue \ + __asm pop edi \ + __asm pop esi \ + __asm pop ebx \ + __asm pop ebp \ + __asm ret + #define P4Prologue \ + __asm sub esp, 16 \ + __asm mov [esp], edi \ + __asm mov [esp+4], esi \ + __asm mov [esp+8], ebx \ + __asm mov [esp+12], ebp \ + __asm mov ecx, [esp+20] \ + __asm mov edx, [esp+24] \ + __asm mov ebx, [esp+28] \ + __asm mov esi, [esp+32] + #define P4Epilogue \ + __asm mov edi, [esp] \ + __asm mov esi, [esp+4] \ + __asm mov ebx, [esp+8] \ + __asm mov ebp, [esp+12] \ + __asm add esp, 16 \ + __asm ret +#else + #define CRYPTOPP_NAKED + #define AS1(x) #x ";" + #define AS2(x, y) #x ", " #y ";" + #define PentiumPrologue \ + __asm__ \ + ( \ + ".att_syntax prefix;" \ + "mov %0, %%ecx;" \ + "mov %1, %%edx;" \ + "mov %2, %%ebx;" \ + "mov %3, %%esi;" \ + ".intel_syntax noprefix;" + #define PentiumEpilogue \ + ".att_syntax prefix;" \ + : \ + : "m" (C), "m" (A), "m" (B), "m" (N) \ + : "%ecx", "%edx", "%ebx", "%esi", "%edi" \ + ); + #define P4Prologue PentiumPrologue + #define P4Epilogue PentiumEpilogue +#endif - mov esi, [esp+24] ; N - mov ebx, [esp+20] ; B +CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) +{ + PentiumPrologue - // now: ebx = B, ecx = C, edx = A, esi = N + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C + AS2( xor eax, eax) // clear eax - sub ecx, edx // hold the distance between C & A so we can add this to A to get C - xor eax, eax // clear eax + AS2( sub eax, esi) // eax is a negative index from end of B + AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - sub eax, esi // eax is a negative index from end of B - lea ebx, [ebx+4*esi] // ebx is end of B + AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag + AS1( jz loopendAdd) // if no dwords then nothing to do - sar eax, 1 // unit of eax is now dwords; this also clears the carry flag - jz loopend // if no dwords then nothing to do + AS1(loopstartAdd:) + AS2( mov esi,[edx]) // load lower word of A + AS2( mov ebp,[edx+4]) // load higher word of A -loopstart: - mov esi,[edx] // load lower word of A - mov ebp,[edx+4] // load higher word of A + AS2( mov edi,[ebx+8*eax]) // load lower word of B + AS2( lea edx,[edx+8]) // advance A and C - mov edi,[ebx+8*eax] // load lower word of B - lea edx,[edx+8] // advance A and C + AS2( adc esi,edi) // add lower words + AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - adc esi,edi // add lower words - mov edi,[ebx+8*eax+4] // load higher word of B + AS2( adc ebp,edi) // add higher words + AS1( inc eax) // advance B - adc ebp,edi // add higher words - inc eax // advance B + AS2( mov [edx+ecx-8],esi) // store lower word result + AS2( mov [edx+ecx-4],ebp) // store higher word result - mov [edx+ecx-8],esi // store lower word result - mov [edx+ecx-4],ebp // store higher word result + AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero - jnz loopstart // loop until eax overflows and becomes zero + AS1(loopendAdd:) + AS2( adc eax, 0) // store carry into eax (return result register) -loopend: - adc eax, 0 // store carry into eax (return result register) - pop edi - pop esi - pop ebx - pop ebp - ret 8 - } + PentiumEpilogue } -__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) +CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) { - __asm - { - push ebp - push ebx - push esi - push edi + PentiumPrologue - mov esi, [esp+24] ; N - mov ebx, [esp+20] ; B + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C + AS2( xor eax, eax) // clear eax - sub ecx, edx - xor eax, eax + AS2( sub eax, esi) // eax is a negative index from end of B + AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - sub eax, esi - lea ebx, [ebx+4*esi] + AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag + AS1( jz loopendSub) // if no dwords then nothing to do - sar eax, 1 - jz loopend + AS1(loopstartSub:) + AS2( mov esi,[edx]) // load lower word of A + AS2( mov ebp,[edx+4]) // load higher word of A -loopstart: - mov esi,[edx] - mov ebp,[edx+4] + AS2( mov edi,[ebx+8*eax]) // load lower word of B + AS2( lea edx,[edx+8]) // advance A and C - mov edi,[ebx+8*eax] - lea edx,[edx+8] + AS2( sbb esi,edi) // subtract lower words + AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - sbb esi,edi - mov edi,[ebx+8*eax+4] + AS2( sbb ebp,edi) // subtract higher words + AS1( inc eax) // advance B - sbb ebp,edi - inc eax + AS2( mov [edx+ecx-8],esi) // store lower word result + AS2( mov [edx+ecx-4],ebp) // store higher word result - mov [edx+ecx-8],esi - mov [edx+ecx-4],ebp + AS1( jnz loopstartSub) // loop until eax overflows and becomes zero - jnz loopstart + AS1(loopendSub:) + AS2( adc eax, 0) // store carry into eax (return result register) -loopend: - adc eax, 0 - pop edi - pop esi - pop ebx - pop ebp - ret 8 - } + PentiumEpilogue +} + +CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N) +{ + P4Prologue + + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( xor eax, eax) + AS1( neg esi) + AS1( jz loopendAddP4) // if no dwords then nothing to do + + AS2( mov edi, [edx]) + AS2( mov ebp, [ebx]) + + AS1(loopstartAddP4:) + AS2( add edi, eax) + AS1( jc carry1AddP4) + + AS2( xor eax, eax) + + AS1(carry1continueAddP4:) + AS2( add edi, ebp) + AS2( mov ebp, 1) + AS2( mov [ecx], edi) + AS2( mov edi, [edx+4]) + AS2( cmovc eax, ebp) + AS2( mov ebp, [ebx+4]) + AS2( lea ebx, [ebx+8]) + AS2( add edi, eax) + AS1( jc carry2AddP4) + + AS2( xor eax, eax) + + AS1(carry2continueAddP4:) + AS2( add edi, ebp) + AS2( mov ebp, 1) + AS2( cmovc eax, ebp) + AS2( mov [ecx+4], edi) + AS2( add ecx, 8) + AS2( mov edi, [edx+8]) + AS2( add edx, 8) + AS2( add esi, 2) + AS2( mov ebp, [ebx]) + AS1( jnz loopstartAddP4) + AS1( jmp loopendAddP4) + + AS1(carry1AddP4:) + AS2( mov eax, 1) + AS1( jmp carry1continueAddP4) + + AS1(carry2AddP4:) + AS2( mov eax, 1) + AS1( jmp carry2continueAddP4) + + AS1(loopendAddP4:) + + P4Epilogue +} + +CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N) +{ + P4Prologue + + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( xor eax, eax) + AS1( neg esi) + AS1( jz loopendSubP4) // if no dwords then nothing to do + + AS2( mov edi, [edx]) + AS2( mov ebp, [ebx]) + + AS1(loopstartSubP4:) + AS2( sub edi, eax) + AS1( jc carry1SubP4) + + AS2( xor eax, eax) + + AS1(carry1continueSubP4:) + AS2( sub edi, ebp) + AS2( mov ebp, 1) + AS2( mov [ecx], edi) + AS2( mov edi, [edx+4]) + AS2( cmovc eax, ebp) + AS2( mov ebp, [ebx+4]) + AS2( lea ebx, [ebx+8]) + AS2( sub edi, eax) + AS1( jc carry2SubP4) + + AS2( xor eax, eax) + + AS1(carry2continueSubP4:) + AS2( sub edi, ebp) + AS2( mov ebp, 1) + AS2( cmovc eax, ebp) + AS2( mov [ecx+4], edi) + AS2( add ecx, 8) + AS2( mov edi, [edx+8]) + AS2( add edx, 8) + AS2( add esi, 2) + AS2( mov ebp, [ebx]) + AS1( jnz loopstartSubP4) + AS1( jmp loopendSubP4) + + AS1(carry1SubP4:) + AS2( mov eax, 1) + AS1( jmp carry1continueSubP4) + + AS1(carry2SubP4:) + AS2( mov eax, 1) + AS1( jmp carry2continueSubP4) + + AS1(loopendSubP4:) + + P4Epilogue +} + +#if __GNUC__ +// Comba square and multiply assembly code originally contributed by Leonard Janke +// These are not needed with MSVC, which does a good job of optimizing the C++ multiply code. + +#define SqrStartup \ + "push %%ebp\n\t" \ + "push %%esi\n\t" \ + "push %%ebx\n\t" \ + "xor %%ebp, %%ebp\n\t" \ + "xor %%ebx, %%ebx\n\t" \ + "xor %%ecx, %%ecx\n\t" + +#define SqrShiftCarry \ + "mov %%ebx, %%ebp\n\t" \ + "mov %%ecx, %%ebx\n\t" \ + "xor %%ecx, %%ecx\n\t" + +#define SqrAccumulate(i,j) \ + "mov 4*"#j"(%%esi), %%eax\n\t" \ + "mull 4*"#i"(%%esi)\n\t" \ + "add %%eax, %%ebp\n\t" \ + "adc %%edx, %%ebx\n\t" \ + "adc %%ch, %%cl\n\t" \ + "add %%eax, %%ebp\n\t" \ + "adc %%edx, %%ebx\n\t" \ + "adc %%ch, %%cl\n\t" + +#define SqrAccumulateCentre(i) \ + "mov 4*"#i"(%%esi), %%eax\n\t" \ + "mull 4*"#i"(%%esi)\n\t" \ + "add %%eax, %%ebp\n\t" \ + "adc %%edx, %%ebx\n\t" \ + "adc %%ch, %%cl\n\t" + +#define SqrStoreDigit(X) \ + "mov %%ebp, 4*"#X"(%%edi)\n\t" \ + +#define SqrLastDiagonal(digits) \ + "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \ + "mull 4*("#digits"-1)(%%esi)\n\t" \ + "add %%eax, %%ebp\n\t" \ + "adc %%edx, %%ebx\n\t" \ + "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \ + "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t" + +#define SqrCleanup \ + "pop %%ebx\n\t" \ + "pop %%esi\n\t" \ + "pop %%ebp\n\t" + +void PentiumOptimized::Square4(word* Y, const word* X) +{ + __asm__ __volatile__( + SqrStartup + + SqrAccumulateCentre(0) + SqrStoreDigit(0) + SqrShiftCarry + + SqrAccumulate(1,0) + SqrStoreDigit(1) + SqrShiftCarry + + SqrAccumulate(2,0) + SqrAccumulateCentre(1) + SqrStoreDigit(2) + SqrShiftCarry + + SqrAccumulate(3,0) + SqrAccumulate(2,1) + SqrStoreDigit(3) + SqrShiftCarry + + SqrAccumulate(3,1) + SqrAccumulateCentre(2) + SqrStoreDigit(4) + SqrShiftCarry + + SqrAccumulate(3,2) + SqrStoreDigit(5) + SqrShiftCarry + + SqrLastDiagonal(4) + + SqrCleanup + + : + : "D" (Y), "S" (X) + : "eax", "ecx", "edx", "ebp", "memory" + ); +} + +#define MulStartup \ + "push %%ebp\n\t" \ + "push %%esi\n\t" \ + "push %%ebx\n\t" \ + "push %%edi\n\t" \ + "mov %%eax, %%ebx \n\t" \ + "xor %%ebp, %%ebp\n\t" \ + "xor %%edi, %%edi\n\t" \ + "xor %%ecx, %%ecx\n\t" + +#define MulShiftCarry \ + "mov %%edx, %%ebp\n\t" \ + "mov %%ecx, %%edi\n\t" \ + "xor %%ecx, %%ecx\n\t" + +#define MulAccumulate(i,j) \ + "mov 4*"#j"(%%ebx), %%eax\n\t" \ + "mull 4*"#i"(%%esi)\n\t" \ + "add %%eax, %%ebp\n\t" \ + "adc %%edx, %%edi\n\t" \ + "adc %%ch, %%cl\n\t" + +#define MulStoreDigit(X) \ + "mov %%edi, %%edx \n\t" \ + "mov (%%esp), %%edi \n\t" \ + "mov %%ebp, 4*"#X"(%%edi)\n\t" \ + "mov %%edi, (%%esp)\n\t" + +#define MulLastDiagonal(digits) \ + "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \ + "mull 4*("#digits"-1)(%%esi)\n\t" \ + "add %%eax, %%ebp\n\t" \ + "adc %%edi, %%edx\n\t" \ + "mov (%%esp), %%edi\n\t" \ + "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \ + "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" + +#define MulCleanup \ + "pop %%edi\n\t" \ + "pop %%ebx\n\t" \ + "pop %%esi\n\t" \ + "pop %%ebp\n\t" + +void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y) +{ + __asm__ __volatile__( + MulStartup + MulAccumulate(0,0) + MulStoreDigit(0) + MulShiftCarry + + MulAccumulate(1,0) + MulAccumulate(0,1) + MulStoreDigit(1) + MulShiftCarry + + MulAccumulate(2,0) + MulAccumulate(1,1) + MulAccumulate(0,2) + MulStoreDigit(2) + MulShiftCarry + + MulAccumulate(3,0) + MulAccumulate(2,1) + MulAccumulate(1,2) + MulAccumulate(0,3) + MulStoreDigit(3) + MulShiftCarry + + MulAccumulate(3,1) + MulAccumulate(2,2) + MulAccumulate(1,3) + MulStoreDigit(4) + MulShiftCarry + + MulAccumulate(3,2) + MulAccumulate(2,3) + MulStoreDigit(5) + MulShiftCarry + + MulLastDiagonal(4) + + MulCleanup + + : + : "D" (Z), "S" (X), "a" (Y) + : "%ecx", "%edx", "memory" + ); +} + +void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) +{ + __asm__ __volatile__( + MulStartup + MulAccumulate(0,0) + MulStoreDigit(0) + MulShiftCarry + + MulAccumulate(1,0) + MulAccumulate(0,1) + MulStoreDigit(1) + MulShiftCarry + + MulAccumulate(2,0) + MulAccumulate(1,1) + MulAccumulate(0,2) + MulStoreDigit(2) + MulShiftCarry + + MulAccumulate(3,0) + MulAccumulate(2,1) + MulAccumulate(1,2) + MulAccumulate(0,3) + MulStoreDigit(3) + MulShiftCarry + + MulAccumulate(4,0) + MulAccumulate(3,1) + MulAccumulate(2,2) + MulAccumulate(1,3) + MulAccumulate(0,4) + MulStoreDigit(4) + MulShiftCarry + + MulAccumulate(5,0) + MulAccumulate(4,1) + MulAccumulate(3,2) + MulAccumulate(2,3) + MulAccumulate(1,4) + MulAccumulate(0,5) + MulStoreDigit(5) + MulShiftCarry + + MulAccumulate(6,0) + MulAccumulate(5,1) + MulAccumulate(4,2) + MulAccumulate(3,3) + MulAccumulate(2,4) + MulAccumulate(1,5) + MulAccumulate(0,6) + MulStoreDigit(6) + MulShiftCarry + + MulAccumulate(7,0) + MulAccumulate(6,1) + MulAccumulate(5,2) + MulAccumulate(4,3) + MulAccumulate(3,4) + MulAccumulate(2,5) + MulAccumulate(1,6) + MulAccumulate(0,7) + MulStoreDigit(7) + MulShiftCarry + + MulAccumulate(7,1) + MulAccumulate(6,2) + MulAccumulate(5,3) + MulAccumulate(4,4) + MulAccumulate(3,5) + MulAccumulate(2,6) + MulAccumulate(1,7) + MulStoreDigit(8) + MulShiftCarry + + MulAccumulate(7,2) + MulAccumulate(6,3) + MulAccumulate(5,4) + MulAccumulate(4,5) + MulAccumulate(3,6) + MulAccumulate(2,7) + MulStoreDigit(9) + MulShiftCarry + + MulAccumulate(7,3) + MulAccumulate(6,4) + MulAccumulate(5,5) + MulAccumulate(4,6) + MulAccumulate(3,7) + MulStoreDigit(10) + MulShiftCarry + + MulAccumulate(7,4) + MulAccumulate(6,5) + MulAccumulate(5,6) + MulAccumulate(4,7) + MulStoreDigit(11) + MulShiftCarry + + MulAccumulate(7,5) + MulAccumulate(6,6) + MulAccumulate(5,7) + MulStoreDigit(12) + MulShiftCarry + + MulAccumulate(7,6) + MulAccumulate(6,7) + MulStoreDigit(13) + MulShiftCarry + + MulLastDiagonal(8) + + MulCleanup + + : + : "D" (Z), "S" (X), "a" (Y) + : "%ecx", "%edx", "memory" + ); +} + +#endif // __GNUC__ + +#else // not x86 - no processor specific code at this layer + +typedef Portable LowLevel; + +#endif + +bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true; + +void DisableSSE2() +{ + g_sse2Enabled = false; } #ifdef SSE2_INTRINSICS_AVAILABLE @@ -961,23 +1459,20 @@ static bool GetSSE2Capability() { word32 b; +#ifdef __GNUC__ + __asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx"); +#else __asm { mov eax, 1 cpuid mov b, edx } +#endif return (b & (1 << 26)) != 0; } -bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true; - -void DisableSSE2() -{ - g_sse2Enabled = false; -} - static inline bool HasSSE2() { if (g_sse2Enabled && !g_sse2DetectionDone) @@ -988,19 +1483,9 @@ static inline bool HasSSE2() return g_sse2Enabled && g_sse2Detected; } -class P4Optimized : public PentiumOptimized -{ -public: - static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N); - static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static inline void Square4(word *R, const word *A) - { - Multiply4(R, A, A); - } - static void Multiply8Bottom(word *C, const word *A, const word *B); -}; +#ifdef __GNUC__ +#define __fastcall +#endif static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B) { @@ -1072,7 +1557,7 @@ void P4Optimized::Multiply4(word *C, const word *A, const word *B) __m64 s1, s2; - __m64 w1 = _m_from_int(w[1]); + __m64 w1 = _mm_cvtsi32_si64(w[1]); __m64 w4 = mw[2]; __m64 w6 = mw[3]; __m64 w8 = mw[4]; @@ -1083,38 +1568,38 @@ void P4Optimized::Multiply4(word *C, const word *A, const word *B) __m64 w18 = mw[9]; __m64 w20 = mw[10]; __m64 w22 = mw[11]; - __m64 w26 = _m_from_int(w[26]); + __m64 w26 = _mm_cvtsi32_si64(w[26]); s1 = _mm_add_si64(w1, w4); - C[1] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[1] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w6, w8); s1 = _mm_add_si64(s1, s2); - C[2] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[2] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w10, w12); s1 = _mm_add_si64(s1, s2); - C[3] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[3] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w14, w16); s1 = _mm_add_si64(s1, s2); - C[4] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[4] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w18, w20); s1 = _mm_add_si64(s1, s2); - C[5] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[5] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w22, w26); s1 = _mm_add_si64(s1, s2); - C[6] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[6] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); - C[7] = _m_to_int(s1) + w[27]; + C[7] = _mm_cvtsi64_si32(s1) + w[27]; _mm_empty(); } @@ -1142,7 +1627,7 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) __m64 s1, s2, s3, s4; - __m64 w1 = _m_from_int(w[1]); + __m64 w1 = _mm_cvtsi32_si64(w[1]); __m64 w4 = mw[2]; __m64 w6 = mw[3]; __m64 w8 = mw[4]; @@ -1153,11 +1638,11 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) __m64 w18 = mw[9]; __m64 w20 = mw[10]; __m64 w22 = mw[11]; - __m64 w26 = _m_from_int(w[26]); - __m64 w27 = _m_from_int(w[27]); + __m64 w26 = _mm_cvtsi32_si64(w[26]); + __m64 w27 = _mm_cvtsi32_si64(w[27]); - __m64 x0 = _m_from_int(x[0]); - __m64 x1 = _m_from_int(x[1]); + __m64 x0 = _mm_cvtsi32_si64(x[0]); + __m64 x1 = _mm_cvtsi32_si64(x[1]); __m64 x4 = mx[2]; __m64 x6 = mx[3]; __m64 x8 = mx[4]; @@ -1168,11 +1653,11 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) __m64 x18 = mx[9]; __m64 x20 = mx[10]; __m64 x22 = mx[11]; - __m64 x26 = _m_from_int(x[26]); - __m64 x27 = _m_from_int(x[27]); + __m64 x26 = _mm_cvtsi32_si64(x[26]); + __m64 x27 = _mm_cvtsi32_si64(x[27]); - __m64 y0 = _m_from_int(y[0]); - __m64 y1 = _m_from_int(y[1]); + __m64 y0 = _mm_cvtsi32_si64(y[0]); + __m64 y1 = _mm_cvtsi32_si64(y[1]); __m64 y4 = my[2]; __m64 y6 = my[3]; __m64 y8 = my[4]; @@ -1183,11 +1668,11 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) __m64 y18 = my[9]; __m64 y20 = my[10]; __m64 y22 = my[11]; - __m64 y26 = _m_from_int(y[26]); - __m64 y27 = _m_from_int(y[27]); + __m64 y26 = _mm_cvtsi32_si64(y[26]); + __m64 y27 = _mm_cvtsi32_si64(y[27]); - __m64 z0 = _m_from_int(z[0]); - __m64 z1 = _m_from_int(z[1]); + __m64 z0 = _mm_cvtsi32_si64(z[0]); + __m64 z1 = _mm_cvtsi32_si64(z[1]); __m64 z4 = mz[2]; __m64 z6 = mz[3]; __m64 z8 = mz[4]; @@ -1198,28 +1683,28 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) __m64 z18 = mz[9]; __m64 z20 = mz[10]; __m64 z22 = mz[11]; - __m64 z26 = _m_from_int(z[26]); + __m64 z26 = _mm_cvtsi32_si64(z[26]); s1 = _mm_add_si64(w1, w4); - C[1] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[1] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w6, w8); s1 = _mm_add_si64(s1, s2); - C[2] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[2] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w10, w12); s1 = _mm_add_si64(s1, s2); - C[3] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[3] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x0, y0); s2 = _mm_add_si64(w14, w16); s1 = _mm_add_si64(s1, s3); s1 = _mm_add_si64(s1, s2); - C[4] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[4] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x1, y1); s4 = _mm_add_si64(x4, y4); @@ -1227,8 +1712,8 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, w20); s1 = _mm_add_si64(s1, s3); - C[5] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[5] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x6, y6); s4 = _mm_add_si64(x8, y8); @@ -1236,24 +1721,24 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, w26); s1 = _mm_add_si64(s1, s3); - C[6] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[6] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x10, y10); s4 = _mm_add_si64(x12, y12); s1 = _mm_add_si64(s1, w27); s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, s3); - C[7] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[7] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x14, y14); s4 = _mm_add_si64(x16, y16); s1 = _mm_add_si64(s1, z0); s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, s3); - C[8] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[8] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x18, y18); s4 = _mm_add_si64(x20, y20); @@ -1261,8 +1746,8 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, z4); s1 = _mm_add_si64(s1, s3); - C[9] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[9] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x22, y22); s4 = _mm_add_si64(x26, y26); @@ -1270,32 +1755,32 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B) s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, z8); s1 = _mm_add_si64(s1, s3); - C[10] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[10] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x27, y27); s1 = _mm_add_si64(s1, z10); s1 = _mm_add_si64(s1, z12); s1 = _mm_add_si64(s1, s3); - C[11] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[11] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(z14, z16); s1 = _mm_add_si64(s1, s3); - C[12] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[12] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(z18, z20); s1 = _mm_add_si64(s1, s3); - C[13] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[13] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(z22, z26); s1 = _mm_add_si64(s1, s3); - C[14] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[14] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); - C[15] = z[27] + _m_to_int(s1); + C[15] = z[27] + _mm_cvtsi64_si32(s1); _mm_empty(); } @@ -1319,7 +1804,7 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) __m64 s1, s2, s3, s4; - __m64 w1 = _m_from_int(w[1]); + __m64 w1 = _mm_cvtsi32_si64(w[1]); __m64 w4 = mw[2]; __m64 w6 = mw[3]; __m64 w8 = mw[4]; @@ -1330,40 +1815,40 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) __m64 w18 = mw[9]; __m64 w20 = mw[10]; __m64 w22 = mw[11]; - __m64 w26 = _m_from_int(w[26]); + __m64 w26 = _mm_cvtsi32_si64(w[26]); - __m64 x0 = _m_from_int(x[0]); - __m64 x1 = _m_from_int(x[1]); + __m64 x0 = _mm_cvtsi32_si64(x[0]); + __m64 x1 = _mm_cvtsi32_si64(x[1]); __m64 x4 = mx[2]; __m64 x6 = mx[3]; __m64 x8 = mx[4]; - __m64 y0 = _m_from_int(y[0]); - __m64 y1 = _m_from_int(y[1]); + __m64 y0 = _mm_cvtsi32_si64(y[0]); + __m64 y1 = _mm_cvtsi32_si64(y[1]); __m64 y4 = my[2]; __m64 y6 = my[3]; __m64 y8 = my[4]; s1 = _mm_add_si64(w1, w4); - C[1] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[1] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w6, w8); s1 = _mm_add_si64(s1, s2); - C[2] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[2] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s2 = _mm_add_si64(w10, w12); s1 = _mm_add_si64(s1, s2); - C[3] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[3] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x0, y0); s2 = _mm_add_si64(w14, w16); s1 = _mm_add_si64(s1, s3); s1 = _mm_add_si64(s1, s2); - C[4] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[4] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x1, y1); s4 = _mm_add_si64(x4, y4); @@ -1371,8 +1856,8 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, w20); s1 = _mm_add_si64(s1, s3); - C[5] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[5] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); s3 = _mm_add_si64(x6, y6); s4 = _mm_add_si64(x8, y8); @@ -1380,558 +1865,15 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) s3 = _mm_add_si64(s3, s4); s1 = _mm_add_si64(s1, w26); s1 = _mm_add_si64(s1, s3); - C[6] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); + C[6] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); - C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; + C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; _mm_empty(); } -__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N) -{ - __asm - { - sub esp, 16 - xor eax, eax - mov [esp], edi - mov [esp+4], esi - mov [esp+8], ebx - mov [esp+12], ebp - - mov ebx, [esp+20] // B - mov esi, [esp+24] // N - - // now: ebx = B, ecx = C, edx = A, esi = N - - neg esi - jz loopend // if no dwords then nothing to do - - mov edi, [edx] - mov ebp, [ebx] - -loopstart: - add edi, eax - jc carry1 - - xor eax, eax - -carry1continue: - add edi, ebp - mov ebp, 1 - mov [ecx], edi - mov edi, [edx+4] - cmovc eax, ebp - mov ebp, [ebx+4] - lea ebx, [ebx+8] - add edi, eax - jc carry2 - - xor eax, eax - -carry2continue: - add edi, ebp - mov ebp, 1 - cmovc eax, ebp - mov [ecx+4], edi - add ecx, 8 - mov edi, [edx+8] - add edx, 8 - add esi, 2 - mov ebp, [ebx] - jnz loopstart - -loopend: - mov edi, [esp] - mov esi, [esp+4] - mov ebx, [esp+8] - mov ebp, [esp+12] - add esp, 16 - ret 8 - -carry1: - mov eax, 1 - jmp carry1continue - -carry2: - mov eax, 1 - jmp carry2continue - } -} - -__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N) -{ - __asm - { - sub esp, 16 - xor eax, eax - mov [esp], edi - mov [esp+4], esi - mov [esp+8], ebx - mov [esp+12], ebp - - mov ebx, [esp+20] // B - mov esi, [esp+24] // N - - // now: ebx = B, ecx = C, edx = A, esi = N - - neg esi - jz loopend // if no dwords then nothing to do - - mov edi, [edx] - mov ebp, [ebx] - -loopstart: - sub edi, eax - jc carry1 - - xor eax, eax - -carry1continue: - sub edi, ebp - mov ebp, 1 - mov [ecx], edi - mov edi, [edx+4] - cmovc eax, ebp - mov ebp, [ebx+4] - lea ebx, [ebx+8] - sub edi, eax - jc carry2 - - xor eax, eax - -carry2continue: - sub edi, ebp - mov ebp, 1 - cmovc eax, ebp - mov [ecx+4], edi - add ecx, 8 - mov edi, [edx+8] - add edx, 8 - add esi, 2 - mov ebp, [ebx] - jnz loopstart - -loopend: - mov edi, [esp] - mov esi, [esp+4] - mov ebx, [esp+8] - mov ebp, [esp+12] - add esp, 16 - ret 8 - -carry1: - mov eax, 1 - jmp carry1continue - -carry2: - mov eax, 1 - jmp carry2continue - } -} - #endif // #ifdef SSE2_INTRINSICS_AVAILABLE -#elif defined(__GNUC__) && defined(__i386__) - -class PentiumOptimized : public Portable -{ -public: -#ifndef __pic__ // -fpic uses up a register, leaving too few for the asm code - static word Add(word *C, const word *A, const word *B, unsigned int N); - static word Subtract(word *C, const word *A, const word *B, unsigned int N); -#endif - static void Square4(word *R, const word *A); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); -}; - -typedef PentiumOptimized LowLevel; - -// Add and Subtract assembly code originally contributed by Alister Lee - -#ifndef __pic__ -__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) -{ - assert (N%2 == 0); - - register word carry, temp; - - __asm__ __volatile__( - "push %%ebp;" - "sub %3, %2;" - "xor %0, %0;" - "sub %4, %0;" - "lea (%1,%4,4), %1;" - "sar $1, %0;" - "jz 1f;" - - "0:;" - "mov 0(%3), %4;" - "mov 4(%3), %%ebp;" - "mov (%1,%0,8), %5;" - "lea 8(%3), %3;" - "adc %5, %4;" - "mov 4(%1,%0,8), %5;" - "adc %5, %%ebp;" - "inc %0;" - "mov %4, -8(%3, %2);" - "mov %%ebp, -4(%3, %2);" - "jnz 0b;" - - "1:;" - "adc $0, %0;" - "pop %%ebp;" - - : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp) - : : "cc", "memory"); - - return carry; -} - -__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) -{ - assert (N%2 == 0); - - register word carry, temp; - - __asm__ __volatile__( - "push %%ebp;" - "sub %3, %2;" - "xor %0, %0;" - "sub %4, %0;" - "lea (%1,%4,4), %1;" - "sar $1, %0;" - "jz 1f;" - - "0:;" - "mov 0(%3), %4;" - "mov 4(%3), %%ebp;" - "mov (%1,%0,8), %5;" - "lea 8(%3), %3;" - "sbb %5, %4;" - "mov 4(%1,%0,8), %5;" - "sbb %5, %%ebp;" - "inc %0;" - "mov %4, -8(%3, %2);" - "mov %%ebp, -4(%3, %2);" - "jnz 0b;" - - "1:;" - "adc $0, %0;" - "pop %%ebp;" - - : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp) - : : "cc", "memory"); - - return carry; -} -#endif // __pic__ - -// Comba square and multiply assembly code originally contributed by Leonard Janke - -#define SqrStartup \ - "push %%ebp\n\t" \ - "push %%esi\n\t" \ - "push %%ebx\n\t" \ - "xor %%ebp, %%ebp\n\t" \ - "xor %%ebx, %%ebx\n\t" \ - "xor %%ecx, %%ecx\n\t" - -#define SqrShiftCarry \ - "mov %%ebx, %%ebp\n\t" \ - "mov %%ecx, %%ebx\n\t" \ - "xor %%ecx, %%ecx\n\t" - -#define SqrAccumulate(i,j) \ - "mov 4*"#j"(%%esi), %%eax\n\t" \ - "mull 4*"#i"(%%esi)\n\t" \ - "add %%eax, %%ebp\n\t" \ - "adc %%edx, %%ebx\n\t" \ - "adc %%ch, %%cl\n\t" \ - "add %%eax, %%ebp\n\t" \ - "adc %%edx, %%ebx\n\t" \ - "adc %%ch, %%cl\n\t" - -#define SqrAccumulateCentre(i) \ - "mov 4*"#i"(%%esi), %%eax\n\t" \ - "mull 4*"#i"(%%esi)\n\t" \ - "add %%eax, %%ebp\n\t" \ - "adc %%edx, %%ebx\n\t" \ - "adc %%ch, %%cl\n\t" - -#define SqrStoreDigit(X) \ - "mov %%ebp, 4*"#X"(%%edi)\n\t" \ - -#define SqrLastDiagonal(digits) \ - "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \ - "mull 4*("#digits"-1)(%%esi)\n\t" \ - "add %%eax, %%ebp\n\t" \ - "adc %%edx, %%ebx\n\t" \ - "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \ - "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t" - -#define SqrCleanup \ - "pop %%ebx\n\t" \ - "pop %%esi\n\t" \ - "pop %%ebp\n\t" - -void PentiumOptimized::Square4(word* Y, const word* X) -{ - __asm__ __volatile__( - SqrStartup - - SqrAccumulateCentre(0) - SqrStoreDigit(0) - SqrShiftCarry - - SqrAccumulate(1,0) - SqrStoreDigit(1) - SqrShiftCarry - - SqrAccumulate(2,0) - SqrAccumulateCentre(1) - SqrStoreDigit(2) - SqrShiftCarry - - SqrAccumulate(3,0) - SqrAccumulate(2,1) - SqrStoreDigit(3) - SqrShiftCarry - - SqrAccumulate(3,1) - SqrAccumulateCentre(2) - SqrStoreDigit(4) - SqrShiftCarry - - SqrAccumulate(3,2) - SqrStoreDigit(5) - SqrShiftCarry - - SqrLastDiagonal(4) - - SqrCleanup - - : - : "D" (Y), "S" (X) - : "eax", "ecx", "edx", "ebp", "memory" - ); -} - -#define MulStartup \ - "push %%ebp\n\t" \ - "push %%esi\n\t" \ - "push %%ebx\n\t" \ - "push %%edi\n\t" \ - "mov %%eax, %%ebx \n\t" \ - "xor %%ebp, %%ebp\n\t" \ - "xor %%edi, %%edi\n\t" \ - "xor %%ecx, %%ecx\n\t" - -#define MulShiftCarry \ - "mov %%edx, %%ebp\n\t" \ - "mov %%ecx, %%edi\n\t" \ - "xor %%ecx, %%ecx\n\t" - -#define MulAccumulate(i,j) \ - "mov 4*"#j"(%%ebx), %%eax\n\t" \ - "mull 4*"#i"(%%esi)\n\t" \ - "add %%eax, %%ebp\n\t" \ - "adc %%edx, %%edi\n\t" \ - "adc %%ch, %%cl\n\t" - -#define MulStoreDigit(X) \ - "mov %%edi, %%edx \n\t" \ - "mov (%%esp), %%edi \n\t" \ - "mov %%ebp, 4*"#X"(%%edi)\n\t" \ - "mov %%edi, (%%esp)\n\t" - -#define MulLastDiagonal(digits) \ - "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \ - "mull 4*("#digits"-1)(%%esi)\n\t" \ - "add %%eax, %%ebp\n\t" \ - "adc %%edi, %%edx\n\t" \ - "mov (%%esp), %%edi\n\t" \ - "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \ - "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" - -#define MulCleanup \ - "pop %%edi\n\t" \ - "pop %%ebx\n\t" \ - "pop %%esi\n\t" \ - "pop %%ebp\n\t" - -void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y) -{ - __asm__ __volatile__( - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(3,2) - MulAccumulate(2,3) - MulStoreDigit(5) - MulShiftCarry - - MulLastDiagonal(4) - - MulCleanup - - : - : "D" (Z), "S" (X), "a" (Y) - : "%ecx", "%edx", "memory" - ); -} - -void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) -{ - __asm__ __volatile__( - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulate(7,0) - MulAccumulate(6,1) - MulAccumulate(5,2) - MulAccumulate(4,3) - MulAccumulate(3,4) - MulAccumulate(2,5) - MulAccumulate(1,6) - MulAccumulate(0,7) - MulStoreDigit(7) - MulShiftCarry - - MulAccumulate(7,1) - MulAccumulate(6,2) - MulAccumulate(5,3) - MulAccumulate(4,4) - MulAccumulate(3,5) - MulAccumulate(2,6) - MulAccumulate(1,7) - MulStoreDigit(8) - MulShiftCarry - - MulAccumulate(7,2) - MulAccumulate(6,3) - MulAccumulate(5,4) - MulAccumulate(4,5) - MulAccumulate(3,6) - MulAccumulate(2,7) - MulStoreDigit(9) - MulShiftCarry - - MulAccumulate(7,3) - MulAccumulate(6,4) - MulAccumulate(5,5) - MulAccumulate(4,6) - MulAccumulate(3,7) - MulStoreDigit(10) - MulShiftCarry - - MulAccumulate(7,4) - MulAccumulate(6,5) - MulAccumulate(5,6) - MulAccumulate(4,7) - MulStoreDigit(11) - MulShiftCarry - - MulAccumulate(7,5) - MulAccumulate(6,6) - MulAccumulate(5,7) - MulStoreDigit(12) - MulShiftCarry - - MulAccumulate(7,6) - MulAccumulate(6,7) - MulStoreDigit(13) - MulShiftCarry - - MulLastDiagonal(8) - - MulCleanup - - : - : "D" (Z), "S" (X), "a" (Y) - : "%ecx", "%edx", "memory" - ); -} - -#else // no processor specific code at this layer - -typedef Portable LowLevel; - -#endif - // ******************************************************** #define A0 A |