diff options
author | weidai <weidai11@users.noreply.github.com> | 2007-04-15 23:00:27 +0000 |
---|---|---|
committer | weidai <weidai11@users.noreply.github.com> | 2007-04-15 23:00:27 +0000 |
commit | 643b3022278591c8a784f5a34f689548a4ad62b1 (patch) | |
tree | 6d117430c0ea76a7444fa3871d1898995d825ada /tiger.cpp | |
parent | 3b89824be3a8d3974ad0316f2067451c3072bffc (diff) | |
download | cryptopp-git-643b3022278591c8a784f5a34f689548a4ad62b1.tar.gz |
MMX/SSE2 optimizations
Diffstat (limited to 'tiger.cpp')
-rw-r--r-- | tiger.cpp | 229 |
1 files changed, 199 insertions, 30 deletions
@@ -3,6 +3,7 @@ #include "pch.h" #include "tiger.h" #include "misc.h" +#include "cpu.h" #ifdef WORD64_AVAILABLE @@ -24,13 +25,187 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) m_data[7] = GetBitCountLo(); - Transform(m_digest, m_data); - CorrectEndianess(m_digest, m_digest, DigestSize()); - memcpy(hash, m_digest, size); + Transform(m_state, m_data); + CorrectEndianess(m_state, m_state, DigestSize()); + memcpy(hash, m_state, size); Restart(); // reinit for next use } +void Tiger::Transform (word64 *digest, const word64 *X) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + { +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) +#else + AS2( mov eax, digest) + AS2( mov esi, X) + AS2( lea edx, [table]) +#endif + AS2( movq mm0, [eax]) + AS2( movq mm1, [eax+1*8]) + AS2( movq mm5, mm1) + AS2( movq mm2, [eax+2*8]) + AS2( movq mm7, [edx+4*2048+0*8]) + AS2( movq mm6, [edx+4*2048+1*8]) + AS2( mov ecx, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 8*8) + AS1( push ecx) + +#define SSE2_round(a,b,c,x,mul) \ + AS2( pxor c, [x])\ + AS2( movd ecx, c)\ + AS2( movzx edi, cl)\ + AS2( movq mm3, [edx+0*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( movq mm4, [edx+3*2048+edi*8])\ + AS2( shr ecx, 16)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+1*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+2*2048+edi*8])\ + AS3( pextrw ecx, c, 2)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+2*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+1*2048+edi*8])\ + AS3( pextrw ecx, c, 3)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+3*2048+edi*8])\ + AS2( psubq a, mm3)\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+0*2048+edi*8])\ + AS2( paddq b, mm4)\ + SSE2_mul_##mul(b) + +#define SSE2_mul_5(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 2)\ + AS2( paddq b, mm3) + +#define SSE2_mul_7(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 3)\ + AS2( psubq b, mm3) + +#define SSE2_mul_9(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 3)\ + AS2( paddq b, mm3) + +#define label2_5 1 +#define label2_7 2 +#define label2_9 3 + +#define SSE2_pass(A,B,C,mul,X) \ + AS2( xor ebx, ebx)\ + ASL(mul)\ + SSE2_round(A,B,C,X+0*8+ebx,mul)\ + SSE2_round(B,C,A,X+1*8+ebx,mul)\ + AS2( cmp ebx, 6*8)\ + ASJ( je, label2_##mul, f)\ + SSE2_round(C,A,B,X+2*8+ebx,mul)\ + AS2( add ebx, 3*8)\ + ASJ( jmp, mul, b)\ + ASL(label2_##mul) + +#define SSE2_key_schedule(Y,X) \ + AS2( movq mm3, [X+7*8])\ + AS2( pxor mm3, mm6)\ + AS2( movq mm4, [X+0*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+0*8], mm4)\ + AS2( pxor mm4, [X+1*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+1*8], mm4)\ + AS2( paddq mm4, [X+2*8])\ + AS2( pxor mm3, mm7)\ + AS2( psllq mm3, 19)\ + AS2( movq [Y+2*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [X+3*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+3*8], mm4)\ + AS2( pxor mm4, [X+4*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+4*8], mm4)\ + AS2( paddq mm4, [X+5*8])\ + AS2( pxor mm3, mm7)\ + AS2( psrlq mm3, 23)\ + AS2( movq [Y+5*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [X+6*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+6*8], mm4)\ + AS2( pxor mm4, [X+7*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+7*8], mm4)\ + AS2( paddq mm4, [Y+0*8])\ + AS2( pxor mm3, mm7)\ + AS2( psllq mm3, 19)\ + AS2( movq [Y+0*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [Y+1*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+1*8], mm4)\ + AS2( pxor mm4, [Y+2*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+2*8], mm4)\ + AS2( paddq mm4, [Y+3*8])\ + AS2( pxor mm3, mm7)\ + AS2( psrlq mm3, 23)\ + AS2( movq [Y+3*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [Y+4*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+4*8], mm4)\ + AS2( pxor mm4, [Y+5*8])\ + AS2( movq [Y+5*8], mm4)\ + AS2( paddq mm4, [Y+6*8])\ + AS2( movq [Y+6*8], mm4)\ + AS2( pxor mm4, [edx+4*2048+2*8])\ + AS2( movq mm3, [Y+7*8])\ + AS2( psubq mm3, mm4)\ + AS2( movq [Y+7*8], mm3) + + SSE2_pass(mm0, mm1, mm2, 5, esi) + SSE2_key_schedule(esp+4, esi) + SSE2_pass(mm2, mm0, mm1, 7, esp+4) + SSE2_key_schedule(esp+4, esp+4) + SSE2_pass(mm1, mm2, mm0, 9, esp+4) + + AS2( pxor mm0, [eax+0*8]) + AS2( movq [eax+0*8], mm0) + AS2( psubq mm1, mm5) + AS2( movq [eax+1*8], mm1) + AS2( paddq mm2, [eax+2*8]) + AS2( movq [eax+2*8], mm2) + + AS1( pop esp) + AS1( emms) +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (digest), "S" (X), "d" (table) + : "%ecx", "%edi", "memory", "cc" + ); +#endif + } + else +#endif + { + word64 a = digest[0]; + word64 b = digest[1]; + word64 c = digest[2]; + word64 Y[8]; + #define t1 (table) #define t2 (table+256) #define t3 (table+256*2) @@ -42,15 +217,17 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \ b *= mul -#define pass(a,b,c,mul,X) \ - round(a,b,c,X[0],mul); \ - round(b,c,a,X[1],mul); \ - round(c,a,b,X[2],mul); \ - round(a,b,c,X[3],mul); \ - round(b,c,a,X[4],mul); \ - round(c,a,b,X[5],mul); \ - round(a,b,c,X[6],mul); \ - round(b,c,a,X[7],mul) +#define pass(a,b,c,mul,X) {\ + int i=0;\ + while (true)\ + {\ + round(a,b,c,X[i+0],mul); \ + round(b,c,a,X[i+1],mul); \ + if (i==6)\ + break;\ + round(c,a,b,X[i+2],mul); \ + i+=3;\ + }} #define key_schedule(Y,X) \ Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \ @@ -70,24 +247,16 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) Y[6] += Y[5]; \ Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF) -void Tiger::Transform (word64 *digest, const word64 *X) -{ - word64 a = digest[0]; - word64 b = digest[1]; - word64 c = digest[2]; - word64 Y[8]; - - pass(a,b,c,5,X); - key_schedule(Y,X); - pass(c,a,b,7,Y); - key_schedule(Y,Y); - pass(b,c,a,9,Y); - - digest[0] = a ^ digest[0]; - digest[1] = b - digest[1]; - digest[2] = c + digest[2]; - - memset(Y, 0, sizeof(Y)); + pass(a,b,c,5,X); + key_schedule(Y,X); + pass(c,a,b,7,Y); + key_schedule(Y,Y); + pass(b,c,a,9,Y); + + digest[0] = a ^ digest[0]; + digest[1] = b - digest[1]; + digest[2] = c + digest[2]; + } } NAMESPACE_END |