diff options
author | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-04-15 22:54:31 +0000 |
---|---|---|
committer | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-04-15 22:54:31 +0000 |
commit | 851f567904d2e6ef236fede79b02d638c71e4e2f (patch) | |
tree | d7c8c74edd548369be89d8f6e88d055995ecd95a | |
parent | 6da041704487c4c8d90b3caa5112ff1ecbb62fb2 (diff) | |
download | cryptopp-851f567904d2e6ef236fede79b02d638c71e4e2f.tar.gz |
SSE2 optimizations
git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@282 57ff6487-cd31-0410-9ec3-f628ee90f5f0
-rw-r--r-- | panama.cpp | 384 | ||||
-rw-r--r-- | panama.h | 20 | ||||
-rwxr-xr-x | salsa.cpp | 369 | ||||
-rwxr-xr-x | salsa.h | 10 | ||||
-rw-r--r-- | strciphr.cpp | 80 | ||||
-rw-r--r-- | strciphr.h | 96 |
6 files changed, 768 insertions, 191 deletions
@@ -3,37 +3,296 @@ #include "pch.h" #include "panama.h" #include "misc.h" +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) template <class B> void Panama<B>::Reset() { - m_bstart = 0; - memset(m_state, 0, m_state.size()*4); + memset(m_state, 0, m_state.SizeInBytes()); +#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE + m_state[17] = HasSSSE3(); +#endif } +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code + +void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) +{ +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) +#else + AS2( mov ecx, count) + AS2( mov esi, state) + AS2( mov edi, z) + AS2( mov edx, y) +#endif + AS2( shl ecx, 5) + ASJ( jz, 5, f) + AS2( mov ebx, [esi+4*17]) + AS2( add ecx, ebx) + + AS1( push ebp) + AS1( push ecx) + + AS2( movdqa xmm0, [esi+0*16]) + AS2( movdqa xmm1, [esi+1*16]) + AS2( movdqa xmm2, [esi+2*16]) + AS2( movdqa xmm3, [esi+3*16]) + AS2( mov eax, [esi+4*16]) + + ASL(4) + // gamma and pi +#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE + AS2( test ebx, 1) + ASJ( jnz, 6, f) +#endif + AS2( movdqa xmm6, xmm2) + AS2( movss xmm6, xmm3) + ASS( pshufd xmm5, xmm6, 0, 3, 2, 1) + AS2( movd xmm6, eax) + AS2( movdqa xmm7, xmm3) + AS2( movss xmm7, xmm6) + ASS( pshufd xmm6, xmm7, 0, 3, 2, 1) +#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE + ASJ( jmp, 7, f) + ASL(6) + AS2( movdqa xmm5, xmm3) + AS3( palignr xmm5, xmm2, 4) + AS2( movd xmm6, eax) + AS3( palignr xmm6, xmm3, 4) + ASL(7) +#endif + + AS2( movd ecx, xmm2) + AS1( not ecx) + AS2( movd ebp, xmm3) + AS2( or ecx, ebp) + AS2( xor eax, ecx) + +#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17) + +#define pi(i) \ + AS2( movd ecx, xmm7)\ + AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ + AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx) + +#define pi4(x, y, z, a, b, c, d) \ + AS2( pcmpeqb xmm7, xmm7)\ + AS2( pxor xmm7, x)\ + AS2( por xmm7, y)\ + AS2( pxor xmm7, z)\ + pi(a)\ + ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\ + pi(b)\ + AS2( punpckhqdq xmm7, xmm7)\ + pi(c)\ + ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\ + pi(d) + + pi4(xmm1, xmm2, xmm3, 1, 5, 9, 13) + pi4(xmm0, xmm1, xmm2, 2, 6, 10, 14) + pi4(xmm6, xmm0, xmm1, 3, 7, 11, 15) + pi4(xmm5, xmm6, xmm0, 4, 8, 12, 16) + + // output keystream and update buffer here to hide partial memory stalls between pi and theta + AS2( movdqa xmm4, xmm3) + AS2( punpcklqdq xmm3, xmm2) // 1 5 2 6 + AS2( punpckhdq xmm4, xmm2) // 9 10 13 14 + AS2( movdqa xmm2, xmm1) + AS2( punpcklqdq xmm1, xmm0) // 3 7 4 8 + AS2( punpckhdq xmm2, xmm0) // 11 12 15 16 + + // keystream + AS2( test edi, edi) + ASJ( jz, 0, f) + AS2( movdqa xmm6, xmm4) + AS2( punpcklqdq xmm4, xmm2) + AS2( punpckhqdq xmm6, xmm2) + AS2( test edx, 0xf) + ASJ( jnz, 2, f) + AS2( test edx, edx) + ASJ( jz, 1, f) + AS2( pxor xmm4, [edx]) + AS2( pxor xmm6, [edx+16]) + AS2( add edx, 32) + ASJ( jmp, 1, f) + ASL(2) + AS2( movdqu xmm0, [edx]) + AS2( movdqu xmm2, [edx+16]) + AS2( pxor xmm4, xmm0) + AS2( pxor xmm6, xmm2) + AS2( add edx, 32) + ASL(1) + AS2( test edi, 0xf) + ASJ( jnz, 3, f) + AS2( movdqa [edi], xmm4) + AS2( movdqa [edi+16], xmm6) + AS2( add edi, 32) + ASJ( jmp, 0, f) + ASL(3) + AS2( movdqu [edi], xmm4) + AS2( movdqu [edi+16], xmm6) + AS2( add edi, 32) + ASL(0) + + // buffer update + AS2( lea ecx, [ebx + 32]) + AS2( and ecx, 31*32) + AS2( lea ebp, [ebx + (32-24)*32]) + AS2( and ebp, 31*32) + + AS2( movdqa xmm0, [esi+20*4+ecx+0*8]) + AS2( pxor xmm3, xmm0) + ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) + AS2( movdqa [esi+20*4+ecx+0*8], xmm3) + AS2( pxor xmm0, [esi+20*4+ebp+2*8]) + AS2( movdqa [esi+20*4+ebp+2*8], xmm0) + + AS2( movdqa xmm4, [esi+20*4+ecx+2*8]) + AS2( pxor xmm1, xmm4) + AS2( movdqa [esi+20*4+ecx+2*8], xmm1) + AS2( pxor xmm4, [esi+20*4+ebp+0*8]) + AS2( movdqa [esi+20*4+ebp+0*8], xmm4) + + // theta + AS2( movdqa xmm3, [esi+3*16]) + AS2( movdqa xmm2, [esi+2*16]) + AS2( movdqa xmm1, [esi+1*16]) + AS2( movdqa xmm0, [esi+0*16]) + +#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE + AS2( test ebx, 1) + ASJ( jnz, 8, f) +#endif + AS2( movd xmm6, eax) + AS2( movdqa xmm7, xmm3) + AS2( movss xmm7, xmm6) + AS2( movdqa xmm6, xmm2) + AS2( movss xmm6, xmm3) + AS2( movdqa xmm5, xmm1) + AS2( movss xmm5, xmm2) + AS2( movdqa xmm4, xmm0) + AS2( movss xmm4, xmm1) + ASS( pshufd xmm7, xmm7, 0, 3, 2, 1) + ASS( pshufd xmm6, xmm6, 0, 3, 2, 1) + ASS( pshufd xmm5, xmm5, 0, 3, 2, 1) + ASS( pshufd xmm4, xmm4, 0, 3, 2, 1) +#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE + ASJ( jmp, 9, f) + ASL(8) + AS2( movd xmm7, eax) + AS3( palignr xmm7, xmm3, 4) + AS2( movq xmm6, xmm3) + AS3( palignr xmm6, xmm2, 4) + AS2( movq xmm5, xmm2) + AS3( palignr xmm5, xmm1, 4) + AS2( movq xmm4, xmm1) + AS3( palignr xmm4, xmm0, 4) + ASL(9) +#endif + + AS2( xor eax, 1) + AS2( movd ecx, xmm0) + AS2( xor eax, ecx) + AS2( movd ecx, xmm3) + AS2( xor eax, ecx) + + AS2( pxor xmm3, xmm2) + AS2( pxor xmm2, xmm1) + AS2( pxor xmm1, xmm0) + AS2( pxor xmm0, xmm7) + AS2( pxor xmm3, xmm7) + AS2( pxor xmm2, xmm6) + AS2( pxor xmm1, xmm5) + AS2( pxor xmm0, xmm4) + + // sigma + AS2( lea ecx, [ebx + (32-4)*32]) + AS2( and ecx, 31*32) + AS2( lea ebp, [ebx + 16*32]) + AS2( and ebp, 31*32) + + AS2( movdqa xmm4, [esi+20*4+ecx+0*16]) + AS2( movdqa xmm5, [esi+20*4+ebp+0*16]) + AS2( movdqa xmm6, xmm4) + AS2( punpcklqdq xmm4, xmm5) + AS2( punpckhqdq xmm6, xmm5) + AS2( pxor xmm3, xmm4) + AS2( pxor xmm2, xmm6) + + AS2( movdqa xmm4, [esi+20*4+ecx+1*16]) + AS2( movdqa xmm5, [esi+20*4+ebp+1*16]) + AS2( movdqa xmm6, xmm4) + AS2( punpcklqdq xmm4, xmm5) + AS2( punpckhqdq xmm6, xmm5) + AS2( pxor xmm1, xmm4) + AS2( pxor xmm0, xmm6) + + // loop + AS2( add ebx, 32) + AS2( cmp ebx, [esp]) + ASJ( jne, 4, b) + + // save state + AS2( mov ebp, [esp+4]) + AS2( add esp, 8) + AS2( mov [esi+4*17], ebx) + AS2( mov [esi+4*16], eax) + AS2( movdqa [esi+3*16], xmm3) + AS2( movdqa [esi+2*16], xmm2) + AS2( movdqa [esi+1*16], xmm1) + AS2( movdqa [esi+0*16], xmm0) + ASL(5) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "c" (count), "S" (state), "D" (z), "d" (y) + : "%eax", "memory", "cc" + ); +#endif +} + +#endif + template <class B> void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y) { - unsigned int bstart = m_bstart; - word32 *const a = m_state; -#define c (a+17) -#define b ((Stage *)(a+34)) + word32 bstart = m_state[17]; + word32 *const aPtr = m_state; + word32 cPtr[17]; + +#define bPtr ((byte *)(aPtr+20)) + +// reorder the state for SSE2 +// a and c: 4 8 12 16 | 3 7 11 15 | 2 6 10 14 | 1 5 9 13 | 0 +// xmm0 xmm1 xmm2 xmm3 eax +#define a(i) aPtr[((i)*13+16) % 17] // 13 is inverse of 4 mod 17 +#define c(i) cPtr[((i)*13+16) % 17] +// b: 0 4 | 1 5 | 2 6 | 3 7 +#define b(i, j) b##i[(j)*2%8 + (j)/4] // output -#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a[i+9]) -#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a[i+9]) +#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a(i+9)) +#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a(i+9)) // buffer update -#define US(i) {word32 t=b0[i]; b0[i]=ConditionalByteReverse(B::ToEnum(), p[i])^t; b25[(i+6)%8]^=t;} -#define UL(i) {word32 t=b0[i]; b0[i]=a[i+1]^t; b25[(i+6)%8]^=t;} +#define US(i) {word32 t=b(0,i); b(0,i)=ConditionalByteReverse(B::ToEnum(), p[i])^t; b(25,(i+6)%8)^=t;} +#define UL(i) {word32 t=b(0,i); b(0,i)=a(i+1)^t; b(25,(i+6)%8)^=t;} // gamma and pi -#define GP(i) c[5*i%17] = rotlFixed(a[i] ^ (a[(i+1)%17] | ~a[(i+2)%17]), ((5*i%17)*((5*i%17)+1)/2)%32) +#define GP(i) c(5*i%17) = rotlFixed(a(i) ^ (a((i+1)%17) | ~a((i+2)%17)), ((5*i%17)*((5*i%17)+1)/2)%32) // theta and sigma -#define T(i,x) a[i] = c[i] ^ c[(i+1)%17] ^ c[(i+4)%17] ^ x +#define T(i,x) a(i) = c(i) ^ c((i+1)%17) ^ c((i+4)%17) ^ x #define TS1S(i) T(i+1, ConditionalByteReverse(B::ToEnum(), p[i])) -#define TS1L(i) T(i+1, b4[i]) -#define TS2(i) T(i+9, b16[i]) +#define TS1L(i) T(i+1, b(4,i)) +#define TS2(i) T(i+9, b(16,i)) while (count--) { @@ -51,12 +310,11 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 * z += 8; } - word32 *const b16 = b[(bstart+16) % STAGES]; - word32 *const b4 = b[(bstart+4) % STAGES]; - bstart = (bstart + STAGES - 1) % STAGES; - word32 *const b0 = b[bstart]; - word32 *const b25 = b[(bstart+25) % STAGES]; - + word32 *const b16 = (word32 *)(bPtr+((bstart+16*32) & 31*32)); + word32 *const b4 = (word32 *)(bPtr+((bstart+(32-4)*32) & 31*32)); + bstart += 32; + word32 *const b0 = (word32 *)(bPtr+((bstart) & 31*32)); + word32 *const b25 = (word32 *)(bPtr+((bstart+(32-25)*32) & 31*32)); if (p) { @@ -67,8 +325,23 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 * UL(0); UL(1); UL(2); UL(3); UL(4); UL(5); UL(6); UL(7); } - GP(0); GP(1); GP(2); GP(3); GP(4); GP(5); GP(6); GP(7); - GP(8); GP(9); GP(10); GP(11); GP(12); GP(13); GP(14); GP(15); GP(16); + GP(0); + GP(1); + GP(2); + GP(3); + GP(4); + GP(5); + GP(6); + GP(7); + GP(8); + GP(9); + GP(10); + GP(11); + GP(12); + GP(13); + GP(14); + GP(15); + GP(16); T(0,1); @@ -84,18 +357,18 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 * TS2(0); TS2(1); TS2(2); TS2(3); TS2(4); TS2(5); TS2(6); TS2(7); } - m_bstart = bstart; + m_state[17] = bstart; } template <class B> -size_t PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length) +size_t Weak::PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length) { this->Iterate(length / this->BLOCKSIZE, input); return length % this->BLOCKSIZE; } template <class B> -void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size) +void Weak::PanamaHash<B>::TruncatedFinal(byte *hash, size_t size) { this->ThrowIfInvalidTruncatedSize(size); @@ -105,8 +378,10 @@ void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size) this->Iterate(32); // pull - ConditionalByteReverse(B::ToEnum(), this->m_state+9, this->m_state+9, DIGESTSIZE); - memcpy(hash, this->m_state+9, size); + FixedSizeSecBlock<word32, 8> buf; + this->Iterate(1, NULL, buf, NULL); + + memcpy(hash, buf, size); this->Restart(); // reinit for next use } @@ -114,31 +389,64 @@ void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size) template <class B> void PanamaCipherPolicy<B>::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) { - FixedSizeSecBlock<word32, 8> buf; + assert(length==32); + memcpy(m_key, key, 32); +} +template <class B> +void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byte *iv) +{ this->Reset(); - memcpy(buf, key, 32); - this->Iterate(1, buf); - if (length == 64) - memcpy(buf, key+32, 32); + this->Iterate(1, m_key); + if (iv && IsAligned<word32>(iv)) + this->Iterate(1, (const word32 *)iv); + else + { + FixedSizeSecBlock<word32, 8> buf; + if (iv) + memcpy(buf, iv, 32); + else + memset(buf, 0, 32); + this->Iterate(1, buf); + } + +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) + Panama_SSE2_Pull(32, this->m_state, NULL, NULL); else - memset(buf, 0, 32); - this->Iterate(1, buf); +#endif + this->Iterate(32); +} - this->Iterate(32); +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 +template <class B> +unsigned int PanamaCipherPolicy<B>::GetAlignment() const +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) + return 16; + else +#endif + return 1; } +#endif template <class B> void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { - this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input); +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) + Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input); + else +#endif + this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input); } template class Panama<BigEndian>; template class Panama<LittleEndian>; -template class PanamaHash<BigEndian>; -template class PanamaHash<LittleEndian>; +template class Weak::PanamaHash<BigEndian>; +template class Weak::PanamaHash<LittleEndian>; template class PanamaCipherPolicy<BigEndian>; template class PanamaCipherPolicy<LittleEndian>; @@ -1,8 +1,6 @@ #ifndef CRYPTOPP_PANAMA_H #define CRYPTOPP_PANAMA_H -#include "seckey.h" -#include "secblock.h" #include "strciphr.h" #include "iterhash.h" @@ -20,10 +18,10 @@ protected: typedef word32 Stage[8]; CRYPTOPP_CONSTANT(STAGES = 32) - FixedSizeSecBlock<word32, 17*2 + 32*sizeof(Stage)> m_state; - unsigned int m_bstart; + FixedSizeAlignedSecBlock<word32, 20 + 8*32> m_state; }; +namespace Weak { /// <a href="http://www.weidai.com/scan-mirror/md.html#Panama">Panama Hash</a> template <class B = LittleEndian> class PanamaHash : protected Panama<B>, public AlgorithmImpl<IteratedHash<word32, NativeByteOrder, 32>, PanamaHash<B> > @@ -39,7 +37,9 @@ protected: void Init() {Panama<B>::Reset();} void HashEndianCorrectedBlock(const word32 *data) {this->Iterate(1, data);} // push size_t HashMultipleBlocks(const word32 *input, size_t length); + word32* StateBuf() {return NULL;} }; +} //! MAC construction using a hermetic hash function template <class T_Hash, class T_Info = T_Hash> @@ -94,6 +94,7 @@ protected: SecByteBlock m_key; }; +namespace Weak { /// Panama MAC template <class B = LittleEndian> class PanamaMAC : public HermeticHashFunctionMAC<PanamaHash<B> > @@ -103,10 +104,11 @@ public: PanamaMAC(const byte *key, unsigned int length) {this->SetKey(key, length);} }; +} //! algorithm info template <class B> -struct PanamaCipherInfo : public VariableKeyLength<32, 32, 64, 32, SimpleKeyingInterface::NOT_RESYNCHRONIZABLE> +struct PanamaCipherInfo : public FixedKeyLength<32, SimpleKeyingInterface::UNIQUE_IV, 32> { static const char * StaticAlgorithmName() {return B::ToEnum() == BIG_ENDIAN_ORDER ? "Panama-BE" : "Panama-LE";} }; @@ -121,9 +123,15 @@ protected: void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length); void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount); bool IsRandomAccess() const {return false;} + void CipherResynchronize(byte *keystreamBuffer, const byte *iv); +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 + unsigned int GetAlignment() const; +#endif + + FixedSizeSecBlock<word32, 8> m_key; }; -//! <a href="http://www.weidai.com/scan-mirror/cs.html#Panama">Panama Stream Cipher</a> +//! <a href="http://www.cryptolounge.org/wiki/PANAMA">Panama Stream Cipher</a> template <class B = LittleEndian> struct PanamaCipher : public PanamaCipherInfo<B>, public SymmetricCipherDocumentation { @@ -4,6 +4,9 @@ #include "salsa.h" #include "misc.h" #include "argnames.h" +#include "cpu.h" + +#include <emmintrin.h> NAMESPACE_BEGIN(CryptoPP) @@ -14,11 +17,13 @@ void Salsa20_TestInstantiations() void Salsa20_Policy::CipherGetNextIV(byte *IV) { - word32 j6 = m_state[6] + 1; - word32 j7 = m_state[7] + (j6 == 0); + word32 j6, j7; + + j6 = m_state[14] + 1; + j7 = m_state[11] + (j6 == 0); - UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV, j6); - UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV+4, j7); + PutWord(false, LITTLE_ENDIAN_ORDER, IV, j6); + PutWord(false, LITTLE_ENDIAN_ORDER, IV+4, j7); } void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) @@ -28,112 +33,304 @@ void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) throw InvalidRounds(StaticAlgorithmName(), m_rounds); - GetUserKey(LITTLE_ENDIAN_ORDER, m_state+1, 4, key, 16); - GetUserKey(LITTLE_ENDIAN_ORDER, m_state+11, 4, key + length - 16, 16); + // m_state is reordered for SSE2 + GetBlock<word32, LittleEndian, false> get1(key); + get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]); + GetBlock<word32, LittleEndian, false> get2(key + length - 16); + get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]); - // m_state[0,5,10,15] forms "expand 16-byte k" or "expand 32-byte k" + // "expand 16-byte k" or "expand 32-byte k" m_state[0] = 0x61707865; - m_state[5] = (length == 16) ? 0x3120646e : 0x3320646e; - m_state[10] = (length == 16) ? 0x79622d36 : 0x79622d32; - m_state[15] = 0x6b206574; + m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e; + m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32; + m_state[3] = 0x6b206574; } void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV) { - GetUserKey(LITTLE_ENDIAN_ORDER, m_state+6, 4, IV, 8); + GetBlock<word32, LittleEndian, false> get(IV); + get(m_state[14])(m_state[11]); + m_state[8] = m_state[5] = 0; } void Salsa20_Policy::SeekToIteration(lword iterationCount) { m_state[8] = (word32)iterationCount; - m_state[9] = (word32)SafeRightShift<32>(iterationCount); + m_state[5] = (word32)SafeRightShift<32>(iterationCount); +} + +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 +unsigned int Salsa20_Policy::GetAlignment() const +{ +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + return 16; + else +#endif + return 1; +} + +unsigned int Salsa20_Policy::GetOptimalBlockSize() const +{ +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + return 4*BYTES_PER_ITERATION; + else +#endif + return BYTES_PER_ITERATION; } +#endif void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { - KeystreamOutput<LittleEndian> keystreamOutput(operation, output, input); + int i; +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + { + __m128i *s = (__m128i *)m_state.data(); + + if (iterationCount >= 4) + { + __m128i ss[16]; + ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0)); + ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1)); + ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2)); + ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3)); + ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0)); + ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2)); + ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3)); + ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1)); + ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2)); + ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3)); + ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0)); + ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1)); + ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2)); + ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3)); + + do + { + word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]); + for (i=0; i<4; i++) + { + countersLo[i] = m_state[8]; + countersHi[i] = m_state[5]; + if (++m_state[8] == 0) + ++m_state[5]; + } + + __m128i x0 = ss[0]; + __m128i x1 = ss[1]; + __m128i x2 = ss[2]; + __m128i x3 = ss[3]; + __m128i x4 = ss[4]; + __m128i x5 = ss[5]; + __m128i x6 = ss[6]; + __m128i x7 = ss[7]; + __m128i x8 = ss[8]; + __m128i x9 = ss[9]; + __m128i x10 = ss[10]; + __m128i x11 = ss[11]; + __m128i x12 = ss[12]; + __m128i x13 = ss[13]; + __m128i x14 = ss[14]; + __m128i x15 = ss[15]; + + for (i=m_rounds; i>0; i-=2) + { + #define SSE2_QUARTER_ROUND(a, b, d, i) {\ + __m128i t = _mm_add_epi32(a, d); \ + b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \ + b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));} + + #define QUARTER_ROUND(a, b, c, d) \ + SSE2_QUARTER_ROUND(a, b, d, 7) \ + SSE2_QUARTER_ROUND(b, c, a, 9) \ + SSE2_QUARTER_ROUND(c, d, b, 13) \ + SSE2_QUARTER_ROUND(d, a, c, 18) + + QUARTER_ROUND(x0, x4, x8, x12) + QUARTER_ROUND(x1, x5, x9, x13) + QUARTER_ROUND(x2, x6, x10, x14) + QUARTER_ROUND(x3, x7, x11, x15) + + QUARTER_ROUND(x0, x13, x10, x7) + QUARTER_ROUND(x1, x14, x11, x4) + QUARTER_ROUND(x2, x15, x8, x5) + QUARTER_ROUND(x3, x12, x9, x6) + + #undef QUARTER_ROUND + } + + x0 = _mm_add_epi32(x0, ss[0]); + x1 = _mm_add_epi32(x1, ss[1]); + x2 = _mm_add_epi32(x2, ss[2]); + x3 = _mm_add_epi32(x3, ss[3]); + x4 = _mm_add_epi32(x4, ss[4]); + x5 = _mm_add_epi32(x5, ss[5]); + x6 = _mm_add_epi32(x6, ss[6]); + x7 = _mm_add_epi32(x7, ss[7]); + x8 = _mm_add_epi32(x8, ss[8]); + x9 = _mm_add_epi32(x9, ss[9]); + x10 = _mm_add_epi32(x10, ss[10]); + x11 = _mm_add_epi32(x11, ss[11]); + x12 = _mm_add_epi32(x12, ss[12]); + x13 = _mm_add_epi32(x13, ss[13]); + x14 = _mm_add_epi32(x14, ss[14]); + x15 = _mm_add_epi32(x15, ss[15]); + + #define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\ + __m128i t0 = _mm_unpacklo_epi32(a, b);\ + __m128i t1 = _mm_unpacklo_epi32(c, d);\ + __m128i t2 = _mm_unpacklo_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\ + t2 = _mm_unpackhi_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\ + t0 = _mm_unpackhi_epi32(a, b);\ + t1 = _mm_unpackhi_epi32(c, d);\ + t2 = _mm_unpacklo_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\ + t2 = _mm_unpackhi_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)} + + #define SALSA_OUTPUT(x) \ + OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\ + OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\ + OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\ + OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15) + + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION) + + #undef SALSA_OUTPUT + } while ((iterationCount-=4) >= 4); + } + + if (!IsP4()) while (iterationCount) + { + --iterationCount; + __m128i x0 = s[0]; + __m128i x1 = s[1]; + __m128i x2 = s[2]; + __m128i x3 = s[3]; + + for (i=m_rounds; i>0; i-=2) + { + SSE2_QUARTER_ROUND(x0, x1, x3, 7) + SSE2_QUARTER_ROUND(x1, x2, x0, 9) + SSE2_QUARTER_ROUND(x2, x3, x1, 13) + SSE2_QUARTER_ROUND(x3, x0, x2, 18) + + x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3)); + x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); + x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1)); + + SSE2_QUARTER_ROUND(x0, x3, x1, 7) + SSE2_QUARTER_ROUND(x3, x2, x0, 9) + SSE2_QUARTER_ROUND(x2, x1, x3, 13) + SSE2_QUARTER_ROUND(x1, x0, x2, 18) + + x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1)); + x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); + x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3)); + } + + x0 = _mm_add_epi32(x0, s[0]); + x1 = _mm_add_epi32(x1, s[1]); + x2 = _mm_add_epi32(x2, s[2]); + x3 = _mm_add_epi32(x3, s[3]); + + if (++m_state[8] == 0) + ++m_state[5]; + + CRYPTOPP_ALIGN_DATA(16) static const word32 masks[8] CRYPTOPP_SECTION_ALIGN16 = + {0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 0, 0xffffffff, 0}; + + __m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32)); + k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3)); + __m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32)); + k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3)); + __m128i maskLo32 = ((__m128i*)masks)[1], maskHi32 = ((__m128i*)masks)[0]; + __m128i k20 = _mm_or_si128(_mm_and_si128(x2, maskLo32), _mm_and_si128(x1, maskHi32)); + __m128i k31 = _mm_or_si128(_mm_and_si128(x3, maskLo32), _mm_and_si128(x2, maskHi32)); + + __m128i k0 = _mm_unpackhi_epi64(k02, k20); + __m128i k1 = _mm_unpackhi_epi64(k13, k31); + __m128i k2 = _mm_unpacklo_epi64(k20, k02); + __m128i k3 = _mm_unpacklo_epi64(k31, k13); + + #define SSE2_OUTPUT(x) {\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)} + + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION); + } + } +#endif word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - word32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - - j0 = m_state[0]; - j1 = m_state[1]; - j2 = m_state[2]; - j3 = m_state[3]; - j4 = m_state[4]; - j5 = m_state[5]; - j6 = m_state[6]; - j7 = m_state[7]; - j8 = m_state[8]; - j9 = m_state[9]; - j10 = m_state[10]; - j11 = m_state[11]; - j12 = m_state[12]; - j13 = m_state[13]; - j14 = m_state[14]; - j15 = m_state[15]; - - for (size_t iteration = 0; iteration < iterationCount; ++iteration) + + while (iterationCount--) { - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - - for (int i=m_rounds; i>0; i-=2) + x0 = m_state[0]; + x1 = m_state[1]; + x2 = m_state[2]; + x3 = m_state[3]; + x4 = m_state[4]; + x5 = m_state[5]; + x6 = m_state[6]; + x7 = m_state[7]; + x8 = m_state[8]; + x9 = m_state[9]; + x10 = m_state[10]; + x11 = m_state[11]; + x12 = m_state[12]; + x13 = m_state[13]; + x14 = m_state[14]; + x15 = m_state[15]; + + for (i=m_rounds; i>0; i-=2) { -#define QUARTER_ROUND(a, b, c, d) \ - b = b ^ rotlFixed(a + d, 7); \ - c = c ^ rotlFixed(b + a, 9); \ - d = d ^ rotlFixed(c + b, 13); \ - a = a ^ rotlFixed(d + c, 18); + #define QUARTER_ROUND(a, b, c, d) \ + b = b ^ rotlFixed(a + d, 7); \ + c = c ^ rotlFixed(b + a, 9); \ + d = d ^ rotlFixed(c + b, 13); \ + a = a ^ rotlFixed(d + c, 18); QUARTER_ROUND(x0, x4, x8, x12) - QUARTER_ROUND(x5, x9, x13, x1) - QUARTER_ROUND(x10, x14, x2, x6) - QUARTER_ROUND(x15, x3, x7, x11) - - QUARTER_ROUND(x0, x1, x2, x3) - QUARTER_ROUND(x5, x6, x7, x4) - QUARTER_ROUND(x10, x11, x8, x9) - QUARTER_ROUND(x15, x12, x13, x14) + QUARTER_ROUND(x1, x5, x9, x13) + QUARTER_ROUND(x2, x6, x10, x14) + QUARTER_ROUND(x3, x7, x11, x15) + + QUARTER_ROUND(x0, x13, x10, x7) + QUARTER_ROUND(x1, x14, x11, x4) + QUARTER_ROUND(x2, x15, x8, x5) + QUARTER_ROUND(x3, x12, x9, x6) } - keystreamOutput (x0 + j0) - (x1 + j1) - (x2 + j2) - (x3 + j3) - (x4 + j4) - (x5 + j5) - (x6 + j6) - (x7 + j7) - (x8 + j8) - (x9 + j9) - (x10 + j10) - (x11 + j11) - (x12 + j12) - (x13 + j13) - (x14 + j14) - (x15 + j15); - - if (++j8 == 0) - ++j9; - } + #define SALSA_OUTPUT(x) {\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} - m_state[8] = j8; - m_state[9] = j9; + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); + + if (++m_state[8] == 0) + ++m_state[5]; + } } NAMESPACE_END @@ -8,7 +8,7 @@ NAMESPACE_BEGIN(CryptoPP) //! _ -struct Salsa20_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::STRUCTURED_IV, 8> +struct Salsa20_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::UNIQUE_IV, 8> { static const char *StaticAlgorithmName() {return "Salsa20";} }; @@ -22,13 +22,17 @@ protected: void CipherResynchronize(byte *keystreamBuffer, const byte *IV); bool IsRandomAccess() const {return true;} void SeekToIteration(lword iterationCount); +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 + unsigned int GetAlignment() const; + unsigned int GetOptimalBlockSize() const; +#endif private: + FixedSizeAlignedSecBlock<word32, 16> m_state; int m_rounds; - FixedSizeSecBlock<word32, 16> m_state; }; -//! Salsa20, variable rounds: 8, 12 or 20 (default 20) +/// <a href="http://www.cryptolounge.org/wiki/Salsa20">Salsa20</a>, variable rounds: 8, 12 or 20 (default 20) struct Salsa20 : public Salsa20_Info, public SymmetricCipherDocumentation { typedef SymmetricCipherFinal<ConcretePolicyHolder<Salsa20_Policy, AdditiveCipherTemplate<> >, Salsa20_Info> Encryption; diff --git a/strciphr.cpp b/strciphr.cpp index b25017e..6294785 100644 --- a/strciphr.cpp +++ b/strciphr.cpp @@ -38,6 +38,57 @@ byte AdditiveCipherTemplate<S>::GenerateByte() } template <class S> +void AdditiveCipherTemplate<S>::GenerateBlock(byte *outString, size_t length) +{ + if (m_leftOver > 0) + { + size_t len = STDMIN(m_leftOver, length); + memcpy(outString, KeystreamBufferEnd()-m_leftOver, len); + length -= len; + m_leftOver -= len; + outString += len; + + if (!length) + return; + } + assert(m_leftOver == 0); + + PolicyInterface &policy = this->AccessPolicy(); + unsigned int bytesPerIteration = policy.GetBytesPerIteration(); + + if (length >= bytesPerIteration) + { + size_t iterations = length / bytesPerIteration; + + policy.WriteKeystream(outString, iterations); + + outString += iterations * bytesPerIteration; + length -= iterations * bytesPerIteration; + + if (!length) + return; + } + + unsigned int bufferByteSize = GetBufferByteSize(policy); + unsigned int bufferIterations = policy.GetIterationsToBuffer(); + + while (length >= bufferByteSize) + { + policy.WriteKeystream(m_buffer, bufferIterations); + memcpy(outString, KeystreamBufferBegin(), bufferByteSize); + length -= bufferByteSize; + outString += bufferByteSize; + } + + if (length > 0) + { + policy.WriteKeystream(m_buffer, bufferIterations); + memcpy(outString, KeystreamBufferBegin(), length); + m_leftOver = bytesPerIteration - length; + } +} + +template <class S> void AdditiveCipherTemplate<S>::ProcessData(byte *outString, const byte *inString, size_t length) { if (m_leftOver > 0) @@ -48,29 +99,26 @@ void AdditiveCipherTemplate<S>::ProcessData(byte *outString, const byte *inStrin m_leftOver -= len; inString += len; outString += len; - } - - if (!length) - return; + if (!length) + return; + } assert(m_leftOver == 0); PolicyInterface &policy = this->AccessPolicy(); unsigned int bytesPerIteration = policy.GetBytesPerIteration(); - unsigned int alignment = policy.GetAlignment(); - if (policy.CanOperateKeystream() && length >= bytesPerIteration && IsAlignedOn(outString, alignment)) + if (policy.CanOperateKeystream() && length >= bytesPerIteration) { - if (IsAlignedOn(inString, alignment)) - policy.OperateKeystream(XOR_KEYSTREAM, outString, inString, length / bytesPerIteration); - else - { - memcpy(outString, inString, length); - policy.OperateKeystream(XOR_KEYSTREAM_INPLACE, outString, outString, length / bytesPerIteration); - } - inString += length - length % bytesPerIteration; - outString += length - length % bytesPerIteration; - length %= bytesPerIteration; + size_t iterations = length / bytesPerIteration; + unsigned int alignment = policy.GetAlignment(); + KeystreamOperation operation = KeystreamOperation((IsAlignedOn(inString, alignment) * 2) | (int)IsAlignedOn(outString, alignment)); + + policy.OperateKeystream(operation, outString, inString, iterations); + + inString += iterations * bytesPerIteration; + outString += iterations * bytesPerIteration; + length -= iterations * bytesPerIteration; if (!length) return; @@ -53,14 +53,23 @@ protected: POLICY_INTERFACE & AccessPolicy() {return *this;} }; -enum KeystreamOperation {WRITE_KEYSTREAM, XOR_KEYSTREAM, XOR_KEYSTREAM_INPLACE}; +enum KeystreamOperationFlags {OUTPUT_ALIGNED=1, INPUT_ALIGNED=2, INPUT_NULL = 4}; +enum KeystreamOperation { + WRITE_KEYSTREAM = INPUT_NULL, + WRITE_KEYSTREAM_ALIGNED = INPUT_NULL | OUTPUT_ALIGNED, + XOR_KEYSTREAM = 0, + XOR_KEYSTREAM_INPUT_ALIGNED = INPUT_ALIGNED, + XOR_KEYSTREAM_OUTPUT_ALIGNED= OUTPUT_ALIGNED, + XOR_KEYSTREAM_BOTH_ALIGNED = OUTPUT_ALIGNED | INPUT_ALIGNED}; struct CRYPTOPP_DLL CRYPTOPP_NO_VTABLE AdditiveCipherAbstractPolicy { - virtual unsigned int GetAlignment() const =0; + virtual unsigned int GetAlignment() const {return 1;} virtual unsigned int GetBytesPerIteration() const =0; + virtual unsigned int GetOptimalBlockSize() const {return GetBytesPerIteration();} virtual unsigned int GetIterationsToBuffer() const =0; - virtual void WriteKeystream(byte *keystreamBuffer, size_t iterationCount) =0; + virtual void WriteKeystream(byte *keystream, size_t iterationCount) + {OperateKeystream(KeystreamOperation(INPUT_NULL | (KeystreamOperationFlags)IsAlignedOn(keystream, GetAlignment())), keystream, NULL, iterationCount);} virtual bool CanOperateKeystream() const {return false;} virtual void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) {assert(false);} virtual void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) =0; @@ -74,59 +83,62 @@ template <typename WT, unsigned int W, unsigned int X = 1, class BASE = Additive struct CRYPTOPP_NO_VTABLE AdditiveCipherConcretePolicy : public BASE { typedef WT WordType; + CRYPTOPP_CONSTANT(BYTES_PER_ITERATION = sizeof(WordType) * W); - unsigned int GetAlignment() const {return sizeof(WordType);} - unsigned int GetBytesPerIteration() const {return sizeof(WordType) * W;} +#if !(CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64) + unsigned int GetAlignment() const {return GetAlignmentOf<WordType>();} +#endif + unsigned int GetBytesPerIteration() const {return BYTES_PER_ITERATION;} unsigned int GetIterationsToBuffer() const {return X;} - void WriteKeystream(byte *buffer, size_t iterationCount) - {OperateKeystream(WRITE_KEYSTREAM, buffer, NULL, iterationCount);} bool CanOperateKeystream() const {return true;} virtual void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) =0; - - template <class B> - struct KeystreamOutput - { - KeystreamOutput(KeystreamOperation operation, byte *output, const byte *input) - : m_operation(operation), m_output(output), m_input(input) {} - - inline KeystreamOutput & operator()(WordType keystreamWord) - { - assert(IsAligned<WordType>(m_input)); - assert(IsAligned<WordType>(m_output)); - - if (!NativeByteOrderIs(B::ToEnum())) - keystreamWord = ByteReverse(keystreamWord); - - if (m_operation == WRITE_KEYSTREAM) - *(WordType*)m_output = keystreamWord; - else if (m_operation == XOR_KEYSTREAM) - { - *(WordType*)m_output = keystreamWord ^ *(WordType*)m_input; - m_input += sizeof(WordType); - } - else if (m_operation == XOR_KEYSTREAM_INPLACE) - *(WordType*)m_output ^= keystreamWord; - - m_output += sizeof(WordType); - - return *this; - } - - KeystreamOperation m_operation; - byte *m_output; - const byte *m_input; - }; }; +// use these to implement OperateKeystream +#define CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, b, i, a) \ + PutWord(bool(x & OUTPUT_ALIGNED), b, output+i*sizeof(WordType), (x & INPUT_NULL) ? a : a ^ GetWord<WordType>(bool(x & INPUT_ALIGNED), b, input+i*sizeof(WordType))); +#define CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, i, a) {\ + __m128i t = (x & INPUT_NULL) ? a : _mm_xor_si128(a, (x & INPUT_ALIGNED) ? _mm_load_si128((__m128i *)input+i) : _mm_loadu_si128((__m128i *)input+i));\ + if (x & OUTPUT_ALIGNED) _mm_store_si128((__m128i *)output+i, t);\ + else _mm_storeu_si128((__m128i *)output+i, t);} +#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y) \ + switch (operation) \ + { \ + case WRITE_KEYSTREAM: \ + x(WRITE_KEYSTREAM) \ + break; \ + case XOR_KEYSTREAM: \ + x(XOR_KEYSTREAM) \ + input += y; \ + break; \ + case XOR_KEYSTREAM_INPUT_ALIGNED: \ + x(XOR_KEYSTREAM_INPUT_ALIGNED) \ + input += y; \ + break; \ + case XOR_KEYSTREAM_OUTPUT_ALIGNED: \ + x(XOR_KEYSTREAM_OUTPUT_ALIGNED) \ + input += y; \ + break; \ + case WRITE_KEYSTREAM_ALIGNED: \ + x(WRITE_KEYSTREAM_ALIGNED) \ + break; \ + case XOR_KEYSTREAM_BOTH_ALIGNED: \ + x(XOR_KEYSTREAM_BOTH_ALIGNED) \ + input += y; \ + break; \ + } \ + output += y; + template <class BASE = AbstractPolicyHolder<AdditiveCipherAbstractPolicy, TwoBases<SymmetricCipher, RandomNumberGenerator> > > class CRYPTOPP_NO_VTABLE AdditiveCipherTemplate : public BASE { public: byte GenerateByte(); + void GenerateBlock(byte *output, size_t size); void ProcessData(byte *outString, const byte *inString, size_t length); void GetNextIV(byte *iv) {this->AccessPolicy().CipherGetNextIV(iv);} void Resynchronize(const byte *iv); - unsigned int OptimalBlockSize() const {return this->GetPolicy().GetBytesPerIteration();} + unsigned int OptimalBlockSize() const {return this->GetPolicy().GetOptimalBlockSize();} unsigned int GetOptimalNextBlockSize() const {return (unsigned int)this->m_leftOver;} unsigned int OptimalDataAlignment() const {return this->GetPolicy().GetAlignment();} bool IsSelfInverting() const {return true;} |