From 5cad605559f973f2ce7d5ede1e6b67151f8e1d51 Mon Sep 17 00:00:00 2001 From: weidai Date: Mon, 16 Apr 2007 00:21:07 +0000 Subject: optimizations --- gf2_32.cpp | 2 +- misc.h | 244 +++++++++++++++++++++++++++++++++++++++++++++++++------------ seal.cpp | 14 ++-- seal.h | 2 +- trunhash.h | 2 + words.h | 3 +- 6 files changed, 209 insertions(+), 58 deletions(-) diff --git a/gf2_32.cpp b/gf2_32.cpp index fd2661b5..ae4874a4 100644 --- a/gf2_32.cpp +++ b/gf2_32.cpp @@ -22,7 +22,7 @@ GF2_32::Element GF2_32::Multiply(Element a, Element b) const table[3] = m_modulus ^ (a<<1); } -#ifdef FAST_ROTATE +#if CRYPTOPP_FAST_ROTATE(32) b = rotrFixed(b, 30U); word32 result = table[b&2]; diff --git a/misc.h b/misc.h index 2c71deb4..c898040f 100644 --- a/misc.h +++ b/misc.h @@ -4,14 +4,39 @@ #include "cryptlib.h" #include "smartptr.h" -#ifdef INTEL_INTRINSICS -#include +#ifdef _MSC_VER + #include + #if _MSC_VER >= 1400 + // VC2005 workaround: disable declarations that conflict with winnt.h + #define _interlockedbittestandset CRYPTOPP_DISABLED_INTRINSIC_1 + #define _interlockedbittestandreset CRYPTOPP_DISABLED_INTRINSIC_2 + #include + #undef _interlockedbittestandset + #undef _interlockedbittestandreset + #define CRYPTOPP_FAST_ROTATE(x) 1 + #elif _MSC_VER >= 1300 + #define CRYPTOPP_FAST_ROTATE(x) ((x) == 32 | (x) == 64) + #else + #define CRYPTOPP_FAST_ROTATE(x) ((x) == 32) + #endif +#elif (defined(__MWERKS__) && TARGET_CPU_PPC) || \ + (defined(__GNUC__) && (defined(_ARCH_PWR2) || defined(_ARCH_PWR) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) || defined(_ARCH_COM))) + #define CRYPTOPP_FAST_ROTATE(x) ((x) == 32) +#elif defined(__GNUC__) && (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86) // depend on GCC's peephole optimization to generate rotate instructions + #define CRYPTOPP_FAST_ROTATE(x) 1 +#elif + #define CRYPTOPP_FAST_ROTATE(x) 0 #endif #ifdef __BORLANDC__ #include #endif +#if defined(__GNUC__) && !defined(__sun__) && !defined(__MINGW32__) +#define CRYPTOPP_BYTESWAP_AVAILABLE +#include +#endif + NAMESPACE_BEGIN(CryptoPP) // ************** compile-time assertion *************** @@ -289,8 +314,13 @@ inline T1 RoundUpToMultipleOf(const T1 &n, const T2 &m) } template -inline unsigned int GetAlignment(T *dummy=NULL) // VC60 workaround +inline unsigned int GetAlignmentOf(T *dummy=NULL) // VC60 workaround { +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 + if (sizeof(T) < 16) + return 1; // alignment not needed on x86 and x64 +#endif + #if (_MSC_VER >= 1300) return __alignof(T); #elif defined(__GNUC__) @@ -304,13 +334,13 @@ inline unsigned int GetAlignment(T *dummy=NULL) // VC60 workaround inline bool IsAlignedOn(const void *p, unsigned int alignment) { - return IsPowerOf2(alignment) ? ModPowerOf2((size_t)p, alignment) == 0 : (size_t)p % alignment == 0; + return alignment==1 || (IsPowerOf2(alignment) ? ModPowerOf2((size_t)p, alignment) == 0 : (size_t)p % alignment == 0); } template inline bool IsAligned(const void *p, T *dummy=NULL) // VC60 workaround { - return IsAlignedOn(p, GetAlignment()); + return IsAlignedOn(p, GetAlignmentOf()); } #ifdef IS_LITTLE_ENDIAN @@ -418,31 +448,29 @@ template inline T rotrMod(T x, unsigned int y) return T((x>>y) | (x<<(sizeof(T)*8-y))); } -#ifdef INTEL_INTRINSICS - -#pragma intrinsic(_lrotl, _lrotr) +#ifdef _MSC_VER template<> inline word32 rotlFixed(word32 x, unsigned int y) { - assert(y < 32); + assert(y < 8*sizeof(x)); return y ? _lrotl(x, y) : x; } template<> inline word32 rotrFixed(word32 x, unsigned int y) { - assert(y < 32); + assert(y < 8*sizeof(x)); return y ? _lrotr(x, y) : x; } template<> inline word32 rotlVariable(word32 x, unsigned int y) { - assert(y < 32); + assert(y < 8*sizeof(x)); return _lrotl(x, y); } template<> inline word32 rotrVariable(word32 x, unsigned int y) { - assert(y < 32); + assert(y < 8*sizeof(x)); return _lrotr(x, y); } @@ -456,9 +484,119 @@ template<> inline word32 rotrMod(word32 x, unsigned int y) return _lrotr(x, y); } -#endif // #ifdef INTEL_INTRINSICS +#if _MSC_VER >= 1300 + +template<> inline word64 rotlFixed(word64 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return y ? _rotl64(x, y) : x; +} + +template<> inline word64 rotrFixed(word64 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return y ? _rotr64(x, y) : x; +} + +template<> inline word64 rotlVariable(word64 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return _rotl64(x, y); +} + +template<> inline word64 rotrVariable(word64 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return _rotr64(x, y); +} + +template<> inline word64 rotlMod(word64 x, unsigned int y) +{ + return _rotl64(x, y); +} + +template<> inline word64 rotrMod(word64 x, unsigned int y) +{ + return _rotr64(x, y); +} + +#endif // #if _MSC_VER >= 1310 + +#if _MSC_VER >= 1400 && (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000) + +template<> inline word16 rotlFixed(word16 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return y ? _rotl16(x, y) : x; +} + +template<> inline word16 rotrFixed(word16 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return y ? _rotr16(x, y) : x; +} -#ifdef PPC_INTRINSICS +template<> inline word16 rotlVariable(word16 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return _rotl16(x, y); +} + +template<> inline word16 rotrVariable(word16 x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return _rotr16(x, y); +} + +template<> inline word16 rotlMod(word16 x, unsigned int y) +{ + return _rotl16(x, y); +} + +template<> inline word16 rotrMod(word16 x, unsigned int y) +{ + return _rotr16(x, y); +} + +template<> inline byte rotlFixed(byte x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return y ? _rotl8(x, y) : x; +} + +template<> inline byte rotrFixed(byte x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return y ? _rotr8(x, y) : x; +} + +template<> inline byte rotlVariable(byte x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return _rotl8(x, y); +} + +template<> inline byte rotrVariable(byte x, unsigned int y) +{ + assert(y < 8*sizeof(x)); + return _rotr8(x, y); +} + +template<> inline byte rotlMod(byte x, unsigned int y) +{ + return _rotl8(x, y); +} + +template<> inline byte rotrMod(byte x, unsigned int y) +{ + return _rotr8(x, y); +} + +#endif // #if _MSC_VER >= 1400 + +#endif // #ifdef _MSC_VER + +#if (defined(__MWERKS__) && TARGET_CPU_PPC) template<> inline word32 rotlFixed(word32 x, unsigned int y) { @@ -494,7 +632,7 @@ template<> inline word32 rotrMod(word32 x, unsigned int y) return (__rlwnm(x,32-y,0,31)); } -#endif // #ifdef PPC_INTRINSICS +#endif // #if (defined(__MWERKS__) && TARGET_CPU_PPC) // ************** endian reversal *************** @@ -514,15 +652,27 @@ inline byte ByteReverse(byte value) inline word16 ByteReverse(word16 value) { +#ifdef CRYPTOPP_BYTESWAP_AVAILABLE + return bswap_16(value); +#elif defined(_MSC_VER) && _MSC_VER >= 1300 + return _byteswap_ushort(value); +#else return rotlFixed(value, 8U); +#endif } inline word32 ByteReverse(word32 value) { -#ifdef PPC_INTRINSICS - // PPC: load reverse indexed instruction +#if defined(__GNUC__) && defined(CRYPTOPP_X86_ASM_AVAILABLE) + __asm__ ("bswap %0" : "=r" (value) : "0" (value)); + return value; +#elif defined(CRYPTOPP_BYTESWAP_AVAILABLE) + return bswap_32(value); +#elif defined(__MWERKS__) && TARGET_CPU_PPC return (word32)__lwbrx(&value,0); -#elif defined(FAST_ROTATE) +#elif defined(_MSC_VER) && _MSC_VER >= 1300 + return _byteswap_ulong(value); +#elif CRYPTOPP_FAST_ROTATE(32) // 5 instructions with rotate instruction, 9 without return (rotrFixed(value, 8U) & 0xff00ff00) | (rotlFixed(value, 8U) & 0x00ff00ff); #else @@ -535,7 +685,14 @@ inline word32 ByteReverse(word32 value) #ifdef WORD64_AVAILABLE inline word64 ByteReverse(word64 value) { -#ifdef CRYPTOPP_SLOW_WORD64 +#if defined(__GNUC__) && defined(CRYPTOPP_X86_ASM_AVAILABLE) && defined(__x86_64__) + __asm__ ("bswap %0" : "=r" (value) : "0" (value)); + return value; +#elif defined(CRYPTOPP_BYTESWAP_AVAILABLE) + return bswap_64(value); +#elif defined(_MSC_VER) && _MSC_VER >= 1300 + return _byteswap_uint64(value); +#elif defined(CRYPTOPP_SLOW_WORD64) return (word64(ByteReverse(word32(value))) << 32) | ByteReverse(word32(value>>32)); #else value = ((value & W64LIT(0xFF00FF00FF00FF00)) >> 8) | ((value & W64LIT(0x00FF00FF00FF00FF)) << 8); @@ -637,6 +794,7 @@ inline void GetUserKey(ByteOrder order, T *out, size_t outlen, const byte *in, s ConditionalByteReverse(order, out, out, RoundUpToMultipleOf(inlen, U)); } +#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS inline byte UnalignedGetWordNonTemplate(ByteOrder order, const byte *block, byte*) { return block[0]; @@ -681,18 +839,12 @@ inline word64 UnalignedGetWordNonTemplate(ByteOrder order, const byte *block, wo } #endif -template -inline T UnalignedGetWord(ByteOrder order, const byte *block, T*dummy=NULL) -{ - return UnalignedGetWordNonTemplate(order, block, dummy); -} - -inline void UnalignedPutWord(ByteOrder order, byte *block, byte value, const byte *xorBlock = NULL) +inline void UnalignedPutWordNonTemplate(ByteOrder order, byte *block, byte value, const byte *xorBlock) { block[0] = xorBlock ? (value ^ xorBlock[0]) : value; } -inline void UnalignedPutWord(ByteOrder order, byte *block, word16 value, const byte *xorBlock = NULL) +inline void UnalignedPutWordNonTemplate(ByteOrder order, byte *block, word16 value, const byte *xorBlock) { if (order == BIG_ENDIAN_ORDER) { @@ -712,7 +864,7 @@ inline void UnalignedPutWord(ByteOrder order, byte *block, word16 value, const b } } -inline void UnalignedPutWord(ByteOrder order, byte *block, word32 value, const byte *xorBlock = NULL) +inline void UnalignedPutWordNonTemplate(ByteOrder order, byte *block, word32 value, const byte *xorBlock) { if (order == BIG_ENDIAN_ORDER) { @@ -739,7 +891,7 @@ inline void UnalignedPutWord(ByteOrder order, byte *block, word32 value, const b } #ifdef WORD64_AVAILABLE -inline void UnalignedPutWord(ByteOrder order, byte *block, word64 value, const byte *xorBlock = NULL) +inline void UnalignedPutWordNonTemplate(ByteOrder order, byte *block, word64 value, const byte *xorBlock) { if (order == BIG_ENDIAN_ORDER) { @@ -777,17 +929,17 @@ inline void UnalignedPutWord(ByteOrder order, byte *block, word64 value, const b } } #endif +#endif // #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS template inline T GetWord(bool assumeAligned, ByteOrder order, const byte *block) { - if (assumeAligned) - { - assert(IsAligned(block)); - return ConditionalByteReverse(order, *reinterpret_cast(block)); - } - else - return UnalignedGetWord(order, block); +#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + if (!assumeAligned) + return UnalignedGetWordNonTemplate(order, block); + assert(IsAligned(block)); +#endif + return ConditionalByteReverse(order, *reinterpret_cast(block)); } template @@ -799,17 +951,13 @@ inline void GetWord(bool assumeAligned, ByteOrder order, T &result, const byte * template inline void PutWord(bool assumeAligned, ByteOrder order, byte *block, T value, const byte *xorBlock = NULL) { - if (assumeAligned) - { - assert(IsAligned(block)); - assert(IsAligned(xorBlock)); - if (xorBlock) - *reinterpret_cast(block) = ConditionalByteReverse(order, value) ^ *reinterpret_cast(xorBlock); - else - *reinterpret_cast(block) = ConditionalByteReverse(order, value); - } - else - UnalignedPutWord(order, block, value, xorBlock); +#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + if (!assumeAligned) + return UnalignedGetWordNonTemplate(order, block, value, xorBlock); + assert(IsAligned(block)); + assert(IsAligned(xorBlock)); +#endif + *reinterpret_cast(block) = ConditionalByteReverse(order, value) ^ (xorBlock ? *reinterpret_cast(xorBlock) : 0); } template @@ -927,4 +1075,4 @@ inline T SafeLeftShift(T value) NAMESPACE_END -#endif // MISC_H +#endif diff --git a/seal.cpp b/seal.cpp index f240b50a..196965f8 100644 --- a/seal.cpp +++ b/seal.cpp @@ -71,7 +71,7 @@ void SEAL_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, template void SEAL_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV) { - m_outsideCounter = IV ? UnalignedGetWord(BIG_ENDIAN_ORDER, IV) : 0; + m_outsideCounter = IV ? GetWord(false, BIG_ENDIAN_ORDER, IV) : 0; m_startCount = m_outsideCounter; m_insideCounter = 0; } @@ -86,7 +86,6 @@ void SEAL_Policy::SeekToIteration(lword iterationCount) template void SEAL_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { - KeystreamOutput keystreamOutput(operation, output, input); word32 a, b, c, d, n1, n2, n3, n4; unsigned int p, q; @@ -175,10 +174,13 @@ void SEAL_Policy::OperateKeystream(KeystreamOperation operation, byte *output d = rotrFixed(d, 9U); a += Ttab(q); - keystreamOutput (b + m_S[4*i+0]) - (c ^ m_S[4*i+1]) - (d + m_S[4*i+2]) - (a ^ m_S[4*i+3]); +#define SEAL_OUTPUT(x) \ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 0, b + m_S[4*i+0]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 1, c ^ m_S[4*i+1]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 2, d + m_S[4*i+2]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 3, a ^ m_S[4*i+3]); + + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SEAL_OUTPUT, 4*4); if (i & 1) { diff --git a/seal.h b/seal.h index fc53c2f0..551a3382 100644 --- a/seal.h +++ b/seal.h @@ -18,7 +18,7 @@ class CRYPTOPP_NO_VTABLE SEAL_Policy : public AdditiveCipherConcretePolicy