summaryrefslogtreecommitdiff
path: root/aria.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2017-04-12 23:28:41 -0400
committerJeffrey Walton <noloader@gmail.com>2017-04-12 23:28:41 -0400
commit59767be52e2537bf8460f211f57b4739c9b02dcb (patch)
tree26b64adaaf58e9a676e3a8165f3005c2c9eb26d9 /aria.cpp
parentf44e705c16265f89264491756677758866e165dc (diff)
downloadcryptopp-git-59767be52e2537bf8460f211f57b4739c9b02dcb.tar.gz
Add Intel and ARM intrinsics
Win32 and Win64 benefited from the Intel intrinsics. A32 and Aarch64 benefited from the ARM intrinsics. The intrinsics shaved 150 to 350 cycles from key setup. The intrinsics slowed modern GCC down a small bit, and did not appear to affect old GCC. As such, Intel intrinsics were only enabled for Microsoft compilers. We were not able to improve encryption and decryption. In fact, some of the attempted macro conversions and intrinsics attempts slowed things down considerably. For example, GCC 5.4 on x86_64 went from 120 MB/s to about 70 MB/s when we tried to improve code around the Key XOR Layer (ARIA_KXL).
Diffstat (limited to 'aria.cpp')
-rw-r--r--aria.cpp264
1 files changed, 187 insertions, 77 deletions
diff --git a/aria.cpp b/aria.cpp
index 53b39a9f..764cfea7 100644
--- a/aria.cpp
+++ b/aria.cpp
@@ -14,7 +14,11 @@
#include "misc.h"
#include "cpu.h"
-#include <stdio.h>
+// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150
+// to 200 cycles. GCC does fine on its own, and it slows things down a small bit.
+#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER
+# define CRYPTOPP_ENABLE_ARIA_INTRINSICS 1
+#endif
ANONYMOUS_NAMESPACE_BEGIN
@@ -313,46 +317,148 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area.
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
- w0[0] = LoadWord<true>(mk,0); w0[1] = LoadWord<true>(mk,1);
- w0[2] = LoadWord<true>(mk,2); w0[3] = LoadWord<true>(mk,3);
+#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ if (HasSSSE3())
+ {
+ // 7 SSE instructions
+ const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
+ const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m);
+ _mm_store_si128((__m128i*)w0, w);
- t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
- t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
+ _mm_store_si128((__m128i*)t, _mm_xor_si128(w,
+ _mm_load_si128((const __m128i*)KRK[q])));
+ }
+ else
+#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ {
+ // 27 integer instructions
+ w0[0] = LoadWord<true>(mk,0); w0[1] = LoadWord<true>(mk,1);
+ w0[2] = LoadWord<true>(mk,2); w0[3] = LoadWord<true>(mk,3);
+
+ t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
+ t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
+ }
+
+ // 24 integer instructions
ARIA_FO;
- if (keyBits > 128)
+ if (keyBits == 256)
{
- w1[0] = LoadWord<true>(mk,4);
- w1[1] = LoadWord<true>(mk,5);
-
- if (keyBits > 192)
+#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ if (HasSSSE3())
+ {
+ // 3 SSE instructions
+ const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
+ _mm_store_si128(reinterpret_cast<__m128i*>(w1),
+ _mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m));
+ }
+#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
+ // 14 integer instructions
+ w1[0] = LoadWord<true>(mk,4);
+ w1[1] = LoadWord<true>(mk,5);
w1[2] = LoadWord<true>(mk,6);
w1[3] = LoadWord<true>(mk,7);
}
+ }
+ else if (keyBits == 192)
+ {
+ w1[0] = LoadWord<true>(mk,4);
+ w1[1] = LoadWord<true>(mk,5);
+ w1[2] = w1[3] = 0;
+ }
+ else
+ {
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+ if (HasSSE2())
+ {
+ _mm_store_si128(reinterpret_cast<__m128i*>(w1), _mm_setzero_si128());
+ }
else
+#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
- w1[2]=w1[3]=0;
+ w1[0]=w1[1]=w1[2]=w1[3]=0;
}
}
- else
+
+#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ if (HasSSSE3())
{
- w1[0]=w1[1]=w1[2]=w1[3]=0;
+ // 4 integer, 7 SSE instructions
+ const __m128i x = _mm_xor_si128(
+ _mm_load_si128((const __m128i*)(w1)),
+ _mm_load_si128((const __m128i*)(t)));
+
+ _mm_store_si128((__m128i*)(w1), x);
+ _mm_store_si128((__m128i*)(t), x);
+
+ q = (q==2) ? 0 : (q+1);
+ const __m128i y = _mm_xor_si128(
+ _mm_load_si128((const __m128i*)(t)),
+ _mm_load_si128((const __m128i*)(KRK[q])));
+
+ _mm_store_si128((__m128i*)(t), y);
}
+ else
+#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ {
+ // 23 integer instructions
+ w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
+ t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
- w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
- t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
+ q = (q==2) ? 0 : (q+1);
+ t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
+ }
- q = (q==2) ? 0 : (q+1);
- t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
ARIA_FE;
- t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
- w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
- q = (q==2) ? 0 : (q+1);
- t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
+#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ if (HasSSSE3())
+ {
+ // 4 integer, 7 SSE instructions
+ const __m128i x = _mm_xor_si128(
+ _mm_load_si128((const __m128i*)(w0)),
+ _mm_load_si128((const __m128i*)(t)));
+
+ _mm_store_si128((__m128i*)(w2), x);
+ _mm_store_si128((__m128i*)(t), x);
+
+ q = (q==2) ? 0 : (q+1);
+ const __m128i y = _mm_xor_si128(
+ _mm_load_si128((const __m128i*)(t)),
+ _mm_load_si128((const __m128i*)(KRK[q])));
+
+ _mm_store_si128((__m128i*)(t), y);
+ }
+ else
+#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ {
+ // 23 integer instructions
+ t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
+ w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
+
+ q = (q==2) ? 0 : (q+1);
+ t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
+ }
+
ARIA_FO;
- w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
+
+#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ if (HasSSSE3())
+ {
+ // 3 SSE instructions
+ const __m128i x = _mm_xor_si128(
+ _mm_load_si128((const __m128i*)(w1)),
+ _mm_load_si128((const __m128i*)(t)));
+
+ _mm_store_si128((__m128i*)(w3), x);
+ }
+ else
+#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
+ {
+ // 14 integer instructions
+ w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
+ }
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
if (HasNEON())
@@ -384,7 +490,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
}
}
else
-#endif
+#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
{
ARIA_GSRK<19>(w0, w1, rk + 0);
ARIA_GSRK<19>(w1, w2, rk + 16);
@@ -423,6 +529,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
rk = m_rk.data();
r = R; q = Q;
+ // 32 integer intructions
a=reinterpret_cast<word32*>(rk); z=a+r*4;
t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3];
a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3];
@@ -432,89 +539,92 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
for (; a<z; a+=4, z-=4)
{
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
- ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
+ ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
s0=t[0]; s1=t[1]; s2=t[2]; s3=t[3];
ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]);
- ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
+ ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3];
z[0]=s0; z[1]=s1; z[2]=s2; z[3]=s3;
}
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
- ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
+ ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
}
}
void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
- const byte *i=inBlock, *x=xorBlock;
- byte *o = outBlock;
-
const byte *rk = reinterpret_cast<const byte*>(m_rk.data());
word32 *t = const_cast<word32*>(m_w.data()+20);
- t[0] = LoadWord<true>(i,0); t[1] = LoadWord<true>(i,1);
- t[2] = LoadWord<true>(i,2); t[3] = LoadWord<true>(i,3);
+ // Visual Studio is generating bad code within the SSSE3 code block. It is
+ // providing a NULL pointer or a pointer set to a constant like 0x1000.
+ // It looks like some leftover garbage in the XMM register rather than
+ // the pointer loaded into the integer register for the non-SSE code path.
+ t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1);
+ t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3);
if (m_rounds > 12) {
- ARIA_KXL rk+= 16; ARIA_FO
- ARIA_KXL rk+= 16; ARIA_FE
+ ARIA_KXL; rk+= 16; ARIA_FO;
+ ARIA_KXL; rk+= 16; ARIA_FE;
}
if (m_rounds > 14) {
- ARIA_KXL rk+= 16; ARIA_FO
- ARIA_KXL rk+= 16; ARIA_FE
+ ARIA_KXL; rk+= 16; ARIA_FO;
+ ARIA_KXL; rk+= 16; ARIA_FE;
}
- ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
- ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
- ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
- ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
- ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
- ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16;
+ ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
+ ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
+ ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
+ ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
+ ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
+ ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16;
#ifdef IS_LITTLE_ENDIAN
- o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];
- o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2];
- o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1];
- o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0];
- o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7];
- o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6];
- o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5];
- o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4];
- o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11];
- o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10];
- o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9];
- o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8];
- o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15];
- o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14];
- o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13];
- o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12];
+ outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];
+ outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2];
+ outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1];
+ outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0];
+ outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7];
+ outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6];
+ outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5];
+ outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4];
+ outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11];
+ outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10];
+ outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9];
+ outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8];
+ outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15];
+ outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14];
+ outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13];
+ outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12];
#else
#define ARIA_WORD(X,Y) (((word32 *)(X))[Y])
- o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
- o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
- o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
- o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
- o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
- o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
- o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
- o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
- o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
- o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
- o[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
- o[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
- o[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
- o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
- o[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
- o[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
- ARIA_WORD(o,0)^=LoadWord<true>(rk,0); ARIA_WORD(o,1)^=LoadWord<true>(rk,1);
- ARIA_WORD(o,2)^=LoadWord<true>(rk,2); ARIA_WORD(o,3)^=LoadWord<true>(rk,3);
+ outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
+ outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
+ outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
+ outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
+ outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
+ outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
+ outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
+ outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
+ outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
+ outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
+ outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
+ outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
+ outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
+ outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
+ outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
+ outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
+ ARIA_WORD(outBlock,0)^=LoadWord<true>(rk,0);
+ ARIA_WORD(outBlock,1)^=LoadWord<true>(rk,1);
+ ARIA_WORD(outBlock,2)^=LoadWord<true>(rk,2);
+ ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3);
#endif
- if (x)
+ if (xorBlock)
for (unsigned int n=0; n<16; ++n)
- o[n] ^= x[n];
+ outBlock[n] ^= xorBlock[n];
}
NAMESPACE_END