summaryrefslogtreecommitdiff
path: root/chacha.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2018-11-04 14:49:26 -0500
committerJeffrey Walton <noloader@gmail.com>2018-11-04 14:49:26 -0500
commit29be6ed97a86380dc1cb472afcdf2cc9b5706509 (patch)
treedf3f0337290015d0baba5a2c54102cf772d26cc1 /chacha.cpp
parentd7a3562c0b288d8f35dd2365dcf17376c381b36c (diff)
downloadcryptopp-git-29be6ed97a86380dc1cb472afcdf2cc9b5706509.tar.gz
Work-around potential counter increment problem in ChaCha20 (GH #732)
This is only a work-around for the moment. The issue only affects SIMD code. The problem is, the algorithm we use performs a 32-bit add as an intermediate result, but we really need a 64-bit add. We are running 4 transforms in parallel, and we can't add and carry the way we need to. The workaround is, whenever we could cross the 32-bit counter boundary we use the C version of the transform. We determine the cross-over point by 'bool safe = 0xffffffff - state.low > 4'. When not safe we skip the SIMD version of the algorithm and use the C version. Once we are safe again we use the SIMD version again. The work-around costs us about 0.1 to 0.2 cpb. At 1.10 or 1.15 cpb that equates to about 200 MB/s on a Skylake. We'd like to get it back eventually.
Diffstat (limited to 'chacha.cpp')
-rw-r--r--chacha.cpp193
1 files changed, 103 insertions, 90 deletions
diff --git a/chacha.cpp b/chacha.cpp
index 1de1e89e..6dac12a1 100644
--- a/chacha.cpp
+++ b/chacha.cpp
@@ -1,6 +1,7 @@
// chacha.cpp - written and placed in the public domain by Jeffrey Walton.
-// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
-// family implementation at http://cr.yp.to/chacha.html.
+// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
+// and Bernstein's reference ChaCha family implementation at
+// http://cr.yp.to/chacha.html.
#include "pch.h"
#include "config.h"
@@ -29,6 +30,24 @@ extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* inpu
a += b; d ^= a; d = rotlConstant<8,word32>(d); \
c += d; b ^= c; b = rotlConstant<7,word32>(b);
+#define CHACHA_OUTPUT(x){\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
+
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
void ChaCha_TestInstantiations()
{
@@ -92,8 +111,8 @@ void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, s
void ChaCha_Policy::SeekToIteration(lword iterationCount)
{
- m_state[13] = (word32)iterationCount;
- m_state[12] = (word32)SafeRightShift<32>(iterationCount);
+ m_state[12] = (word32)iterationCount; // low word
+ m_state[13] = (word32)SafeRightShift<32>(iterationCount);
}
unsigned int ChaCha_Policy::GetAlignment() const
@@ -102,6 +121,10 @@ unsigned int ChaCha_Policy::GetAlignment() const
if (HasSSE2())
return 16;
else
+#elif (CRYPTOPP_POWER8_AVAILABLE)
+ if (HasPower8())
+ return 16;
+ else
#endif
return GetAlignmentOf<word32>();
}
@@ -126,117 +149,107 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
return BYTES_PER_ITERATION;
}
+bool ChaCha_Policy::MultiBlockSafe() const
+{
+ const word32 c = m_state[12];
+ return 0xffffffff - c > 4;
+}
+
// OperateKeystream always produces a key stream. The key stream is written
// to output. Optionally a message may be supplied to xor with the key stream.
// The message is input, and output = output ^ input.
void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
byte *output, const byte *input, size_t iterationCount)
{
-#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
- if (HasSSE2())
+ do
{
- while (iterationCount >= 4)
+#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
+ if (HasSSE2())
{
- const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
- ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
-
- m_state[12] += 4;
- if (m_state[12] < 4)
- m_state[13]++;
-
- input += (!!xorInput)*4*BYTES_PER_ITERATION;
- output += 4*BYTES_PER_ITERATION;
- iterationCount -= 4;
+ while (iterationCount >= 4 && MultiBlockSafe())
+ {
+ const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
+ ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
+
+ m_state[12] += 4;
+ if (m_state[12] < 4)
+ m_state[13]++;
+
+ input += (!!xorInput)*4*BYTES_PER_ITERATION;
+ output += 4*BYTES_PER_ITERATION;
+ iterationCount -= 4;
+ }
}
- }
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
- if (HasNEON())
- {
- while (iterationCount >= 4)
+ if (HasNEON())
{
- const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
- ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
-
- m_state[12] += 4;
- if (m_state[12] < 4)
- m_state[13]++;
-
- input += (!!xorInput)*4*BYTES_PER_ITERATION;
- output += 4*BYTES_PER_ITERATION;
- iterationCount -= 4;
+ while (iterationCount >= 4 && MultiBlockSafe())
+ {
+ const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
+ ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
+
+ m_state[12] += 4;
+ if (m_state[12] < 4)
+ m_state[13]++;
+
+ input += (!!xorInput)*4*BYTES_PER_ITERATION;
+ output += 4*BYTES_PER_ITERATION;
+ iterationCount -= 4;
+ }
}
- }
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
- if (HasPower8())
- {
- while (iterationCount >= 4)
+ if (HasPower8())
{
- const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
- ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
-
- m_state[12] += 4;
- if (m_state[12] < 4)
- m_state[13]++;
-
- input += (!!xorInput)*4*BYTES_PER_ITERATION;
- output += 4*BYTES_PER_ITERATION;
- iterationCount -= 4;
+ while (iterationCount >= 4 && MultiBlockSafe())
+ {
+ const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
+ ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
+
+ m_state[12] += 4;
+ if (m_state[12] < 4)
+ m_state[13]++;
+
+ input += (!!xorInput)*4*BYTES_PER_ITERATION;
+ output += 4*BYTES_PER_ITERATION;
+ iterationCount -= 4;
+ }
}
- }
#endif
- while (iterationCount--)
- {
- word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ if (iterationCount)
+ {
+ word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
- x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
- x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
- x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
- x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
+ x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
+ x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
+ x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
+ x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
- for (int i = static_cast<int>(m_rounds); i > 0; i -= 2)
- {
- CHACHA_QUARTER_ROUND(x0, x4, x8, x12);
- CHACHA_QUARTER_ROUND(x1, x5, x9, x13);
- CHACHA_QUARTER_ROUND(x2, x6, x10, x14);
- CHACHA_QUARTER_ROUND(x3, x7, x11, x15);
-
- CHACHA_QUARTER_ROUND(x0, x5, x10, x15);
- CHACHA_QUARTER_ROUND(x1, x6, x11, x12);
- CHACHA_QUARTER_ROUND(x2, x7, x8, x13);
- CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
- }
+ for (int i = static_cast<int>(m_rounds); i > 0; i -= 2)
+ {
+ CHACHA_QUARTER_ROUND(x0, x4, x8, x12);
+ CHACHA_QUARTER_ROUND(x1, x5, x9, x13);
+ CHACHA_QUARTER_ROUND(x2, x6, x10, x14);
+ CHACHA_QUARTER_ROUND(x3, x7, x11, x15);
-#ifndef CRYPTOPP_DOXYGEN_PROCESSING
- #define CHACHA_OUTPUT(x){\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
-
- CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
- #undef CHACHA_OUTPUT
-#endif
+ CHACHA_QUARTER_ROUND(x0, x5, x10, x15);
+ CHACHA_QUARTER_ROUND(x1, x6, x11, x12);
+ CHACHA_QUARTER_ROUND(x2, x7, x8, x13);
+ CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
+ }
+
+ CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
+
+ if (++m_state[12] == 0)
+ m_state[13]++;
+ }
- if (++m_state[12] == 0)
- m_state[13]++;
- }
+ // We may re-enter a SIMD keystream operation from here.
+ } while (iterationCount--);
}
NAMESPACE_END