summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--donna.h12
-rw-r--r--donna_sse.cpp16
2 files changed, 18 insertions, 10 deletions
diff --git a/donna.h b/donna.h
index e80b2666..2c3d4722 100644
--- a/donna.h
+++ b/donna.h
@@ -54,15 +54,23 @@ int curve25519(byte sharedKey[32], const byte secretKey[32], const byte othersKe
# define CRYPTOPP_CURVE25519_32BIT 1
#endif
-// Benchmarking on a modern 64-bit Core i5-6400 shows SSE2 on Linux
+// Benchmarking on a modern 64-bit Core i5-6400 @2.7 GHz shows SSE2 on Linux
// is not profitable. Here are the numbers in milliseconds/operation:
//
// * Langley, C++, 0.050
// * Moon, C++: 0.040
// * Moon, SSE2: 0.061
// * Moon, native: 0.045
+//
+// However, a modern 64-bit Core i5-3200 @2.3 GHz shows SSE2 is profitable
+// for MS compilers. Here are the numbers in milliseconds/operation:
+//
+// * x86, no SSE2, 0.294
+// * x86, SSE2, 0.097
+// * x64, no SSE2, 0.081
+// * x64, SSE2, 0.071
-#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) && 0
+#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) && defined(_MSC_VER)
# define CRYPTOPP_CURVE25519_SSE2 1
#endif
diff --git a/donna_sse.cpp b/donna_sse.cpp
index 2f6a68ac..3ab13076 100644
--- a/donna_sse.cpp
+++ b/donna_sse.cpp
@@ -214,7 +214,7 @@ curve25519_contract(byte out[32], const bignum25519 in) {
*/
inline void
curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
- const word32 swap = (word32)(-(int32_t)iswap);
+ const word32 swap = (word32)(-(sword32)iswap);
xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
xmmi mask = _mm_cvtsi32_si128(swap);
mask = _mm_shuffle_epi32(mask, 0);
@@ -1114,16 +1114,16 @@ int curve25519_SSE2(byte sharedKey[32], const byte secretKey[32], const byte oth
packed32bignum25519 qx, qz, pqz, pqx;
packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
bignum25519mulprecomp preq;
- size_t bit, lastbit;
+ size_t i=0, bit=0, lastbit=0;
curve25519_expand(nqpqx, othersKey);
curve25519_mul_precompute(&preq, nqpqx);
/* do bits 254..3 */
- for (int i = 254, lastbit = 0; i >= 3; i--) {
+ for (i = 254, lastbit=0; i >= 3; i--) {
bit = (e[i/8] >> (i & 7)) & 1;
- curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit);
- curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit);
+ curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
+ curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
lastbit = bit;
curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
@@ -1149,11 +1149,11 @@ int curve25519_SSE2(byte sharedKey[32], const byte secretKey[32], const byte oth
/* it's possible to get rid of this swap with the swap in the above loop
at the bottom instead of the top, but compilers seem to optimize better this way */
- curve25519_swap_conditional(nqx, nqpqx, bit);
- curve25519_swap_conditional(nqz, nqpqz, bit);
+ curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
+ curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
/* do bits 2..0 */
- for (size_t i = 0; i < 3; i++) {
+ for (i = 0; i < 3; i++) {
curve25519_compute_nq(nq, nqx, nqz);
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */