summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-01-25 20:06:14 +0100
committerNiels Möller <nisse@lysator.liu.se>2022-01-25 20:06:14 +0100
commit2c0d307d6126c8ff5a6e1e1a2e533d7d492d6ed0 (patch)
tree6471ca142a8d22c323999552ffd0c491179b1020
parentb7268727a11bce0a350345c2671493d2ddd28b45 (diff)
downloadnettle-poly1305-radix32.tar.gz
Complete rewrite of C implementation of poly1305.poly1305-radix32
Radix 32 rather than radix 26, and use that certain key bits are always zero. * poly1305-internal.c (_nettle_poly1305_set_key): Rewritten. (_nettle_poly1305_block): Rewritten. (_nettle_poly1305_digest): Rewritten. * poly1305.h (struct poly1305_ctx): Rearrange internal unions, but with size and alignment unchanged.
-rw-r--r--ChangeLog8
-rw-r--r--poly1305-internal.c203
-rw-r--r--poly1305.h11
3 files changed, 103 insertions, 119 deletions
diff --git a/ChangeLog b/ChangeLog
index dc1b999d..e3972f8e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
2022-01-25 Niels Möller <nisse@lysator.liu.se>
+ Complete rewrite of C implementation of poly1305. Radix 32 rather
+ than radix 26, and use that certain key bits are always zero.
+ * poly1305-internal.c (_nettle_poly1305_set_key): Rewritten.
+ (_nettle_poly1305_block): Rewritten.
+ (_nettle_poly1305_digest): Rewritten.
+ * poly1305.h (struct poly1305_ctx): Rearrange internal unions, but
+ with size and alignment unchanged.
+
Chacha implementation for arm64, contributed by Mamone Tarsha.
* arm64/chacha-core-internal.asm: New file.
* arm64/chacha-2core.asm: New file.
diff --git a/poly1305-internal.c b/poly1305-internal.c
index 490fdf71..17ad7998 100644
--- a/poly1305-internal.c
+++ b/poly1305-internal.c
@@ -1,8 +1,7 @@
/* poly1305-internal.c
- Copyright: 2012-2013 Andrew M. (floodyberry)
Copyright: 2013 Nikos Mavrogiannopoulos
- Copyright: 2013 Niels Möller
+ Copyright: 2013, 2022 Niels Möller
This file is part of GNU Nettle.
@@ -31,30 +30,6 @@
not, see http://www.gnu.org/licenses/.
*/
-/* Based on https://github.com/floodyberry/poly1305-donna.
- * Modified for nettle by Nikos Mavrogiannopoulos and Niels Möller.
- * Original license notice:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
#if HAVE_CONFIG_H
#include "config.h"
#endif
@@ -67,44 +42,41 @@
#include "macros.h"
-#define mul32x32_64(a,b) ((uint64_t)(a) * (b))
+#define M32(a,b) ((uint64_t)(a) * (b))
#define r0 r.r32[0]
#define r1 r.r32[1]
#define r2 r.r32[2]
#define r3 r.r32[3]
-#define r4 r.r32[4]
+#define s0 r.r32[4]
#define s1 r.r32[5]
-#define s2 s32[0]
-#define s3 s32[1]
-#define s4 s32[2]
+#define s2 r.r32[6]
+#define s3 r.r32[7]
#define h0 h.h32[0]
#define h1 h.h32[1]
#define h2 h.h32[2]
#define h3 h.h32[3]
-#define h4 hh
+#define h4 h.h32[4]
void
_nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
{
- uint32_t t0,t1,t2,t3;
-
- t0 = LE_READ_UINT32(key);
- t1 = LE_READ_UINT32(key+4);
- t2 = LE_READ_UINT32(key+8);
- t3 = LE_READ_UINT32(key+12);
-
- ctx->r0 = t0 & 0x3ffffff; t0 >>= 26; t0 |= t1 << 6;
- ctx->r1 = t0 & 0x3ffff03; t1 >>= 20; t1 |= t2 << 12;
- ctx->r2 = t1 & 0x3ffc0ff; t2 >>= 14; t2 |= t3 << 18;
- ctx->r3 = t2 & 0x3f03fff; t3 >>= 8;
- ctx->r4 = t3 & 0x00fffff;
-
- ctx->s1 = ctx->r1 * 5;
- ctx->s2 = ctx->r2 * 5;
- ctx->s3 = ctx->r3 * 5;
- ctx->s4 = ctx->r4 * 5;
+ uint32_t t0, t1, t2, t3;
+ t0 = LE_READ_UINT32 (key);
+ t1 = LE_READ_UINT32 (key+4);
+ t2 = LE_READ_UINT32 (key+8);
+ t3 = LE_READ_UINT32 (key+12);
+
+ ctx->r0 = t0 & 0x0fffffff;
+ ctx->r1 = t1 & 0x0ffffffc;
+ ctx->r2 = t2 & 0x0ffffffc;
+ ctx->r3 = t3 & 0x0ffffffc;
+
+ ctx->s0 = 5*ctx->r0;
+ ctx->s1 = 5*(ctx->r1 >> 2);
+ ctx->s2 = 5*(ctx->r2 >> 2);
+ ctx->s3 = 5*(ctx->r3 >> 2);
ctx->h0 = 0;
ctx->h1 = 0;
@@ -114,82 +86,89 @@ _nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
}
void
-_nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m, unsigned t4)
+_nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m, unsigned m128)
{
- uint32_t t0,t1,t2,t3;
- uint32_t b;
- uint64_t t[5];
- uint64_t c;
-
- t0 = LE_READ_UINT32(m);
- t1 = LE_READ_UINT32(m+4);
- t2 = LE_READ_UINT32(m+8);
- t3 = LE_READ_UINT32(m+12);
-
- ctx->h0 += t0 & 0x3ffffff;
- ctx->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
- ctx->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
- ctx->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
- ctx->h4 += (t3 >> 8) | ((uint32_t) t4 << 24);
-
- /* poly1305_donna_mul: */
- t[0] = mul32x32_64(ctx->h0,ctx->r0) + mul32x32_64(ctx->h1,ctx->s4) + mul32x32_64(ctx->h2,ctx->s3) + mul32x32_64(ctx->h3,ctx->s2) + mul32x32_64(ctx->h4,ctx->s1);
- t[1] = mul32x32_64(ctx->h0,ctx->r1) + mul32x32_64(ctx->h1,ctx->r0) + mul32x32_64(ctx->h2,ctx->s4) + mul32x32_64(ctx->h3,ctx->s3) + mul32x32_64(ctx->h4,ctx->s2);
- t[2] = mul32x32_64(ctx->h0,ctx->r2) + mul32x32_64(ctx->h1,ctx->r1) + mul32x32_64(ctx->h2,ctx->r0) + mul32x32_64(ctx->h3,ctx->s4) + mul32x32_64(ctx->h4,ctx->s3);
- t[3] = mul32x32_64(ctx->h0,ctx->r3) + mul32x32_64(ctx->h1,ctx->r2) + mul32x32_64(ctx->h2,ctx->r1) + mul32x32_64(ctx->h3,ctx->r0) + mul32x32_64(ctx->h4,ctx->s4);
- t[4] = mul32x32_64(ctx->h0,ctx->r4) + mul32x32_64(ctx->h1,ctx->r3) + mul32x32_64(ctx->h2,ctx->r2) + mul32x32_64(ctx->h3,ctx->r1) + mul32x32_64(ctx->h4,ctx->r0);
-
- ctx->h0 = (uint32_t)t[0] & 0x3ffffff; c = (t[0] >> 26);
- t[1] += c; ctx->h1 = (uint32_t)t[1] & 0x3ffffff; b = (uint32_t)(t[1] >> 26);
- t[2] += b; ctx->h2 = (uint32_t)t[2] & 0x3ffffff; b = (uint32_t)(t[2] >> 26);
- t[3] += b; ctx->h3 = (uint32_t)t[3] & 0x3ffffff; b = (uint32_t)(t[3] >> 26);
- t[4] += b; ctx->h4 = (uint32_t)t[4] & 0x3ffffff; b = (uint32_t)(t[4] >> 26);
- ctx->h0 += b * 5;
+ uint32_t t0, t1, t2, t3, t4;
+ uint64_t s, f0, f1, f2, f3;
+
+ /* Add in message block */
+ t0 = ctx->h0 + LE_READ_UINT32(m);
+ s = (uint64_t) ctx->h1 + (t0 < ctx->h0) + LE_READ_UINT32(m+4);
+ t1 = s;
+ s = ctx->h2 + (s >> 32) + LE_READ_UINT32(m+8);
+ t2 = s;
+ s = ctx->h3 + (s >> 32) + LE_READ_UINT32(m+12);
+ t3 = s;
+ t4 = ctx->h4 + (s >> 32) + m128;
+
+ /* Key constants are bounded by rk < 2^28, sk < 5*2^26, therefore
+ all the fk sums fit in 64 bits without overflow, with at least
+ one bit margin. */
+ f0 = M32(t0, ctx->r0) + M32(t1, ctx->s3) + M32(t2, ctx->s2) + M32(t3, ctx->s1)
+ + M32(t4 >> 2, ctx->s0);
+ f1 = M32(t0, ctx->r1) + M32(t1, ctx->r0) + M32(t2, ctx->s3) + M32(t3, ctx->s2)
+ + M32(t4, ctx->s1);
+ f2 = M32(t0, ctx->r2) + M32(t1, ctx->r1) + M32(t2, ctx->r0) + M32(t3, ctx->s3)
+ + M32(t4, ctx->s2);
+ f3 = M32(t0, ctx->r3) + M32(t1, ctx->r2) + M32(t2, ctx->r1) + M32(t3, ctx->r0)
+ + M32(t4, ctx->s3) + ((uint64_t)((t4 & 3)*ctx->r0) << 32);
+
+ ctx->h0 = f0;
+ f1 += f0 >> 32;
+ ctx->h1 = f1;
+ f2 += f1 >> 32;
+ ctx->h2 = f2;
+ f3 += f2 >> 32;
+ ctx->h3 = f3;
+ ctx->h4 = f3 >> 32;
}
/* Adds digest to the nonce */
void
_nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s)
{
- uint32_t b, nb;
- uint64_t f0,f1,f2,f3;
- uint32_t g0,g1,g2,g3,g4;
-
- b = ctx->h0 >> 26; ctx->h0 = ctx->h0 & 0x3ffffff;
- ctx->h1 += b; b = ctx->h1 >> 26; ctx->h1 = ctx->h1 & 0x3ffffff;
- ctx->h2 += b; b = ctx->h2 >> 26; ctx->h2 = ctx->h2 & 0x3ffffff;
- ctx->h3 += b; b = ctx->h3 >> 26; ctx->h3 = ctx->h3 & 0x3ffffff;
- ctx->h4 += b; b = ctx->h4 >> 26; ctx->h4 = ctx->h4 & 0x3ffffff;
- ctx->h0 += b * 5; b = ctx->h0 >> 26; ctx->h0 = ctx->h0 & 0x3ffffff;
- ctx->h1 += b;
-
- g0 = ctx->h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff;
- g1 = ctx->h1 + b; b = g1 >> 26; g1 &= 0x3ffffff;
- g2 = ctx->h2 + b; b = g2 >> 26; g2 &= 0x3ffffff;
- g3 = ctx->h3 + b; b = g3 >> 26; g3 &= 0x3ffffff;
- g4 = ctx->h4 + b - (1 << 26);
-
- b = (g4 >> 31) - 1;
- nb = ~b;
- ctx->h0 = (ctx->h0 & nb) | (g0 & b);
- ctx->h1 = (ctx->h1 & nb) | (g1 & b);
- ctx->h2 = (ctx->h2 & nb) | (g2 & b);
- ctx->h3 = (ctx->h3 & nb) | (g3 & b);
- ctx->h4 = (ctx->h4 & nb) | (g4 & b);
+ uint32_t t0, t1, t2, t3, t4, c0, c1, c2, c3, mask;
+ uint64_t f0, f1, f2;
+
+ t0 = ctx->h0;
+ t1 = ctx->h1;
+ t2 = ctx->h2;
+ t3 = ctx->h3;
+ t4 = ctx->h4;
+
+ /* Fold high part of t4 */
+ c0 = 5 * (t4 >> 2);
+ t4 &= 3;
+ t0 += c0; c1 = (t0 < c0);
+ t1 += c1; c2 = (t1 < c1);
+ t2 += c2; c3 = (t2 < c2);
+ t3 += c3;
+ t4 += (t3 < c3);
+
+ /* Compute resulting carries when adding 5. */
+ c1 = (t0 >= 0xfffffffb);
+ c2 = (t1 + c1 < c1);
+ c3 = (t2 + c2 < t2);
+ t4 += (t3 + c3 < t3);
+
+ /* Set if H >= 2^130 - 5 */
+ mask = - (t4 >> 2);
+
+ t0 += mask & 5;
+ t1 += mask & c1;
+ t2 += mask & c2;
+ t3 += mask & c3;
/* FIXME: Take advantage of s being aligned as an unsigned long. */
- f0 = ((ctx->h0 )|(ctx->h1<<26)) + (uint64_t)LE_READ_UINT32(s->b);
- f1 = ((ctx->h1>> 6)|(ctx->h2<<20)) + (uint64_t)LE_READ_UINT32(s->b+4);
- f2 = ((ctx->h2>>12)|(ctx->h3<<14)) + (uint64_t)LE_READ_UINT32(s->b+8);
- f3 = ((ctx->h3>>18)|(ctx->h4<< 8)) + (uint64_t)LE_READ_UINT32(s->b+12);
+ f0 = (uint64_t) t0 + LE_READ_UINT32(s->b);
+ f1 = t1 + (f0 >> 32) + LE_READ_UINT32(s->b+4);
+ f2 = t2 + (f1 >> 32) + LE_READ_UINT32(s->b+8);
+ t3 += (f2 >> 32) + LE_READ_UINT32(s->b+12);
LE_WRITE_UINT32(s->b, f0);
- f1 += (f0 >> 32);
LE_WRITE_UINT32(s->b+4, f1);
- f2 += (f1 >> 32);
LE_WRITE_UINT32(s->b+8, f2);
- f3 += (f2 >> 32);
- LE_WRITE_UINT32(s->b+12, f3);
+ LE_WRITE_UINT32(s->b+12, t3);
ctx->h0 = 0;
ctx->h1 = 0;
diff --git a/poly1305.h b/poly1305.h
index 99c63c8a..6c13a590 100644
--- a/poly1305.h
+++ b/poly1305.h
@@ -55,18 +55,15 @@ struct poly1305_ctx {
/* Key, 128-bit value and some cached multiples. */
union
{
- uint32_t r32[6];
- uint64_t r64[3];
+ uint32_t r32[8];
+ uint64_t r64[4];
} r;
- uint32_t s32[3];
/* State, represented as words of 26, 32 or 64 bits, depending on
implementation. */
- /* High bits first, to maintain alignment. */
- uint32_t hh;
union
{
- uint32_t h32[4];
- uint64_t h64[2];
+ uint32_t h32[6];
+ uint64_t h64[3];
} h;
};