diff options
author | Niels Möller <nisse@lysator.liu.se> | 2022-10-07 20:37:58 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2022-11-28 19:30:04 +0100 |
commit | 39ed186b89bae0db3fe8a443b09ef8b493c483b3 (patch) | |
tree | 42b537c84f52f7823826779ccc22f1aa659e4bbe | |
parent | 3fbce8821d1d6c1c7a2a25c8e5ad9b186125bcdb (diff) | |
download | nettle-39ed186b89bae0db3fe8a443b09ef8b493c483b3.tar.gz |
Reimplement modular inversion.
New implementation based on the algorithm by Daniel J. Bernstein and
Bo-Yin Yang, see https://eprint.iacr.org/2019/266.pdf.
-rw-r--r-- | ChangeLog | 22 | ||||
-rw-r--r-- | curve25519-eh-to-x.c | 2 | ||||
-rw-r--r-- | curve25519-mul-g.c | 5 | ||||
-rw-r--r-- | curve25519-mul.c | 2 | ||||
-rw-r--r-- | curve448-eh-to-x.c | 2 | ||||
-rw-r--r-- | curve448-mul-g.c | 2 | ||||
-rw-r--r-- | curve448-mul.c | 2 | ||||
-rw-r--r-- | ecc-curve25519.c | 35 | ||||
-rw-r--r-- | ecc-curve448.c | 26 | ||||
-rw-r--r-- | ecc-gost-gc256b.c | 8 | ||||
-rw-r--r-- | ecc-gost-gc512a.c | 8 | ||||
-rw-r--r-- | ecc-internal.h | 15 | ||||
-rw-r--r-- | ecc-mod-inv.c | 355 | ||||
-rw-r--r-- | ecc-secp192r1.c | 8 | ||||
-rw-r--r-- | ecc-secp224r1.c | 40 | ||||
-rw-r--r-- | ecc-secp256r1.c | 73 | ||||
-rw-r--r-- | ecc-secp384r1.c | 53 | ||||
-rw-r--r-- | ecc-secp521r1.c | 64 | ||||
-rw-r--r-- | eccdata.c | 38 | ||||
-rw-r--r-- | testsuite/ecc-modinv-test.c | 13 |
20 files changed, 420 insertions, 353 deletions
@@ -1,3 +1,25 @@ +2022-10-07 Niels Möller <nisse@lysator.liu.se> + + * ecc-mod-inv.c: Implement Bernstein-Yang algorithm. It + outperforms older inversion code, except except secp192r1 mod p + inversion. + * eccdata.c (output_modulo): Output iteration count for new + modular inversion algorithm, and needed power of two for redc + inversion. + * ecc-curve25519.c (ecc_curve25519_inv): Deleted. + * ecc-curve448.c (ecc_curve448_inv): Deleted. + * ecc-secp224r1.c (ecc_secp224r1_inv): Deleted. + * ecc-secp256r1.c (ecc_secp256r1_inv): Deleted. + * ecc-secp384r1.c (ecc_secp384r1_inv): Deleted. + * ecc-secp521r1.c (ecc_secp521r1_inv): Deleted. + +2022-10-03 Niels Möller <nisse@lysator.liu.se> + + * ecc-internal.h (struct ecc_modulo): Delete (m+1)/2 constant, and + update all curve structs. + * eccdata.c (output_curve): Delete corresponding code to generate + ecc_pp1h and ecc_qp1h. + 2022-11-09 Niels Möller <nisse@lysator.liu.se> From Mamone Tarsha: diff --git a/curve25519-eh-to-x.c b/curve25519-eh-to-x.c index d90defda..8b6e8d1a 100644 --- a/curve25519-eh-to-x.c +++ b/curve25519-eh-to-x.c @@ -64,7 +64,7 @@ curve25519_eh_to_x (mp_limb_t *xp, const mp_limb_t *p, x = 0, and we should be fine, since ecc_mod_inv for ecc->p returns 0 in this case. */ ecc_mod_sub (&ecc->p, t0, wp, vp); - /* Needs a total of 6*size storage. */ + /* Needs a total of 2*size + scratch for inversion. */ ecc->p.invert (&ecc->p, t1, t0, tp); ecc_mod_add (&ecc->p, t0, wp, vp); diff --git a/curve25519-mul-g.c b/curve25519-mul-g.c index 000e0987..3f12af73 100644 --- a/curve25519-mul-g.c +++ b/curve25519-mul-g.c @@ -33,6 +33,7 @@ # include "config.h" #endif +#include <assert.h> #include <string.h> #include "curve25519.h" @@ -57,7 +58,9 @@ curve25519_mul_g (uint8_t *r, const uint8_t *n) t[0] &= ~7; t[CURVE25519_SIZE-1] = (t[CURVE25519_SIZE-1] & 0x3f) | 0x40; - itch = 4*ecc->p.size + ecc->mul_g_itch; + assert (ecc->mul_g_itch <= 2*ecc->p.size + ecc->p.invert_itch); + itch = 6*ecc->p.size + ecc->p.invert_itch; + scratch = gmp_alloc_limbs (itch); mpn_set_base256_le (x, ecc->p.size, t, CURVE25519_SIZE); diff --git a/curve25519-mul.c b/curve25519-mul.c index aff2b293..d8023fc3 100644 --- a/curve25519-mul.c +++ b/curve25519-mul.c @@ -48,7 +48,7 @@ curve25519_mul (uint8_t *q, const uint8_t *n, const uint8_t *p) mp_size_t itch; mp_limb_t *x; - itch = m->size + ECC_MUL_M_ITCH(m->size); + itch = m->size + ECC_MUL_M_ITCH(m->size, m->invert_itch); x = gmp_alloc_limbs (itch); /* Note that 255 % GMP_NUMB_BITS == 0 isn't supported, so x always diff --git a/curve448-eh-to-x.c b/curve448-eh-to-x.c index 3b9bf3ec..29c38b27 100644 --- a/curve448-eh-to-x.c +++ b/curve448-eh-to-x.c @@ -58,7 +58,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p, mp_limb_t *scratch) x = v^2 / u^2 = (V/W)^2 / (U/W)^2 = (V/U)^2 */ - /* Needs a total of 5*size storage. */ + /* Needs a total of size + scratch for inversion. */ ecc->p.invert (&ecc->p, t0, up, tp); ecc_mod_mul (&ecc->p, t0, t0, vp, tp); ecc_mod_sqr_canonical (&ecc->p, xp, t0, tp); diff --git a/curve448-mul-g.c b/curve448-mul-g.c index a396595a..71af4547 100644 --- a/curve448-mul-g.c +++ b/curve448-mul-g.c @@ -58,7 +58,7 @@ curve448_mul_g (uint8_t *r, const uint8_t *n) t[0] &= ~3; t[CURVE448_SIZE-1] = (t[CURVE448_SIZE-1] & 0x7f) | 0x80; - itch = 5*ecc->p.size + ecc->mul_g_itch; + itch = 4*ecc->p.size + ecc->mul_g_itch; scratch = gmp_alloc_limbs (itch); mpn_set_base256_le (x, ecc->p.size, t, CURVE448_SIZE); diff --git a/curve448-mul.c b/curve448-mul.c index 2090c8a5..c8b13e90 100644 --- a/curve448-mul.c +++ b/curve448-mul.c @@ -50,7 +50,7 @@ curve448_mul (uint8_t *q, const uint8_t *n, const uint8_t *p) mp_size_t itch; mp_limb_t *x; - itch = m->size + ECC_MUL_M_ITCH(m->size); + itch = m->size + ECC_MUL_M_ITCH(m->size, m->invert_itch); x = gmp_alloc_limbs (itch); mpn_set_base256_le (x, m->size, p, CURVE448_SIZE); diff --git a/ecc-curve25519.c b/ecc-curve25519.c index 539bff22..bb419ad2 100644 --- a/ecc-curve25519.c +++ b/ecc-curve25519.c @@ -149,27 +149,6 @@ ecc_mod_pow_252m3 (const struct ecc_modulo *m, #undef tp } -/* Scratch as for ecc_mod_pow_252m3 above. */ -#define ECC_25519_INV_ITCH (4*ECC_LIMB_SIZE) - -static void -ecc_curve25519_inv (const struct ecc_modulo *p, - mp_limb_t *rp, const mp_limb_t *ap, - mp_limb_t *scratch) -{ - /* Addition chain - - p - 2 = 2^{255} - 21 - = 1 + 2 (1 + 4 (2^{252}-3)) - */ - ecc_mod_pow_252m3 (p, rp, ap, scratch); - ecc_mod_sqr (p, rp, rp, scratch); - ecc_mod_sqr (p, rp, rp, scratch); - ecc_mod_mul (p, rp, ap, rp, scratch); - ecc_mod_sqr (p, rp, rp, scratch); - ecc_mod_mul (p, rp, ap, rp, scratch); -} - static int ecc_curve25519_zero_p (const struct ecc_modulo *p, mp_limb_t *xp) { @@ -259,20 +238,22 @@ const struct ecc_curve _nettle_curve25519 = ECC_LIMB_SIZE, ECC_BMODP_SIZE, 0, - ECC_25519_INV_ITCH, + ECC_MOD_INV_ITCH(ECC_LIMB_SIZE), 0, ECC_25519_SQRT_RATIO_ITCH, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, NULL, - ecc_pp1h, + NULL, ecc_curve25519_modp, ecc_curve25519_modp, - ecc_curve25519_inv, + ecc_mod_inv, NULL, ecc_curve25519_sqrt_ratio, }, @@ -284,13 +265,15 @@ const struct ecc_curve _nettle_curve25519 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_mBmodq_shifted, /* Use q - 2^{252} instead. */ ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_curve25519_modq, ecc_curve25519_modq, @@ -308,7 +291,7 @@ const struct ecc_curve _nettle_curve25519 = ECC_DUP_TH_ITCH (ECC_LIMB_SIZE), ECC_MUL_A_EH_ITCH (ECC_LIMB_SIZE), ECC_MUL_G_EH_ITCH (ECC_LIMB_SIZE), - ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_25519_INV_ITCH), + ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_MOD_INV_ITCH(ECC_LIMB_SIZE)), ecc_add_th, ecc_add_thh, diff --git a/ecc-curve448.c b/ecc-curve448.c index daef56cc..8d1258fe 100644 --- a/ecc-curve448.c +++ b/ecc-curve448.c @@ -142,18 +142,6 @@ ecc_mod_pow_446m224m1 (const struct ecc_modulo *p, #undef tp } -#define ECC_CURVE448_INV_ITCH (4*ECC_LIMB_SIZE) - -static void ecc_curve448_inv (const struct ecc_modulo *p, - mp_limb_t *rp, const mp_limb_t *ap, - mp_limb_t *tp) -{ - ecc_mod_pow_446m224m1 (p, rp, ap, tp);/* a^{2^446-2^222-1} */ - ecc_mod_sqr (p, rp, rp, tp); /* a^{2^447-2^223-2} */ - ecc_mod_sqr (p, rp, rp, tp); /* a^{2^448-2^224-4} */ - ecc_mod_mul (p, rp, ap, rp, tp); /* a^{2^448-2^224-3} */ -} - /* To guarantee that inputs to ecc_mod_zero_p are in the required range. */ #if ECC_LIMB_SIZE * GMP_NUMB_BITS != 448 #error Unsupported limb size @@ -213,20 +201,22 @@ const struct ecc_curve _nettle_curve448 = ECC_LIMB_SIZE, ECC_BMODP_SIZE, 0, - ECC_CURVE448_INV_ITCH, + ECC_MOD_INV_ITCH(ECC_LIMB_SIZE), 0, ECC_CURVE448_SQRT_RATIO_ITCH, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, NULL, - ecc_pp1h, + NULL, ecc_curve448_modp, ecc_curve448_modp, - ecc_curve448_inv, + ecc_mod_inv, NULL, ecc_curve448_sqrt_ratio, }, @@ -238,13 +228,15 @@ const struct ecc_curve _nettle_curve448 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_mod, ecc_mod, @@ -262,7 +254,7 @@ const struct ecc_curve _nettle_curve448 = ECC_DUP_EH_ITCH (ECC_LIMB_SIZE), ECC_MUL_A_EH_ITCH (ECC_LIMB_SIZE), ECC_MUL_G_EH_ITCH (ECC_LIMB_SIZE), - ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_CURVE448_INV_ITCH), + ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)), ecc_add_eh, ecc_add_ehh, diff --git a/ecc-gost-gc256b.c b/ecc-gost-gc256b.c index df9cbb58..10c7ae8c 100644 --- a/ecc-gost-gc256b.c +++ b/ecc-gost-gc256b.c @@ -67,14 +67,16 @@ const struct ecc_curve _nettle_gost_gc256b = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, + NULL, - ecc_pp1h, ecc_gost_gc256b_modp, ecc_gost_gc256b_modp, ecc_mod_inv, @@ -89,13 +91,15 @@ const struct ecc_curve _nettle_gost_gc256b = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_gost_gc256b_modq, ecc_gost_gc256b_modq, diff --git a/ecc-gost-gc512a.c b/ecc-gost-gc512a.c index 3807b57e..0ba58d64 100644 --- a/ecc-gost-gc512a.c +++ b/ecc-gost-gc512a.c @@ -67,14 +67,16 @@ const struct ecc_curve _nettle_gost_gc512a = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, + NULL, - ecc_pp1h, ecc_gost_gc512a_modp, ecc_gost_gc512a_modp, ecc_mod_inv, @@ -89,13 +91,15 @@ const struct ecc_curve _nettle_gost_gc512a = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_gost_gc512a_modq, ecc_gost_gc512a_modq, diff --git a/ecc-internal.h b/ecc-internal.h index be02de5f..cd4d5ea1 100644 --- a/ecc-internal.h +++ b/ecc-internal.h @@ -170,6 +170,13 @@ struct ecc_modulo unsigned short sqrt_itch; unsigned short sqrt_ratio_itch; + /* Needed iterations for the Bernstein-Yang inversion algorithm, for + input size bit_size, or bit_size + 1 in case the most significant + bit of the most significant modulo limb is zero. */ + unsigned short invert_count; + /* m^{-1} mod 2^{GMP_NUMB_BITS} */ + mp_limb_t binv; + const mp_limb_t *m; /* B^size mod m. Expected to have at least 32 leading zeros (equality for secp_256r1). */ @@ -184,8 +191,8 @@ struct ecc_modulo /* m +/- 1, for redc, excluding redc_size low limbs. */ const mp_limb_t *redc_mpm1; - /* (m+1)/2 */ - const mp_limb_t *mp1h; + + const mp_limb_t *invert_power; ecc_mod_func *mod; ecc_mod_func *reduce; @@ -482,7 +489,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p, mp_limb_t *scratch); /* Current scratch needs: */ -#define ECC_MOD_INV_ITCH(size) (3*(size)) +#define ECC_MOD_INV_ITCH(size) (5*(size)+4) #define ECC_J_TO_A_ITCH(size, inv) ((size)+(inv)) #define ECC_EH_TO_A_ITCH(size, inv) ((size)+(inv)) #define ECC_DUP_JJ_ITCH(size) (4*(size)) @@ -508,7 +515,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p, #define ECC_MUL_A_EH_ITCH(size) \ (((3 << ECC_MUL_A_EH_WBITS) + 7) * (size)) #endif -#define ECC_MUL_M_ITCH(size) (8*(size)) +#define ECC_MUL_M_ITCH(size, inv) (4*(size) + (inv)) #define ECC_ECDSA_SIGN_ITCH(size) (11*(size)) #define ECC_GOSTDSA_SIGN_ITCH(size) (11*(size)) #define ECC_MOD_RANDOM_ITCH(size) (size) diff --git a/ecc-mod-inv.c b/ecc-mod-inv.c index 254fb697..68d6c339 100644 --- a/ecc-mod-inv.c +++ b/ecc-mod-inv.c @@ -1,6 +1,6 @@ /* ecc-mod-inv.c - Copyright (C) 2013, 2014 Niels Möller + Copyright (C) 2013, 2014, 2022 Niels Möller This file is part of GNU Nettle. @@ -29,8 +29,6 @@ not, see http://www.gnu.org/licenses/. */ -/* Development of Nettle's ECC support was funded by the .SE Internet Fund. */ - #if HAVE_CONFIG_H # include "config.h" #endif @@ -39,7 +37,11 @@ #include "ecc-internal.h" -static void +#if GMP_LIMB_BITS != GMP_NUMB_BITS +#error Unsupported configuration +#endif + +static mp_limb_t cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n) { mp_limb_t cy = (cnd != 0); @@ -52,109 +54,282 @@ cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n) cy = r < cy; rp[i] = r; } + /* Return cnd except R = A = 0. */ + return cnd & (1-cy); } -/* Compute a^{-1} mod m, with running time depending only on the size. - Returns zero if a == 0 (mod m), to be consistent with a^{phi(m)-1}. - Also needs (m+1)/2, and m must be odd. +#define BITCNT_BITS (sizeof(mp_bitcnt_t) * CHAR_BIT) + +#define STEPS (GMP_NUMB_BITS - 2) + +struct matrix { + /* Matrix elements interpreted as signed two's complement. Absolute + value of elements is at most 2^STEPS. */ + mp_limb_t a[2][2]; +}; + +/* Conditionally set (a, b) <-- (b, -a) */ +#define CND_NEG_SWAP_LIMB(cnd, a, b) do {\ + mp_limb_t __cnd_sum = (a) + (b); \ + mp_limb_t __cnd_diff = (a) - (b); \ + (a) -= __cnd_diff & -(cnd); \ + (b) -= __cnd_sum & -(cnd); \ + } while (0) + +/* Perform K elementary reduction steps on (delta, f, g). Only the + least significant K bits of f, g matter. Note that delta has an + unsigned type, but is used as two's complement. */ +static mp_bitcnt_t +steps(struct matrix *M, unsigned k, mp_bitcnt_t delta, mp_limb_t f, mp_limb_t g) +{ + mp_limb_t a00, a01, a10, a11; + assert (f & 1); + + /* Identity matrix. */ + a00 = a11 = 1; + a01 = a10 = 0; + + /* Preserve invariant (f ; g) = 2^{-i} M (f_orig, g_orig) */ + for (; k-- > 0; delta++) + { + mp_limb_t odd = g & 1; + mp_limb_t swap = odd & ~(delta >> (BITCNT_BITS-1)); + + /* Swap; (f'; g') = (g; -f) = (0,1;-1,0) (g;f) */ + CND_NEG_SWAP_LIMB(swap, f, g); + CND_NEG_SWAP_LIMB(swap, a00, a10); + CND_NEG_SWAP_LIMB(swap, a01, a11); + + /* Conditional negation. */ + delta = (delta ^ - (mp_bitcnt_t) swap) + swap; + + /* Cancel low bit and shift. */ + g += f & -odd; + a10 += a00 & -odd; + a11 += a01 & -odd; + + g >>= 1; + a00 <<= 1; + a01 <<= 1; + } + M->a[0][0] = a00; M->a[0][1] = a01; + M->a[1][0] = a10; M->a[1][1] = a11; + return delta; +} + +/* Set R = (u * F + v * G), treating all numbers as two's complement. + No overlap allowed. */ +static mp_limb_t +add_add_mul (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp, mp_size_t n, + mp_limb_t u, mp_limb_t v) { + mp_limb_t f_sign = fp[n-1] >> (GMP_LIMB_BITS - 1); + mp_limb_t g_sign = gp[n-1] >> (GMP_LIMB_BITS - 1); + mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1); + mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1); + mp_limb_t hi = mpn_mul_1 (rp, fp, n, u) - ((-f_sign) & u); + hi -= ((-u_sign) & fp[n-1]) + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n-1); + + hi += mpn_addmul_1 (rp, gp, n, v) - ((-g_sign) & v); + hi -= ((-v_sign) & gp[n-1]) + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n-1); + + return hi; +} + +/* Update (f'; g') = M (f; g) / 2^{shift}, where all numbers are two's complement. */ +static void +matrix_vector_mul (const struct matrix *M, unsigned shift, + mp_size_t n, mp_limb_t *fp, mp_limb_t *gp, mp_limb_t *tp) +{ + mp_limb_t f_hi = add_add_mul (tp, fp, gp, n, M->a[0][0], M->a[0][1]); + mp_limb_t g_hi = add_add_mul (tp + n, fp, gp, n, M->a[1][0], M->a[1][1]); + mp_limb_t lo = mpn_rshift (fp, tp, n, shift); + assert (lo == 0); + fp[n-1] += (f_hi << (GMP_LIMB_BITS - shift)); + lo = mpn_rshift (gp, tp + n, n, shift); + assert (lo == 0); + gp[n-1] += (g_hi << (GMP_LIMB_BITS - shift)); +} + +/* Set R = (u * F + v * G) (mod M), treating u, v as two's complement, + but F, G, R unsigned. No overlap allowed. n limb inputs, n+1 limb + output. + + Input can be allowed in the range 0 <= F, G < min (2 M, B^n), + output always in the range 0 <= R < B M. +*/ +static void +add_add_mul_mod (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp, + const mp_limb_t *mp, mp_size_t n, + mp_limb_t u, mp_limb_t v) { + mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1); + mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1); + mp_limb_t r_sign; + + assert ((fp[n-1] >> 1) <= mp[n-1]); + assert ((gp[n-1] >> 1) <= mp[n-1]); + + rp[n] = mpn_mul_1 (rp, fp, n, u); + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n); + + rp[n] += mpn_addmul_1 (rp, gp, n, v); + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n); + + /* Row sums of the matrix have absolute value <= B/4. With inputs F, + G < 2 M, at this point we have + + |R| < B M/2. + + If R < 0, then R + B M is positive negative, adding B M makes the + result positive. + */ + r_sign = rp[n] >> (GMP_LIMB_BITS - 1); + r_sign -= mpn_cnd_add_n (r_sign, rp + 1, rp + 1, mp, n); + assert (r_sign == 0); + assert (rp[n] <= mp[n-1]); +} + +/* Input in range 0 <= T < B M, output in range 0 <= R < M. */ +static void +redc_1 (mp_limb_t *rp, mp_limb_t *tp, + const mp_limb_t *mp, mp_size_t n, mp_limb_t m_binv) +{ + mp_limb_t cy, hi; + /* If we knew that 2M < B^n, we could allow result value in the + range 0 <= R < 2M, and use mpn_add_mul_1 without any adjustment + step. */ + hi = mpn_submul_1 (tp, mp, n, m_binv * tp[0]); + assert (tp[0] == 0); + cy = tp[n] < hi; + tp[n] -= hi; + + cy -= mpn_cnd_add_n (cy, rp, tp + 1, mp, n); + assert (cy == 0); +} + +/* Update (u'; v') = M (u; v) B^{-1} (mod m), where u, v, m are + unsigned n limbs, M has signed elements, and B is the bignum + base. - Needs 3n limbs of scratch space. + Output are canonically reduced, 0 <= U, V < M, but + inputs are only required to be < 2 M. */ +static void +matrix_vector_mul_mod (const struct matrix *M, const mp_limb_t *mp, + mp_limb_t m_binv, + mp_size_t n, mp_limb_t *up, mp_limb_t *vp, mp_limb_t *tp) +{ + add_add_mul_mod (tp, up, vp, mp, n, M->a[0][0], M->a[0][1]); + add_add_mul_mod (tp + n + 1, up, vp, mp, n, M->a[1][0], M->a[1][1]); + + /* Reduce to n limbs, by multiplying with B^-1 (mod m) */ + redc_1 (up, tp, mp, n, m_binv); + redc_1 (vp, tp + n + 1, mp, n, m_binv); +} + +/* Input in range 0 <= T < 2M, output in range 0 <= R < 2M. */ +static void +redc_bits (mp_limb_t *rp, mp_limb_t *tp, const mp_limb_t *mp, + mp_size_t n, mp_limb_t m_binv, mp_bitcnt_t k) +{ + mp_limb_t hi; + + for (; k >= GMP_NUMB_BITS; k -= GMP_NUMB_BITS) + { + hi = mpn_addmul_1 (tp, mp, n, -m_binv * tp[0]); + mpn_copyi (tp, tp + 1, n - 1); + tp[n-1] = hi; + } + if (k > 0) + { + mp_limb_t mask = ((mp_limb_t) 2 << (k-1)) - 1; + mp_limb_t q = (-m_binv * tp[0]) & mask; + hi = mpn_addmul_1 (tp, mp, n, q); + mpn_rshift (rp, tp, n, k); + rp[n-1] |= hi << (GMP_NUMB_BITS - k); + } +} + +static int +one_p (const mp_limb_t *p, mp_size_t n) +{ + mp_limb_t diff = p[0] ^ 1; + mp_size_t i; + for (i = 1; i < n; i++) + diff |= p[i]; + + return diff == 0; +} -/* FIXME: Could use mpn_sec_invert (in GMP-6), but with a bit more - scratch need since it doesn't precompute (m+1)/2. */ +/* Compute a^{-1} mod m, with running time depending only on the size. + Returns zero if a == 0 (mod m), to be consistent with a^{phi(m)-1}. + Based on the algorithm in https://eprint.iacr.org/2019/266.pdf */ void ecc_mod_inv (const struct ecc_modulo *m, - mp_limb_t *vp, const mp_limb_t *in_ap, + mp_limb_t *up, const mp_limb_t *ap, mp_limb_t *scratch) { -#define ap scratch -#define bp (scratch + n) -#define up (scratch + 2*n) - mp_size_t n = m->size; - /* Avoid the mp_bitcnt_t type for compatibility with older GMP - versions. */ - unsigned i; + mp_bitcnt_t shift, delta, count; + mp_limb_t cy; + struct matrix M; + + /* Total scratch need 4*(n+1) for fp, gp, tp, n for v, total 5*n+4 */ +#define fp scratch +#define gp (scratch + n + 1) +#define vp (scratch + 2*n + 2) +#define tp (scratch + 3*n + 2) + +#define mp (m->m) + + /* Input should satisfy a < 2m, since that is what the iteration + count is based on. */ + assert ((ap[n-1] >> 1) <= mp[n-1]); - /* Maintain + mpn_copyi (fp, mp, n); fp[n] = 0; gp[n] = 0; - a = u * orig_a (mod m) - b = v * orig_a (mod m) + if (m->invert_power) + /* Multiply by suitable pre-computed power to get inverse in redc + representation. */ + ecc_mod_mul (m, gp, ap, m->invert_power, tp); + else { + mpn_copyi (gp, ap, n); - and b odd at all times. Initially, + shift = GMP_NUMB_BITS*((m->invert_count + STEPS - 1) / STEPS) - m->invert_count; - a = a_orig, u = 1 - b = m, v = 0 - */ + /* Premultiply a by 2^{- shift} mod m */ + redc_bits (gp, gp, mp, n, m->binv, shift); + } - assert (ap != vp); + /* Maintain invariant - up[0] = 1; - mpn_zero (up+1, n - 1); - mpn_copyi (bp, m->m, n); - mpn_zero (vp, n); - mpn_copyi (ap, in_ap, n); + a * u = 2^{-count} B^{ceil(count/STEPS)} f (mod m) + a * v = 2^{-count} B^{ceil(count/STEPS)} g (mod m) + */ + mpn_zero (up, n); + mpn_zero (vp+1, n-1); vp[0] = 1; - for (i = m->bit_size + GMP_NUMB_BITS * n; i-- > 0; ) + for (delta = 1, count = m->invert_count; count > STEPS; count -= STEPS) { - mp_limb_t odd, swap, cy; - - /* Always maintain b odd. The logic of the iteration is as - follows. For a, b: - - odd = a & 1 - a -= odd * b - if (underflow from a-b) - { - b += a, assigns old a - a = B^n-a - } - - a /= 2 - - For u, v: - - if (underflow from a - b) - swap u, v - u -= odd * v - if (underflow from u - v) - u += m - - u /= 2 - if (a one bit was shifted out) - u += (m+1)/2 - - As long as a > 0, the quantity - - (bitsize of a) + (bitsize of b) - - is reduced by at least one bit per iteration, hence after - (bit_size of orig_a) + (bit_size of m) - 1 iterations we - surely have a = 0. Then b = gcd(orig_a, m) and if b = 1 then - also v = orig_a^{-1} (mod m) - */ - - assert (bp[0] & 1); - odd = ap[0] & 1; - - swap = mpn_cnd_sub_n (odd, ap, ap, bp, n); - mpn_cnd_add_n (swap, bp, bp, ap, n); - cnd_neg (swap, ap, ap, n); - - mpn_cnd_swap (swap, up, vp, n); - cy = mpn_cnd_sub_n (odd, up, up, vp, n); - cy -= mpn_cnd_add_n (cy, up, up, m->m, n); - assert (cy == 0); - - cy = mpn_rshift (ap, ap, n, 1); - assert (cy == 0); - cy = mpn_rshift (up, up, n, 1); - cy = mpn_cnd_add_n (cy, up, up, m->mp1h, n); - assert (cy == 0); + delta = steps (&M, STEPS, delta, fp[0], gp[0]); + matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp); + matrix_vector_mul_mod (&M, mp, m->binv, n, up, vp, tp); } - assert ( (ap[0] | ap[n-1]) == 0); -#undef ap -#undef bp -#undef up + delta = steps (&M, count, delta, fp[0], gp[0]); + matrix_vector_mul (&M, count, n+1, fp, gp, tp); + /* Only compute u, we don't need v. */ + add_add_mul_mod (tp, up, vp, mp, n, M.a[0][0], M.a[0][1]); + redc_1 (up, tp, mp, n, m->binv); + + assert (sec_zero_p (gp, n+1)); + + /* Now f = ±1 (if the inverse exists), and a * u = f (mod m) */ + cy = cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), up, up, n); + /* Make u non-negative */ + cy -= mpn_cnd_add_n (cy, up, up, mp, n); + + cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n + 1); + cy = one_p (fp, n+1); + /* Set to zero if invert doesn't exist. */ + while (n > 0) + up[--n] &= - cy; } diff --git a/ecc-secp192r1.c b/ecc-secp192r1.c index 4a07bca3..00ff87b1 100644 --- a/ecc-secp192r1.c +++ b/ecc-secp192r1.c @@ -244,13 +244,15 @@ const struct ecc_curve _nettle_secp_192r1 = ECC_SECP192R1_INV_ITCH, ECC_SECP192R1_SQRT_ITCH, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, - ecc_pp1h, + NULL, ecc_secp192r1_modp, ecc_secp192r1_modp, @@ -266,13 +268,15 @@ const struct ecc_curve _nettle_secp_192r1 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_mod, ecc_mod, diff --git a/ecc-secp224r1.c b/ecc-secp224r1.c index b2a335ec..341aee7b 100644 --- a/ecc-secp224r1.c +++ b/ecc-secp224r1.c @@ -111,32 +111,6 @@ ecc_mod_pow_127m1 (const struct ecc_modulo *m, #undef tp } -#define ECC_SECP224R1_INV_ITCH (4*ECC_LIMB_SIZE) - -static void -ecc_secp224r1_inv (const struct ecc_modulo *p, - mp_limb_t *rp, const mp_limb_t *ap, - mp_limb_t *scratch) -{ -#define a96m1 scratch -#define tp (scratch + ECC_LIMB_SIZE) - - /* Compute a^{p - 2}, with - - p-2 = 2^{224} - 2^{96} - 1 - = 2^{97}(2^{127} - 1) + 2^{96} - 1 - - This addition chain needs 97 squarings and one multiply in - addition to ecc_mod_pow_127m1, for a total of 223 squarings and - 13 multiplies. - */ - ecc_mod_pow_127m1 (p, rp, a96m1, ap, tp); - ecc_mod_pow_2k_mul (p, rp, rp, 97, a96m1, tp); /* a^{2^{224} - 2^{96} - 1 */ - -#undef a96m1 -#undef tp -} - #define ECC_SECP224R1_SQRT_ITCH (5*ECC_LIMB_SIZE) static int @@ -216,20 +190,22 @@ const struct ecc_curve _nettle_secp_224r1 = ECC_LIMB_SIZE, ECC_BMODP_SIZE, -ECC_REDC_SIZE, - ECC_SECP224R1_INV_ITCH, + ECC_MOD_INV_ITCH(ECC_LIMB_SIZE), ECC_SECP224R1_SQRT_ITCH, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, - ecc_pp1h, + USE_REDC ? ecc_invp_redc_power : NULL, ecc_secp224r1_modp, USE_REDC ? ecc_secp224r1_redc : ecc_secp224r1_modp, - ecc_secp224r1_inv, + ecc_mod_inv, ecc_secp224r1_sqrt, NULL, }, @@ -241,13 +217,15 @@ const struct ecc_curve _nettle_secp_224r1 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_mod, ecc_mod, @@ -265,7 +243,7 @@ const struct ecc_curve _nettle_secp_224r1 = ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE), ECC_MUL_A_ITCH (ECC_LIMB_SIZE), ECC_MUL_G_ITCH (ECC_LIMB_SIZE), - ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP224R1_INV_ITCH), + ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)), ecc_add_jja, ecc_add_jjj, diff --git a/ecc-secp256r1.c b/ecc-secp256r1.c index 4848dfe3..8c852aaa 100644 --- a/ecc-secp256r1.c +++ b/ecc-secp256r1.c @@ -213,64 +213,6 @@ ecc_secp256r1_modq (const struct ecc_modulo *q, mp_limb_t *rp, mp_limb_t *xp) #error Unsupported parameters #endif -#define ECC_SECP256R1_INV_ITCH (4*ECC_LIMB_SIZE) - -static void -ecc_secp256r1_inv (const struct ecc_modulo *p, - mp_limb_t *rp, const mp_limb_t *ap, - mp_limb_t *scratch) -{ -#define a5m1 scratch -#define t0 (scratch + ECC_LIMB_SIZE) -#define a15m1 t0 -#define a32m1 a5m1 -#define tp (scratch + 2*ECC_LIMB_SIZE) -/* - Addition chain for p - 2 = 2^{256} - 2^{224} + 2^{192} + 2^{96} - 3 - - 2^5 - 1 = 1 + 2 (2^4 - 1) = 1 + 2 (2^2+1)(2 + 1) 4 S + 3 M - 2^{15} - 1 = (2^5 - 1) (1 + 2^5 (1 + 2^5) 10 S + 2 M - 2^{16} - 1 = 1 + 2 (2^{15} - 1) S + M - 2^{32} - 1 = (2^{16} + 1) (2^{16} - 1) 16 S + M - 2^{64} - 2^{32} + 1 = 2^{32} (2^{32} - 1) + 1 32 S + M - 2^{192} - 2^{160} + 2^{128} + 2^{32} - 1 - = 2^{128} (2^{64} - 2^{32} + 1) + 2^{32} - 1 128 S + M - 2^{224} - 2^{192} + 2^{160} + 2^{64} - 1 - = 2^{32} (...) + 2^{32} - 1 32 S + M - 2^{239} - 2^{207} + 2^{175} + 2^{79} - 1 - = 2^{15} (...) + 2^{15} - 1 15 S + M - 2^{254} - 2^{222} + 2^{190} + 2^{94} - 1 - = 2^{15} (...) + 2^{15} - 1 15 S + M - p - 2 = 2^2 (...) + 1 2 S M - --------------- - 255 S + 13 M - */ - ecc_mod_sqr (p, rp, ap, tp); /* a^2 */ - ecc_mod_mul (p, rp, rp, ap, tp); /* a^3 */ - ecc_mod_pow_2kp1 (p, t0, rp, 2, tp); /* a^{2^4 - 1} */ - ecc_mod_sqr (p, rp, t0, tp); /* a^{2^5 - 2} */ - ecc_mod_mul (p, a5m1, rp, ap, tp); /* a^{2^5 - 1}, a5m1 */ - - ecc_mod_pow_2kp1 (p, rp, a5m1, 5, tp); /* a^{2^{10} - 1, a5m1*/ - ecc_mod_pow_2k_mul (p, a15m1, rp, 5, a5m1, tp); /* a^{2^{15} - 1}, a5m1 a15m1 */ - ecc_mod_sqr (p, rp, a15m1, tp); /* a^{2^{16} - 2}, a15m1 */ - ecc_mod_mul (p, rp, rp, ap, tp); /* a^{2^{16} - 1}, a15m1 */ - ecc_mod_pow_2kp1 (p, a32m1, rp, 16, tp); /* a^{2^{32} - 1}, a15m1, a32m1 */ - - ecc_mod_pow_2k_mul (p, rp, a32m1, 32, ap, tp);/* a^{2^{64} - 2^{32} + 1 */ - ecc_mod_pow_2k_mul (p, rp, rp, 128, a32m1, tp); /* a^{2^{192} - 2^{160} + 2^{128} + 2^{32} - 1} */ - ecc_mod_pow_2k_mul (p, rp, rp, 32, a32m1, tp);/* a^{2^{224} - 2^{192} + 2^{160} + 2^{64} - 1} */ - ecc_mod_pow_2k_mul (p, rp, rp, 15, a15m1, tp);/* a^{2^{239} - 2^{207} + 2^{175} + 2^{79} - 1} */ - ecc_mod_pow_2k_mul (p, rp, rp, 15, a15m1, tp);/* a^{2^{254} - 2^{222} + 2^{190} + 2^{94} - 1} */ - ecc_mod_pow_2k_mul (p, rp, rp, 2, ap, tp); /* a^{2^{256} - 2^{224} + 2^{192} + 2^{96} - 3} */ - -#undef a5m1 -#undef t0 -#undef a15m1 -#undef a32m1 -#undef tp -} - /* To guarantee that inputs to ecc_mod_zero_p are in the required range. */ #if ECC_LIMB_SIZE * GMP_NUMB_BITS != 256 #error Unsupported limb size @@ -336,20 +278,22 @@ const struct ecc_curve _nettle_secp_256r1 = ECC_LIMB_SIZE, ECC_BMODP_SIZE, ECC_REDC_SIZE, - ECC_SECP256R1_INV_ITCH, + ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), ECC_SECP256R1_SQRT_ITCH, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, - ecc_pp1h, + USE_REDC ? ecc_invp_redc_power : NULL, ecc_secp256r1_modp, USE_REDC ? ecc_secp256r1_redc : ecc_secp256r1_modp, - ecc_secp256r1_inv, + ecc_mod_inv, ecc_secp256r1_sqrt, NULL, }, @@ -361,14 +305,15 @@ const struct ecc_curve _nettle_secp_256r1 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, - + NULL, ecc_secp256r1_modq, ecc_secp256r1_modq, ecc_mod_inv, @@ -385,7 +330,7 @@ const struct ecc_curve _nettle_secp_256r1 = ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE), ECC_MUL_A_ITCH (ECC_LIMB_SIZE), ECC_MUL_G_ITCH (ECC_LIMB_SIZE), - ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP256R1_INV_ITCH), + ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)), ecc_add_jja, ecc_add_jjj, diff --git a/ecc-secp384r1.c b/ecc-secp384r1.c index abac5e6d..9c015297 100644 --- a/ecc-secp384r1.c +++ b/ecc-secp384r1.c @@ -210,45 +210,6 @@ ecc_mod_pow_288m32m1 (const struct ecc_modulo *m, #undef tp } -#define ECC_SECP384R1_INV_ITCH (6*ECC_LIMB_SIZE) - -static void -ecc_secp384r1_inv (const struct ecc_modulo *p, - mp_limb_t *rp, const mp_limb_t *ap, - mp_limb_t *scratch) -{ - /* - Addition chain for - - p - 2 = 2^{384} - 2^{128} - 2^{96} + 2^{32} - 3 - - Start with - - a^{2^{288} - 2^{32} - 1} - - and then use - - 2^{382} - 2^{126} - 2^{94} + 2^{30} - 1 - = 2^{94} (2^{288} - 2^{32} - 1) + 2^{30} - 1 - - 2^{384} - 2^{128} - 2^{96} + 2^{32} - 3 - = 2^2 (2^{382} - 2^{126} - 2^{94} + 2^{30} - 1) + 1 - - This addition chain needs 96 additional squarings and 2 - multiplies, for a total of 383 squarings and 14 multiplies. - */ - -#define a30m1 scratch -#define tp (scratch + ECC_LIMB_SIZE) - - ecc_mod_pow_288m32m1 (p, rp, a30m1, ap, tp); - ecc_mod_pow_2k_mul (p, rp, rp, 94, a30m1, tp); /* a^{2^{382} - 2^{126} - 2^{94} + 2^{30} - 1 */ - ecc_mod_pow_2k_mul (p, rp, rp, 2, ap, tp); - -#undef a30m1 -#undef tp -} - /* To guarantee that inputs to ecc_mod_zero_p are in the required range. */ #if ECC_LIMB_SIZE * GMP_NUMB_BITS != 384 #error Unsupported limb size @@ -307,20 +268,22 @@ const struct ecc_curve _nettle_secp_384r1 = ECC_LIMB_SIZE, ECC_BMODP_SIZE, ECC_REDC_SIZE, - ECC_SECP384R1_INV_ITCH, + ECC_MOD_INV_ITCH(ECC_LIMB_SIZE), ECC_SECP384R1_SQRT_ITCH, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, - ecc_pp1h, + NULL, ecc_secp384r1_modp, ecc_secp384r1_modp, - ecc_secp384r1_inv, + ecc_mod_inv, ecc_secp384r1_sqrt, NULL, }, @@ -332,13 +295,15 @@ const struct ecc_curve _nettle_secp_384r1 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_mod, ecc_mod, @@ -356,7 +321,7 @@ const struct ecc_curve _nettle_secp_384r1 = ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE), ECC_MUL_A_ITCH (ECC_LIMB_SIZE), ECC_MUL_G_ITCH (ECC_LIMB_SIZE), - ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP384R1_INV_ITCH), + ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)), ecc_add_jja, ecc_add_jjj, diff --git a/ecc-secp521r1.c b/ecc-secp521r1.c index 8ab7b4bf..c47b7b81 100644 --- a/ecc-secp521r1.c +++ b/ecc-secp521r1.c @@ -75,53 +75,6 @@ ecc_secp521r1_modp (const struct ecc_modulo *m UNUSED, mp_limb_t *rp, mp_limb_t } #endif -#define ECC_SECP521R1_INV_ITCH (3*ECC_LIMB_SIZE) - -static void -ecc_secp521r1_inv (const struct ecc_modulo *p, - mp_limb_t *rp, const mp_limb_t *ap, - mp_limb_t *scratch) -{ -#define t0 scratch -#define tp (scratch + ECC_LIMB_SIZE) - - /* Addition chain for p - 2: - - 2^{521} - 3 - = 1 + 2^2(2^519 - 1) - = 1 + 2^2(1 + 2 (2^518 - 1) - = 1 + 2^2(1 + 2 (2^259 + 1) (1 + 2(2^258 - 1))) - = 1 + 2^2(1 + 2 (2^259 + 1) (1 + 2(2^129 + 1) (2^129 - 1))) - = 1 + 2^2(1 + 2 (2^259 + 1) (1 + 2(2^129 + 1) (1 + 2 (2^128 - 1)))) - - where - - 2^{128} - 1 = (2^64 + 1) (2^32+1) (2^16 + 1) (2^8 + 1) (2^4 + 1) (2^2 + 1) (2 + 1) - - This addition chain needs 520 squarings and 13 multiplies. - */ - - ecc_mod_sqr (p, rp, ap, tp); /* a^2 */ - ecc_mod_mul (p, rp, ap, rp, tp); /* a^3 = a^{2^2 - 1} */ - ecc_mod_pow_2kp1 (p, t0, rp, 2, tp); /* a^15 = a^{2^4 - 1} */ - ecc_mod_pow_2kp1 (p, rp, t0, 4, tp); /* a^{2^8 - 1} */ - ecc_mod_pow_2kp1 (p, t0, rp, 8, tp); /* a^{2^16 - 1} */ - ecc_mod_pow_2kp1 (p, rp, t0, 16, tp); /* a^{2^32 - 1} */ - ecc_mod_pow_2kp1 (p, t0, rp, 32, tp); /* a^{2^64 - 1} */ - ecc_mod_pow_2kp1 (p, rp, t0, 64, tp); /* a^{2^128 - 1} */ - ecc_mod_sqr (p, rp, rp, tp); /* a^{2^129 - 2} */ - ecc_mod_mul (p, rp, rp, ap, tp); /* a^{2^129 - 1} */ - ecc_mod_pow_2kp1 (p, t0, rp, 129, tp);/* a^{2^258 - 1} */ - ecc_mod_sqr (p, rp, t0, tp); /* a^{2^259 - 2} */ - ecc_mod_mul (p, rp, rp, ap, tp); /* a^{2^259 - 1} */ - ecc_mod_pow_2kp1 (p, t0, rp, 259, tp);/* a^{2^518 - 1} */ - ecc_mod_sqr (p, rp, t0, tp); /* a^{2^519 - 2} */ - ecc_mod_mul (p, rp, rp, ap, tp); /* a^{2^519 - 1} */ - ecc_mod_sqr (p, rp, rp, tp); /* a^{2^520 - 2} */ - ecc_mod_sqr (p, rp, rp, tp); /* a^{2^521 - 4} */ - ecc_mod_mul (p, rp, rp, ap, tp); /* a^{2^521 - 3} */ -} - #define ECC_SECP521R1_SQRT_ITCH (2*ECC_LIMB_SIZE) static int @@ -162,20 +115,22 @@ const struct ecc_curve _nettle_secp_521r1 = ECC_LIMB_SIZE, ECC_BMODP_SIZE, ECC_REDC_SIZE, - ECC_SECP521R1_INV_ITCH, + ECC_MOD_INV_ITCH(ECC_LIMB_SIZE), ECC_SECP521R1_SQRT_ITCH, 0, + ECC_INVP_COUNT, + ECC_BINVP, ecc_p, ecc_Bmodp, ecc_Bmodp_shifted, ecc_Bm2p, ecc_redc_ppm1, - ecc_pp1h, + NULL, ecc_secp521r1_modp, ecc_secp521r1_modp, - ecc_secp521r1_inv, + ecc_mod_inv, ecc_secp521r1_sqrt, NULL, }, @@ -187,13 +142,15 @@ const struct ecc_curve _nettle_secp_521r1 = ECC_MOD_INV_ITCH (ECC_LIMB_SIZE), 0, 0, + ECC_INVQ_COUNT, + ECC_BINVQ, ecc_q, ecc_Bmodq, ecc_Bmodq_shifted, ecc_Bm2q, NULL, - ecc_qp1h, + NULL, ecc_mod, ecc_mod, @@ -211,8 +168,11 @@ const struct ecc_curve _nettle_secp_521r1 = ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE), ECC_MUL_A_ITCH (ECC_LIMB_SIZE), ECC_MUL_G_ITCH (ECC_LIMB_SIZE), +#if 0 ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP521R1_INV_ITCH), - +#else + ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH(ECC_LIMB_SIZE)), +#endif ecc_add_jja, ecc_add_jjj, ecc_dup_jj, @@ -1167,11 +1167,19 @@ string_toupper (char *buf, size_t size, const char *s) abort(); } +static unsigned +invert_count (unsigned bit_size) { + /* See https://eprint.iacr.org/2019/266.pdf, Theorem 11.2 */ + return (49 * bit_size + 57) / 17; +} + static void output_modulo (const char *name, const mpz_t x, unsigned size, unsigned bits_per_limb) { unsigned bit_size; + unsigned count; + unsigned invert_iterations; int shift; char buf[20]; mpz_t t; @@ -1193,9 +1201,29 @@ output_modulo (const char *name, const mpz_t x, / bits_per_limb)); bit_size = mpz_sizeinbase (x, 2); - shift = size * bits_per_limb - bit_size; assert (shift >= 0); + + /* If most significant bit of top limb is zero, input may be one bit + larger than the modulo. */ + count = invert_count (bit_size + (shift > 0)); + printf("#define ECC_INV%s_COUNT %u\n", buf, count); + + mpz_set_ui (t, 0); + mpz_setbit (t, bits_per_limb); + mpz_invert (t, x, t); + + printf ("#define ECC_BINV%s 0x", buf); + mpz_out_str (stdout, 16, t); + printf("%s\n", bits_per_limb > 32 ? "ULL" : "UL"); + + invert_iterations = (count + bits_per_limb - 3) / (bits_per_limb - 2); + mpz_set_ui (t, 2); + mpz_powm_ui (t, t, bits_per_limb * (invert_iterations + size) - count, x); + mpz_invert (t, t, x); + snprintf (buf, sizeof(buf), "ecc_inv%s_redc_power", name); + output_bignum (buf, t, size, bits_per_limb); + if (shift > 0) { mpz_set_ui (t, 0); @@ -1292,14 +1320,6 @@ output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb) output_bignum ("ecc_b", ecc->b, limb_size, bits_per_limb); - mpz_add_ui (t, ecc->p, 1); - mpz_fdiv_q_2exp (t, t, 1); - output_bignum ("ecc_pp1h", t, limb_size, bits_per_limb); - - mpz_add_ui (t, ecc->q, 1); - mpz_fdiv_q_2exp (t, t, 1); - output_bignum ("ecc_qp1h", t, limb_size, bits_per_limb); - /* Trailing zeros in p+1 correspond to trailing ones in p. */ redc_limbs = mpz_scan0 (ecc->p, 0) / bits_per_limb; if (redc_limbs > 0) diff --git a/testsuite/ecc-modinv-test.c b/testsuite/ecc-modinv-test.c index aecfb43b..147896a6 100644 --- a/testsuite/ecc-modinv-test.c +++ b/testsuite/ecc-modinv-test.c @@ -51,9 +51,12 @@ test_modulo (gmp_randstate_t rands, const char *name, mp_limb_t *scratch; unsigned j; mpz_t r; + mp_bitcnt_t bits; mpz_init (r); + bits = m->bit_size + (m->size * GMP_NUMB_BITS > m->bit_size); + a = xalloc_limbs (m->size); ai = xalloc_limbs (m->size); ref = xalloc_limbs (m->size);; @@ -90,15 +93,17 @@ test_modulo (gmp_randstate_t rands, const char *name, fprintf (stderr, " (bad)\n"); abort (); } - + for (j = 0; j < COUNT; j++) { if (j & 1) - mpz_rrandomb (r, rands, m->size * GMP_NUMB_BITS); + mpz_rrandomb (r, rands, bits); else - mpz_urandomb (r, rands, m->size * GMP_NUMB_BITS); + mpz_urandomb (r, rands, bits); mpz_limbs_copy (a, r, m->size); + if ((a[m->size - 1] >> 1) > m->m[m->size - 1]) + a[m->size - 1] = 2*m->m[m->size - 1] + 1; if (!ref_modinv (ref, a, m->m, m->size, use_redc)) { @@ -113,7 +118,7 @@ test_modulo (gmp_randstate_t rands, const char *name, fprintf (stderr, "%s->invert failed (test %u, bit size %u):\n", name, j, m->bit_size); fprintf (stderr, "a = "); - mpz_out_str (stderr, 16, r); + mpn_out_str (stderr, 16, a, m->size); fprintf (stderr, "\np = "); mpn_out_str (stderr, 16, m->m, m->size); fprintf (stderr, "\nt = "); |