Reimplement modular inversion.

New implementation based on the algorithm by Daniel J. Bernstein and Bo-Yin Yang, see https://eprint.iacr.org/2019/266.pdf.
author: Niels Möller <nisse@lysator.liu.se> 2022-10-07 20:37:58 +0200
committer: Niels Möller <nisse@lysator.liu.se> 2022-11-28 19:30:04 +0100
commit: 39ed186b89bae0db3fe8a443b09ef8b493c483b3 (patch)
tree: 42b537c84f52f7823826779ccc22f1aa659e4bbe
parent: 3fbce8821d1d6c1c7a2a25c8e5ad9b186125bcdb (diff)
download: nettle-39ed186b89bae0db3fe8a443b09ef8b493c483b3.tar.gz
20 files changed, 420 insertions, 353 deletions
diff --git a/ChangeLog b/ChangeLog
index f1e5537d..0727f8d1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,25 @@
+2022-10-07  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-mod-inv.c: Implement Bernstein-Yang algorithm. It
+	outperforms older inversion code, except except secp192r1 mod p
+	inversion.
+	* eccdata.c (output_modulo): Output iteration count for new
+	modular inversion algorithm, and needed power of two for redc
+	inversion.
+	* ecc-curve25519.c (ecc_curve25519_inv): Deleted.
+	* ecc-curve448.c (ecc_curve448_inv): Deleted.
+	* ecc-secp224r1.c (ecc_secp224r1_inv): Deleted.
+	* ecc-secp256r1.c (ecc_secp256r1_inv): Deleted.
+	* ecc-secp384r1.c (ecc_secp384r1_inv): Deleted.
+	* ecc-secp521r1.c (ecc_secp521r1_inv): Deleted.
+
+2022-10-03  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-internal.h (struct ecc_modulo): Delete (m+1)/2 constant, and
+	update all curve structs.
+	* eccdata.c (output_curve): Delete corresponding code to generate
+	ecc_pp1h and ecc_qp1h.
+
 2022-11-09  Niels Möller  <nisse@lysator.liu.se>
 
 	From Mamone Tarsha:
diff --git a/curve25519-eh-to-x.c b/curve25519-eh-to-x.c
index d90defda..8b6e8d1a 100644
--- a/curve25519-eh-to-x.c
+++ b/curve25519-eh-to-x.c
@@ -64,7 +64,7 @@ curve25519_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
      x = 0, and we should be fine, since ecc_mod_inv for ecc->p returns 0
      in this case. */
   ecc_mod_sub (&ecc->p, t0, wp, vp);
-  /* Needs a total of 6*size storage. */
+  /* Needs a total of 2*size + scratch for inversion. */
   ecc->p.invert (&ecc->p, t1, t0, tp);
   
   ecc_mod_add (&ecc->p, t0, wp, vp);
diff --git a/curve25519-mul-g.c b/curve25519-mul-g.c
index 000e0987..3f12af73 100644
--- a/curve25519-mul-g.c
+++ b/curve25519-mul-g.c
@@ -33,6 +33,7 @@
 # include "config.h"
 #endif
 
+#include <assert.h>
 #include <string.h>
 
 #include "curve25519.h"
@@ -57,7 +58,9 @@ curve25519_mul_g (uint8_t *r, const uint8_t *n)
   t[0] &= ~7;
   t[CURVE25519_SIZE-1] = (t[CURVE25519_SIZE-1] & 0x3f) | 0x40;
 
-  itch = 4*ecc->p.size + ecc->mul_g_itch;
+  assert (ecc->mul_g_itch <= 2*ecc->p.size + ecc->p.invert_itch); 
+  itch = 6*ecc->p.size + ecc->p.invert_itch;
+
   scratch = gmp_alloc_limbs (itch);
 
   mpn_set_base256_le (x, ecc->p.size, t, CURVE25519_SIZE);
diff --git a/curve25519-mul.c b/curve25519-mul.c
index aff2b293..d8023fc3 100644
--- a/curve25519-mul.c
+++ b/curve25519-mul.c
@@ -48,7 +48,7 @@ curve25519_mul (uint8_t *q, const uint8_t *n, const uint8_t *p)
   mp_size_t itch;
   mp_limb_t *x;
 
-  itch = m->size + ECC_MUL_M_ITCH(m->size);
+  itch = m->size + ECC_MUL_M_ITCH(m->size, m->invert_itch);
   x = gmp_alloc_limbs (itch);
 
   /* Note that 255 % GMP_NUMB_BITS == 0 isn't supported, so x always
diff --git a/curve448-eh-to-x.c b/curve448-eh-to-x.c
index 3b9bf3ec..29c38b27 100644
--- a/curve448-eh-to-x.c
+++ b/curve448-eh-to-x.c
@@ -58,7 +58,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p, mp_limb_t *scratch)
 
      x = v^2 / u^2 = (V/W)^2 / (U/W)^2 = (V/U)^2
   */
-  /* Needs a total of 5*size storage. */
+  /* Needs a total of size + scratch for inversion. */
   ecc->p.invert (&ecc->p, t0, up, tp);
   ecc_mod_mul (&ecc->p, t0, t0, vp, tp);
   ecc_mod_sqr_canonical (&ecc->p, xp, t0, tp);
diff --git a/curve448-mul-g.c b/curve448-mul-g.c
index a396595a..71af4547 100644
--- a/curve448-mul-g.c
+++ b/curve448-mul-g.c
@@ -58,7 +58,7 @@ curve448_mul_g (uint8_t *r, const uint8_t *n)
   t[0] &= ~3;
   t[CURVE448_SIZE-1] = (t[CURVE448_SIZE-1] & 0x7f) | 0x80;
 
-  itch = 5*ecc->p.size + ecc->mul_g_itch;
+  itch = 4*ecc->p.size + ecc->mul_g_itch;
   scratch = gmp_alloc_limbs (itch);
 
   mpn_set_base256_le (x, ecc->p.size, t, CURVE448_SIZE);
diff --git a/curve448-mul.c b/curve448-mul.c
index 2090c8a5..c8b13e90 100644
--- a/curve448-mul.c
+++ b/curve448-mul.c
@@ -50,7 +50,7 @@ curve448_mul (uint8_t *q, const uint8_t *n, const uint8_t *p)
   mp_size_t itch;
   mp_limb_t *x;
 
-  itch = m->size + ECC_MUL_M_ITCH(m->size);
+  itch = m->size + ECC_MUL_M_ITCH(m->size, m->invert_itch);
   x = gmp_alloc_limbs (itch);
 
   mpn_set_base256_le (x, m->size, p, CURVE448_SIZE);
diff --git a/ecc-curve25519.c b/ecc-curve25519.c
index 539bff22..bb419ad2 100644
--- a/ecc-curve25519.c
+++ b/ecc-curve25519.c
@@ -149,27 +149,6 @@ ecc_mod_pow_252m3 (const struct ecc_modulo *m,
 #undef tp
 }
 
-/* Scratch as for ecc_mod_pow_252m3 above. */
-#define ECC_25519_INV_ITCH (4*ECC_LIMB_SIZE)
-
-static void
-ecc_curve25519_inv (const struct ecc_modulo *p,
-		    mp_limb_t *rp, const mp_limb_t *ap,
-		    mp_limb_t *scratch)
-{
-  /* Addition chain
-
-       p - 2 = 2^{255} - 21
-             = 1 + 2 (1 + 4 (2^{252}-3))
-  */
-  ecc_mod_pow_252m3 (p, rp, ap, scratch);
-  ecc_mod_sqr (p, rp, rp, scratch);
-  ecc_mod_sqr (p, rp, rp, scratch);
-  ecc_mod_mul (p, rp, ap, rp, scratch);
-  ecc_mod_sqr (p, rp, rp, scratch);
-  ecc_mod_mul (p, rp, ap, rp, scratch);
-}
-
 static int
 ecc_curve25519_zero_p (const struct ecc_modulo *p, mp_limb_t *xp)
 {
@@ -259,20 +238,22 @@ const struct ecc_curve _nettle_curve25519 =
     ECC_LIMB_SIZE,
     ECC_BMODP_SIZE,
     0,
-    ECC_25519_INV_ITCH,
+    ECC_MOD_INV_ITCH(ECC_LIMB_SIZE),
     0,
     ECC_25519_SQRT_RATIO_ITCH,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     NULL,
-    ecc_pp1h,
+    NULL,
 
     ecc_curve25519_modp,
     ecc_curve25519_modp,
-    ecc_curve25519_inv,
+    ecc_mod_inv,
     NULL,
     ecc_curve25519_sqrt_ratio,
   },
@@ -284,13 +265,15 @@ const struct ecc_curve _nettle_curve25519 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,  
     ecc_mBmodq_shifted, /* Use q - 2^{252} instead. */
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_curve25519_modq,
     ecc_curve25519_modq,
@@ -308,7 +291,7 @@ const struct ecc_curve _nettle_curve25519 =
   ECC_DUP_TH_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_A_EH_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_G_EH_ITCH (ECC_LIMB_SIZE),
-  ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_25519_INV_ITCH),
+  ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_MOD_INV_ITCH(ECC_LIMB_SIZE)),
 
   ecc_add_th,
   ecc_add_thh,
diff --git a/ecc-curve448.c b/ecc-curve448.c
index daef56cc..8d1258fe 100644
--- a/ecc-curve448.c
+++ b/ecc-curve448.c
@@ -142,18 +142,6 @@ ecc_mod_pow_446m224m1 (const struct ecc_modulo *p,
 #undef tp
 }
 
-#define ECC_CURVE448_INV_ITCH (4*ECC_LIMB_SIZE)
-
-static void ecc_curve448_inv (const struct ecc_modulo *p,
-			 mp_limb_t *rp, const mp_limb_t *ap,
-			 mp_limb_t *tp)
-{
-  ecc_mod_pow_446m224m1 (p, rp, ap, tp);/* a^{2^446-2^222-1} */
-  ecc_mod_sqr (p, rp, rp, tp);		/* a^{2^447-2^223-2} */
-  ecc_mod_sqr (p, rp, rp, tp);		/* a^{2^448-2^224-4} */
-  ecc_mod_mul (p, rp, ap, rp, tp);	/* a^{2^448-2^224-3} */
-}
-
 /* To guarantee that inputs to ecc_mod_zero_p are in the required range. */
 #if ECC_LIMB_SIZE * GMP_NUMB_BITS != 448
 #error Unsupported limb size
@@ -213,20 +201,22 @@ const struct ecc_curve _nettle_curve448 =
     ECC_LIMB_SIZE,
     ECC_BMODP_SIZE,
     0,
-    ECC_CURVE448_INV_ITCH,
+    ECC_MOD_INV_ITCH(ECC_LIMB_SIZE),
     0,
     ECC_CURVE448_SQRT_RATIO_ITCH,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     NULL,
-    ecc_pp1h,
+    NULL,
 
     ecc_curve448_modp,
     ecc_curve448_modp,
-    ecc_curve448_inv,
+    ecc_mod_inv,
     NULL,
     ecc_curve448_sqrt_ratio,
   },
@@ -238,13 +228,15 @@ const struct ecc_curve _nettle_curve448 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_mod,
     ecc_mod,
@@ -262,7 +254,7 @@ const struct ecc_curve _nettle_curve448 =
   ECC_DUP_EH_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_A_EH_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_G_EH_ITCH (ECC_LIMB_SIZE),
-  ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_CURVE448_INV_ITCH),
+  ECC_EH_TO_A_ITCH (ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)),
 
   ecc_add_eh,
   ecc_add_ehh,
diff --git a/ecc-gost-gc256b.c b/ecc-gost-gc256b.c
index df9cbb58..10c7ae8c 100644
--- a/ecc-gost-gc256b.c
+++ b/ecc-gost-gc256b.c
@@ -67,14 +67,16 @@ const struct ecc_curve _nettle_gost_gc256b =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
+    NULL,
 
-    ecc_pp1h,
     ecc_gost_gc256b_modp,
     ecc_gost_gc256b_modp,
     ecc_mod_inv,
@@ -89,13 +91,15 @@ const struct ecc_curve _nettle_gost_gc256b =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_gost_gc256b_modq,
     ecc_gost_gc256b_modq,
diff --git a/ecc-gost-gc512a.c b/ecc-gost-gc512a.c
index 3807b57e..0ba58d64 100644
--- a/ecc-gost-gc512a.c
+++ b/ecc-gost-gc512a.c
@@ -67,14 +67,16 @@ const struct ecc_curve _nettle_gost_gc512a =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
+    NULL,
 
-    ecc_pp1h,
     ecc_gost_gc512a_modp,
     ecc_gost_gc512a_modp,
     ecc_mod_inv,
@@ -89,13 +91,15 @@ const struct ecc_curve _nettle_gost_gc512a =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_gost_gc512a_modq,
     ecc_gost_gc512a_modq,
diff --git a/ecc-internal.h b/ecc-internal.h
index be02de5f..cd4d5ea1 100644
--- a/ecc-internal.h
+++ b/ecc-internal.h
@@ -170,6 +170,13 @@ struct ecc_modulo
   unsigned short sqrt_itch;
   unsigned short sqrt_ratio_itch;
 
+  /* Needed iterations for the Bernstein-Yang inversion algorithm, for
+     input size bit_size, or bit_size + 1 in case the most significant
+     bit of the most significant modulo limb is zero. */
+  unsigned short invert_count;
+  /* m^{-1} mod 2^{GMP_NUMB_BITS} */
+  mp_limb_t binv;
+
   const mp_limb_t *m;
   /* B^size mod m. Expected to have at least 32 leading zeros
      (equality for secp_256r1). */
@@ -184,8 +191,8 @@ struct ecc_modulo
 
   /* m +/- 1, for redc, excluding redc_size low limbs. */
   const mp_limb_t *redc_mpm1;
-  /* (m+1)/2 */
-  const mp_limb_t *mp1h;
+
+  const mp_limb_t *invert_power;
 
   ecc_mod_func *mod;
   ecc_mod_func *reduce;
@@ -482,7 +489,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
 		  mp_limb_t *scratch);
 
 /* Current scratch needs: */
-#define ECC_MOD_INV_ITCH(size) (3*(size))
+#define ECC_MOD_INV_ITCH(size) (5*(size)+4)
 #define ECC_J_TO_A_ITCH(size, inv) ((size)+(inv))
 #define ECC_EH_TO_A_ITCH(size, inv) ((size)+(inv))
 #define ECC_DUP_JJ_ITCH(size) (4*(size))
@@ -508,7 +515,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
 #define ECC_MUL_A_EH_ITCH(size) \
   (((3 << ECC_MUL_A_EH_WBITS) + 7) * (size))
 #endif
-#define ECC_MUL_M_ITCH(size) (8*(size))
+#define ECC_MUL_M_ITCH(size, inv) (4*(size) + (inv))
 #define ECC_ECDSA_SIGN_ITCH(size) (11*(size))
 #define ECC_GOSTDSA_SIGN_ITCH(size) (11*(size))
 #define ECC_MOD_RANDOM_ITCH(size) (size)
diff --git a/ecc-mod-inv.c b/ecc-mod-inv.c
index 254fb697..68d6c339 100644
--- a/ecc-mod-inv.c
+++ b/ecc-mod-inv.c
@@ -1,6 +1,6 @@
 /* ecc-mod-inv.c
 
-   Copyright (C) 2013, 2014 Niels Möller
+   Copyright (C) 2013, 2014, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -29,8 +29,6 @@
    not, see http://www.gnu.org/licenses/.
 */
 
-/* Development of Nettle's ECC support was funded by the .SE Internet Fund. */
-
 #if HAVE_CONFIG_H
 # include "config.h"
 #endif
@@ -39,7 +37,11 @@
 
 #include "ecc-internal.h"
 
-static void
+#if GMP_LIMB_BITS != GMP_NUMB_BITS
+#error Unsupported configuration
+#endif
+
+static mp_limb_t
 cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n)
 {
   mp_limb_t cy = (cnd != 0);
@@ -52,109 +54,282 @@ cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n)
       cy = r < cy;
       rp[i] = r;
     }
+  /* Return cnd except R = A = 0. */
+  return cnd & (1-cy);
 }
 
-/* Compute a^{-1} mod m, with running time depending only on the size.
-   Returns zero if a == 0 (mod m), to be consistent with a^{phi(m)-1}.
-   Also needs (m+1)/2, and m must be odd.
+#define BITCNT_BITS (sizeof(mp_bitcnt_t) * CHAR_BIT)
+
+#define STEPS (GMP_NUMB_BITS - 2)
+
+struct matrix {
+  /* Matrix elements interpreted as signed two's complement. Absolute
+     value of elements is at most 2^STEPS. */
+  mp_limb_t a[2][2];
+};
+
+/* Conditionally set (a, b) <-- (b, -a) */
+#define CND_NEG_SWAP_LIMB(cnd, a, b) do {\
+    mp_limb_t __cnd_sum = (a) + (b);	 \
+    mp_limb_t __cnd_diff = (a) - (b);	 \
+    (a) -= __cnd_diff & -(cnd);		 \
+    (b) -= __cnd_sum & -(cnd);		 \
+  } while (0)
+
+/* Perform K elementary reduction steps on (delta, f, g). Only the
+   least significant K bits of f, g matter. Note that delta has an
+   unsigned type, but is used as two's complement. */
+static mp_bitcnt_t
+steps(struct matrix *M, unsigned k, mp_bitcnt_t delta, mp_limb_t f, mp_limb_t g)
+{
+  mp_limb_t a00, a01, a10, a11;
+  assert (f & 1);
+
+  /* Identity matrix. */
+  a00 = a11 = 1;
+  a01 = a10 = 0;
+
+  /* Preserve invariant (f ; g) = 2^{-i} M (f_orig, g_orig) */
+  for (; k-- > 0; delta++)
+    {
+      mp_limb_t odd = g & 1;
+      mp_limb_t swap = odd & ~(delta >> (BITCNT_BITS-1));
+
+      /* Swap; (f'; g') = (g; -f) = (0,1;-1,0) (g;f) */
+      CND_NEG_SWAP_LIMB(swap, f, g);
+      CND_NEG_SWAP_LIMB(swap, a00, a10);
+      CND_NEG_SWAP_LIMB(swap, a01, a11);
+
+      /* Conditional negation. */
+      delta = (delta ^ - (mp_bitcnt_t) swap) + swap;
+
+      /* Cancel low bit and shift. */
+      g += f & -odd;
+      a10 += a00 & -odd;
+      a11 += a01 & -odd;
+
+      g >>= 1;
+      a00 <<= 1;
+      a01 <<= 1;
+    }
+  M->a[0][0] = a00; M->a[0][1] = a01;
+  M->a[1][0] = a10; M->a[1][1] = a11;
+  return delta;
+}
+
+/* Set R = (u * F + v * G), treating all numbers as two's complement.
+   No overlap allowed. */
+static mp_limb_t
+add_add_mul (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp, mp_size_t n,
+	     mp_limb_t u, mp_limb_t v) {
+  mp_limb_t f_sign = fp[n-1] >> (GMP_LIMB_BITS - 1);
+  mp_limb_t g_sign = gp[n-1] >> (GMP_LIMB_BITS - 1);
+  mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1);
+  mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1);
+  mp_limb_t hi = mpn_mul_1 (rp, fp, n, u) - ((-f_sign) & u);
+  hi -= ((-u_sign) & fp[n-1]) + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n-1);
+
+  hi += mpn_addmul_1 (rp, gp, n, v) - ((-g_sign) & v);
+  hi -= ((-v_sign) & gp[n-1]) + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n-1);
+
+  return hi;
+}
+
+/* Update (f'; g') = M (f; g) / 2^{shift}, where all numbers are two's complement. */
+static void
+matrix_vector_mul (const struct matrix *M, unsigned shift,
+		   mp_size_t n, mp_limb_t *fp, mp_limb_t *gp, mp_limb_t *tp)
+{
+  mp_limb_t f_hi = add_add_mul (tp, fp, gp, n, M->a[0][0], M->a[0][1]);
+  mp_limb_t g_hi = add_add_mul (tp + n, fp, gp, n, M->a[1][0], M->a[1][1]);
+  mp_limb_t lo = mpn_rshift (fp, tp, n, shift);
+  assert (lo == 0);
+  fp[n-1] += (f_hi << (GMP_LIMB_BITS - shift));
+  lo = mpn_rshift (gp, tp + n, n, shift);
+  assert (lo == 0);
+  gp[n-1] += (g_hi << (GMP_LIMB_BITS - shift));
+}
+
+/* Set R = (u * F + v * G) (mod M), treating u, v as two's complement,
+   but F, G, R unsigned. No overlap allowed. n limb inputs, n+1 limb
+   output.
+
+   Input can be allowed in the range 0 <= F, G < min (2 M, B^n),
+   output always in the range 0 <= R < B M.
+*/
+static void
+add_add_mul_mod (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp,
+		 const mp_limb_t *mp, mp_size_t n,
+		 mp_limb_t u, mp_limb_t v) {
+  mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1);
+  mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1);
+  mp_limb_t r_sign;
+
+  assert ((fp[n-1] >> 1) <= mp[n-1]);
+  assert ((gp[n-1] >> 1) <= mp[n-1]);
+
+  rp[n] = mpn_mul_1 (rp, fp, n, u);
+  mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n);
+
+  rp[n] += mpn_addmul_1 (rp, gp, n, v);
+  mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n);
+
+  /* Row sums of the matrix have absolute value <= B/4. With inputs F,
+     G < 2 M, at this point we have
+
+       |R| < B M/2.
+
+     If R < 0, then R + B M is positive negative, adding B M makes the
+     result positive.
+  */
+  r_sign = rp[n] >> (GMP_LIMB_BITS - 1);
+  r_sign -= mpn_cnd_add_n (r_sign, rp + 1, rp + 1, mp, n);
+  assert (r_sign == 0);
+  assert (rp[n] <= mp[n-1]);
+}
+
+/* Input in range 0 <= T < B M, output in range 0 <= R < M. */
+static void
+redc_1 (mp_limb_t *rp, mp_limb_t *tp,
+	const mp_limb_t *mp, mp_size_t n, mp_limb_t m_binv)
+{
+  mp_limb_t cy, hi;
+  /* If we knew that 2M < B^n, we could allow result value in the
+     range 0 <= R < 2M, and use mpn_add_mul_1 without any adjustment
+     step. */
+  hi = mpn_submul_1 (tp, mp, n, m_binv * tp[0]);
+  assert (tp[0] == 0);
+  cy = tp[n] < hi;
+  tp[n] -= hi;
+
+  cy -= mpn_cnd_add_n (cy, rp, tp + 1, mp, n);
+  assert (cy == 0);
+}
+
+/* Update (u'; v') = M (u; v) B^{-1} (mod m), where u, v, m are
+   unsigned n limbs, M has signed elements, and B is the bignum
+   base.
 
-   Needs 3n limbs of scratch space.
+   Output are canonically reduced, 0 <= U, V < M, but
+   inputs are only required to be < 2 M.
 */
+static void
+matrix_vector_mul_mod (const struct matrix *M, const mp_limb_t *mp,
+		       mp_limb_t m_binv,
+		       mp_size_t n, mp_limb_t *up, mp_limb_t *vp, mp_limb_t *tp)
+{
+  add_add_mul_mod (tp, up, vp, mp, n, M->a[0][0], M->a[0][1]);
+  add_add_mul_mod (tp + n + 1, up, vp, mp, n, M->a[1][0], M->a[1][1]);
+
+  /* Reduce to n limbs, by multiplying with B^-1 (mod m) */
+  redc_1 (up, tp, mp, n, m_binv);
+  redc_1 (vp, tp + n + 1, mp, n, m_binv);
+}
+
+/* Input in range 0 <= T < 2M, output in range 0 <= R < 2M. */
+static void
+redc_bits (mp_limb_t *rp, mp_limb_t *tp, const mp_limb_t *mp,
+	   mp_size_t n, mp_limb_t m_binv, mp_bitcnt_t k)
+{
+  mp_limb_t hi;
+
+  for (; k >= GMP_NUMB_BITS; k -= GMP_NUMB_BITS)
+    {
+      hi = mpn_addmul_1 (tp, mp, n, -m_binv * tp[0]);
+      mpn_copyi (tp, tp + 1, n - 1);
+      tp[n-1] = hi;
+    }
+  if (k > 0)
+    {
+      mp_limb_t mask = ((mp_limb_t) 2 << (k-1)) - 1;
+      mp_limb_t q = (-m_binv * tp[0]) & mask;
+      hi = mpn_addmul_1 (tp, mp, n, q);
+      mpn_rshift (rp, tp, n, k);
+      rp[n-1] |= hi << (GMP_NUMB_BITS - k);
+    }
+}
+
+static int
+one_p (const mp_limb_t *p, mp_size_t n)
+{
+  mp_limb_t diff = p[0] ^ 1;
+  mp_size_t i;
+  for (i = 1; i < n; i++)
+    diff |= p[i];
+
+  return diff == 0;
+}
 
-/* FIXME: Could use mpn_sec_invert (in GMP-6), but with a bit more
-   scratch need since it doesn't precompute (m+1)/2. */
+/* Compute a^{-1} mod m, with running time depending only on the size.
+   Returns zero if a == 0 (mod m), to be consistent with a^{phi(m)-1}.
+   Based on the algorithm in https://eprint.iacr.org/2019/266.pdf */
 void
 ecc_mod_inv (const struct ecc_modulo *m,
-	     mp_limb_t *vp, const mp_limb_t *in_ap,
+	     mp_limb_t *up, const mp_limb_t *ap,
 	     mp_limb_t *scratch)
 {
-#define ap scratch
-#define bp (scratch + n)
-#define up (scratch + 2*n)
-
   mp_size_t n = m->size;
-  /* Avoid the mp_bitcnt_t type for compatibility with older GMP
-     versions. */  
-  unsigned i;
+  mp_bitcnt_t shift, delta, count;
+  mp_limb_t cy;
+  struct matrix M;
+
+  /* Total scratch need 4*(n+1) for fp, gp, tp, n for v, total 5*n+4 */
+#define fp scratch
+#define gp (scratch + n + 1)
+#define vp (scratch + 2*n + 2)
+#define tp (scratch + 3*n + 2)
+
+#define mp (m->m)
+
+  /* Input should satisfy a < 2m, since that is what the iteration
+     count is based on. */
+  assert ((ap[n-1] >> 1) <= mp[n-1]);
 
-  /* Maintain
+  mpn_copyi (fp, mp, n); fp[n] = 0; gp[n] = 0;
 
-       a = u * orig_a (mod m)
-       b = v * orig_a (mod m)
+  if (m->invert_power)
+    /* Multiply by suitable pre-computed power to get inverse in redc
+       representation. */
+    ecc_mod_mul (m, gp, ap, m->invert_power, tp);
+  else {
+    mpn_copyi (gp, ap, n);
 
-     and b odd at all times. Initially,
+    shift = GMP_NUMB_BITS*((m->invert_count + STEPS - 1) / STEPS) - m->invert_count;
 
-       a = a_orig, u = 1
-       b = m,      v = 0
-     */
+    /* Premultiply a by 2^{- shift} mod m */
+    redc_bits (gp, gp, mp, n, m->binv, shift);
+  }
 
-  assert (ap != vp);
+  /* Maintain invariant
 
-  up[0] = 1;
-  mpn_zero (up+1, n - 1);
-  mpn_copyi (bp, m->m, n);
-  mpn_zero (vp, n);
-  mpn_copyi (ap, in_ap, n);
+       a * u = 2^{-count} B^{ceil(count/STEPS)} f (mod m)
+       a * v = 2^{-count} B^{ceil(count/STEPS)} g (mod m)
+  */
+  mpn_zero (up, n);
+  mpn_zero (vp+1, n-1); vp[0] = 1;
 
-  for (i = m->bit_size + GMP_NUMB_BITS * n; i-- > 0; )
+  for (delta = 1, count = m->invert_count; count > STEPS; count -= STEPS)
     {
-      mp_limb_t odd, swap, cy;
-      
-      /* Always maintain b odd. The logic of the iteration is as
-	 follows. For a, b:
-
-	   odd = a & 1
-	   a -= odd * b
-	   if (underflow from a-b)
-	     {
-	       b += a, assigns old a
-	       a = B^n-a
-	     }
-	   
-	   a /= 2
-
-	 For u, v:
-
-	   if (underflow from a - b)
-	     swap u, v
-	   u -= odd * v
-	   if (underflow from u - v)
-	     u += m
-
-	   u /= 2
-	   if (a one bit was shifted out)
-	     u += (m+1)/2
-
-	 As long as a > 0, the quantity
-
-	   (bitsize of a) + (bitsize of b)
-
-	 is reduced by at least one bit per iteration, hence after
-         (bit_size of orig_a) + (bit_size of m) - 1 iterations we
-         surely have a = 0. Then b = gcd(orig_a, m) and if b = 1 then
-         also v = orig_a^{-1} (mod m)
-      */
-
-      assert (bp[0] & 1);
-      odd = ap[0] & 1;
-
-      swap = mpn_cnd_sub_n (odd, ap, ap, bp, n);
-      mpn_cnd_add_n (swap, bp, bp, ap, n);
-      cnd_neg (swap, ap, ap, n);
-
-      mpn_cnd_swap (swap, up, vp, n);
-      cy = mpn_cnd_sub_n (odd, up, up, vp, n);
-      cy -= mpn_cnd_add_n (cy, up, up, m->m, n);
-      assert (cy == 0);
-
-      cy = mpn_rshift (ap, ap, n, 1);
-      assert (cy == 0);
-      cy = mpn_rshift (up, up, n, 1);
-      cy = mpn_cnd_add_n (cy, up, up, m->mp1h, n);
-      assert (cy == 0);
+      delta = steps (&M, STEPS, delta, fp[0], gp[0]);
+      matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp);
+      matrix_vector_mul_mod (&M, mp, m->binv, n, up, vp, tp);
     }
-  assert ( (ap[0] | ap[n-1]) == 0);
-#undef ap
-#undef bp
-#undef up
+  delta = steps (&M, count, delta, fp[0], gp[0]);
+  matrix_vector_mul (&M, count, n+1, fp, gp, tp);
+  /* Only compute u, we don't need v. */
+  add_add_mul_mod (tp, up, vp, mp, n, M.a[0][0], M.a[0][1]);
+  redc_1 (up, tp, mp, n, m->binv);
+
+  assert (sec_zero_p (gp, n+1));
+
+  /* Now f = ±1 (if the inverse exists), and a * u = f (mod m) */
+  cy = cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), up, up, n);
+  /* Make u non-negative */
+  cy -= mpn_cnd_add_n (cy, up, up, mp, n);
+
+  cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n + 1);
+  cy = one_p (fp, n+1);
+  /* Set to zero if invert doesn't exist. */
+  while (n > 0)
+    up[--n] &= - cy;
 }
diff --git a/ecc-secp192r1.c b/ecc-secp192r1.c
index 4a07bca3..00ff87b1 100644
--- a/ecc-secp192r1.c
+++ b/ecc-secp192r1.c
@@ -244,13 +244,15 @@ const struct ecc_curve _nettle_secp_192r1 =
     ECC_SECP192R1_INV_ITCH,
     ECC_SECP192R1_SQRT_ITCH,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
-    ecc_pp1h,
+    NULL,
 
     ecc_secp192r1_modp,
     ecc_secp192r1_modp,
@@ -266,13 +268,15 @@ const struct ecc_curve _nettle_secp_192r1 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_mod,
     ecc_mod,
diff --git a/ecc-secp224r1.c b/ecc-secp224r1.c
index b2a335ec..341aee7b 100644
--- a/ecc-secp224r1.c
+++ b/ecc-secp224r1.c
@@ -111,32 +111,6 @@ ecc_mod_pow_127m1 (const struct ecc_modulo *m,
 #undef tp
 }
 
-#define ECC_SECP224R1_INV_ITCH (4*ECC_LIMB_SIZE)
-
-static void
-ecc_secp224r1_inv (const struct ecc_modulo *p,
-		   mp_limb_t *rp, const mp_limb_t *ap,
-		   mp_limb_t *scratch)
-{
-#define a96m1 scratch
-#define tp (scratch + ECC_LIMB_SIZE)
-
-  /* Compute a^{p - 2}, with
-
-       p-2 = 2^{224} - 2^{96} - 1
-                   = 2^{97}(2^{127} - 1) + 2^{96} - 1
-
-     This addition chain needs 97 squarings and one multiply in
-     addition to ecc_mod_pow_127m1, for a total of 223 squarings and
-     13 multiplies.
-  */
-  ecc_mod_pow_127m1 (p, rp, a96m1, ap, tp);
-  ecc_mod_pow_2k_mul (p, rp, rp, 97, a96m1, tp); /* a^{2^{224} - 2^{96} - 1 */
-
-#undef a96m1
-#undef tp
-}
-
 #define ECC_SECP224R1_SQRT_ITCH (5*ECC_LIMB_SIZE)
 
 static int
@@ -216,20 +190,22 @@ const struct ecc_curve _nettle_secp_224r1 =
     ECC_LIMB_SIZE,    
     ECC_BMODP_SIZE,
     -ECC_REDC_SIZE,
-    ECC_SECP224R1_INV_ITCH,
+    ECC_MOD_INV_ITCH(ECC_LIMB_SIZE),
     ECC_SECP224R1_SQRT_ITCH,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
-    ecc_pp1h,
+    USE_REDC ? ecc_invp_redc_power : NULL,
 
     ecc_secp224r1_modp,
     USE_REDC ? ecc_secp224r1_redc : ecc_secp224r1_modp,
-    ecc_secp224r1_inv,
+    ecc_mod_inv,
     ecc_secp224r1_sqrt,
     NULL,
   },
@@ -241,13 +217,15 @@ const struct ecc_curve _nettle_secp_224r1 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_mod,
     ecc_mod,
@@ -265,7 +243,7 @@ const struct ecc_curve _nettle_secp_224r1 =
   ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_A_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_G_ITCH (ECC_LIMB_SIZE),
-  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP224R1_INV_ITCH),
+  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)),
 
   ecc_add_jja,
   ecc_add_jjj,
diff --git a/ecc-secp256r1.c b/ecc-secp256r1.c
index 4848dfe3..8c852aaa 100644
--- a/ecc-secp256r1.c
+++ b/ecc-secp256r1.c
@@ -213,64 +213,6 @@ ecc_secp256r1_modq (const struct ecc_modulo *q, mp_limb_t *rp, mp_limb_t *xp)
 #error Unsupported parameters
 #endif
 
-#define ECC_SECP256R1_INV_ITCH (4*ECC_LIMB_SIZE)
-
-static void
-ecc_secp256r1_inv (const struct ecc_modulo *p,
-		   mp_limb_t *rp, const mp_limb_t *ap,
-		   mp_limb_t *scratch)
-{
-#define a5m1 scratch
-#define t0 (scratch + ECC_LIMB_SIZE)
-#define a15m1 t0
-#define a32m1 a5m1
-#define tp (scratch + 2*ECC_LIMB_SIZE)
-/*
-   Addition chain for p - 2 = 2^{256} - 2^{224} + 2^{192} + 2^{96} - 3
-
-    2^5 - 1 = 1 + 2 (2^4 - 1) = 1 + 2 (2^2+1)(2 + 1)    4 S + 3 M
-    2^{15} - 1 = (2^5 - 1) (1 + 2^5 (1 + 2^5)          10 S + 2 M
-    2^{16} - 1 = 1 + 2 (2^{15} - 1)                       S +   M
-    2^{32} - 1 = (2^{16} + 1) (2^{16} - 1)             16 S +   M
-    2^{64} - 2^{32} + 1 = 2^{32} (2^{32} - 1) + 1      32 S +   M
-    2^{192} - 2^{160} + 2^{128} + 2^{32} - 1
-        = 2^{128} (2^{64} - 2^{32} + 1) + 2^{32} - 1  128 S +   M
-    2^{224} - 2^{192} + 2^{160} + 2^{64} - 1
-        = 2^{32} (...) + 2^{32} - 1                    32 S +   M
-    2^{239} - 2^{207} + 2^{175} + 2^{79} - 1
-        = 2^{15} (...) + 2^{15} - 1                    15 S +   M
-    2^{254} - 2^{222} + 2^{190} + 2^{94} - 1
-        = 2^{15} (...) + 2^{15} - 1                    15 S +   M
-    p - 2 = 2^2 (...) + 1                               2 S     M
-                                                   ---------------
-						      255 S + 13 M
- */
-  ecc_mod_sqr (p, rp, ap, tp);			/* a^2 */
-  ecc_mod_mul (p, rp, rp, ap, tp);		/* a^3 */
-  ecc_mod_pow_2kp1 (p, t0, rp, 2, tp);		/* a^{2^4 - 1} */
-  ecc_mod_sqr (p, rp, t0, tp);			/* a^{2^5 - 2} */
-  ecc_mod_mul (p, a5m1, rp, ap, tp);		/* a^{2^5 - 1}, a5m1 */
-
-  ecc_mod_pow_2kp1 (p, rp, a5m1, 5, tp);	/* a^{2^{10} - 1, a5m1*/
-  ecc_mod_pow_2k_mul (p, a15m1, rp, 5, a5m1, tp); /* a^{2^{15} - 1}, a5m1 a15m1 */
-  ecc_mod_sqr (p, rp, a15m1, tp);		/* a^{2^{16} - 2}, a15m1 */
-  ecc_mod_mul (p, rp, rp, ap, tp);		/* a^{2^{16} - 1}, a15m1 */
-  ecc_mod_pow_2kp1 (p, a32m1, rp, 16, tp);	/* a^{2^{32} - 1}, a15m1, a32m1 */
-
-  ecc_mod_pow_2k_mul (p, rp, a32m1, 32, ap, tp);/* a^{2^{64} - 2^{32} + 1 */
-  ecc_mod_pow_2k_mul (p, rp, rp, 128, a32m1, tp); /* a^{2^{192} - 2^{160} + 2^{128} + 2^{32} - 1} */
-  ecc_mod_pow_2k_mul (p, rp, rp, 32, a32m1, tp);/* a^{2^{224} - 2^{192} + 2^{160} + 2^{64} - 1} */
-  ecc_mod_pow_2k_mul (p, rp, rp, 15, a15m1, tp);/* a^{2^{239} - 2^{207} + 2^{175} + 2^{79} - 1} */
-  ecc_mod_pow_2k_mul (p, rp, rp, 15, a15m1, tp);/* a^{2^{254} - 2^{222} + 2^{190} + 2^{94} - 1} */
-  ecc_mod_pow_2k_mul (p, rp, rp, 2, ap, tp); 	/* a^{2^{256} - 2^{224} + 2^{192} + 2^{96} - 3} */
-
-#undef a5m1
-#undef t0
-#undef a15m1
-#undef a32m1
-#undef tp
-}
-
 /* To guarantee that inputs to ecc_mod_zero_p are in the required range. */
 #if ECC_LIMB_SIZE * GMP_NUMB_BITS != 256
 #error Unsupported limb size
@@ -336,20 +278,22 @@ const struct ecc_curve _nettle_secp_256r1 =
     ECC_LIMB_SIZE,
     ECC_BMODP_SIZE,
     ECC_REDC_SIZE,
-    ECC_SECP256R1_INV_ITCH,
+    ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     ECC_SECP256R1_SQRT_ITCH,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
-    ecc_pp1h,
+    USE_REDC ? ecc_invp_redc_power : NULL,
 
     ecc_secp256r1_modp,
     USE_REDC ? ecc_secp256r1_redc : ecc_secp256r1_modp,
-    ecc_secp256r1_inv,
+    ecc_mod_inv,
     ecc_secp256r1_sqrt,
     NULL,
   },
@@ -361,14 +305,15 @@ const struct ecc_curve _nettle_secp_256r1 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
-
+    NULL,
     ecc_secp256r1_modq,
     ecc_secp256r1_modq,
     ecc_mod_inv,
@@ -385,7 +330,7 @@ const struct ecc_curve _nettle_secp_256r1 =
   ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_A_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_G_ITCH (ECC_LIMB_SIZE),
-  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP256R1_INV_ITCH),
+  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)),
 
   ecc_add_jja,
   ecc_add_jjj,
diff --git a/ecc-secp384r1.c b/ecc-secp384r1.c
index abac5e6d..9c015297 100644
--- a/ecc-secp384r1.c
+++ b/ecc-secp384r1.c
@@ -210,45 +210,6 @@ ecc_mod_pow_288m32m1 (const struct ecc_modulo *m,
 #undef tp
 }
 
-#define ECC_SECP384R1_INV_ITCH (6*ECC_LIMB_SIZE)
-
-static void
-ecc_secp384r1_inv (const struct ecc_modulo *p,
-		   mp_limb_t *rp, const mp_limb_t *ap,
-		   mp_limb_t *scratch)
-{
-  /*
-    Addition chain for
-
-    p - 2 = 2^{384} - 2^{128} - 2^{96} + 2^{32} - 3
-
-    Start with
-
-      a^{2^{288} - 2^{32} - 1}
-
-    and then use
-
-      2^{382} - 2^{126} - 2^{94} + 2^{30} - 1
-         = 2^{94} (2^{288} - 2^{32} - 1) + 2^{30} - 1
-
-      2^{384} - 2^{128} - 2^{96} + 2^{32} - 3
-         = 2^2 (2^{382} - 2^{126} - 2^{94} + 2^{30} - 1) + 1
-
-    This addition chain needs 96 additional squarings and 2
-    multiplies, for a total of 383 squarings and 14 multiplies.
-  */
-
-#define a30m1 scratch
-#define tp (scratch + ECC_LIMB_SIZE)
-
-  ecc_mod_pow_288m32m1 (p, rp, a30m1, ap, tp);
-  ecc_mod_pow_2k_mul (p, rp, rp, 94, a30m1, tp); /* a^{2^{382} - 2^{126} - 2^{94} + 2^{30} - 1 */
-  ecc_mod_pow_2k_mul (p, rp, rp, 2, ap, tp);
-
-#undef a30m1
-#undef tp
-}
-
 /* To guarantee that inputs to ecc_mod_zero_p are in the required range. */
 #if ECC_LIMB_SIZE * GMP_NUMB_BITS != 384
 #error Unsupported limb size
@@ -307,20 +268,22 @@ const struct ecc_curve _nettle_secp_384r1 =
     ECC_LIMB_SIZE,    
     ECC_BMODP_SIZE,
     ECC_REDC_SIZE,
-    ECC_SECP384R1_INV_ITCH,
+    ECC_MOD_INV_ITCH(ECC_LIMB_SIZE),
     ECC_SECP384R1_SQRT_ITCH,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
-    ecc_pp1h,
+    NULL,
 
     ecc_secp384r1_modp,
     ecc_secp384r1_modp,
-    ecc_secp384r1_inv,
+    ecc_mod_inv,
     ecc_secp384r1_sqrt,
     NULL,
   },
@@ -332,13 +295,15 @@ const struct ecc_curve _nettle_secp_384r1 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_mod,
     ecc_mod,
@@ -356,7 +321,7 @@ const struct ecc_curve _nettle_secp_384r1 =
   ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_A_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_G_ITCH (ECC_LIMB_SIZE),
-  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP384R1_INV_ITCH),
+  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH (ECC_LIMB_SIZE)),
 
   ecc_add_jja,
   ecc_add_jjj,
diff --git a/ecc-secp521r1.c b/ecc-secp521r1.c
index 8ab7b4bf..c47b7b81 100644
--- a/ecc-secp521r1.c
+++ b/ecc-secp521r1.c
@@ -75,53 +75,6 @@ ecc_secp521r1_modp (const struct ecc_modulo *m UNUSED, mp_limb_t *rp, mp_limb_t
 }
 #endif
 
-#define ECC_SECP521R1_INV_ITCH (3*ECC_LIMB_SIZE)
-
-static void
-ecc_secp521r1_inv (const struct ecc_modulo *p,
-		   mp_limb_t *rp, const mp_limb_t *ap,
-		   mp_limb_t *scratch)
-{
-#define t0 scratch
-#define tp (scratch + ECC_LIMB_SIZE)
-
-  /* Addition chain for p - 2:
-
-     2^{521} - 3
-     = 1 + 2^2(2^519 - 1)
-     = 1 + 2^2(1 + 2 (2^518 - 1)
-     = 1 + 2^2(1 + 2 (2^259 + 1) (1 + 2(2^258 - 1)))
-     = 1 + 2^2(1 + 2 (2^259 + 1) (1 + 2(2^129 + 1) (2^129 - 1)))
-     = 1 + 2^2(1 + 2 (2^259 + 1) (1 + 2(2^129 + 1) (1 + 2 (2^128 - 1))))
-
-     where
-
-     2^{128} - 1 = (2^64 + 1) (2^32+1) (2^16 + 1) (2^8 + 1) (2^4 + 1) (2^2 + 1) (2 + 1)
-
-     This addition chain needs 520 squarings and 13 multiplies.
-  */
-
-  ecc_mod_sqr (p, rp, ap, tp);	        /* a^2 */
-  ecc_mod_mul (p, rp, ap, rp, tp);	/* a^3 = a^{2^2 - 1} */
-  ecc_mod_pow_2kp1 (p, t0, rp, 2, tp);	/* a^15 = a^{2^4 - 1} */
-  ecc_mod_pow_2kp1 (p, rp, t0, 4, tp);	/* a^{2^8 - 1} */
-  ecc_mod_pow_2kp1 (p, t0, rp, 8, tp);	/* a^{2^16 - 1} */
-  ecc_mod_pow_2kp1 (p, rp, t0, 16, tp);	/* a^{2^32 - 1} */
-  ecc_mod_pow_2kp1 (p, t0, rp, 32, tp);	/* a^{2^64 - 1} */
-  ecc_mod_pow_2kp1 (p, rp, t0, 64, tp);	/* a^{2^128 - 1} */
-  ecc_mod_sqr (p, rp, rp, tp);		/* a^{2^129 - 2} */
-  ecc_mod_mul (p, rp, rp, ap, tp);	/* a^{2^129 - 1} */
-  ecc_mod_pow_2kp1 (p, t0, rp, 129, tp);/* a^{2^258 - 1} */
-  ecc_mod_sqr (p, rp, t0, tp);		/* a^{2^259 - 2} */
-  ecc_mod_mul (p, rp, rp, ap, tp);	/* a^{2^259 - 1} */
-  ecc_mod_pow_2kp1 (p, t0, rp, 259, tp);/* a^{2^518 - 1} */
-  ecc_mod_sqr (p, rp, t0, tp);		/* a^{2^519 - 2} */
-  ecc_mod_mul (p, rp, rp, ap, tp);	/* a^{2^519 - 1} */
-  ecc_mod_sqr (p, rp, rp, tp);		/* a^{2^520 - 2} */
-  ecc_mod_sqr (p, rp, rp, tp);		/* a^{2^521 - 4} */
-  ecc_mod_mul (p, rp, rp, ap, tp);	/* a^{2^521 - 3} */
-}
-
 #define ECC_SECP521R1_SQRT_ITCH (2*ECC_LIMB_SIZE)
 
 static int
@@ -162,20 +115,22 @@ const struct ecc_curve _nettle_secp_521r1 =
     ECC_LIMB_SIZE,    
     ECC_BMODP_SIZE,
     ECC_REDC_SIZE,
-    ECC_SECP521R1_INV_ITCH,
+    ECC_MOD_INV_ITCH(ECC_LIMB_SIZE),
     ECC_SECP521R1_SQRT_ITCH,
     0,
+    ECC_INVP_COUNT,
+    ECC_BINVP,
 
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
     ecc_Bm2p,
     ecc_redc_ppm1,
-    ecc_pp1h,
+    NULL,
 
     ecc_secp521r1_modp,
     ecc_secp521r1_modp,
-    ecc_secp521r1_inv,
+    ecc_mod_inv,
     ecc_secp521r1_sqrt,
     NULL,
   },
@@ -187,13 +142,15 @@ const struct ecc_curve _nettle_secp_521r1 =
     ECC_MOD_INV_ITCH (ECC_LIMB_SIZE),
     0,
     0,
+    ECC_INVQ_COUNT,
+    ECC_BINVQ,
 
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
     ecc_Bm2q,
     NULL,
-    ecc_qp1h,
+    NULL,
 
     ecc_mod,
     ecc_mod,
@@ -211,8 +168,11 @@ const struct ecc_curve _nettle_secp_521r1 =
   ECC_DUP_JJ_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_A_ITCH (ECC_LIMB_SIZE),
   ECC_MUL_G_ITCH (ECC_LIMB_SIZE),
+#if 0
   ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_SECP521R1_INV_ITCH),
-
+#else
+  ECC_J_TO_A_ITCH(ECC_LIMB_SIZE, ECC_MOD_INV_ITCH(ECC_LIMB_SIZE)),
+#endif
   ecc_add_jja,
   ecc_add_jjj,
   ecc_dup_jj,
diff --git a/eccdata.c b/eccdata.c
index e0726e8d..1d2e1993 100644
--- a/eccdata.c
+++ b/eccdata.c
@@ -1167,11 +1167,19 @@ string_toupper (char *buf, size_t size, const char *s)
   abort();
 }
 
+static unsigned
+invert_count (unsigned bit_size) {
+  /* See https://eprint.iacr.org/2019/266.pdf, Theorem 11.2 */
+  return (49 * bit_size + 57) / 17;
+}
+
 static void
 output_modulo (const char *name, const mpz_t x,
 	       unsigned size, unsigned bits_per_limb)
 {
   unsigned bit_size;
+  unsigned count;
+  unsigned invert_iterations;
   int shift;
   char buf[20];
   mpz_t t;
@@ -1193,9 +1201,29 @@ output_modulo (const char *name, const mpz_t x,
 		      / bits_per_limb));
 
   bit_size = mpz_sizeinbase (x, 2);
-
   shift = size * bits_per_limb - bit_size;
   assert (shift >= 0);
+
+  /* If most significant bit of top limb is zero, input may be one bit
+     larger than the modulo. */
+  count = invert_count (bit_size + (shift > 0));
+  printf("#define ECC_INV%s_COUNT %u\n", buf, count);
+
+  mpz_set_ui (t, 0);
+  mpz_setbit (t, bits_per_limb);
+  mpz_invert (t, x, t);
+
+  printf ("#define ECC_BINV%s 0x", buf);
+  mpz_out_str (stdout, 16, t);
+  printf("%s\n", bits_per_limb > 32 ? "ULL" : "UL");
+
+  invert_iterations = (count + bits_per_limb - 3) / (bits_per_limb - 2);
+  mpz_set_ui (t, 2);
+  mpz_powm_ui (t, t, bits_per_limb * (invert_iterations + size) - count, x);
+  mpz_invert (t, t, x);
+  snprintf (buf, sizeof(buf), "ecc_inv%s_redc_power", name);
+  output_bignum (buf, t, size, bits_per_limb);
+
   if (shift > 0)
     {
       mpz_set_ui (t, 0);
@@ -1292,14 +1320,6 @@ output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
 
   output_bignum ("ecc_b", ecc->b, limb_size, bits_per_limb);
 
-  mpz_add_ui (t, ecc->p, 1);
-  mpz_fdiv_q_2exp (t, t, 1);
-  output_bignum ("ecc_pp1h", t, limb_size, bits_per_limb);      
-
-  mpz_add_ui (t, ecc->q, 1);
-  mpz_fdiv_q_2exp (t, t, 1);
-  output_bignum ("ecc_qp1h", t, limb_size, bits_per_limb);  
-
   /* Trailing zeros in p+1 correspond to trailing ones in p. */
   redc_limbs = mpz_scan0 (ecc->p, 0) / bits_per_limb;
   if (redc_limbs > 0)
diff --git a/testsuite/ecc-modinv-test.c b/testsuite/ecc-modinv-test.c
index aecfb43b..147896a6 100644
--- a/testsuite/ecc-modinv-test.c
+++ b/testsuite/ecc-modinv-test.c
@@ -51,9 +51,12 @@ test_modulo (gmp_randstate_t rands, const char *name,
   mp_limb_t *scratch;
   unsigned j;
   mpz_t r;
+  mp_bitcnt_t bits;
 
   mpz_init (r);
 
+  bits = m->bit_size + (m->size * GMP_NUMB_BITS > m->bit_size);
+
   a = xalloc_limbs (m->size);
   ai = xalloc_limbs (m->size);
   ref = xalloc_limbs (m->size);;
@@ -90,15 +93,17 @@ test_modulo (gmp_randstate_t rands, const char *name,
       fprintf (stderr, " (bad)\n");
       abort ();
     }
-	
+
   for (j = 0; j < COUNT; j++)
     {
       if (j & 1)
-	mpz_rrandomb (r, rands, m->size * GMP_NUMB_BITS);
+	mpz_rrandomb (r, rands, bits);
       else
-	mpz_urandomb (r, rands, m->size * GMP_NUMB_BITS);
+	mpz_urandomb (r, rands, bits);
 
       mpz_limbs_copy (a, r, m->size);
+      if ((a[m->size - 1] >> 1) > m->m[m->size - 1])
+	a[m->size - 1] = 2*m->m[m->size - 1] + 1;
 
       if (!ref_modinv (ref, a, m->m, m->size, use_redc))
 	{
@@ -113,7 +118,7 @@ test_modulo (gmp_randstate_t rands, const char *name,
 	  fprintf (stderr, "%s->invert failed (test %u, bit size %u):\n",
 		   name, j, m->bit_size);
 	  fprintf (stderr, "a = ");
-	  mpz_out_str (stderr, 16, r);
+	  mpn_out_str (stderr, 16, a, m->size);
 	  fprintf (stderr, "\np = ");
 	  mpn_out_str (stderr, 16, m->m, m->size);
 	  fprintf (stderr, "\nt = ");
author	Niels Möller <nisse@lysator.liu.se>	2022-10-07 20:37:58 +0200
committer	Niels Möller <nisse@lysator.liu.se>	2022-11-28 19:30:04 +0100
commit	39ed186b89bae0db3fe8a443b09ef8b493c483b3 (patch)
tree	42b537c84f52f7823826779ccc22f1aa659e4bbe
parent	3fbce8821d1d6c1c7a2a25c8e5ad9b186125bcdb (diff)
download	nettle-39ed186b89bae0db3fe8a443b09ef8b493c483b3.tar.gz