6 files changed, 182 insertions, 198 deletions
diff --git a/ChangeLog b/ChangeLog
index eb66f3aa..0516a040 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2019-12-14  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-mul-m.c (ecc_mul_m): New file and function. Implements
+	multipliction for curves in Montgomery representation, as used for
+	curve25519 and curve448. Extracted from curve25519_mul.
+	* ecc-internal.h (ecc_mul_m): Declare.
+	(ECC_MUL_M_ITCH): New macro.
+	* Makefile.in (hogweed_SOURCES): Add ecc-mul-m.c.
+
+	* curve25519-mul.c (curve25519_mul): Use ecc_mul_m.
+	* curve448-mul.c (curve448_mul): Likewise.
+
 2019-12-13  Niels Möller  <nisse@lysator.liu.se>
 
 	* Merge curve448 implementation.
diff --git a/Makefile.in b/Makefile.in
index 036a3a1d..8d06149f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -180,7 +180,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \
 		  ecc-dup-jj.c ecc-add-jja.c ecc-add-jjj.c \
 		  ecc-eh-to-a.c \
 		  ecc-dup-eh.c ecc-add-eh.c ecc-add-ehh.c \
-		  ecc-mul-g-eh.c ecc-mul-a-eh.c \
+		  ecc-mul-g-eh.c ecc-mul-a-eh.c ecc-mul-m.c \
 		  ecc-mul-g.c ecc-mul-a.c ecc-hash.c ecc-random.c \
 		  ecc-point.c ecc-scalar.c ecc-point-mul.c ecc-point-mul-g.c \
 		  ecc-ecdsa-sign.c ecdsa-sign.c \
diff --git a/curve25519-mul.c b/curve25519-mul.c
index ba76bc0b..aff2b293 100644
--- a/curve25519-mul.c
+++ b/curve25519-mul.c
@@ -44,104 +44,21 @@
 void
 curve25519_mul (uint8_t *q, const uint8_t *n, const uint8_t *p)
 {
-  const struct ecc_curve *ecc = &_nettle_curve25519;
+  const struct ecc_modulo *m = &_nettle_curve25519.p;
   mp_size_t itch;
-  mp_limb_t *scratch;
-  int i;
-  mp_limb_t cy;
-
-  /* FIXME: Could save some more scratch space, e.g., by letting BB
-     overlap C, D, and CB overlap A, D. And possibly reusing some of
-     x2, z2, x3, z3. */
-#define x1 scratch
-#define x2 (scratch + ecc->p.size)
-#define z2 (scratch + 2*ecc->p.size)
-#define x3 (scratch + 3*ecc->p.size)
-#define z3 (scratch + 4*ecc->p.size)
-
-#define A  (scratch + 5*ecc->p.size)
-#define B  (scratch + 6*ecc->p.size)
-#define C  (scratch + 7*ecc->p.size)
-#define D  (scratch + 8*ecc->p.size)
-#define AA  (scratch + 9*ecc->p.size)
-#define BB  (scratch +10*ecc->p.size)
-#define E  (scratch + 10*ecc->p.size) /* Overlap BB */
-#define DA  (scratch + 9*ecc->p.size) /* Overlap AA */
-#define CB  (scratch + 10*ecc->p.size) /* Overlap BB */
-
-  itch = ecc->p.size * 12;
-  scratch = gmp_alloc_limbs (itch);
-
-  /* Note that 255 % GMP_NUMB_BITS == 0 isn't supported, so x1 always
+  mp_limb_t *x;
+
+  itch = m->size + ECC_MUL_M_ITCH(m->size);
+  x = gmp_alloc_limbs (itch);
+
+  /* Note that 255 % GMP_NUMB_BITS == 0 isn't supported, so x always
      holds at least 256 bits. */
-  mpn_set_base256_le (x1, ecc->p.size, p, CURVE25519_SIZE);
+  mpn_set_base256_le (x, m->size, p, CURVE25519_SIZE);
   /* Clear bit 255, as required by RFC 7748. */
-  x1[255/GMP_NUMB_BITS] &= ~((mp_limb_t) 1 << (255 % GMP_NUMB_BITS));
-
-  /* Initialize, x2 = x1, z2 = 1 */
-  mpn_copyi (x2, x1, ecc->p.size);
-  z2[0] = 1;
-  mpn_zero (z2+1, ecc->p.size - 1);
-
-  /* Get x3, z3 from doubling. Since bit 254 is forced to 1. */
-  ecc_modp_add (ecc, A, x2, z2);
-  ecc_modp_sub (ecc, B, x2, z2);
-  ecc_modp_sqr (ecc, AA, A);
-  ecc_modp_sqr (ecc, BB, B);
-  ecc_modp_mul (ecc, x3, AA, BB);
-  ecc_modp_sub (ecc, E, AA, BB);
-  ecc_modp_addmul_1 (ecc, AA, E, 121665);
-  ecc_modp_mul (ecc, z3, E, AA);      
-
-  for (i = 253; i >= 3; i--)
-    {
-      int bit = (n[i/8] >> (i & 7)) & 1;
-
-      cnd_swap (bit, x2, x3, 2*ecc->p.size);
-
-      /* Formulas from draft-turner-thecurve25519function-00-Mont. We
-	 compute new coordinates in memory-address order, since mul
-	 and sqr clobbers higher limbs. */
-      ecc_modp_add (ecc, A, x2, z2);
-      ecc_modp_sub (ecc, B, x2, z2);
-      ecc_modp_sqr (ecc, AA, A);
-      ecc_modp_sqr (ecc, BB, B);
-      ecc_modp_mul (ecc, x2, AA, BB); /* Last use of BB */
-      ecc_modp_sub (ecc, E, AA, BB);
-      ecc_modp_addmul_1 (ecc, AA, E, 121665);
-      ecc_modp_add (ecc, C, x3, z3);
-      ecc_modp_sub (ecc, D, x3, z3);
-      ecc_modp_mul (ecc, z2, E, AA); /* Last use of E and AA */
-      ecc_modp_mul (ecc, DA, D, A);  /* Last use of D, A. FIXME: could
-					let CB overlap. */
-      ecc_modp_mul (ecc, CB, C, B);
-
-      ecc_modp_add (ecc, C, DA, CB);
-      ecc_modp_sqr (ecc, x3, C);
-      ecc_modp_sub (ecc, C, DA, CB);
-      ecc_modp_sqr (ecc, DA, C);
-      ecc_modp_mul (ecc, z3, DA, x1);
-
-      /* FIXME: Could be combined with the loop's initial cnd_swap. */
-      cnd_swap (bit, x2, x3, 2*ecc->p.size);
-    }
-  /* Do the 3 low zero bits, just duplicating x2 */
-  for ( ; i >= 0; i--)
-    {
-      ecc_modp_add (ecc, A, x2, z2);
-      ecc_modp_sub (ecc, B, x2, z2);
-      ecc_modp_sqr (ecc, AA, A);
-      ecc_modp_sqr (ecc, BB, B);
-      ecc_modp_mul (ecc, x2, AA, BB);
-      ecc_modp_sub (ecc, E, AA, BB);
-      ecc_modp_addmul_1 (ecc, AA, E, 121665);
-      ecc_modp_mul (ecc, z2, E, AA);      
-    }
-  ecc->p.invert (&ecc->p, x3, z2, z3 + ecc->p.size);
-  ecc_modp_mul (ecc, z3, x2, x3);
-  cy = mpn_sub_n (x2, z3, ecc->p.m, ecc->p.size);
-  cnd_copy (cy, x2, z3, ecc->p.size);
-  mpn_get_base256_le (q, CURVE25519_SIZE, x2, ecc->p.size);
-
-  gmp_free_limbs (scratch, itch);
+  x[255/GMP_NUMB_BITS] &= ~((mp_limb_t) 1 << (255 % GMP_NUMB_BITS));
+
+  ecc_mul_m (m, 121665, 3, 253, x, n, x, x + m->size);
+  mpn_get_base256_le (q, CURVE25519_SIZE, x, m->size);
+
+  gmp_free_limbs (x, itch);
 }
diff --git a/curve448-mul.c b/curve448-mul.c
index 59cf7664..2090c8a5 100644
--- a/curve448-mul.c
+++ b/curve448-mul.c
@@ -46,105 +46,16 @@
 void
 curve448_mul (uint8_t *q, const uint8_t *n, const uint8_t *p)
 {
-  const struct ecc_curve *ecc = &_nettle_curve448;
+  const struct ecc_modulo *m = &_nettle_curve448.p;
   mp_size_t itch;
-  mp_limb_t *scratch;
-  int i;
-  mp_limb_t cy;
-
-  /* FIXME: Could save some more scratch space, e.g., by letting BB
-     overlap C, D, and CB overlap A, D. And possibly reusing some of
-     x2, z2, x3, z3. */
-#define x1 scratch
-#define x2 (scratch + ecc->p.size)
-#define z2 (scratch + 2*ecc->p.size)
-#define x3 (scratch + 3*ecc->p.size)
-#define z3 (scratch + 4*ecc->p.size)
-
-#define A  (scratch + 5*ecc->p.size)
-#define B  (scratch + 6*ecc->p.size)
-#define C  (scratch + 7*ecc->p.size)
-#define D  (scratch + 8*ecc->p.size)
-#define AA  (scratch + 9*ecc->p.size)
-#define BB  (scratch + 10*ecc->p.size)
-#define E  (scratch + 10*ecc->p.size) /* Overlap BB */
-#define DA  (scratch + 9*ecc->p.size) /* Overlap AA */
-#define CB  (scratch + 10*ecc->p.size) /* Overlap BB */
-
-#define a24 39081
-
-  itch = ecc->p.size * 12;
-  assert (ecc->p.invert_itch + 5*ecc->p.size <= itch);
-  scratch = gmp_alloc_limbs (itch);
-
-  /* Note that 255 % GMP_NUMB_BITS == 0 isn't supported, so x1 always
-     holds at least 256 bits. */
-  mpn_set_base256_le (x1, ecc->p.size, p, CURVE448_SIZE);
-
-  /* Initialize, x2 = x1, z2 = 1 */
-  mpn_copyi (x2, x1, ecc->p.size);
-  z2[0] = 1;
-  mpn_zero (z2+1, ecc->p.size - 1);
-
-  /* Get x3, z3 from doubling. Since bit 447 is forced to 1. */
-  ecc_modp_add (ecc, A, x2, z2);
-  ecc_modp_sub (ecc, B, x2, z2);
-  ecc_modp_sqr (ecc, AA, A);
-  ecc_modp_sqr (ecc, BB, B);
-  ecc_modp_mul (ecc, x3, AA, BB);
-  ecc_modp_sub (ecc, E, AA, BB);
-  ecc_modp_addmul_1 (ecc, AA, E, a24);
-  ecc_modp_mul (ecc, z3, E, AA);
-
-  for (i = 446; i >= 2; i--)
-    {
-      int bit = (n[i/8] >> (i & 7)) & 1;
-
-      cnd_swap (bit, x2, x3, 2*ecc->p.size);
-
-      /* Formulas from RFC 7748. We compute new coordinates in
-	 memory-address order, since mul and sqr clobbers higher
-	 limbs. */
-      ecc_modp_add (ecc, A, x2, z2);
-      ecc_modp_sub (ecc, B, x2, z2);
-      ecc_modp_sqr (ecc, AA, A);
-      ecc_modp_sqr (ecc, BB, B);
-      ecc_modp_mul (ecc, x2, AA, BB);
-      ecc_modp_sub (ecc, E, AA, BB); /* Last use of BB */
-      ecc_modp_addmul_1 (ecc, AA, E, a24);
-      ecc_modp_add (ecc, C, x3, z3);
-      ecc_modp_sub (ecc, D, x3, z3);
-      ecc_modp_mul (ecc, z2, E, AA); /* Last use of E and AA */
-      ecc_modp_mul (ecc, DA, D, A);  /* Last use of D, A. FIXME: could
-					let CB overlap. */
-      ecc_modp_mul (ecc, CB, C, B);
-
-      ecc_modp_add (ecc, C, DA, CB);
-      ecc_modp_sqr (ecc, x3, C);
-      ecc_modp_sub (ecc, C, DA, CB);
-      ecc_modp_sqr (ecc, DA, C);
-      ecc_modp_mul (ecc, z3, DA, x1);
-
-      /* FIXME: Could be combined with the loop's initial cnd_swap. */
-      cnd_swap (bit, x2, x3, 2*ecc->p.size);
-    }
-  /* Do the 2 low zero bits, just duplicating x2 */
-  for ( ; i >= 0; i--)
-    {
-      ecc_modp_add (ecc, A, x2, z2);
-      ecc_modp_sub (ecc, B, x2, z2);
-      ecc_modp_sqr (ecc, AA, A);
-      ecc_modp_sqr (ecc, BB, B);
-      ecc_modp_mul (ecc, x2, AA, BB);
-      ecc_modp_sub (ecc, E, AA, BB);
-      ecc_modp_addmul_1 (ecc, AA, E, a24);
-      ecc_modp_mul (ecc, z2, E, AA);
-    }
-  ecc->p.invert (&ecc->p, x3, z2, z3 + ecc->p.size);
-  ecc_modp_mul (ecc, z3, x2, x3);
-  cy = mpn_sub_n (x2, z3, ecc->p.m, ecc->p.size);
-  cnd_copy (cy, x2, z3, ecc->p.size);
-  mpn_get_base256_le (q, CURVE448_SIZE, x2, ecc->p.size);
-
-  gmp_free_limbs (scratch, itch);
+  mp_limb_t *x;
+
+  itch = m->size + ECC_MUL_M_ITCH(m->size);
+  x = gmp_alloc_limbs (itch);
+
+  mpn_set_base256_le (x, m->size, p, CURVE448_SIZE);
+  ecc_mul_m (m, 39081, 2, 446, x, n, x, x + m->size);
+  mpn_get_base256_le (q, CURVE448_SIZE, x, m->size);
+
+  gmp_free_limbs (x, itch);
 }
diff --git a/ecc-internal.h b/ecc-internal.h
index a7c7fa15..cd1a1573 100644
--- a/ecc-internal.h
+++ b/ecc-internal.h
@@ -69,6 +69,7 @@
 #define ecc_mul_a _nettle_ecc_mul_a
 #define ecc_mul_g_eh _nettle_ecc_mul_g_eh
 #define ecc_mul_a_eh _nettle_ecc_mul_a_eh
+#define ecc_mul_m _nettle_ecc_mul_m
 #define cnd_copy _nettle_cnd_copy
 #define sec_add_1 _nettle_sec_add_1
 #define sec_sub_1 _nettle_sec_sub_1
@@ -394,6 +395,13 @@ ecc_mul_a_eh (const struct ecc_curve *ecc,
 	      mp_limb_t *scratch);
 
 void
+ecc_mul_m (const struct ecc_modulo *m,
+	   mp_limb_t a24,
+	   unsigned bit_low, unsigned bit_high,
+	   mp_limb_t *qx, const uint8_t *n, const mp_limb_t *px,
+	   mp_limb_t *scratch);
+
+void
 cnd_copy (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n);
 
 mp_limb_t
@@ -439,6 +447,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
 #define ECC_MUL_A_EH_ITCH(size) \
   (((3 << ECC_MUL_A_EH_WBITS) + 10) * (size))
 #endif
+#define ECC_MUL_M_ITCH(size) (11*(size))
 #define ECC_ECDSA_SIGN_ITCH(size) (12*(size))
 #define ECC_MOD_RANDOM_ITCH(size) (size)
 #define ECC_HASH_ITCH(size) (1+(size))
diff --git a/ecc-mul-m.c b/ecc-mul-m.c
new file mode 100644
index 00000000..68bdd16e
--- /dev/null
+++ b/ecc-mul-m.c
@@ -0,0 +1,135 @@
+/* ecc-mul-m.c
+
+   Point multiplication using Montgomery curve representation.
+
+   Copyright (C) 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+
+#include "ecc.h"
+#include "ecc-internal.h"
+
+void
+ecc_mul_m (const struct ecc_modulo *m,
+	   mp_limb_t a24,
+	   unsigned bit_low, unsigned bit_high,
+	   mp_limb_t *qx, const uint8_t *n, const mp_limb_t *px,
+	   mp_limb_t *scratch)
+{
+  unsigned i;
+  mp_limb_t cy;
+
+  /* FIXME: Could save some more scratch space, e.g., by letting BB
+     overlap C, D, and CB overlap A, D. And possibly reusing some of
+     x2, z2, x3, z3. */
+#define x2 (scratch)
+#define z2 (scratch + m->size)
+#define x3 (scratch + 2*m->size)
+#define z3 (scratch + 3*m->size)
+
+#define A  (scratch + 4*m->size)
+#define B  (scratch + 5*m->size)
+#define C  (scratch + 6*m->size)
+#define D  (scratch + 7*m->size)
+#define AA  (scratch + 8*m->size)
+#define BB  (scratch +9*m->size)
+#define E  (scratch + 9*m->size) /* Overlap BB */
+#define DA  (scratch + 8*m->size) /* Overlap AA */
+#define CB  (scratch + 9*m->size) /* Overlap BB */
+
+  /* Initialize, x2 = px, z2 = 1 */
+  mpn_copyi (x2, px, m->size);
+  z2[0] = 1;
+  mpn_zero (z2+1, m->size - 1);
+
+  /* Get x3, z3 from doubling. Since most significant bit is forced to 1. */
+  ecc_mod_add (m, A, x2, z2);
+  ecc_mod_sub (m, B, x2, z2);
+  ecc_mod_sqr (m, AA, A);
+  ecc_mod_sqr (m, BB, B);
+  ecc_mod_mul (m, x3, AA, BB);
+  ecc_mod_sub (m, E, AA, BB);
+  ecc_mod_addmul_1 (m, AA, E, a24);
+  ecc_mod_mul (m, z3, E, AA);
+
+  for (i = bit_high; i >= bit_low; i--)
+    {
+      int bit = (n[i/8] >> (i & 7)) & 1;
+
+      cnd_swap (bit, x2, x3, 2*m->size);
+
+      /* Formulas from RFC 7748. We compute new coordinates in
+	 memory-address order, since mul and sqr clobbers higher
+	 limbs. */
+      ecc_mod_add (m, A, x2, z2);
+      ecc_mod_sub (m, B, x2, z2);
+      ecc_mod_sqr (m, AA, A);
+      ecc_mod_sqr (m, BB, B);
+      ecc_mod_mul (m, x2, AA, BB); /* Last use of BB */
+      ecc_mod_sub (m, E, AA, BB);
+      ecc_mod_addmul_1 (m, AA, E, a24);
+      ecc_mod_add (m, C, x3, z3);
+      ecc_mod_sub (m, D, x3, z3);
+      ecc_mod_mul (m, z2, E, AA); /* Last use of E and AA */
+      ecc_mod_mul (m, DA, D, A);  /* Last use of D, A. FIXME: could
+				     let CB overlap. */
+      ecc_mod_mul (m, CB, C, B);
+
+      ecc_mod_add (m, C, DA, CB);
+      ecc_mod_sqr (m, x3, C);
+      ecc_mod_sub (m, C, DA, CB);
+      ecc_mod_sqr (m, DA, C);
+      ecc_mod_mul (m, z3, DA, px);
+
+      /* FIXME: Could be combined with the loop's initial cnd_swap. */
+      cnd_swap (bit, x2, x3, 2*m->size);
+    }
+  /* Do the low zero bits, just duplicating x2 */
+  for (i = 0; i < bit_low; i++)
+    {
+      ecc_mod_add (m, A, x2, z2);
+      ecc_mod_sub (m, B, x2, z2);
+      ecc_mod_sqr (m, AA, A);
+      ecc_mod_sqr (m, BB, B);
+      ecc_mod_mul (m, x2, AA, BB);
+      ecc_mod_sub (m, E, AA, BB);
+      ecc_mod_addmul_1 (m, AA, E, a24);
+      ecc_mod_mul (m, z2, E, AA);
+    }
+  assert (m->invert_itch <= 7 * m->size);
+  m->invert (m, x3, z2, z3 + m->size);
+  ecc_mod_mul (m, z3, x2, x3);
+  cy = mpn_sub_n (qx, z3, m->m, m->size);
+  cnd_copy (cy, qx, z3, m->size);
+}