summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2000-04-15 22:24:34 +0200
committertege <tege@gmplib.org>2000-04-15 22:24:34 +0200
commit7b9a3c35feeb183c3f89ae8430d97ed3813db2ef (patch)
treeeb16400131397ad7ac45072dd46c4a7c20165a73
parent95d0da0adb5f39c7e6a9772000b47b0b96d38fdb (diff)
downloadgmp-7b9a3c35feeb183c3f89ae8430d97ed3813db2ef.tar.gz
Replacement from Paul Zimmermann.
-rw-r--r--mpz/powm.c479
1 files changed, 282 insertions, 197 deletions
diff --git a/mpz/powm.c b/mpz/powm.c
index f596436e2..8d670cd8c 100644
--- a/mpz/powm.c
+++ b/mpz/powm.c
@@ -1,6 +1,7 @@
/* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
-Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
This file is part of the GNU MP Library.
@@ -23,256 +24,340 @@ MA 02111-1307, USA. */
#include "gmp-impl.h"
#include "longlong.h"
+/* returns -1/m mod 2^BITS_PER_MP_LIMB, suppose m odd */
+static mp_limb_t
+mpz_dmprepare (m)
+ mpz_t m;
+{
+ mp_limb_t m0, x; int k;
+
+ m0 = PTR (m)[0];
+ ASSERT_ALWAYS (m0 % 2 != 0);
+
+ /* quadratic Hensel lifting (or Newton iteration) */
+ x = 1; k = 1; /* invariant: x * m0 = 1 mod 2^k */
+ while (k < BITS_PER_MP_LIMB)
+ {
+ x += x * (1 - x * m0);
+ k <<= 1;
+ }
+ return -x;
+}
+
+/* set c <- (a*b)/R^n mod m c has to have at least (2n) allocated limbs */
+static void
+mpz_redc (c, a, b, m, Nprim)
+ mpz_t c, a, b, m;
+ mp_limb_t Nprim;
+{
+ mp_ptr cp, mp = PTR (m);
+ mp_limb_t cy, cout = 0;
+ mp_limb_t q;
+ size_t j, n = SIZ (m);
+
+ ASSERT (ALLOC (c) >= 2 * n);
+
+ mpz_mul (c, a, b);
+ cp = PTR (c);
+ j = ABSIZ (c);
+ MPN_ZERO (cp + j, 2 * n - j);
+ for (j = 0; j < n; j++)
+ {
+ q = cp[0] * Nprim;
+ cy = mpn_addmul_1 (cp, mp, n, q);
+ cout += mpn_add_1 (cp + n, cp + n, n - j, cy);
+ cp++;
+ }
+ cp -= n;
+ if (cout)
+ {
+ cy = cout - mpn_sub_n (cp, cp + n, mp, n);
+ while (cy)
+ cy -= mpn_sub_n (cp, cp, mp, n);
+ }
+ else
+ MPN_COPY (cp, cp + n, n);
+ MPN_NORMALIZE (cp, n);
+ SIZ (c) = SIZ (c) < 0 ? -n : n;
+}
+
+/* average number of calls to redc for an exponent of n bits
+ with the sliding window algorithm of base 2^k: the optimal is
+ obtained for the value of k which minimizes 2^(k-1)+n/(k+1):
+
+ n\k 4 5 6 7 8
+ 128 156* 159 171 200 261
+ 256 309 307* 316 343 403
+ 512 617 607* 610 632 688
+ 1024 1231 1204 1195* 1207 1256
+ 2048 2461 2399 2366 2360* 2396
+ 4096 4918 4787 4707 4665* 4670
+*/
+
#ifndef BERKELEY_MP
void
#if __STDC__
-mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr exp, mpz_srcptr mod)
+mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod)
#else
-mpz_powm (res, base, exp, mod)
+mpz_powm (res, base, e, mod)
mpz_ptr res;
mpz_srcptr base;
- mpz_srcptr exp;
+ mpz_srcptr e;
mpz_srcptr mod;
#endif
#else /* BERKELEY_MP */
void
#if __STDC__
-pow (mpz_srcptr base, mpz_srcptr exp, mpz_srcptr mod, mpz_ptr res)
+pow (mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod, mpz_ptr res)
#else
-pow (base, exp, mod, res)
+pow (base, e, mod, res)
mpz_srcptr base;
- mpz_srcptr exp;
+ mpz_srcptr e;
mpz_srcptr mod;
mpz_ptr res;
#endif
#endif /* BERKELEY_MP */
{
- mp_ptr rp, ep, mp, bp;
- mp_size_t esize, msize, bsize, rsize;
- mp_size_t size;
- int mod_shift_cnt;
- int negative_result;
- mp_limb_t *free_me = NULL;
- size_t free_me_size;
- TMP_DECL (marker);
-
- esize = ABS (exp->_mp_size);
- msize = ABS (mod->_mp_size);
- size = 2 * msize;
+ mp_limb_t invm, *ep, c, mask;
+ mpz_t xx, *g;
+ mp_size_t n, i, K, j, l, k;
+ int sh;
+ int use_redc;
+
+#ifdef DEBUG
+ mpz_t exp;
+ mpz_init (exp);
+#endif
- rp = res->_mp_d;
- ep = exp->_mp_d;
+ n = SIZ (mod);
- if (msize == 0)
- msize = 1 / msize; /* provoke a signal */
+ if (n == 0)
+ DIVIDE_BY_ZERO;
- if (esize == 0)
+ if (SIZ (e) == 0)
{
/* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
- depending on if MOD equals 1. */
- rp[0] = 1;
- res->_mp_size = (msize == 1 && (mod->_mp_d)[0] == 1) ? 0 : 1;
+ depending on if MOD equals 1. */
+ PTR(res)[0] = 1;
+ SIZ(res) = (ABSIZ (mod) == 1 && (PTR(mod))[0] == 1) ? 0 : 1;
return;
}
- TMP_MARK (marker);
-
- /* Normalize MOD (i.e. make its most significant bit set) as required by
- mpn_divmod. This will make the intermediate values in the calculation
- slightly larger, but the correct result is obtained after a final
- reduction using the original MOD value. */
+ /* Use REDC instead of usual reduction for small sizes, more precisely using
+ REDC each modular multiplication cost about 2*n^2 limbs operations,
+ whereas using usual reduction it costs 3*K(n), where K(n) is the cost of a
+ multiplication using Karatsuba, and a division is assumed to cost 2*K(n),
+ for example using Burnikel-Ziegler's algorithm. This gives a theoretical
+ threshold of a*KARATSUBA_SQR_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~
+ 2.66. */
+ /* For now, also disable REDC when MOD is even, as mpz_dmprepare cannot cope
+ with that. */
+
+ use_redc = (3 * n < 8 * KARATSUBA_SQR_THRESHOLD && PTR(mod)[0] % 2 != 0);
+ if (use_redc)
+ invm = mpz_dmprepare (mod);
+
+ /* determines optimal value of k */
+ l = ABSIZ (e) * BITS_PER_MP_LIMB; /* number of bits of exponent */
+ k = 1;
+ K = 2;
+ while (2 * l > K * (2 + k * (3 + k)))
+ {
+ k++;
+ K *= 2;
+ }
- mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB);
- count_leading_zeros (mod_shift_cnt, mod->_mp_d[msize - 1]);
- if (mod_shift_cnt != 0)
- mpn_lshift (mp, mod->_mp_d, msize, mod_shift_cnt);
+ g = (*_mp_allocate_func) (K / 2 * sizeof (mpz_t));
+ /* compute x*R^n where R=2^BITS_PER_MP_LIMB */
+ mpz_init (g[0]);
+ if (use_redc)
+ {
+ mpz_mul_2exp (g[0], base, n * BITS_PER_MP_LIMB);
+ mpz_mod (g[0], g[0], mod);
+ }
else
- MPN_COPY (mp, mod->_mp_d, msize);
+ mpz_mod (g[0], base, mod);
- bsize = ABS (base->_mp_size);
- if (bsize > msize)
+ /* compute xx^g for odd g < 2^k */
+ mpz_init (xx);
+ if (use_redc)
{
- /* The base is larger than the module. Reduce it. */
-
- /* Allocate (BSIZE + 1) with space for remainder and quotient.
- (The quotient is (bsize - msize + 1) limbs.) */
- bp = (mp_ptr) TMP_ALLOC ((bsize + 1) * BYTES_PER_MP_LIMB);
- MPN_COPY (bp, base->_mp_d, bsize);
- /* We don't care about the quotient, store it above the remainder,
- at BP + MSIZE. */
- mpn_divmod (bp + msize, bp, bsize, mp, msize);
- bsize = msize;
- /* Canonicalize the base, since we are going to multiply with it
- quite a few times. */
- MPN_NORMALIZE (bp, bsize);
+ _mpz_realloc (xx, 2 * n);
+ mpz_redc (xx, g[0], g[0], mod, invm); /* xx = x^2*R^n */
}
else
- bp = base->_mp_d;
-
- if (bsize == 0)
{
- res->_mp_size = 0;
- TMP_FREE (marker);
- return;
+ mpz_mul (xx, g[0], g[0]);
+ mpz_mod (xx, xx, mod);
}
-
- if (res->_mp_alloc < size)
+ for (i = 1; i < K / 2; i++)
{
- /* We have to allocate more space for RES. If any of the input
- parameters are identical to RES, defer deallocation of the old
- space. */
-
- if (rp == ep || rp == mp || rp == bp)
+ mpz_init (g[i]);
+ if (use_redc)
{
- free_me = rp;
- free_me_size = res->_mp_alloc;
+ _mpz_realloc (g[i], 2 * n);
+ mpz_redc (g[i], g[i - 1], xx, mod, invm); /* g[i] = x^(2i+1)*R^n */
}
else
- (*_mp_free_func) (rp, res->_mp_alloc * BYTES_PER_MP_LIMB);
-
- rp = (mp_ptr) (*_mp_allocate_func) (size * BYTES_PER_MP_LIMB);
- res->_mp_alloc = size;
- res->_mp_d = rp;
- }
- else
- {
- /* Make BASE, EXP and MOD not overlap with RES. */
- if (rp == bp)
{
- /* RES and BASE are identical. Allocate temp. space for BASE. */
- bp = (mp_ptr) TMP_ALLOC (bsize * BYTES_PER_MP_LIMB);
- MPN_COPY (bp, rp, bsize);
+ mpz_mul (g[i], g[i - 1], xx);
+ mpz_mod (g[i], g[i], mod);
}
- if (rp == ep)
+ }
+
+ /* now starts the real stuff */
+ mask = (mp_limb_t) ((1<<k) - 1);
+ ep = PTR (e);
+ i = SIZ (e) - 1; /* current index */
+ c = ep[i]; /* current limb */
+ count_leading_zeros (sh, c);
+ sh = BITS_PER_MP_LIMB - sh; /* significant bits in ep[i] */
+ sh -= k; /* index of lower bit of ep[i] to take into account */
+ if (sh < 0)
+ { /* k-sh extra bits are needed */
+ if (i > 0)
{
- /* RES and EXP are identical. Allocate temp. space for EXP. */
- ep = (mp_ptr) TMP_ALLOC (esize * BYTES_PER_MP_LIMB);
- MPN_COPY (ep, rp, esize);
+ i--;
+ c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh));
+ sh += BITS_PER_MP_LIMB;
}
- if (rp == mp)
+ }
+ else
+ c = c >> sh;
+#ifdef DEBUG
+ printf ("-1/m mod 2^%u = %lu\n", BITS_PER_MP_LIMB, invm);
+ mpz_set_ui (exp, c);
+#endif
+ j=0;
+ while (c % 2 == 0)
+ {
+ j++;
+ c = (c >> 1);
+ }
+ mpz_set (xx, g[c >> 1]);
+ while (j--)
+ {
+ if (use_redc)
+ mpz_redc (xx, xx, xx, mod, invm);
+ else
{
- /* RES and MOD are identical. Allocate temporary space for MOD. */
- mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB);
- MPN_COPY (mp, rp, msize);
+ mpz_mul (xx, xx, xx);
+ mpz_mod (xx, xx, mod);
}
}
- MPN_COPY (rp, bp, bsize);
- rsize = bsize;
-
- {
- mp_size_t i;
- mp_ptr xp = (mp_ptr) TMP_ALLOC (2 * (msize + 1) * BYTES_PER_MP_LIMB);
- int c;
- mp_limb_t e;
- mp_limb_t carry_limb;
-
- negative_result = (ep[0] & 1) && base->_mp_size < 0;
-
- i = esize - 1;
- e = ep[i];
- count_leading_zeros (c, e);
- e = (e << c) << 1; /* shift the exp bits to the left, lose msb */
- c = BITS_PER_MP_LIMB - 1 - c;
-
- /* Main loop.
-
- Make the result be pointed to alternately by XP and RP. This
- helps us avoid block copying, which would otherwise be necessary
- with the overlap restrictions of mpn_divmod. With 50% probability
- the result after this loop will be in the area originally pointed
- by RP (==RES->_mp_d), and with 50% probability in the area originally
- pointed to by XP. */
+#ifdef DEBUG
+ printf ("x^"); mpz_out_str (0, 10, exp);
+ printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx);
+ putchar ('\n');
+#endif
- for (;;)
- {
- while (c != 0)
- {
- mp_ptr tp;
- mp_size_t xsize;
+ while (i > 0 || sh > 0)
+ {
+ c = ep[i];
+ sh -= k;
+ l = k; /* number of bits treated */
+ if (sh < 0)
+ {
+ if (i > 0)
+ {
+ i--;
+ c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh));
+ sh += BITS_PER_MP_LIMB;
+ }
+ else
+ {
+ l += sh; /* may be less bits than k here */
+ c = c & ((1<<l) - 1);
+ }
+ }
+ else
+ c = c >> sh;
+ c = c & mask;
- mpn_mul_n (xp, rp, rp, rsize);
- xsize = 2 * rsize;
- xsize -= xp[xsize - 1] == 0;
- if (xsize > msize)
- {
- mpn_divmod (xp + msize, xp, xsize, mp, msize);
- xsize = msize;
- }
+ /* this while loop implements the sliding window improvement */
+ while ((c & (1 << (k - 1))) == 0 && (i > 0 || sh > 0))
+ {
+ if (use_redc) mpz_redc (xx, xx, xx, mod, invm);
+ else
+ {
+ mpz_mul (xx, xx, xx);
+ mpz_mod (xx, xx, mod);
+ }
+ if (sh)
+ {
+ sh--;
+ c = (c<<1) + ((ep[i]>>sh) & 1);
+ }
+ else
+ {
+ i--;
+ sh = BITS_PER_MP_LIMB - 1;
+ c = (c<<1) + (ep[i]>>sh);
+ }
+ }
- tp = rp; rp = xp; xp = tp;
- rsize = xsize;
+#ifdef DEBUG
+ printf ("l=%u c=%lu\n", l, c);
+ mpz_mul_2exp (exp, exp, k);
+ mpz_add_ui (exp, exp, c);
+#endif
- if ((mp_limb_signed_t) e < 0)
+ /* now replace xx by xx^(2^k)*x^c */
+ if (c != 0)
+ {
+ j = 0;
+ while (c % 2 == 0)
+ {
+ j++;
+ c = c >> 1;
+ }
+ /* c0 = c * 2^j, i.e. xx^(2^k)*x^c = (A^(2^(k - j))*c)^(2^j) */
+ l -= j;
+ while (l--)
+ if (use_redc) mpz_redc (xx, xx, xx, mod, invm);
+ else
{
- mpn_mul (xp, rp, rsize, bp, bsize);
- xsize = rsize + bsize;
- xsize -= xp[xsize - 1] == 0;
- if (xsize > msize)
- {
- mpn_divmod (xp + msize, xp, xsize, mp, msize);
- xsize = msize;
- }
-
- tp = rp; rp = xp; xp = tp;
- rsize = xsize;
+ mpz_mul (xx, xx, xx);
+ mpz_mod (xx, xx, mod);
}
- e <<= 1;
- c--;
- }
-
- i--;
- if (i < 0)
- break;
- e = ep[i];
- c = BITS_PER_MP_LIMB;
- }
-
- /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT
- steps. Adjust the result by reducing it with the original MOD.
-
- Also make sure the result is put in RES->_mp_d (where it already
- might be, see above). */
-
- if (mod_shift_cnt != 0)
- {
- carry_limb = mpn_lshift (res->_mp_d, rp, rsize, mod_shift_cnt);
- rp = res->_mp_d;
- if (carry_limb != 0)
- {
- rp[rsize] = carry_limb;
- rsize++;
- }
- }
- else
- {
- MPN_COPY (res->_mp_d, rp, rsize);
- rp = res->_mp_d;
- }
-
- if (rsize >= msize)
- {
- mpn_divmod (rp + msize, rp, rsize, mp, msize);
- rsize = msize;
- }
-
- /* Remove any leading zero words from the result. */
- if (mod_shift_cnt != 0)
- mpn_rshift (rp, rp, rsize, mod_shift_cnt);
- MPN_NORMALIZE (rp, rsize);
- }
+ if (use_redc)
+ mpz_redc (xx, xx, g[c >> 1], mod, invm);
+ else
+ {
+ mpz_mul (xx, xx, g[c >> 1]);
+ mpz_mod (xx, xx, mod);
+ }
+ }
+ else
+ j = l; /* case c=0 */
+ while (j--)
+ {
+ if (use_redc)
+ mpz_redc (xx, xx, xx, mod, invm);
+ else
+ {
+ mpz_mul (xx, xx, xx);
+ mpz_mod (xx, xx, mod);
+ }
+ }
+#ifdef DEBUG
+ printf ("x^"); mpz_out_str (0, 10, exp);
+ printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx);
+ putchar ('\n');
+#endif
+ }
- if (negative_result && rsize != 0)
+ /* now convert back xx to xx/R^n */
+ if (use_redc)
{
- if (mod_shift_cnt != 0)
- mpn_rshift (mp, mp, msize, mod_shift_cnt);
- mpn_sub (rp, mp, msize, rp, rsize);
- rsize = msize;
- MPN_NORMALIZE (rp, rsize);
+ mpz_set_ui (g[0], 1);
+ mpz_redc (xx, xx, g[0], mod, invm);
}
- res->_mp_size = rsize;
+ mpz_set (res, xx);
- if (free_me != NULL)
- (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
- TMP_FREE (marker);
+ mpz_clear (xx);
+ for (i = 0; i < K / 2; i++)
+ mpz_clear (g[i]);
+ (*_mp_free_func) (g, K / 2 * sizeof (mpz_t));
}