diff options
author | Marco Bodrato <bodrato@mail.dm.unipi.it> | 2018-01-12 12:51:19 +0100 |
---|---|---|
committer | Marco Bodrato <bodrato@mail.dm.unipi.it> | 2018-01-12 12:51:19 +0100 |
commit | 2ac167397efa92b8d6433f69f8095c6d3da0bcc7 (patch) | |
tree | 09d1ca8590a904d042858096955efec7d7fb2576 /mpz | |
parent | b4530a49d42f695abdf2457c2667df303a2a317f (diff) | |
download | gmp-2ac167397efa92b8d6433f69f8095c6d3da0bcc7.tar.gz |
mpz/bin_ui.c: better precomp-update approach suggested by Niels'observations
Diffstat (limited to 'mpz')
-rw-r--r-- | mpz/bin_ui.c | 230 |
1 files changed, 210 insertions, 20 deletions
diff --git a/mpz/bin_ui.c b/mpz/bin_ui.c index a6781e367..f06577925 100644 --- a/mpz/bin_ui.c +++ b/mpz/bin_ui.c @@ -1,6 +1,6 @@ /* mpz_bin_ui(RESULT, N, K) -- Set RESULT to N over K. -Copyright 1998-2002, 2012, 2013, 2015, 2017 Free Software Foundation, Inc. +Copyright 1998-2002, 2012, 2013, 2015, 2017-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -33,13 +33,22 @@ see https://www.gnu.org/licenses/. */ /* How many special cases? Minimum is 2: 0 and 1; * also 3 {0,1,2} and 5 {0,1,2,3,4} are implemented. */ -#define APARTA_KALKULOJ 2 +#define APARTAJ_KALKULOJ 2 /* Whether to use (1) or not (0) the function mpz_bin_uiui whenever * the operands fit. */ #define UZU_BIN_UIUI 0 +/* Whether to use a shortcut to precompute the product of four + * elements (1), or precompute only the product of a couple (0). + * + * In both cases the precomputed product is then updated with some + * linear operations to obtain the product of the next four (1) + * [or two (0)] operands. + */ +#define KVAROPE 1 + static void posmpz_init (mpz_ptr r) { @@ -53,13 +62,188 @@ posmpz_init (mpz_ptr r) /* Equivalent to mpz_add_ui (r, r, in), but faster when 0 < SIZ (r) < ALLOC (r) and limbs above SIZ (r) contain 0. */ static void -posmpz_inc_ui (mpz_ptr r, mp_limb_t in) +posmpz_inc_ui (mpz_ptr r, unsigned long in) { +#if BITS_PER_ULONG > GMP_NUMB_BITS + mpz_add_ui (r, r, in); +#else ASSERT (SIZ (r) > 0); MPN_INCR_U (PTR (r), SIZ (r) + 1, in); SIZ (r) += (PTR (r)[SIZ (r)] != 0); +#endif +} + +/* Equivalent to mpz_sub_ui (r, r, in), but faster when + 0 < SIZ (r) and we know in advance that the result is positive. */ +static void +posmpz_dec_ui (mpz_ptr r, unsigned long in) +{ +#if BITS_PER_ULONG > GMP_NUMB_BITS + mpz_sub_ui (r, r, in); +#else + ASSERT (mpz_cmp_ui (r, in) > 0); + MPN_DECR_U (PTR (r), SIZ (r), in); + SIZ (r) -= (PTR (r)[SIZ (r)-1] == 0); +#endif +} + +/* Equivalent to mpz_tdiv_q_2exp (r, r, 1), but faster when + 0 < SIZ (r) and we know in advance that the result is positive. */ +static void +posmpz_rsh1 (mpz_ptr r) +{ + mp_ptr rp; + mp_size_t rn; + + rn = SIZ (r); + rp = PTR (r); + ASSERT (rn > 0); + mpn_rshift (rp, rp, rn, 1); + SIZ (r) -= rp[rn - 1] == 0; +} + +/* Computes r = n(n+(2*k-1))/2 + It uses a sqare instead of a product, computing + r = ((n+k-1)^2 + n - (k-1)^2)/2 + As a side effect, sets t = n+k + */ +static void +mpz_hmul_nbnpk (mpz_ptr r, mpz_srcptr n, unsigned long int k, mpz_ptr t) +{ + --k; + mpz_add_ui (t, n, k); + mpz_mul (r, t, t); + mpz_add (r, r, n); + posmpz_rsh1 (r); + if (LIKELY (k <= (1UL << (BITS_PER_ULONG / 2)))) + posmpz_dec_ui (r, (k + (k & 1))*(k >> 1)); + else + { + mpz_t tmp; + mpz_init_set_ui (tmp, (k + (k & 1))); + mpz_mul_ui (tmp, tmp, k >> 1); + mpz_sub (r, r, tmp); + mpz_clear (tmp); + } +} + +#if KVAROPE +static void +rek_raising_fac4 (mpz_ptr r, mpz_ptr p, mpz_ptr P, unsigned long int k, unsigned long int lk, mpz_ptr t) +{ + if (k - lk < 5) + { + do { + posmpz_inc_ui (p, 4*k+2); + mpz_addmul_ui (P, p, 4*k); + posmpz_dec_ui (P, k); + mpz_mul (r, r, P); + } while (--k > lk); + } + else + { + mpz_t lt; + unsigned long int m; + + m = ((k + lk) >> 1) + 1; + rek_raising_fac4 (r, p, P, k, m, t); + + posmpz_inc_ui (p, 4*m+2); + mpz_addmul_ui (P, p, 4*m); + posmpz_dec_ui (P, m); + if (t == NULL) + { + mpz_init_set (lt, P); + t = lt; + } + else + { + ALLOC (lt) = 0; + mpz_set (t, P); + } + rek_raising_fac4 (t, p, P, m - 1, lk, NULL); + + mpz_mul (r, r, t); + mpz_clear (lt); + } +} + +/* Computes (n+1)(n+2)...(n+k)/2^(k/2 +k/4) using the helper function + rek_raising_fac4, and exploiting an idea inspired by a piece of + code that Fredrik Johansson wrote and by a comment by Niels Möller. + + Assume k = 4i then compute: + p = (n+1)(n+4i)/2 - i + (n+1+1)(n+4i)/2 = p + i + (n+4i)/2 + (n+1+1)(n+4i-1)/2 = p + i + ((n+4i)-(n+1+1))/2 = p + i + (n-n+4i-2)/2 = p + 3i-1 + P = (p + i)*(p+3i-1)/2 = (n+1)(n+2)(n+4i-1)(n+4i)/8 + n' = n + 2 + i' = i - 1 + (n'-1)(n')(n'+4i'+1)(n'+4i'+2)/8 = P + (n'-1)(n'+4i'+2)/2 - i' - 1 = p + (n'-1+2)(n'+4i'+2)/2 - i' - 1 = p + (n'+4i'+2) + (n'-1+2)(n'+4i'+2-2)/2 - i' - 1 = p + (n'+4i'+2) - (n'-1+2) = p + 4i' + 1 + (n'-1+2)(n'+4i'+2-2)/2 - i' = p + 4i' + 2 + p' = p + 4i' + 2 = (n'+1)(n'+4i')/2 - i' + p' - 4i' - 2 = p + (p' - 4i' - 2 + i)*(p' - 4i' - 2+3i-1)/2 = P + (p' - 4i' - 2 + i' + 1)*(p' - 4i' - 2 + 3i' + 3 - 1)/2 = P + (p' - 3i' - 1)*(p' - i')/2 = P + (p' - 3i' - 1 + 4i' + 1)*(p' - i' + 4i' - 1)/2 = P + (4i' + 1)*(p' - i')/2 + (p' - 3i' - 1 + 4i' + 1)*(4i' - 1)/2 + (p' + i')*(p' + 3i' - 1)/2 = P + (4i')*(p' + p')/2 + (p' - i' - (p' + i'))/2 + (p' + i')*(p' + 3i' - 1)/2 = P + 4i'p' + (p' - i' - p' - i')/2 + (p' + i')*(p' + 3i' - 1)/2 = P + 4i'p' - i' + P' = P + 4i'p' - i' + + And compute the product P * P' * P" ... + */ + +static void +mpz_raising_fac4 (mpz_ptr r, mpz_ptr n, unsigned long int k, mpz_ptr t, mpz_ptr p) +{ + ASSERT ((k >= APARTAJ_KALKULOJ) && (APARTAJ_KALKULOJ > 0)); + posmpz_init (n); + posmpz_inc_ui (n, 1); + SIZ (r) = 0; + if (k & 1) + { + mpz_set (r, n); + posmpz_inc_ui (n, 1); + } + k >>= 1; + if (APARTAJ_KALKULOJ < 2 && k == 0) + return; + + mpz_hmul_nbnpk (p, n, k, t); + posmpz_init (p); + + if (k & 1) + { + if (SIZ (r)) + mpz_mul (r, r, p); + else + mpz_set (r, p); + posmpz_inc_ui (p, k - 1); + } + k >>= 1; + if (APARTAJ_KALKULOJ < 4 && k == 0) + return; + + mpz_hmul_nbnpk (t, p, k, n); + if (SIZ (r)) + mpz_mul (r, r, t); + else + mpz_set (r, t); + + if (APARTAJ_KALKULOJ > 8 || k > 1) + { + posmpz_dec_ui (p, k); + rek_raising_fac4 (r, p, t, k - 1, 0, n); + } } +#else /* KVAROPE */ + static void rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk, mpz_ptr t1, mpz_ptr t2) { @@ -77,7 +261,7 @@ rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk mpz_t t3; unsigned long int m; - m = (k + lk) >> 1; + m = ((k + lk) >> 1) + 1; rek_raising_fac (r, n, k, m, t1, t2); posmpz_inc_ui (n, m); @@ -111,34 +295,34 @@ rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk p'== (n'+1)(n'+2i')/2 == (n+1 +1)(n+2i -1)/2 And compute the product p * p' * p" ... - */ +*/ + static void mpz_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, mpz_ptr t, mpz_ptr p) { - ASSERT (k > 1); - mpz_add_ui (t, n, k); + unsigned long int hk; + ASSERT ((k >= APARTAJ_KALKULOJ) && (APARTAJ_KALKULOJ > 1)); mpz_add_ui (n, n, 1); - mpz_mul (p, n, t); + hk = k >> 1; + mpz_hmul_nbnpk (p, n, hk, t); if ((k & 1) != 0) { - mpz_sub (p, p, n); - mpz_fdiv_q_2exp (p, p, 1); + mpz_add_ui (t, t, hk + 1); mpz_mul (r, t, p); } else { - mpz_fdiv_q_2exp (p, p, 1); mpz_set (r, p); } - if ((APARTA_KALKULOJ > 3) || (k > 3)) + if ((APARTAJ_KALKULOJ > 3) || (hk > 1)) { - k = (k >> 1) - 1; posmpz_init (p); - rek_raising_fac (r, p, k, 0, t, n); + rek_raising_fac (r, p, hk - 1, 0, t, n); } } +#endif /* KVAROPE */ /* This is a poor implementation. Look at bin_uiui.c for improvement ideas. In fact consider calling mpz_bin_uiui() when the arguments fit, leaving @@ -151,7 +335,6 @@ void mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k) { mpz_t ni; - mp_limb_t i; mp_size_t negate; if (SIZ (n) < 0) @@ -192,23 +375,23 @@ mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k) mpz_set_ui (ni, tmp); } - if (k < APARTA_KALKULOJ) + if (k < APARTAJ_KALKULOJ) { if (k == 0) { SIZ (r) = 1; MPZ_NEWALLOC (r, 1)[0] = 1; } -#if APARTA_KALKULOJ == 3 || APARTA_KALKULOJ == 5 +#if APARTAJ_KALKULOJ > 2 else if (k == 2) { mpz_add_ui (ni, ni, 1); mpz_mul (r, ni, ni); mpz_add (r, r, ni); - mpz_fdiv_q_2exp (r, r, 1); + posmpz_rsh1 (r); } #endif -#if APARTA_KALKULOJ == 5 +#if APARTAJ_KALKULOJ > 3 else if (k > 2) { /* k = 3, 4 */ mpz_add_ui (ni, ni, 2); /* n+1 */ @@ -251,10 +434,17 @@ mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k) mpz_init (num); mpz_init (den); +#if KVAROPE + mpz_raising_fac4 (num, ni, k, den, r); + popc_limb (count, k); + ASSERT (k - (k >> 1) - (k >> 2) - count >= 0); + mpz_tdiv_q_2exp (num, num, k - (k >> 1) - (k >> 2) - count); +#else mpz_raising_fac (num, ni, k, den, r); popc_limb (count, k); ASSERT (k - (k >> 1) - count >= 0); - mpz_fdiv_q_2exp (num, num, k - (k >> 1) - count); + mpz_tdiv_q_2exp (num, num, k - (k >> 1) - count); +#endif mpz_oddfac_1(den, k, 0); |