summaryrefslogtreecommitdiff
path: root/mpz
diff options
context:
space:
mode:
authorMarco Bodrato <bodrato@mail.dm.unipi.it>2018-01-12 12:51:19 +0100
committerMarco Bodrato <bodrato@mail.dm.unipi.it>2018-01-12 12:51:19 +0100
commit2ac167397efa92b8d6433f69f8095c6d3da0bcc7 (patch)
tree09d1ca8590a904d042858096955efec7d7fb2576 /mpz
parentb4530a49d42f695abdf2457c2667df303a2a317f (diff)
downloadgmp-2ac167397efa92b8d6433f69f8095c6d3da0bcc7.tar.gz
mpz/bin_ui.c: better precomp-update approach suggested by Niels'observations
Diffstat (limited to 'mpz')
-rw-r--r--mpz/bin_ui.c230
1 files changed, 210 insertions, 20 deletions
diff --git a/mpz/bin_ui.c b/mpz/bin_ui.c
index a6781e367..f06577925 100644
--- a/mpz/bin_ui.c
+++ b/mpz/bin_ui.c
@@ -1,6 +1,6 @@
/* mpz_bin_ui(RESULT, N, K) -- Set RESULT to N over K.
-Copyright 1998-2002, 2012, 2013, 2015, 2017 Free Software Foundation, Inc.
+Copyright 1998-2002, 2012, 2013, 2015, 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -33,13 +33,22 @@ see https://www.gnu.org/licenses/. */
/* How many special cases? Minimum is 2: 0 and 1;
* also 3 {0,1,2} and 5 {0,1,2,3,4} are implemented.
*/
-#define APARTA_KALKULOJ 2
+#define APARTAJ_KALKULOJ 2
/* Whether to use (1) or not (0) the function mpz_bin_uiui whenever
* the operands fit.
*/
#define UZU_BIN_UIUI 0
+/* Whether to use a shortcut to precompute the product of four
+ * elements (1), or precompute only the product of a couple (0).
+ *
+ * In both cases the precomputed product is then updated with some
+ * linear operations to obtain the product of the next four (1)
+ * [or two (0)] operands.
+ */
+#define KVAROPE 1
+
static void
posmpz_init (mpz_ptr r)
{
@@ -53,13 +62,188 @@ posmpz_init (mpz_ptr r)
/* Equivalent to mpz_add_ui (r, r, in), but faster when
0 < SIZ (r) < ALLOC (r) and limbs above SIZ (r) contain 0. */
static void
-posmpz_inc_ui (mpz_ptr r, mp_limb_t in)
+posmpz_inc_ui (mpz_ptr r, unsigned long in)
{
+#if BITS_PER_ULONG > GMP_NUMB_BITS
+ mpz_add_ui (r, r, in);
+#else
ASSERT (SIZ (r) > 0);
MPN_INCR_U (PTR (r), SIZ (r) + 1, in);
SIZ (r) += (PTR (r)[SIZ (r)] != 0);
+#endif
+}
+
+/* Equivalent to mpz_sub_ui (r, r, in), but faster when
+ 0 < SIZ (r) and we know in advance that the result is positive. */
+static void
+posmpz_dec_ui (mpz_ptr r, unsigned long in)
+{
+#if BITS_PER_ULONG > GMP_NUMB_BITS
+ mpz_sub_ui (r, r, in);
+#else
+ ASSERT (mpz_cmp_ui (r, in) > 0);
+ MPN_DECR_U (PTR (r), SIZ (r), in);
+ SIZ (r) -= (PTR (r)[SIZ (r)-1] == 0);
+#endif
+}
+
+/* Equivalent to mpz_tdiv_q_2exp (r, r, 1), but faster when
+ 0 < SIZ (r) and we know in advance that the result is positive. */
+static void
+posmpz_rsh1 (mpz_ptr r)
+{
+ mp_ptr rp;
+ mp_size_t rn;
+
+ rn = SIZ (r);
+ rp = PTR (r);
+ ASSERT (rn > 0);
+ mpn_rshift (rp, rp, rn, 1);
+ SIZ (r) -= rp[rn - 1] == 0;
+}
+
+/* Computes r = n(n+(2*k-1))/2
+ It uses a sqare instead of a product, computing
+ r = ((n+k-1)^2 + n - (k-1)^2)/2
+ As a side effect, sets t = n+k
+ */
+static void
+mpz_hmul_nbnpk (mpz_ptr r, mpz_srcptr n, unsigned long int k, mpz_ptr t)
+{
+ --k;
+ mpz_add_ui (t, n, k);
+ mpz_mul (r, t, t);
+ mpz_add (r, r, n);
+ posmpz_rsh1 (r);
+ if (LIKELY (k <= (1UL << (BITS_PER_ULONG / 2))))
+ posmpz_dec_ui (r, (k + (k & 1))*(k >> 1));
+ else
+ {
+ mpz_t tmp;
+ mpz_init_set_ui (tmp, (k + (k & 1)));
+ mpz_mul_ui (tmp, tmp, k >> 1);
+ mpz_sub (r, r, tmp);
+ mpz_clear (tmp);
+ }
+}
+
+#if KVAROPE
+static void
+rek_raising_fac4 (mpz_ptr r, mpz_ptr p, mpz_ptr P, unsigned long int k, unsigned long int lk, mpz_ptr t)
+{
+ if (k - lk < 5)
+ {
+ do {
+ posmpz_inc_ui (p, 4*k+2);
+ mpz_addmul_ui (P, p, 4*k);
+ posmpz_dec_ui (P, k);
+ mpz_mul (r, r, P);
+ } while (--k > lk);
+ }
+ else
+ {
+ mpz_t lt;
+ unsigned long int m;
+
+ m = ((k + lk) >> 1) + 1;
+ rek_raising_fac4 (r, p, P, k, m, t);
+
+ posmpz_inc_ui (p, 4*m+2);
+ mpz_addmul_ui (P, p, 4*m);
+ posmpz_dec_ui (P, m);
+ if (t == NULL)
+ {
+ mpz_init_set (lt, P);
+ t = lt;
+ }
+ else
+ {
+ ALLOC (lt) = 0;
+ mpz_set (t, P);
+ }
+ rek_raising_fac4 (t, p, P, m - 1, lk, NULL);
+
+ mpz_mul (r, r, t);
+ mpz_clear (lt);
+ }
+}
+
+/* Computes (n+1)(n+2)...(n+k)/2^(k/2 +k/4) using the helper function
+ rek_raising_fac4, and exploiting an idea inspired by a piece of
+ code that Fredrik Johansson wrote and by a comment by Niels Möller.
+
+ Assume k = 4i then compute:
+ p = (n+1)(n+4i)/2 - i
+ (n+1+1)(n+4i)/2 = p + i + (n+4i)/2
+ (n+1+1)(n+4i-1)/2 = p + i + ((n+4i)-(n+1+1))/2 = p + i + (n-n+4i-2)/2 = p + 3i-1
+ P = (p + i)*(p+3i-1)/2 = (n+1)(n+2)(n+4i-1)(n+4i)/8
+ n' = n + 2
+ i' = i - 1
+ (n'-1)(n')(n'+4i'+1)(n'+4i'+2)/8 = P
+ (n'-1)(n'+4i'+2)/2 - i' - 1 = p
+ (n'-1+2)(n'+4i'+2)/2 - i' - 1 = p + (n'+4i'+2)
+ (n'-1+2)(n'+4i'+2-2)/2 - i' - 1 = p + (n'+4i'+2) - (n'-1+2) = p + 4i' + 1
+ (n'-1+2)(n'+4i'+2-2)/2 - i' = p + 4i' + 2
+ p' = p + 4i' + 2 = (n'+1)(n'+4i')/2 - i'
+ p' - 4i' - 2 = p
+ (p' - 4i' - 2 + i)*(p' - 4i' - 2+3i-1)/2 = P
+ (p' - 4i' - 2 + i' + 1)*(p' - 4i' - 2 + 3i' + 3 - 1)/2 = P
+ (p' - 3i' - 1)*(p' - i')/2 = P
+ (p' - 3i' - 1 + 4i' + 1)*(p' - i' + 4i' - 1)/2 = P + (4i' + 1)*(p' - i')/2 + (p' - 3i' - 1 + 4i' + 1)*(4i' - 1)/2
+ (p' + i')*(p' + 3i' - 1)/2 = P + (4i')*(p' + p')/2 + (p' - i' - (p' + i'))/2
+ (p' + i')*(p' + 3i' - 1)/2 = P + 4i'p' + (p' - i' - p' - i')/2
+ (p' + i')*(p' + 3i' - 1)/2 = P + 4i'p' - i'
+ P' = P + 4i'p' - i'
+
+ And compute the product P * P' * P" ...
+ */
+
+static void
+mpz_raising_fac4 (mpz_ptr r, mpz_ptr n, unsigned long int k, mpz_ptr t, mpz_ptr p)
+{
+ ASSERT ((k >= APARTAJ_KALKULOJ) && (APARTAJ_KALKULOJ > 0));
+ posmpz_init (n);
+ posmpz_inc_ui (n, 1);
+ SIZ (r) = 0;
+ if (k & 1)
+ {
+ mpz_set (r, n);
+ posmpz_inc_ui (n, 1);
+ }
+ k >>= 1;
+ if (APARTAJ_KALKULOJ < 2 && k == 0)
+ return;
+
+ mpz_hmul_nbnpk (p, n, k, t);
+ posmpz_init (p);
+
+ if (k & 1)
+ {
+ if (SIZ (r))
+ mpz_mul (r, r, p);
+ else
+ mpz_set (r, p);
+ posmpz_inc_ui (p, k - 1);
+ }
+ k >>= 1;
+ if (APARTAJ_KALKULOJ < 4 && k == 0)
+ return;
+
+ mpz_hmul_nbnpk (t, p, k, n);
+ if (SIZ (r))
+ mpz_mul (r, r, t);
+ else
+ mpz_set (r, t);
+
+ if (APARTAJ_KALKULOJ > 8 || k > 1)
+ {
+ posmpz_dec_ui (p, k);
+ rek_raising_fac4 (r, p, t, k - 1, 0, n);
+ }
}
+#else /* KVAROPE */
+
static void
rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk, mpz_ptr t1, mpz_ptr t2)
{
@@ -77,7 +261,7 @@ rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk
mpz_t t3;
unsigned long int m;
- m = (k + lk) >> 1;
+ m = ((k + lk) >> 1) + 1;
rek_raising_fac (r, n, k, m, t1, t2);
posmpz_inc_ui (n, m);
@@ -111,34 +295,34 @@ rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk
p'== (n'+1)(n'+2i')/2 == (n+1 +1)(n+2i -1)/2
And compute the product p * p' * p" ...
- */
+*/
+
static void
mpz_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, mpz_ptr t, mpz_ptr p)
{
- ASSERT (k > 1);
- mpz_add_ui (t, n, k);
+ unsigned long int hk;
+ ASSERT ((k >= APARTAJ_KALKULOJ) && (APARTAJ_KALKULOJ > 1));
mpz_add_ui (n, n, 1);
- mpz_mul (p, n, t);
+ hk = k >> 1;
+ mpz_hmul_nbnpk (p, n, hk, t);
if ((k & 1) != 0)
{
- mpz_sub (p, p, n);
- mpz_fdiv_q_2exp (p, p, 1);
+ mpz_add_ui (t, t, hk + 1);
mpz_mul (r, t, p);
}
else
{
- mpz_fdiv_q_2exp (p, p, 1);
mpz_set (r, p);
}
- if ((APARTA_KALKULOJ > 3) || (k > 3))
+ if ((APARTAJ_KALKULOJ > 3) || (hk > 1))
{
- k = (k >> 1) - 1;
posmpz_init (p);
- rek_raising_fac (r, p, k, 0, t, n);
+ rek_raising_fac (r, p, hk - 1, 0, t, n);
}
}
+#endif /* KVAROPE */
/* This is a poor implementation. Look at bin_uiui.c for improvement ideas.
In fact consider calling mpz_bin_uiui() when the arguments fit, leaving
@@ -151,7 +335,6 @@ void
mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k)
{
mpz_t ni;
- mp_limb_t i;
mp_size_t negate;
if (SIZ (n) < 0)
@@ -192,23 +375,23 @@ mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k)
mpz_set_ui (ni, tmp);
}
- if (k < APARTA_KALKULOJ)
+ if (k < APARTAJ_KALKULOJ)
{
if (k == 0)
{
SIZ (r) = 1;
MPZ_NEWALLOC (r, 1)[0] = 1;
}
-#if APARTA_KALKULOJ == 3 || APARTA_KALKULOJ == 5
+#if APARTAJ_KALKULOJ > 2
else if (k == 2)
{
mpz_add_ui (ni, ni, 1);
mpz_mul (r, ni, ni);
mpz_add (r, r, ni);
- mpz_fdiv_q_2exp (r, r, 1);
+ posmpz_rsh1 (r);
}
#endif
-#if APARTA_KALKULOJ == 5
+#if APARTAJ_KALKULOJ > 3
else if (k > 2)
{ /* k = 3, 4 */
mpz_add_ui (ni, ni, 2); /* n+1 */
@@ -251,10 +434,17 @@ mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k)
mpz_init (num);
mpz_init (den);
+#if KVAROPE
+ mpz_raising_fac4 (num, ni, k, den, r);
+ popc_limb (count, k);
+ ASSERT (k - (k >> 1) - (k >> 2) - count >= 0);
+ mpz_tdiv_q_2exp (num, num, k - (k >> 1) - (k >> 2) - count);
+#else
mpz_raising_fac (num, ni, k, den, r);
popc_limb (count, k);
ASSERT (k - (k >> 1) - count >= 0);
- mpz_fdiv_q_2exp (num, num, k - (k >> 1) - count);
+ mpz_tdiv_q_2exp (num, num, k - (k >> 1) - count);
+#endif
mpz_oddfac_1(den, k, 0);