summaryrefslogtreecommitdiff
path: root/gmp/mpn/generic/mul.c
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/generic/mul.c')
-rw-r--r--gmp/mpn/generic/mul.c428
1 files changed, 428 insertions, 0 deletions
diff --git a/gmp/mpn/generic/mul.c b/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000000..2d72df3d4d
--- /dev/null
+++ b/gmp/mpn/generic/mul.c
@@ -0,0 +1,428 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+ Contributed to the GNU project by Torbjorn Granlund.
+
+Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#ifndef MUL_BASECASE_MAX_UN
+#define MUL_BASECASE_MAX_UN 500
+#endif
+
+/* Areas where the different toom algorithms can be called (extracted
+ from the t-toom*.c files, and ignoring small constant offsets):
+
+ 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
+ 4/7 6/7
+ 6/11
+ |--------------------| toom22 (small)
+ || toom22 (large)
+ |xxxx| toom22 called
+ |-------------------------------------| toom32
+ |xxxxxxxxxxxxxxxx| | toom32 called
+ |------------| toom33
+ |x| toom33 called
+ |---------------------------------| | toom42
+ |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called
+ |--------------------| toom43
+ |xxxxxxxxxx| toom43 called
+ |-----------------------------| toom52 (unused)
+ |--------| toom44
+ |xxxxxxxx| toom44 called
+ |--------------------| | toom53
+ |xxxxxx| toom53 called
+ |-------------------------| toom62 (unused)
+ |----------------| toom54 (unused)
+ |--------------------| toom63
+ |xxxxxxxxx| | toom63 called
+ |---------------------------------| toom6h
+ |xxxxxxxx| toom6h called
+ |-------------------------| toom8h (32 bit)
+ |------------------------------------------| toom8h (64 bit)
+ |xxxxxxxx| toom8h called
+*/
+
+#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
+#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+ (pointed to by VP, with VN limbs), and store the result at PRODP. The
+ result is UN + VN limbs. Return the most significant limb of the result.
+
+ NOTE: The space pointed to by PRODP is overwritten before finished with U
+ and V, so overlap is an error.
+
+ Argument constraints:
+ 1. UN >= VN.
+ 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+ the multiplier and the multiplicand. */
+
+/*
+ * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
+ ideal lines of the surrounding algorithms. Is that optimal?
+
+ * The toomX3 code now uses a structure similar to the one of toomX2, except
+ that it loops longer in the unbalanced case. The result is that the
+ remaining area might have un < vn. Should we fix the toomX2 code in a
+ similar way?
+
+ * The toomX3 code is used for the largest non-FFT unbalanced operands. It
+ therefore calls mpn_mul recursively for certain cases.
+
+ * Allocate static temp space using THRESHOLD variables (except for toom44
+ when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
+
+ * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
+ have the same vn threshold. This is not true, we should actually use
+ mul_basecase for slightly larger operands for toom32 than for toom22, and
+ even larger for toom42.
+
+ * That problem is even more prevalent for toomX3. We therefore use special
+ THRESHOLD variables there.
+
+ * Is our ITCH allocation correct?
+*/
+
+#define ITCH (16*vn + 100)
+
+mp_limb_t
+mpn_mul (mp_ptr prodp,
+ mp_srcptr up, mp_size_t un,
+ mp_srcptr vp, mp_size_t vn)
+{
+ ASSERT (un >= vn);
+ ASSERT (vn >= 1);
+ ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
+ ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
+
+ if (un == vn)
+ {
+ if (up == vp)
+ mpn_sqr (prodp, up, un);
+ else
+ mpn_mul_n (prodp, up, vp, un);
+ }
+ else if (vn < MUL_TOOM22_THRESHOLD)
+ { /* plain schoolbook multiplication */
+
+ /* Unless un is very large, or else if have an applicable mpn_mul_N,
+ perform basecase multiply directly. */
+ if (un <= MUL_BASECASE_MAX_UN
+#if HAVE_NATIVE_mpn_mul_2
+ || vn <= 2
+#else
+ || vn == 1
+#endif
+ )
+ mpn_mul_basecase (prodp, up, un, vp, vn);
+ else
+ {
+ /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
+ locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
+ these pieces with the vp[] operand. After each such partial
+ multiplication (but the last) we copy the most significant vn
+ limbs into a temporary buffer since that part would otherwise be
+ overwritten by the next multiplication. After the next
+ multiplication, we add it back. This illustrates the situation:
+
+ -->vn<--
+ | |<------- un ------->|
+ _____________________|
+ X /|
+ /XX__________________/ |
+ _____________________ |
+ X / |
+ /XX__________________/ |
+ _____________________ |
+ / / |
+ /____________________/ |
+ ==================================================================
+
+ The parts marked with X are the parts whose sums are copied into
+ the temporary buffer. */
+
+ mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
+ mp_limb_t cy;
+ ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
+
+ mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
+ prodp += MUL_BASECASE_MAX_UN;
+ MPN_COPY (tp, prodp, vn); /* preserve high triangle */
+ up += MUL_BASECASE_MAX_UN;
+ un -= MUL_BASECASE_MAX_UN;
+ while (un > MUL_BASECASE_MAX_UN)
+ {
+ mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
+ cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
+ mpn_incr_u (prodp + vn, cy);
+ prodp += MUL_BASECASE_MAX_UN;
+ MPN_COPY (tp, prodp, vn); /* preserve high triangle */
+ up += MUL_BASECASE_MAX_UN;
+ un -= MUL_BASECASE_MAX_UN;
+ }
+ if (un > vn)
+ {
+ mpn_mul_basecase (prodp, up, un, vp, vn);
+ }
+ else
+ {
+ ASSERT (un > 0);
+ mpn_mul_basecase (prodp, vp, vn, up, un);
+ }
+ cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
+ mpn_incr_u (prodp + vn, cy);
+ }
+ }
+ else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
+ {
+ /* Use ToomX2 variants */
+ mp_ptr scratch;
+ TMP_SDECL; TMP_SMARK;
+
+ scratch = TMP_SALLOC_LIMBS (ITCH);
+
+ /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
+ square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
+ wise; we would get better balance by slightly moving the bound. We
+ will sometimes end up with un < vn, like in the X3 arm below. */
+ if (un >= 3 * vn)
+ {
+ mp_limb_t cy;
+ mp_ptr ws;
+
+ /* The maximum ws usage is for the mpn_mul result. */
+ ws = TMP_SALLOC_LIMBS (4 * vn);
+
+ mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ prodp += 2 * vn;
+
+ while (un >= 3 * vn)
+ {
+ mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, 2 * vn);
+ mpn_incr_u (prodp + vn, cy);
+ prodp += 2 * vn;
+ }
+
+ /* vn <= un < 3vn */
+
+ if (4 * un < 5 * vn)
+ mpn_toom22_mul (ws, up, un, vp, vn, scratch);
+ else if (4 * un < 7 * vn)
+ mpn_toom32_mul (ws, up, un, vp, vn, scratch);
+ else
+ mpn_toom42_mul (ws, up, un, vp, vn, scratch);
+
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+ }
+ else
+ {
+ if (4 * un < 5 * vn)
+ mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
+ else if (4 * un < 7 * vn)
+ mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+ }
+ TMP_SFREE;
+ }
+ else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
+ BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
+ {
+ /* Handle the largest operands that are not in the FFT range. The 2nd
+ condition makes very unbalanced operands avoid the FFT code (except
+ perhaps as coefficient products of the Toom code. */
+
+ if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
+ {
+ /* Use ToomX3 variants */
+ mp_ptr scratch;
+ TMP_SDECL; TMP_SMARK;
+
+ scratch = TMP_SALLOC_LIMBS (ITCH);
+
+ if (2 * un >= 5 * vn)
+ {
+ mp_limb_t cy;
+ mp_ptr ws;
+
+ /* The maximum ws usage is for the mpn_mul result. */
+ ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
+
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+ mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
+ else
+ mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ prodp += 2 * vn;
+
+ while (2 * un >= 5 * vn) /* un >= 2.5vn */
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+ mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
+ else
+ mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, 2 * vn);
+ mpn_incr_u (prodp + vn, cy);
+ prodp += 2 * vn;
+ }
+
+ /* vn / 2 <= un < 2.5vn */
+
+ if (un < vn)
+ mpn_mul (ws, vp, vn, up, un);
+ else
+ mpn_mul (ws, up, un, vp, vn);
+
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+ }
+ else
+ {
+ if (6 * un < 7 * vn)
+ mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
+ else if (2 * un < 3 * vn)
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
+ mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
+ }
+ else if (6 * un < 11 * vn)
+ {
+ if (4 * un < 7 * vn)
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
+ mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
+ }
+ else
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
+ mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
+ }
+ }
+ else
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+ mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
+ }
+ }
+ TMP_SFREE;
+ }
+ else
+ {
+ mp_ptr scratch;
+ TMP_DECL; TMP_MARK;
+
+ if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
+ {
+ scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
+ mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
+ }
+ else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
+ {
+ scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
+ mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
+ }
+ else
+ {
+ scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
+ mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
+ }
+ TMP_FREE;
+ }
+ }
+ else
+ {
+ if (un >= 8 * vn)
+ {
+ mp_limb_t cy;
+ mp_ptr ws;
+ TMP_DECL; TMP_MARK;
+
+ /* The maximum ws usage is for the mpn_mul result. */
+ ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
+
+ mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
+ un -= 3 * vn;
+ up += 3 * vn;
+ prodp += 3 * vn;
+
+ while (2 * un >= 7 * vn) /* un >= 3.5vn */
+ {
+ mpn_fft_mul (ws, up, 3 * vn, vp, vn);
+ un -= 3 * vn;
+ up += 3 * vn;
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, 3 * vn);
+ mpn_incr_u (prodp + vn, cy);
+ prodp += 3 * vn;
+ }
+
+ /* vn / 2 <= un < 3.5vn */
+
+ if (un < vn)
+ mpn_mul (ws, vp, vn, up, un);
+ else
+ mpn_mul (ws, up, un, vp, vn);
+
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+
+ TMP_FREE;
+ }
+ else
+ mpn_fft_mul (prodp, up, un, vp, vn);
+ }
+
+ return prodp[un + vn - 1]; /* historic */
+}