From f81af6353b6692f092d66b24573d91b3ba22d509 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 1 Jun 2009 01:00:20 +0200 Subject: Rewrite. --- mpn/generic/mul.c | 245 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 149 insertions(+), 96 deletions(-) (limited to 'mpn/generic/mul.c') diff --git a/mpn/generic/mul.c b/mpn/generic/mul.c index 489e1f524..770557118 100644 --- a/mpn/generic/mul.c +++ b/mpn/generic/mul.c @@ -3,7 +3,7 @@ Contributed to the GNU project by Torbjorn Granlund. Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005, -2006, 2007 Free Software Foundation, Inc. +2006, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -50,16 +50,11 @@ mpn_mul (mp_ptr prodp, ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); - if (un == vn) + if (up == vp && un == vn) { - if (up == vp) - mpn_sqr_n (prodp, up, un); - else - mpn_mul_n (prodp, up, vp, un); - return prodp[2 * un - 1]; + mpn_sqr_n (prodp, up, un); } - - if (vn < MUL_KARATSUBA_THRESHOLD) + else if (vn < MUL_KARATSUBA_THRESHOLD) { /* plain schoolbook multiplication */ /* Unless un is very large, or else if have an applicable mpn_mul_N, @@ -111,7 +106,7 @@ mpn_mul (mp_ptr prodp, { mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ - mpn_incr_u (prodp + vn, cy); /* safe? */ + mpn_incr_u (prodp + vn, cy); prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; @@ -127,96 +122,154 @@ mpn_mul (mp_ptr prodp, mpn_mul_basecase (prodp, vp, vn, up, un); } cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ - mpn_incr_u (prodp + vn, cy); /* safe? */ + mpn_incr_u (prodp + vn, cy); } - return prodp[un + vn - 1]; } + else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD) || + /* Also do larger unbalanced here, up to a (somewhat arbitrarily) + larger vn limit, unless toom33 can do this product directly. */ + (3 * un >= 4 * vn && BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD * 3 / 2))) + { + /* Loop over toom42, then choose toom42, toom32, or toom22 */ + mp_ptr ws; + mp_ptr scratch; + TMP_DECL; TMP_MARK; + + #define WSALL (4 * vn) + ws = TMP_SALLOC_LIMBS (WSALL + 1); + + #define ITCH ((un + vn) * 4 + 100) + scratch = TMP_SALLOC_LIMBS (ITCH + 1); + + if (un >= 3 * vn) + { + mp_limb_t cy; + + mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + prodp += 2 * vn; + + while (un >= 3 * vn) + { + mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 2 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 2 * vn; + } - if (ABOVE_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) && - ABOVE_THRESHOLD (vn, MUL_FFT_THRESHOLD / 3)) /* FIXME */ + /* FIXME: Test these in opposite order, following the philosophy of + minimizing the relative overhead. */ + if (5 * un > 9 * vn) + { + mpn_toom42_mul (ws, up, un, vp, vn, scratch); + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + else if (9 * un > 10 * vn) + { + mpn_toom32_mul (ws, up, un, vp, vn, scratch); + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + else + { + mpn_toom22_mul (ws, up, un, vp, vn, scratch); + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + } + else + { + /* FIXME: Test these in opposite order, following the philosophy of + minimizing the relative overhead. */ + if (5 * un > 9 * vn) + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + else if (9 * un > 10 * vn) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom22_mul (prodp, up, un, vp, vn, scratch); + } + TMP_FREE; + } + else if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD)) + { + TMP_DECL; TMP_MARK; + if (3 * un < 4 * vn) + { + /* Apply toom33 directly, since operands are balanced enough. */ + mp_ptr scratch; + scratch = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (un, vn)); + mpn_toom33_mul (prodp, up, un, vp, vn, scratch); + } + else + { + /* Apply toom33, recurse. FUTURE: toom63, toom53, toom43, toom33 */ + mp_ptr scratch, pp; /* FIXME: use same area for these */ + scratch = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (vn, vn)); + mpn_toom33_mul (prodp, up, vn, vp, vn, scratch); + prodp += vn; + up += vn; + un -= vn; + if (un != 0) + { + mp_limb_t cy; + pp = TMP_SALLOC_LIMBS (un + vn); + if (un > vn) + mpn_mul (pp, up, un, vp, vn); + else + mpn_mul (pp, vp, vn, up, un); + cy = mpn_add_n (prodp, prodp, pp, vn); + MPN_COPY (prodp + vn, pp + vn, un); + mpn_incr_u (prodp + vn, cy); + } + } + TMP_FREE; + } + else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) || + BELOW_THRESHOLD (vn, MUL_FFT_THRESHOLD / 3)) /* FIXME */ + { + TMP_DECL; TMP_MARK; + if (4 * un < 5 * vn) + { + /* Apply toom44 directly, since operands are balanced enough. */ + mp_ptr scratch; + scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn)); + mpn_toom44_mul (prodp, up, un, vp, vn, scratch); + } + else + { + /* Apply toom44, recurse. FUTURE: toom84, toom74, toom64, toom54, toom44 */ + mp_ptr scratch, pp; /* FIXME: use same area for these */ + scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (vn, vn)); + mpn_toom44_mul (prodp, up, vn, vp, vn, scratch); + prodp += vn; + up += vn; + un -= vn; + if (un != 0) + { + mp_limb_t cy; + pp = TMP_ALLOC_LIMBS (un + vn); + if (un > vn) + mpn_mul (pp, up, un, vp, vn); + else + mpn_mul (pp, vp, vn, up, un); + cy = mpn_add_n (prodp, prodp, pp, vn); + MPN_COPY (prodp + vn, pp + vn, un); + mpn_incr_u (prodp + vn, cy); + } + } + TMP_FREE; + } + else { mpn_mul_fft_full (prodp, up, un, vp, vn); - return prodp[un + vn - 1]; } - - { - mp_ptr ws; - mp_ptr scratch; -#if WANT_ASSERT - mp_ptr ssssp; -#endif - TMP_DECL; - TMP_MARK; - -#define WSALL (4 * vn) - ws = TMP_SALLOC_LIMBS (WSALL + 1); - -#define ITCH ((un + vn) * 4 + 100) - scratch = TMP_ALLOC_LIMBS (ITCH + 1); -#if WANT_ASSERT - ssssp = scratch + ITCH; - ws[WSALL] = 0xbabecafe; - ssssp[0] = 0xbeef; -#endif - - if (un >= 3 * vn) - { - mp_limb_t cy; - - mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); - un -= 2 * vn; - up += 2 * vn; - prodp += 2 * vn; - - while (un >= 3 * vn) - { - mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); - un -= 2 * vn; - up += 2 * vn; - cy = mpn_add_n (prodp, prodp, ws, vn); - MPN_COPY (prodp + vn, ws + vn, 2 * vn); - mpn_incr_u (prodp + vn, cy); - prodp += 2 * vn; - } - - if (5 * un > 9 * vn) - { - mpn_toom42_mul (ws, up, un, vp, vn, scratch); - cy = mpn_add_n (prodp, prodp, ws, vn); - MPN_COPY (prodp + vn, ws + vn, un); - mpn_incr_u (prodp + vn, cy); - } - else if (9 * un > 10 * vn) - { - mpn_toom32_mul (ws, up, un, vp, vn, scratch); - cy = mpn_add_n (prodp, prodp, ws, vn); - MPN_COPY (prodp + vn, ws + vn, un); - mpn_incr_u (prodp + vn, cy); - } - else - { - mpn_toom22_mul (ws, up, un, vp, vn, scratch); - cy = mpn_add_n (prodp, prodp, ws, vn); - MPN_COPY (prodp + vn, ws + vn, un); - mpn_incr_u (prodp + vn, cy); - } - - ASSERT (ws[WSALL] == 0xbabecafe); - ASSERT (ssssp[0] == 0xbeef); - TMP_FREE; - return prodp[un + vn - 1]; - } - - if (un * 5 > vn * 9) - mpn_toom42_mul (prodp, up, un, vp, vn, scratch); - else if (9 * un > 10 * vn) - mpn_toom32_mul (prodp, up, un, vp, vn, scratch); - else - mpn_toom22_mul (prodp, up, un, vp, vn, scratch); - - ASSERT (ws[WSALL] == 0xbabecafe); - ASSERT (ssssp[0] == 0xbeef); - TMP_FREE; - return prodp[un + vn - 1]; - } + return prodp[un + vn - 1]; } -- cgit v1.2.1