summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Maw <richard.maw@codethink.co.uk>2012-01-19 10:33:31 +0000
committerRichard Maw <richard.maw@codethink.co.uk>2012-01-19 10:33:31 +0000
commit29137c6ff7a9e370e2332d855ab46616ad4e9cc9 (patch)
treefbca7aa7cfa645df1b059aeba7e81739620b013c
parent962de8d4b353178d38c2c70e952944686b9fd47b (diff)
parent2c033efc02631f22e6e180ce737a2faf81b09ccc (diff)
downloadgmp-29137c6ff7a9e370e2332d855ab46616ad4e9cc9.tar.gz
Merge branch 'master' into baserock/morph
-rw-r--r--AUTHORS2
-rw-r--r--ChangeLog237
-rw-r--r--configure.in338
-rw-r--r--doc/gmp.texi7
-rw-r--r--dumbmp.c2
-rw-r--r--gmp-h.in4
-rw-r--r--gmp-impl.h5
-rw-r--r--gmpxx.h2
-rw-r--r--mpn/alpha/ev5/gmp-mparam.h40
-rw-r--r--mpn/alpha/ev6/gmp-mparam.h76
-rw-r--r--mpn/asm-defs.m417
-rw-r--r--mpn/generic/gcd_subdiv_step.c2
-rw-r--r--mpn/generic/hgcd_appr.c181
-rw-r--r--mpn/generic/hgcd_jacobi.c4
-rw-r--r--mpn/generic/hgcd_reduce.c14
-rw-r--r--mpn/generic/hgcd_step.c2
-rw-r--r--mpn/generic/powm.c44
-rw-r--r--mpn/generic/powm_sec.c126
-rw-r--r--mpn/generic/redc_1.c5
-rw-r--r--mpn/generic/tabselect.c (renamed from mpn/generic/redc_1_sec.c)35
-rw-r--r--mpn/generic/udiv_w_sdiv.c6
-rw-r--r--mpn/ia64/gmp-mparam.h148
-rw-r--r--mpn/ia64/tabselect.asm139
-rw-r--r--mpn/pa64/gmp-mparam.h62
-rw-r--r--mpn/powerpc32/aors_n.asm19
-rw-r--r--mpn/powerpc32/p3-p7/aors_n.asm176
-rw-r--r--mpn/powerpc32/p5/gmp-mparam.h137
-rw-r--r--mpn/powerpc32/p6/gmp-mparam.h206
-rw-r--r--mpn/powerpc32/p7/gmp-mparam.h149
-rw-r--r--mpn/powerpc32/tabselect.asm98
-rw-r--r--mpn/powerpc64/com.asm9
-rw-r--r--mpn/powerpc64/copyd.asm9
-rw-r--r--mpn/powerpc64/copyi.asm9
-rw-r--r--mpn/powerpc64/logops_n.asm9
-rw-r--r--mpn/powerpc64/lshift.asm11
-rw-r--r--mpn/powerpc64/lshiftc.asm (renamed from mpn/powerpc64/mode64/lshiftc.asm)16
-rw-r--r--mpn/powerpc64/mode64/aors_n.asm14
-rw-r--r--mpn/powerpc64/mode64/aorscnd_n.asm185
-rw-r--r--mpn/powerpc64/mode64/aorslshC_n.asm11
-rw-r--r--mpn/powerpc64/mode64/aorsmul_1.asm15
-rw-r--r--mpn/powerpc64/mode64/bdiv_dbm1c.asm4
-rw-r--r--mpn/powerpc64/mode64/dive_1.asm11
-rw-r--r--mpn/powerpc64/mode64/divrem_1.asm13
-rw-r--r--mpn/powerpc64/mode64/divrem_2.asm11
-rw-r--r--mpn/powerpc64/mode64/invert_limb.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_1_1.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_1_4.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_34lsub1.asm11
-rw-r--r--mpn/powerpc64/mode64/mode1o.asm10
-rw-r--r--mpn/powerpc64/mode64/mul_1.asm11
-rw-r--r--mpn/powerpc64/mode64/mul_basecase.asm12
-rw-r--r--mpn/powerpc64/mode64/p3/gmp-mparam.h73
-rw-r--r--mpn/powerpc64/mode64/p4/gmp-mparam.h31
-rw-r--r--mpn/powerpc64/mode64/p5/gmp-mparam.h41
-rw-r--r--mpn/powerpc64/mode64/p6/aorsmul_1.asm172
-rw-r--r--mpn/powerpc64/mode64/p6/gmp-mparam.h85
-rw-r--r--mpn/powerpc64/mode64/p6/mul_basecase.asm2
-rw-r--r--mpn/powerpc64/mode64/p7/gmp-mparam.h159
-rw-r--r--mpn/powerpc64/mode64/rsh1add_n.asm11
-rw-r--r--mpn/powerpc64/mode64/rsh1sub_n.asm11
-rw-r--r--mpn/powerpc64/mode64/sqr_basecase.asm852
-rw-r--r--mpn/powerpc64/mode64/sqr_diag_addlsh1.asm238
-rw-r--r--mpn/powerpc64/rshift.asm11
-rw-r--r--mpn/powerpc64/tabselect.asm96
-rw-r--r--mpn/s390_32/esame/gmp-mparam.h86
-rw-r--r--mpn/s390_32/lshift.asm2
-rw-r--r--mpn/s390_32/lshiftc.asm2
-rw-r--r--mpn/s390_32/rshift.asm2
-rw-r--r--mpn/s390_64/README77
-rw-r--r--mpn/s390_64/gmp-mparam.h24
-rw-r--r--mpn/sparc64/ultrasparc34/gmp-mparam.h29
-rw-r--r--mpn/sparc64/ultrasparct1/gmp-mparam.h36
-rw-r--r--mpn/x86/atom/gmp-mparam.h41
-rw-r--r--mpn/x86/atom/lshift.asm4
-rw-r--r--mpn/x86/atom/sse2/mul_1.asm2
-rw-r--r--mpn/x86/bdiv_dbm1c.asm4
-rw-r--r--mpn/x86/bdiv_q_1.asm2
-rw-r--r--mpn/x86/bobcat/gmp-mparam.h142
-rw-r--r--mpn/x86/core2/gmp-mparam.h141
-rw-r--r--mpn/x86/coreinhm/gmp-mparam.h141
-rw-r--r--mpn/x86/coreisbr/gmp-mparam.h140
-rw-r--r--mpn/x86/k10/gmp-mparam.h142
-rw-r--r--mpn/x86/k7/addlsh1_n.asm6
-rw-r--r--mpn/x86/k7/gmp-mparam.h45
-rw-r--r--mpn/x86/k7/invert_limb.asm2
-rw-r--r--mpn/x86/k7/sublsh1_n.asm8
-rw-r--r--mpn/x86/k8/gmp-mparam.h144
-rw-r--r--mpn/x86/nano/gmp-mparam.h152
-rw-r--r--mpn/x86/p6/bdiv_q_1.asm4
-rw-r--r--mpn/x86/p6/sse2/gmp-mparam.h61
-rw-r--r--mpn/x86/pentium/bdiv_q_1.asm2
-rw-r--r--mpn/x86/pentium4/sse2/gmp-mparam.h85
-rw-r--r--mpn/x86/tabselect.asm104
-rw-r--r--mpn/x86_64/addmul_2.asm7
-rw-r--r--mpn/x86_64/aorrlsh1_n.asm8
-rw-r--r--mpn/x86_64/aorrlsh2_n.asm5
-rw-r--r--mpn/x86_64/aorrlshC_n.asm7
-rw-r--r--mpn/x86_64/aorrlsh_n.asm14
-rw-r--r--mpn/x86_64/aors_n.asm34
-rw-r--r--mpn/x86_64/aorscnd_n.asm178
-rw-r--r--mpn/x86_64/aorsmul_1.asm51
-rw-r--r--mpn/x86_64/atom/gmp-mparam.h17
-rw-r--r--mpn/x86_64/bdiv_dbm1c.asm16
-rw-r--r--mpn/x86_64/bdiv_q_1.asm21
-rw-r--r--mpn/x86_64/bobcat/gmp-mparam.h10
-rw-r--r--mpn/x86_64/com.asm8
-rw-r--r--mpn/x86_64/copyd.asm9
-rw-r--r--mpn/x86_64/copyi.asm9
-rw-r--r--mpn/x86_64/core2/aorrlsh1_n.asm5
-rw-r--r--mpn/x86_64/core2/aorrlsh2_n.asm5
-rw-r--r--mpn/x86_64/core2/aorrlsh_n.asm4
-rw-r--r--mpn/x86_64/core2/aors_n.asm19
-rw-r--r--mpn/x86_64/core2/aorsmul_1.asm8
-rw-r--r--mpn/x86_64/core2/gmp-mparam.h23
-rw-r--r--mpn/x86_64/core2/lshift.asm39
-rw-r--r--mpn/x86_64/core2/lshiftc.asm39
-rw-r--r--mpn/x86_64/core2/rsh1aors_n.asm17
-rw-r--r--mpn/x86_64/core2/rshift.asm39
-rw-r--r--mpn/x86_64/core2/sublsh1_n.asm5
-rw-r--r--mpn/x86_64/core2/sublsh2_n.asm5
-rw-r--r--mpn/x86_64/core2/sublshC_n.asm4
-rw-r--r--mpn/x86_64/coreinhm/aorrlsh_n.asm17
-rw-r--r--mpn/x86_64/coreinhm/gmp-mparam.h113
-rw-r--r--mpn/x86_64/coreisbr/aors_n.asm14
-rw-r--r--mpn/x86_64/coreisbr/gmp-mparam.h166
-rw-r--r--mpn/x86_64/div_qr_2n_pi1.asm6
-rw-r--r--mpn/x86_64/div_qr_2u_pi1.asm6
-rw-r--r--mpn/x86_64/dos64.m439
-rw-r--r--mpn/x86_64/gmp-mparam.h13
-rw-r--r--mpn/x86_64/invert_limb.asm6
-rw-r--r--mpn/x86_64/invert_limb_table.asm3
-rw-r--r--mpn/x86_64/logops_n.asm16
-rw-r--r--mpn/x86_64/lshift.asm11
-rw-r--r--mpn/x86_64/lshiftc.asm7
-rw-r--r--mpn/x86_64/lshsub_n.asm16
-rw-r--r--mpn/x86_64/mod_1_1.asm11
-rw-r--r--mpn/x86_64/mod_1_2.asm9
-rw-r--r--mpn/x86_64/mod_1_4.asm15
-rw-r--r--mpn/x86_64/mod_34lsub1.asm12
-rw-r--r--mpn/x86_64/mul_1.asm55
-rw-r--r--mpn/x86_64/mul_2.asm7
-rw-r--r--mpn/x86_64/mul_basecase.asm14
-rw-r--r--mpn/x86_64/mulmid_basecase.asm14
-rw-r--r--mpn/x86_64/nano/gmp-mparam.h33
-rw-r--r--mpn/x86_64/pentium4/gmp-mparam.h51
-rw-r--r--mpn/x86_64/popham.asm12
-rw-r--r--mpn/x86_64/redc_1.asm73
-rw-r--r--mpn/x86_64/rsh1aors_n.asm17
-rw-r--r--mpn/x86_64/rshift.asm7
-rw-r--r--mpn/x86_64/sqr_basecase.asm9
-rw-r--r--mpn/x86_64/sublsh1_n.asm7
-rw-r--r--mpn/x86_64/tabselect.asm123
-rw-r--r--mpn/x86_64/x86_64-defs.m47
-rw-r--r--mpz/jacobi.c8
-rw-r--r--tests/cxx/t-ops2.cc40
-rw-r--r--tests/devel/try.c18
-rw-r--r--tests/mpn/t-hgcd_appr.c14
-rw-r--r--tests/mpn/t-mod_1.c2
-rw-r--r--tests/mpn/t-mulmid.c2
-rw-r--r--tests/mpz/t-jac.c4
-rw-r--r--tests/refmpn.c30
-rw-r--r--tests/tests.h5
-rw-r--r--tune/Makefile.am7
-rw-r--r--tune/common.c40
-rw-r--r--tune/hgcd_appr_lehmer.c29
-rw-r--r--tune/hgcd_reduce_1.c30
-rw-r--r--tune/hgcd_reduce_2.c29
-rw-r--r--tune/speed.c9
-rw-r--r--tune/speed.h89
-rw-r--r--tune/tune-gcd-p.c4
-rw-r--r--tune/tuneup.c173
171 files changed, 6367 insertions, 1780 deletions
diff --git a/AUTHORS b/AUTHORS
index 170c766e1..f399ce345 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -58,5 +58,5 @@ David Harvey mpn/generic/add_err1_n.c, add_err2_n.c,
aors_err2_n.asm, aors_err3_n.asm,
mulmid_basecase.asm,
mpn/x86_64/core2/aors_err1_n.asm.
-
+
Martin Boij mpn/generic/perfpow.c
diff --git a/ChangeLog b/ChangeLog
index 939030555..01c275bd2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,240 @@
+2011-11-29 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/x86_64: Add DOS64 ABI support to most files.
+
+2011-11-28 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/x86_64/mul_basecase.asm: Support ABI DOS64.
+ * mpn/x86_64/sqr_basecase.asm: Support ABI DOS64.
+ * mpn/x86_64/aorsmul_1.asm: Support ABI DOS64.
+ * mpn/x86_64/mul_1.asm: Support ABI DOS64.
+
+ * mpn/x86_64/x86_64-defs.m4 (DOS64_ENTRY, DOS64_EXIT): New, empty defs.
+
+ * mpn/x86_64/dos64.m4: New file.
+
+ * mpn/asm-defs.m4 (ABI_SUPPORT): New dummy macro.
+
+ * configure.in (64-bit mingw/cygwin): Define HOST_DOS64,GMP_NONSTD_ABI.
+ No longer clear out path_64.
+ (mpn code selection loop): Handle GMP_NONSTD_ABI.
+
+ * mpn/generic/udiv_w_sdiv.c: Use CNST_LIMB for some constants.
+
+2011-11-25 Torbjorn Granlund <tege@gmplib.org>
+
+ * x86/*: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode.
+
+ * configure.in: Overhaul x86/x86_64 support, merging three case
+ statements into one.
+
+2011-11-24 Torbjorn Granlund <tege@gmplib.org>
+
+ * doc/gmp.texi (Formatted Output Strings): Clarify rules for mpf_t
+ precision.
+
+ * mpn/powerpc32/p7/gmp-mparam.h: New file.
+
+ * tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's
+ threshold.
+
+2011-11-22 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/powerpc64/mode64/p6/aorsmul_1.asm: New file.
+
+ * configure.in: Don't fail fat builds under 64-bit DOS.
+
+ * mpn/powerpc64/mode64/aors_n.asm: Align loop for slightly better
+ power5 performance.
+
+2011-11-21 Torbjorn Granlund <tege@gmplib.org>
+
+ * gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name.
+
+2011-11-20 Torbjorn Granlund <tege@gmplib.org>
+
+ * configure.in: Split x86 CPUs into more subtypes for more accurate
+ passing of gcc flags.
+
+ * mpn/powerpc32/p3-p7/aors_n.asm: New file.
+
+ * configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe
+ mechanism.
+
+ * configure.in: Support powerpc32/p3-p7 directory for affected CPUs.
+
+2011-11-17 Torbjorn Granlund <tege@gmplib.org>
+
+ * tune/speed.c (routine): Add mpn_tabselect.
+ * tune/common.c (speed_mpn_tabselect): New function.
+ * tune/speed.h (SPEED_ROUTINE_MPN_COPY_CALL): New macro, made from
+ old SPEED_ROUTINE_MPN_COPY.
+ (SPEED_ROUTINE_MPN_COPY): Just invoke SPEED_ROUTINE_MPN_COPY_CALL.
+ (SPEED_ROUTINE_MPN_TABSELECT): New macro.
+
+2011-11-17 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/tuneup.c (tune_hgcd_appr): Increase stop_since_change.
+
+2011-11-16 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/powerpc32/tabselect.asm: New file.
+
+ * mpn/powerpc64/mode64/aorscnd_n.asm: New file.
+
+2011-11-15 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/speed.h (speed_mpn_hgcd_appr_lehmer): New prototype.
+ (mpn_hgcd_lehmer_itch): Likewise.
+ (mpn_hgcd_appr_lehmer): Likewise.
+ (mpn_hgcd_appr_lehmer_itch): Likewise.
+ (MPN_HGCD_LEHMER_ITCH): Deleted macro.
+
+ * tune/speed.c (routine): Added mpn_hgcd_appr_lehmer.
+
+ * tune/common.c (speed_mpn_hgcd_lehmer): Use mpn_hgcd_lehmer_itch
+ rather than similarly named macro.
+ (speed_mpn_hgcd_appr_lehmer): New function.
+
+ * tune/Makefile.am (libspeed_la_SOURCES): Added
+ hgcd_appr_lehmer.c.
+
+ * tune/hgcd_appr_lehmer.c: New file.
+
+ * tune/tuneup.c (tune_hgcd_appr): Increased min_size to 50; some
+ machines got small thresholds which appear to be bogus.
+
+2011-11-15 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/generic/powm_sec.c (mpn_local_sqr): Remove forgotten TMP_* calls.
+ (redcify): Likewise.
+ (mpn_powm_sec): Likewise.
+
+ * mpn/generic/powm_sec.c (mpn_powm_sec): Rework scratch usage
+ (mpn_powm_sec_itch): Rewrite.
+
+ * mpn/generic/powm_sec.c (mpn_powm_sec): Use mpn_tabselect also in
+ initialisation.
+
+ * configure.in: Amend 2011-11-03 gcc_cflags change.
+
+ * mpn/powerpc64/tabselect.asm: New file.
+ * mpn/x86_64/tabselect.asm: New file.
+ * mpn/x86/tabselect.asm: New file.
+ * mpn/ia64/tabselect.asm: New file.
+
+ * mpn/asm-defs.m4 (define_mpn): Add tabselect.
+
+ * configure.in (gmp_mpn_functions): Add tabselect.
+ (HAVE_NATIVE): Add entries for addncd_n, subcnd_n, tabselect.
+
+ * mpn/generic/powm_sec.c: Remove mpn_tabselect implementation.
+ * mpn/generic/tabselect.c: New file with removed code.
+
+2011-11-13 Torbjorn Granlund <tege@gmplib.org>
+
+ * tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Add powm_sec.c.
+
+ * mpn/generic/powm_sec.c (win_size): Use POWM_SEC_TABLE
+ (POWM_SEC_TABLE): Define default.
+
+ * tune/tuneup.c (tune_powm_sec): New function computing POWM_SEC_TABLE.
+ (all): Call new function.
+
+ * mpn/generic/powm_sec.c (win_size): Define only when
+ TUNE_PROGRAM_BUILD is not set.
+
+2011-11-13 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/tuneup.c (tune_hgcd_appr): Use default min_size.
+ (tune_hgcd_reduce): Increase max_size and step_factor, to 7000
+ and 0.04, respectively.
+
+2011-11-11 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/powerpc64/mode64/sqr_diag_addlsh1.asm: Remove.
+
+2011-11-11 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/hgcd_reduce_2.c: New file.
+ * tune/hgcd_reduce_1.c: New file.
+
+ * tune/tuneup.c (hgcd_appr_threshold): New threshold variable.
+ (hgcd_reduce_threshold): Likewise.
+ (tune_hgcd_appr): New function.
+ (tune_hgcd_reduce): New function.
+ (all): Call tune_hgcd_appr and tune_hgcd_reduce.
+
+ * tune/speed.h (speed_mpn_hgcd_reduce): Declaration.
+ (speed_mpn_hgcd_reduce_[12]): Likewise.
+ (mpn_hgcd_reduce_[12]): Likewise.
+ (SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL): New macro.
+
+ * tune/speed.c (routine): Added mpn_hgcd_reduce,
+ mpn_hgcd_reduce_1, and mpn_hgcd_reduce_2.
+
+ * tune/common.c (speed_mpn_hgcd_reduce): New function.
+ (speed_mpn_hgcd_reduce_[12]): Likewise.
+
+ * tune/Makefile.am (libspeed_la_SOURCES): Added hgcd_reduce_1.c
+ hgcd_reduce_2.c.
+ (TUNE_MPN_SRCS_BASIC): Added hgcd_appr.c and hgcd_reduce.c.
+
+ * mpn/generic/hgcd_appr.c (submul, hgcd_matrix_apply): Deleted
+ functions, earlier copied to hgcd_reduce.c.
+ (mpn_hgcd_appr): Use hgcd_reduce.
+
+2011-11-09 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/powerpc64/mode64/sqr_basecase.asm: New file.
+
+ * mpn/x86_64/aorscnd_n.asm: New file.
+
+ * tune/speed.c (routine): Add measuring of mpn_addcnd_n, mpn_subcnd_n.
+ * tune/common.c (speed_mpn_addcnd_n,speed_mpn_subcnd_n): New functions.
+ * tune/speed.h: Declare them.
+
+ * tests/devel/try.c: Add tests for mpn_addcnd_n and mpn_subcnd_n.
+ * tests/refmpn.c (refmpn_addcnd_n, refmpn_subcnd_n): New functions.
+ * tests/tests.h: Declare them.
+
+ * configure.in (gmp_mpn_functions): Add addcnd_n and subcnd_n.
+
+2011-11-07 Torbjorn Granlund <tege@gmplib.org>
+
+ * mpn/generic/redc_1.c: Just reduce U uperand using Hensel norm, but
+ not fully canonically; leave add_n and conditional sub_n to caller.
+ Therefore omit R argument.
+
+ * mpn/generic/redc_1_sec.c: Remove.
+
+ * gmp-impl.h (mpn_redc_1): Update declaration.
+ (mpn_redc_1_sec): Remove declaration.
+
+ * configure.in (gmp_mpn_functions): Remove redc_1.
+
+ * mpn/x86_64/redc_1.asm: Adopt to new defined functionality/interface.
+ * tune/speed.h (SPEED_ROUTINE_REDC_1): Likewise.
+
+ * tests/refmpn.c (refmpn_redc_1): Likewise; also call refmpn_addmul_1
+ instead of mpn_addmul_1.
+
+ * mpn/generic/powm.c (MPN_REDC_1): New macro, use for mpn_redc_1.
+ * mpn/generic/powm_sec.c (MPN_REDC_1_SEC): New macro, use for
+ mpn_redc_1_sec.
+
+2011-11-03 Torbjorn Granlund <tege@gmplib.org>
+
+ * dumbmp.c (mpz_sub): Abort for non-handled case.
+
+ * mpn/powerpc64/mode64/lshiftc.asm: Move file from here...
+ * mpn/powerpc64/lshiftc.asm: ...to here, with trivial modifications.
+
+ * configure.in: Pass -m32 in more cases, using via _maybe mechanism.
+ Inherit default gcc_cflags in more places.
+
+ * mpn/powerpc64/mode64/p7/gmp-mparam.h: New file.
+
2011-11-02 Torbjorn Granlund <tege@gmplib.org>
* mpn/s390_64/invert_limb.asm: Slight optimisation.
diff --git a/configure.in b/configure.in
index 21defe968..601d6348f 100644
--- a/configure.in
+++ b/configure.in
@@ -648,7 +648,7 @@ case $host in
# -mpa-risc-2-0 is only an optional flag, in case an old gcc is
# used. Assembler support for 2.0 is essential though, for our asm
# files.
- gcc_20n_cflags="-O2"
+ gcc_20n_cflags="$gcc_cflags"
gcc_20n_cflags_optlist="arch"
gcc_20n_cflags_arch="-mpa-risc-2-0 -mpa-risc-1-1"
gcc_20n_testlist="sizeof-long-4 hppa-level-2.0"
@@ -671,7 +671,7 @@ case $host in
esac
cclist_20w="gcc cc"
- gcc_20w_cflags="-O2 -mpa-risc-2-0"
+ gcc_20w_cflags="$gcc_cflags -mpa-risc-2-0"
cc_20w_cflags="+DD64 +O2"
cc_20w_testlist="hpc-hppa-2-0"
path_20w="pa64"
@@ -735,7 +735,7 @@ case $host in
cc_32_cflags=""
cc_32_cflags_optlist="opt"
cc_32_cflags_opt="+O3 +O2 +O1"
- gcc_32_cflags="-milp32 -O2"
+ gcc_32_cflags="$gcc_cflags -milp32"
limb_32=longlong
SPEED_CYCLECOUNTER_OBJ_32=ia64.lo
cyclecounter_size_32=2
@@ -750,7 +750,7 @@ case $host in
cc_64_cppflags="+DD64"
cc_64_cflags_optlist="opt"
cc_64_cflags_opt="+O3 +O2 +O1"
- gcc_64_cflags="$gcc_64_cflags -mlp64"
+ gcc_64_cflags="$gcc_cflags -mlp64"
;;
esac
;;
@@ -831,13 +831,13 @@ case $host in
abilist="n32 64 o32"
cclist_n32="gcc cc"
- gcc_n32_cflags="-O2 -mabi=n32"
+ gcc_n32_cflags="$gcc_cflags -mabi=n32"
cc_n32_cflags="-O2 -n32" # no -g, it disables all optimizations
limb_n32=longlong
path_n32="mips64"
cclist_64="gcc cc"
- gcc_64_cflags="$gcc_64_cflags -mabi=64"
+ gcc_64_cflags="$gcc_cflags -mabi=64"
gcc_64_ldflags="-Wc,-mabi=64"
cc_64_cflags="-O2 -64" # no -g, it disables all optimizations
cc_64_ldflags="-Wc,-64"
@@ -875,7 +875,7 @@ case $host in
abilist="32"
cclist="gcc cc"
cc_cflags="-O2"
- gcc_cflags="$gcc_cflags -mpowerpc"
+ gcc_32_cflags="$gcc_cflags -mpowerpc"
gcc_cflags_optlist="precomp subtype asm cpu"
gcc_cflags_precomp="-no-cpp-precomp"
gcc_cflags_subtype="-force_cpusubtype_ALL" # for vmx on darwin
@@ -919,7 +919,7 @@ case $host in
powerpc620) gcc_cflags_cpu="-mcpu=620" ;;
powerpc630) gcc_cflags_cpu="-mcpu=630"
xlc_cflags_arch="-qarch=pwr3"
- cpu_path="p3" ;;
+ cpu_path="p3 p3-p7" ;;
powerpc740) gcc_cflags_cpu="-mcpu=740" ;;
powerpc7400 | powerpc7410)
gcc_cflags_asm="-Wa,-maltivec"
@@ -935,19 +935,19 @@ case $host in
powerpc970) gcc_cflags_cpu="-mtune=970"
xlc_cflags_arch="-qarch=970 -qarch=pwr3"
vmx_path="powerpc64/vmx"
- cpu_path="p4" ;;
+ cpu_path="p4 p3-p7" ;;
power4) gcc_cflags_cpu="-mtune=power4"
xlc_cflags_arch="-qarch=pwr4"
- cpu_path="p4" ;;
+ cpu_path="p4 p3-p7" ;;
power5) gcc_cflags_cpu="-mtune=power5 -mtune=power4"
xlc_cflags_arch="-qarch=pwr5"
- cpu_path="p5 p4" ;;
+ cpu_path="p5 p4 p3-p7" ;;
power6) gcc_cflags_cpu="-mtune=power6"
xlc_cflags_arch="-qarch=pwr6"
- cpu_path="p6" ;;
+ cpu_path="p6 p3-p7" ;;
power7) gcc_cflags_cpu="-mtune=power7 -mtune=power5"
xlc_cflags_arch="-qarch=pwr7 -qarch=pwr5"
- cpu_path="p7 p5 p4" ;;
+ cpu_path="p7 p5 p4 p3-p7" ;;
esac
case $host in
@@ -969,7 +969,7 @@ case $host in
# Need -Wc to pass object type flags through to the linker.
abilist="mode64 $abilist"
cclist_mode64="gcc xlc"
- gcc_mode64_cflags="-O2 -maix64 -mpowerpc64"
+ gcc_mode64_cflags="$gcc_cflags -maix64 -mpowerpc64"
gcc_mode64_cflags_optlist="cpu"
gcc_mode64_ldflags="-Wc,-maix64"
xlc_mode64_cflags="-O2 -q64 -qmaxmem=20000"
@@ -1012,8 +1012,10 @@ case $host in
# incompatible with a shared library.
#
abilist="mode64 mode32 $abilist"
+ gcc_32_cflags_maybe="-m32"
gcc_cflags_opt="-O3 -O2 -O1" # will this become used?
cclist_mode32="gcc"
+ gcc_mode32_cflags_maybe="-m32"
gcc_mode32_cflags="-mpowerpc64"
gcc_mode32_cflags_optlist="subtype cpu opt"
gcc_mode32_cflags_subtype="-force_cpusubtype_ALL"
@@ -1056,7 +1058,9 @@ case $host in
# 64-bits.
#
abilist="mode64 mode32 $abilist"
+ gcc_32_cflags_maybe="-m32"
cclist_mode32="gcc"
+ gcc_mode32_cflags_maybe="-m32"
gcc_mode32_cflags="-mpowerpc64"
gcc_mode32_cflags_optlist="cpu opt"
gcc_mode32_cflags_opt="-O3 -O2 -O1"
@@ -1241,9 +1245,7 @@ case $host in
#
case $host_cpu in
sparc64 | sparcv9* | ultrasparc*)
- gcc_cflags="$gcc_cflags -Wa,-xarch=v8plus" ;;
- *)
- gcc_cflags="$gcc_cflags" ;;
+ gcc_32_cflags="$gcc_cflags -Wa,-xarch=v8plus" ;;
esac
gcc_32_cflags_maybe="-m32"
gcc_cflags_optlist="cpu"
@@ -1358,7 +1360,7 @@ case $host in
# it until we're sure. (Might want -xarch=v9a or -xarch=v9b for the
# higher cpu types instead.)
#
- gcc_64_cflags="$gcc_64_cflags -m64 -mptr64"
+ gcc_64_cflags="$gcc_cflags -m64 -mptr64"
gcc_64_ldflags="-Wc,-m64"
gcc_64_cflags_optlist="cpu"
@@ -1464,156 +1466,172 @@ case $host in
gcc_cflags_optlist="cpu arch"
case $host_cpu in
i386*)
- gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386"
- gcc_cflags_arch="-march=i386"
- ;;
+ gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386"
+ gcc_cflags_arch="-march=i386"
+ path="x86"
+ ;;
i486*)
- gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=i486"
- ;;
+ gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=i486"
+ path="x86/i486 x86"
+ ;;
i586 | pentium)
- gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486"
- gcc_cflags_arch="-march=pentium"
- ;;
+ gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486"
+ gcc_cflags_arch="-march=pentium"
+ path="x86/pentium x86"
+ ;;
pentiummmx)
- gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486"
- gcc_cflags_arch="-march=pentium-mmx -march=pentium"
- ;;
+ gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486"
+ gcc_cflags_arch="-march=pentium-mmx -march=pentium"
+ path="x86/pentium/mmx x86/pentium x86"
+ ;;
i686 | pentiumpro)
- gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486"
- gcc_cflags_arch="-march=pentiumpro -march=pentium"
- ;;
+ gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=pentiumpro -march=pentium"
+ path="x86/p6 x86"
+ ;;
pentium2)
- gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486"
- gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium"
- ;;
- pentium3 | pentiumm)
- gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
- gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
- ;;
+ gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium"
+ path="x86/p6/mmx x86/p6 x86"
+ ;;
+ pentium3)
+ gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
+ path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ ;;
+ pentiumm)
+ gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
+ path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ ;;
k6)
- gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=k6"
- ;;
+ gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=k6"
+ path="x86/k6/mmx x86/k6 x86"
+ ;;
k62)
- gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=k6-2 -march=k6"
- ;;
+ gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=k6-2 -march=k6"
+ path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
+ ;;
k63)
- gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=k6-3 -march=k6"
- ;;
+ gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=k6-3 -march=k6"
+ path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
+ ;;
geode)
- gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=k6-3 -march=k6"
- ;;
+ gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=k6-3 -march=k6"
+ path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
+ ;;
athlon)
- # Athlon instruction costs are close to P6 (3 cycle load latency,
- # 4-6 cycle mul, 40 cycle div, pairable adc, etc) so if gcc doesn't
- # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro.
- gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
- gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium"
- ;;
+ # Athlon instruction costs are close to P6 (3 cycle load latency,
+ # 4-6 cycle mul, 40 cycle div, pairable adc, etc) so if gcc doesn't
+ # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro.
+ gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium"
+ path="x86/k7/mmx x86/k7 x86"
+ ;;
i786 | pentium4)
- # pentiumpro is the primary fallback when gcc doesn't know pentium4.
- # This gets us cmov to eliminate branches. Maybe "athlon" would be
- # a possibility on gcc 3.0.
- #
- gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486"
- gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium"
- gcc_64_cflags_cpu="-mtune=nocona"
- ;;
+ # pentiumpro is the primary fallback when gcc doesn't know pentium4.
+ # This gets us cmov to eliminate branches. Maybe "athlon" would be
+ # a possibility on gcc 3.0.
+ #
+ gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium"
+ gcc_64_cflags_cpu="-mtune=nocona"
+ path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86"
+ path_64="x86_64/pentium4 x86_64"
+ ;;
viac32)
- # Not sure of the best fallbacks here for -mcpu.
- # c3-2 has sse and mmx, so pentium3 is good for -march.
- gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium"
- ;;
+ # Not sure of the best fallbacks here for -mcpu.
+ # c3-2 has sse and mmx, so pentium3 is good for -march.
+ gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium"
+ path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ ;;
viac3*)
- # Not sure of the best fallbacks here.
- gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
- ;;
- athlon64 | k8 | k10 | bobcat | bulldozer | x86_64)
- gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
- gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
- ;;
- core2 | corei | coreinhm | coreiwsm | coreisbr)
- gcc_cflags_cpu="-mtune=core2 -mtune=k8"
- gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
- ;;
+ # Not sure of the best fallbacks here.
+ gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
+ path="x86/pentium/mmx x86/pentium x86"
+ ;;
+ athlon64 | k8 | x86_64)
+ gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
+ path="x86/k8 x86/k7/mmx x86/k7 x86"
+ path_64="x86_64/k8 x86_64"
+ ;;
+ k10)
+ gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
+ gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+ path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86"
+ path_64="x86_64/k10 x86_64/k8 x86_64"
+ ;;
+ bobcat)
+ gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
+ gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+ path="x86/bobcat x86/k7/mmx x86/k7 x86"
+ path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64"
+ ;;
+ bulldozer | bd1)
+ gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
+ gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+ path="x86/bd1 x86/k7/mmx x86/k7 x86"
+ path_64="x86_64/bd1 x86_64"
+ ;;
+ core2)
+ gcc_cflags_cpu="-mtune=core2 -mtune=k8"
+ gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+ path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path_64="x86_64/core2 x86_64"
+ ;;
+ corei | coreinhm | coreiwsm)
+ gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
+ gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+ path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path_64="x86_64/coreinhm x86_64/core2 x86_64"
+ ;;
+ coreisbr)
+ gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
+ gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+ path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
+ ;;
atom)
- gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
- gcc_cflags_arch="-march=atom -march=pentium3"
- ;;
+ gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
+ gcc_cflags_arch="-march=atom -march=pentium3"
+ path="x86/atom/sse2 x86/atom/mmx x86/atom x86"
+ path_64="x86_64/atom x86_64"
+ ;;
+ nano)
+ gcc_cflags_cpu="-mtune=nano"
+ gcc_cflags_arch="-march=nano"
+ path="x86/nano x86"
+ path_64="x86_64/nano x86_64"
+ ;;
*)
- gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
- gcc_cflags_arch="-march=i486"
- ;;
- esac
-
- case $host_cpu in
- i386*) path="x86" ;;
- i486*) path="x86/i486 x86" ;;
- i586 | pentium) path="x86/pentium x86" ;;
- pentiummmx) path="x86/pentium/mmx x86/pentium x86" ;;
- i686 | pentiumpro) path="x86/p6 x86" ;;
- pentium2) path="x86/p6/mmx x86/p6 x86" ;;
- pentium3) path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86";;
- pentiumm | core2 | corei | coreinhm | coreiwsm | coreisbr)
- path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86";;
- [k6[23]]) path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;;
- k6) path="x86/k6/mmx x86/k6 x86" ;;
- geode) path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;;
- # we don't have any specific 32-bit code for athlon64/opteron, the
- # athlon code should be reasonable
- athlon | athlon64 | k8 | k10 | bobcat | bulldozer)
- path="x86/k7/mmx x86/k7 x86" ;;
- i786 | pentium4) path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86" ;;
- # VIA/Centaur processors, sold as CyrixIII and C3.
- viac32) path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86";;
- viac3*) path="x86/pentium/mmx x86/pentium x86";;
- atom) path="x86/atom/sse2 x86/atom/mmx x86/atom x86" ;;
- *) path="x86" ;;
+ gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
+ gcc_cflags_arch="-march=i486"
+ path="x86"
+ path_64="x86_64"
+ ;;
esac
case $host in
X86_64_PATTERN)
cclist_64="gcc"
- gcc_64_cflags="$gcc_64_cflags -m64"
+ gcc_64_cflags="$gcc_cflags -m64"
gcc_64_cflags_optlist="cpu arch"
CALLING_CONVENTIONS_OBJS_64='amd64call.lo amd64check$U.lo'
SPEED_CYCLECOUNTER_OBJ_64=x86_64.lo
cyclecounter_size_64=2
abilist="64 32"
- path_64="x86_64"
if test "$enable_assembly" = "yes" ; then
extra_functions_64="invert_limb_table"
fi
- case $host_cpu in
- x86_64)
- ;;
- k10 | bulldozer)
- path_64="x86_64/k10 x86_64/k8 $path_64" ;;
- athlon64 | k8)
- path_64="x86_64/k8 $path_64" ;;
- bobcat)
- path_64="x86_64/bobcat x86_64/k10 x86_64/k8 $path_64" ;;
- pentium4)
- path_64="x86_64/pentium4 $path_64" ;;
- core2)
- path_64="x86_64/core2 $path_64" ;;
- corei | coreinhm | coreiwsm)
- path_64="x86_64/coreinhm x86_64/core2 $path_64" ;;
- coreisbr)
- path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 $path_64" ;;
- atom)
- path_64="x86_64/atom $path_64" ;;
- nano)
- path_64="x86_64/nano $path_64" ;;
- esac
-
case $host in
*-*-solaris*)
# Sun cc.
@@ -1622,10 +1640,9 @@ case $host in
;;
*-*-mingw* | *-*-cygwin)
limb_64=longlong
- path_64="" # Windows amd64 calling conventions are *different*
- extra_functions_64=""
- # Silence many pedantic warnings for w64. FIXME.
- gcc_64_cflags="$gcc_64_cflags -std=gnu99"
+ CALLING_CONVENTIONS_OBJS_64=""
+ AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64])
+ AC_SUBST(GMP_NONSTD_ABI,DOS64)
;;
esac
;;
@@ -1912,9 +1929,17 @@ case $host in
if test "$abi" = 64; then
gcc_64_cflags=""
- extra_functions_64="$extra_functions_64 fat fat_entry"
- path_64="x86_64/fat x86_64"
- fat_path="x86_64 x86_64/fat x86_64/pentium4 x86_64/core2 x86_64/coreinhm x86_64/coreisbr x86_64/atom x86_64/nano"
+ case $host in
+ *-*-mingw* | *-*-cygwin)
+ path_64="" # Windows amd64 calling conventions are *different*
+ fat_path=""
+ ;;
+ *)
+ extra_functions_64="$extra_functions_64 fat fat_entry"
+ path_64="x86_64/fat x86_64"
+ fat_path="x86_64 x86_64/fat x86_64/pentium4 x86_64/core2 x86_64/coreinhm x86_64/coreisbr x86_64/atom x86_64/nano"
+ ;;
+ esac
fi
fat_functions="add_n addmul_1 copyd copyi
@@ -2636,9 +2661,9 @@ gmp_mpn_functions="$extra_functions \
mu_bdiv_q mu_bdiv_qr \
bdiv_q bdiv_qr \
divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec \
- redc_1_sec trialdiv remove \
+ trialdiv remove \
and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n \
- copyi copyd zero \
+ copyi copyd zero tabselect \
$gmp_mpn_functions_optional"
define(GMP_MULFUNC_CHOICES,
@@ -2652,6 +2677,7 @@ case $tmp_fn in
tmp_mulfunc="aors_err2_n" ;;
add_err3_n|sub_err3_n)
tmp_mulfunc="aors_err3_n" ;;
+ addcnd_n|subcnd_n) tmp_mulfunc="aorscnd_n" ;;
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
popcount|hamdist) tmp_mulfunc="popham" ;;
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
@@ -3034,6 +3060,17 @@ for tmp_fn in $gmp_mpn_functions; do
esac
fi
+ # If the host uses a non-standard ABI, check if tmp_file supports it
+ #
+ if test -n "$GMP_NONSTD_ABI" && test $tmp_dir != generic; then
+ abi=[`sed -n 's/^[ ]*ABI_SUPPORT(\(.*\))/\1/p' $tmp_file `]
+ if echo "$abi" | grep -q "\\b${GMP_NONSTD_ABI}\\b"; then
+ true
+ else
+ continue
+ fi
+ fi
+
found=yes
eval found_$tmp_ext=yes
@@ -3100,6 +3137,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_mpn_add_n_sub_n
#undef HAVE_NATIVE_mpn_add_nc
#undef HAVE_NATIVE_mpn_addaddmul_1msb0
+#undef HAVE_NATIVE_mpn_addcnd_n
#undef HAVE_NATIVE_mpn_addlsh1_n
#undef HAVE_NATIVE_mpn_addlsh2_n
#undef HAVE_NATIVE_mpn_addlsh_n
@@ -3188,6 +3226,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_mpn_sqr_diag_addlsh1
#undef HAVE_NATIVE_mpn_sub_n
#undef HAVE_NATIVE_mpn_sub_nc
+#undef HAVE_NATIVE_mpn_subcnd_n
#undef HAVE_NATIVE_mpn_sublsh1_n
#undef HAVE_NATIVE_mpn_sublsh2_n
#undef HAVE_NATIVE_mpn_sublsh_n
@@ -3201,6 +3240,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_mpn_sublsh2_nc_ip1
#undef HAVE_NATIVE_mpn_sublsh_nc_ip1
#undef HAVE_NATIVE_mpn_submul_1c
+#undef HAVE_NATIVE_mpn_tabselect
#undef HAVE_NATIVE_mpn_udiv_qrnnd
#undef HAVE_NATIVE_mpn_udiv_qrnnd_r
#undef HAVE_NATIVE_mpn_umul_ppmm
@@ -3314,6 +3354,8 @@ if test "$gmp_asm_syntax_testing" != no; then
case $host in
*-*-darwin*)
GMP_INCLUDE_MPN(x86_64/darwin.m4) ;;
+ *-*-mingw* | *-*-cygwin)
+ GMP_INCLUDE_MPN(x86_64/dos64.m4) ;;
esac
;;
esac
diff --git a/doc/gmp.texi b/doc/gmp.texi
index 1d6538165..9e77abe7f 100644
--- a/doc/gmp.texi
+++ b/doc/gmp.texi
@@ -5909,7 +5909,7 @@ instance extensions registered with GLIBC @code{register_printf_function}.
Also currently there's no support for POSIX @samp{$} style numbered arguments
(perhaps this will be added in the future).
-The precision field has it's usual meaning for integer @samp{Z} and float
+The precision field has its usual meaning for integer @samp{Z} and float
@samp{F} types, but is currently undefined for @samp{Q} and should not be used
with that.
@@ -5920,7 +5920,10 @@ happens even for an @samp{f} conversion of an @code{mpf_t} which is an
integer, for instance @math{2^@W{1024}} in an @code{mpf_t} of 128 bits
precision will only produce about 40 digits, then pad with zeros to the
decimal point. An empty precision field like @samp{%.Fe} or @samp{%.Ff} can
-be used to specifically request just the significant digits.
+be used to specifically request just the significant digits. Without any dot
+and thus no precision field, a precision value of 6 will be used. Note that
+these rules mean that @samp{%Ff}, @samp{%.Ff}, and @samp{%.0Ff} will all be
+different.
The decimal point character (or string) is taken from the current locale
settings on systems which provide @code{localeconv} (@pxref{Locales,, Locales
diff --git a/dumbmp.c b/dumbmp.c
index 293580228..3292d6eec 100644
--- a/dumbmp.c
+++ b/dumbmp.c
@@ -421,6 +421,8 @@ mpz_sub (mpz_t r, mpz_t a, mpz_t b)
mp_limb_t *tp; int tn;
tn = an; an = bn; bn = tn;
tp = ap; ap = bp; bp = tp;
+ /* This needs sign change, not done so abort. */
+ abort ();
}
cy = 0;
diff --git a/gmp-h.in b/gmp-h.in
index 7d6b22926..fa3438041 100644
--- a/gmp-h.in
+++ b/gmp-h.in
@@ -1535,7 +1535,7 @@ __GMP_DECLSPEC mp_limb_t mpn_divrem_2 __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, m
#define mpn_div_qr_2 __MPN(div_qr_2)
__GMP_DECLSPEC mp_limb_t mpn_div_qr_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
-
+
#define mpn_gcd __MPN(gcd)
__GMP_DECLSPEC mp_size_t mpn_gcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
@@ -2275,7 +2275,7 @@ enum
#define __GNU_MP_VERSION 5
#define __GNU_MP_VERSION_MINOR 0
#define __GNU_MP_VERSION_PATCHLEVEL 90
-#define __GMP_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL)
+#define __GNU_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL)
#define __GMP_H__
#endif /* __GMP_H__ */
diff --git a/gmp-impl.h b/gmp-impl.h
index e918c31ed..c0ed63791 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -1063,7 +1063,7 @@ __GMP_DECLSPEC void mpn_mulmid __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_sr
__GMP_DECLSPEC mp_limb_t mpn_submul_1c __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
#define mpn_redc_1 __MPN(redc_1)
-__GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+__GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
#define mpn_redc_2 __MPN(redc_2)
__GMP_DECLSPEC void mpn_redc_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
@@ -1471,9 +1471,6 @@ __GMP_DECLSPEC void mpn_powm_sec __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t
__GMP_DECLSPEC mp_size_t mpn_powm_sec_itch __GMP_PROTO ((mp_size_t, mp_size_t, mp_size_t));
#define mpn_tabselect __MPN(tabselect)
__GMP_DECLSPEC void mpn_tabselect __GMP_PROTO ((volatile mp_limb_t *, volatile mp_limb_t *, mp_size_t, mp_size_t, mp_size_t));
-#define mpn_redc_1_sec __MPN(redc_1_sec)
-__GMP_DECLSPEC void mpn_redc_1_sec __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
-
#define mpn_addcnd_n __MPN(addcnd_n)
__GMP_DECLSPEC mp_limb_t mpn_addcnd_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
#define mpn_subcnd_n __MPN(subcnd_n)
diff --git a/gmpxx.h b/gmpxx.h
index e7ef16266..fb4865466 100644
--- a/gmpxx.h
+++ b/gmpxx.h
@@ -616,7 +616,7 @@ struct __gmp_binary_divides
}
else
#endif
- mpz_tdiv_q_ui(z, w, l);
+ mpz_tdiv_q_ui(z, w, l);
}
static void eval(mpz_ptr z, unsigned long int l, mpz_srcptr w)
{
diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h
index a4c794838..395353a46 100644
--- a/mpn/alpha/ev5/gmp-mparam.h
+++ b/mpn/alpha/ev5/gmp-mparam.h
@@ -26,38 +26,44 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 29
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 /* never mpn_mod_1_1p */
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 4
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 75
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD 21
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 80
+#define BMOD_1_TO_MOD_1_THRESHOLD 78
-#define MUL_TOOM22_THRESHOLD 18
-#define MUL_TOOM33_THRESHOLD 61
-#define MUL_TOOM44_THRESHOLD 88
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 118
#define MUL_TOOM6H_THRESHOLD 173
-#define MUL_TOOM8H_THRESHOLD 0
+#define MUL_TOOM8H_THRESHOLD 240
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 60
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
#define SQR_BASECASE_THRESHOLD 4
#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 65
+#define SQR_TOOM3_THRESHOLD 77
#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 180
-#define SQR_TOOM8_THRESHOLD 248
+#define SQR_TOOM6_THRESHOLD 173
+#define SQR_TOOM8_THRESHOLD 260
+
+#define MULMID_TOOM42_THRESHOLD 20
#define MULMOD_BNM1_THRESHOLD 11
#define SQRMOD_BNM1_THRESHOLD 13
+#define POWM_SEC_TABLE 2,17,322,387
+
#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 244, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
@@ -161,9 +167,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 942
#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 101
-#define GCD_DC_THRESHOLD 330
-#define GCDEXT_DC_THRESHOLD 222
+#define HGCD_THRESHOLD 105
+#define HGCD_APPR_THRESHOLD 111
+#define HGCD_REDUCE_THRESHOLD 1437
+#define GCD_DC_THRESHOLD 318
+#define GCDEXT_DC_THRESHOLD 214
#define JACOBI_BASE_METHOD 2
#define GET_STR_DC_THRESHOLD 16
diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h
index 12c3891d7..ce865f4cc 100644
--- a/mpn/alpha/ev6/gmp-mparam.h
+++ b/mpn/alpha/ev6/gmp-mparam.h
@@ -29,38 +29,44 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 6
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 4
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD 8
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 16
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
#define MUL_TOOM22_THRESHOLD 35
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 178
-#define MUL_TOOM6H_THRESHOLD 288
-#define MUL_TOOM8H_THRESHOLD 333
+#define MUL_TOOM33_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 184
+#define MUL_TOOM6H_THRESHOLD 228
+#define MUL_TOOM8H_THRESHOLD 288
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 75
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 101
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 73
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 61
-#define SQR_TOOM3_THRESHOLD 107
-#define SQR_TOOM4_THRESHOLD 170
-#define SQR_TOOM6_THRESHOLD 309
-#define SQR_TOOM8_THRESHOLD 360
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 58
+#define SQR_TOOM3_THRESHOLD 103
+#define SQR_TOOM4_THRESHOLD 172
+#define SQR_TOOM6_THRESHOLD 264
+#define SQR_TOOM8_THRESHOLD 333
+
+#define MULMID_TOOM42_THRESHOLD 52
#define MULMOD_BNM1_THRESHOLD 20
#define SQRMOD_BNM1_THRESHOLD 23
+#define POWM_SEC_TABLE 4,17,246,2388
+
#define MUL_FFT_MODF_THRESHOLD 480 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 480, 5}, { 18, 6}, { 10, 5}, { 21, 6}, \
@@ -148,19 +154,19 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 3136
#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 130
-#define MULLO_MUL_N_THRESHOLD 15604
+#define MULLO_DC_THRESHOLD 173
+#define MULLO_MUL_N_THRESHOLD 11355
-#define DC_DIV_QR_THRESHOLD 119
-#define DC_DIVAPPR_Q_THRESHOLD 390
+#define DC_DIV_QR_THRESHOLD 112
+#define DC_DIVAPPR_Q_THRESHOLD 422
#define DC_BDIV_QR_THRESHOLD 110
-#define DC_BDIV_Q_THRESHOLD 318
+#define DC_BDIV_Q_THRESHOLD 348
-#define INV_MULMOD_BNM1_THRESHOLD 75
-#define INV_NEWTON_THRESHOLD 390
-#define INV_APPR_THRESHOLD 372
+#define INV_MULMOD_BNM1_THRESHOLD 68
+#define INV_NEWTON_THRESHOLD 402
+#define INV_APPR_THRESHOLD 396
-#define BINV_NEWTON_THRESHOLD 393
+#define BINV_NEWTON_THRESHOLD 399
#define REDC_1_TO_REDC_N_THRESHOLD 110
#define MU_DIV_QR_THRESHOLD 1718
@@ -170,12 +176,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1652
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 282
-#define GCD_DC_THRESHOLD 1138
-#define GCDEXT_DC_THRESHOLD 773
+#define HGCD_THRESHOLD 278
+#define HGCD_APPR_THRESHOLD 366
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 1258
+#define GCDEXT_DC_THRESHOLD 777
#define JACOBI_BASE_METHOD 3
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 19
-#define SET_STR_DC_THRESHOLD 3754
-#define SET_STR_PRECOMPUTE_THRESHOLD 8097
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 3539
+#define SET_STR_PRECOMPUTE_THRESHOLD 7784
diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4
index 4f049b21b..b95cad7c0 100644
--- a/mpn/asm-defs.m4
+++ b/mpn/asm-defs.m4
@@ -1471,6 +1471,7 @@ define_mpn(sub_n)
define_mpn(sub_nc)
define_mpn(submul_1)
define_mpn(submul_1c)
+define_mpn(tabselect)
define_mpn(umul_ppmm)
define_mpn(umul_ppmm_r)
define_mpn(udiv_qrnnd)
@@ -1712,6 +1713,22 @@ m4_assert_numargs(1)
)
+dnl Usage: ABI_SUPPORT(abi)
+dnl
+dnl A dummy macro which is grepped for by ./configure to know what ABIs
+dnl are supported in an asm file.
+dnl
+dnl If multiple non-standard ABIs are supported, several ABI_SUPPORT
+dnl declarations should be used:
+dnl
+dnl ABI_SUPPORT(FOOABI)
+dnl ABI_SUPPORT(BARABI)
+
+define(ABI_SUPPORT,
+m4_assert_numargs(1)
+)
+
+
dnl Usage: GMP_NUMB_MASK
dnl
dnl A bit mask for the number part of a limb. Eg. with 6 bit nails in a
diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c
index 11c00bb6a..3db34073c 100644
--- a/mpn/generic/gcd_subdiv_step.c
+++ b/mpn/generic/gcd_subdiv_step.c
@@ -185,7 +185,7 @@ mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s,
}
else
MPN_COPY (bp, ap, an);
-
+
MPN_DECR_U (tp, qn, 1);
}
diff --git a/mpn/generic/hgcd_appr.c b/mpn/generic/hgcd_appr.c
index 963eaea47..f7c7eb2c9 100644
--- a/mpn/generic/hgcd_appr.c
+++ b/mpn/generic/hgcd_appr.c
@@ -25,172 +25,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
-/* Computes R -= A * B. Result must be non-negative. Normalized down
- to size an, and resulting size is returned. */
-static mp_size_t
-submul (mp_ptr rp, mp_size_t rn,
- mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
-{
- mp_ptr tp;
- TMP_DECL;
-
- ASSERT (bn > 0);
- ASSERT (an >= bn);
- ASSERT (rn >= an);
- ASSERT (an + bn <= rn + 1);
-
- TMP_MARK;
- tp = TMP_ALLOC_LIMBS (an + bn);
-
- mpn_mul (tp, ap, an, bp, bn);
- if (an + bn > rn)
- {
- ASSERT (tp[rn] == 0);
- bn--;
- }
- ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn));
- TMP_FREE;
-
- while (rn > an && (rp[rn-1] == 0))
- rn--;
-
- return rn;
-}
-
-/* Computes (a, b) <-- M^{-1} (a; b) */
-/* FIXME:
- x Take scratch parameter, and figure out scratch need.
-
- x Use some fallback for small M->n?
-*/
-static mp_size_t
-hgcd_matrix_apply (const struct hgcd_matrix *M,
- mp_ptr ap, mp_ptr bp,
- mp_size_t n)
-{
- mp_size_t an, bn, un, vn, nn;
- mp_size_t mn[2][2];
- mp_size_t modn;
- mp_ptr tp, sp, scratch;
- mp_limb_t cy;
- unsigned i, j;
-
- TMP_DECL;
-
- ASSERT ( (ap[n-1] | bp[n-1]) > 0);
-
- an = n;
- MPN_NORMALIZE (ap, an);
- bn = n;
- MPN_NORMALIZE (bp, bn);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- {
- mp_size_t k;
- k = M->n;
- MPN_NORMALIZE (M->p[i][j], k);
- mn[i][j] = k;
- }
-
- ASSERT (mn[0][0] > 0);
- ASSERT (mn[1][1] > 0);
- ASSERT ( (mn[0][1] | mn[1][0]) > 0);
-
- TMP_MARK;
-
- if (mn[0][1] == 0)
- {
- mp_size_t qn;
-
- /* A unchanged, M = (1, 0; q, 1) */
- ASSERT (mn[0][0] == 1);
- ASSERT (M->p[0][0][0] == 1);
- ASSERT (mn[1][1] == 1);
- ASSERT (M->p[1][1][0] == 1);
-
- /* Put B <-- B - q A */
- nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
- }
- else if (mn[1][0] == 0)
- {
- /* B unchanged, M = (1, q; 0, 1) */
- ASSERT (mn[0][0] == 1);
- ASSERT (M->p[0][0][0] == 1);
- ASSERT (mn[1][1] == 1);
- ASSERT (M->p[1][1][0] == 1);
-
- /* Put A <-- A - q * B */
- nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
- }
- else
- {
- /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01.
- B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */
- un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
- vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
-
- nn = MAX (un, vn);
- /* In the range of interest, mulmod_bnm1 should always beat mullo. */
- modn = mpn_mulmod_bnm1_next_size (nn + 1);
-
- scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
- tp = TMP_ALLOC_LIMBS (modn);
- sp = TMP_ALLOC_LIMBS (modn);
-
- ASSERT (n <= 2*modn);
-
- if (n > modn)
- {
- cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
- MPN_INCR_U (ap, modn, cy);
-
- cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
- MPN_INCR_U (bp, modn, cy);
-
- n = modn;
- }
-
- mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
- mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
-
- /* FIXME: Handle the small n case in some better way. */
- if (n + mn[1][1] < modn)
- MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
- if (n + mn[0][1] < modn)
- MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-
- cy = mpn_sub_n (tp, tp, sp, modn);
- MPN_DECR_U (tp, modn, cy);
-
- ASSERT (mpn_zero_p (tp + nn, modn - nn));
-
- mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
- MPN_COPY (ap, tp, nn);
- mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
-
- if (n + mn[1][0] < modn)
- MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
- if (n + mn[0][0] < modn)
- MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
-
- cy = mpn_sub_n (tp, tp, sp, modn);
- MPN_DECR_U (tp, modn, cy);
-
- ASSERT (mpn_zero_p (tp + nn, modn - nn));
- MPN_COPY (bp, tp, nn);
-
- while ( (ap[nn-1] | bp[nn-1]) == 0)
- {
- nn--;
- ASSERT (nn > 0);
- }
- }
- TMP_FREE;
-
- return nn;
-}
-
/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
HGCD_THRESHOLD at the end? */
mp_size_t
@@ -238,7 +72,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
we discard some of the least significant limbs, we must keep one
additional bit to account for the truncation error. We maintain
the GMP_NUMB_BITS * s - extra_bits as the current target size. */
-
+
s = n/2 + 1;
if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
{
@@ -321,7 +155,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
ASSERT (n <= 2*s);
nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-
+
if (!nn)
return 1;
@@ -347,13 +181,12 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
{
mp_size_t n2 = (3*n)/4 + 1;
mp_size_t p = n/2;
- mp_size_t input_n = n;
+ mp_size_t nn;
- MPN_COPY (tp, ap + p, n - p);
- MPN_COPY (tp + n - p, bp + p, n - p);
- if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
+ nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
+ if (nn)
{
- n = hgcd_matrix_apply (M, ap, bp, n);
+ n = nn;
/* FIXME: Discard some of the low limbs immediately? */
success = 1;
}
@@ -416,7 +249,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
ASSERT (n <= 2*s);
nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-
+
if (!nn)
return success;
diff --git a/mpn/generic/hgcd_jacobi.c b/mpn/generic/hgcd_jacobi.c
index 2dce43b99..0d4cb021c 100644
--- a/mpn/generic/hgcd_jacobi.c
+++ b/mpn/generic/hgcd_jacobi.c
@@ -26,7 +26,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "longlong.h"
/* This file is almost a copy of hgcd.c, with some added calls to
- mpn_jacobi_update */
+ mpn_jacobi_update */
struct hgcd_jacobi_ctx
{
@@ -127,7 +127,7 @@ hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
struct hgcd_jacobi_ctx ctx;
ctx.M = M;
ctx.bitsp = bitsp;
-
+
return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp);
}
}
diff --git a/mpn/generic/hgcd_reduce.c b/mpn/generic/hgcd_reduce.c
index 142d44a30..89240af4d 100644
--- a/mpn/generic/hgcd_reduce.c
+++ b/mpn/generic/hgcd_reduce.c
@@ -38,7 +38,7 @@ submul (mp_ptr rp, mp_size_t rn,
ASSERT (an >= bn);
ASSERT (rn >= an);
ASSERT (an + bn <= rn + 1);
-
+
TMP_MARK;
tp = TMP_ALLOC_LIMBS (an + bn);
@@ -61,7 +61,7 @@ submul (mp_ptr rp, mp_size_t rn,
/* FIXME:
x Take scratch parameter, and figure out scratch need.
- x Use some fallback for small M->n?
+ x Use some fallback for small M->n?
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
@@ -83,7 +83,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
MPN_NORMALIZE (ap, an);
bn = n;
MPN_NORMALIZE (bp, bn);
-
+
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
@@ -102,7 +102,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
if (mn[0][1] == 0)
{
mp_size_t qn;
-
+
/* A unchanged, M = (1, 0; q, 1) */
ASSERT (mn[0][0] == 1);
ASSERT (M->p[0][0][0] == 1);
@@ -121,7 +121,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
ASSERT (M->p[1][1][0] == 1);
/* Put A <-- A - q * B */
- nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
+ nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
}
else
{
@@ -159,7 +159,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
if (n + mn[0][1] < modn)
MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-
+
cy = mpn_sub_n (tp, tp, sp, modn);
MPN_DECR_U (tp, modn, cy);
@@ -209,7 +209,7 @@ mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p)
itch = 2*(n-p) + mpn_hgcd_itch (n-p);
/* Currently, hgcd_matrix_apply allocates its own storage. */
}
- return itch;
+ return itch;
}
/* FIXME: Document storage need. */
diff --git a/mpn/generic/hgcd_step.c b/mpn/generic/hgcd_step.c
index 0e56be39e..dbc757935 100644
--- a/mpn/generic/hgcd_step.c
+++ b/mpn/generic/hgcd_step.c
@@ -112,7 +112,7 @@ mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
/* Multiply M1^{-1} (a;b) */
return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
}
-
+
subtract:
return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp);
diff --git a/mpn/generic/powm.c b/mpn/generic/powm.c
index 57edfd4f6..fa92362ad 100644
--- a/mpn/generic/powm.c
+++ b/mpn/generic/powm.c
@@ -6,7 +6,7 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -74,6 +74,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
+#undef MPN_REDC_1
+#define MPN_REDC_1(rp, up, mp, n, invm) \
+ do { \
+ mp_limb_t cy; \
+ mpn_redc_1 (up, mp, n, invm); \
+ cy = mpn_add_n (rp, up + n, up, n); \
+ if (cy != 0) \
+ mpn_sub_n (rp, rp, mp, n); \
+ } while (0)
+
#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
#define WANT_REDC_2 1
#endif
@@ -212,12 +222,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mpn_sqr (tp, this_pp, n);
#if WANT_REDC_2
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
mpn_redc_2 (rp, tp, mp, n, mip);
#else
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
#endif
else
mpn_redc_n (rp, tp, mp, n, mip);
@@ -229,12 +239,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
this_pp += n;
#if WANT_REDC_2
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
mpn_redc_2 (this_pp, tp, mp, n, mip);
#else
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
#endif
else
mpn_redc_n (this_pp, tp, mp, n, mip);
@@ -309,7 +319,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -319,7 +329,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -380,7 +390,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -390,7 +400,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -401,7 +411,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
@@ -440,7 +450,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -450,7 +460,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -501,7 +511,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -511,7 +521,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -522,7 +532,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -545,12 +555,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#if WANT_REDC_2
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
mpn_redc_2 (rp, tp, mp, n, mip);
#else
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
#endif
else
mpn_redc_n (rp, tp, mp, n, mip);
diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c
index 315ae6e5e..24bb83de3 100644
--- a/mpn/generic/powm_sec.c
+++ b/mpn/generic/powm_sec.c
@@ -7,7 +7,7 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -56,6 +56,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define WANT_CACHE_SECURITY 1
+#undef MPN_REDC_1_SEC
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \
+ do { \
+ mp_limb_t cy; \
+ mpn_redc_1 (up, mp, n, invm); \
+ cy = mpn_add_n (rp, up + n, up, n); \
+ mpn_subcnd_n (rp, rp, mp, n, cy); \
+ } while (0)
/* Define our own mpn squaring function. We do this since we cannot use a
native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over
@@ -125,8 +133,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
if (n > 1)
{
mp_limb_t cy;
- TMP_DECL;
- TMP_MARK;
cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
tp[n - 1] = cy;
@@ -148,8 +154,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
#endif
rp[2 * n - 1] += cy;
}
-
- TMP_FREE;
}
}
#endif
@@ -181,36 +185,46 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
}
}
+#ifndef POWM_SEC_TABLE
+#if GMP_NUMB_BITS < 50
+#define POWM_SEC_TABLE 2,33,96,780,2741
+#else
+#define POWM_SEC_TABLE 2,130,524,2578
+#endif
+#endif
+
+#if TUNE_PROGRAM_BUILD
+extern int win_size (mp_bitcnt_t);
+#else
static inline int
win_size (mp_bitcnt_t eb)
{
int k;
- static mp_bitcnt_t x[] = {0,4,27,100,325,1026,2905,7848,20457,51670,~(mp_bitcnt_t)0};
+ static mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0};
for (k = 1; eb > x[k]; k++)
;
return k;
}
+#endif
-/* Convert U to REDC form, U_r = B^n * U mod M */
+/* Convert U to REDC form, U_r = B^n * U mod M.
+ Uses scratch space at tp of size 2un + n + 1. */
static void
redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp)
{
mp_ptr qp;
- TMP_DECL;
- TMP_MARK;
- qp = tp + un + n;
+ qp = tp + un + n; /* un + n - n + 1 = un + 1 limbs */
MPN_ZERO (tp, n);
MPN_COPY (tp + n, up, un);
mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
- TMP_FREE;
}
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
- Requires that mp[n-1..0] is odd. FIXME: is this true?
- Requires that ep[en-1..0] is > 1.
- Uses scratch space at tp of 3n+1 limbs. */
+ Requires that mp[n-1..0] is odd.
+ Requires that ep[en-1..0] > 1.
+ Uses scratch space at tp as defined by mpn_powm_sec_itch. */
void
mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mp_srcptr ep, mp_size_t en,
@@ -224,13 +238,10 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mp_ptr pp, this_pp;
long i;
int cnd;
- TMP_DECL;
ASSERT (en > 1 || (en == 1 && ep[0] > 0));
ASSERT (n >= 1 && ((mp[0] & 1) != 0));
- TMP_MARK;
-
count_leading_zeros (cnt, ep[en - 1]);
ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt;
@@ -239,20 +250,32 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
binvert_limb (minv, mp[0]);
minv = -minv;
- pp = tp + 4 * n;
+ pp = tp;
+ tp += (n << windowsize); /* put tp after power table */
+ /* Compute pp[0] table entry */
+ /* scratch: | n | 1 | n+2 | */
+ /* | pp[0] | 1 | redcify | */
this_pp = pp;
this_pp[n] = 1;
- redcify (this_pp, this_pp + n, 1, mp, n, tp + 6 * n);
+ redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1);
this_pp += n;
- redcify (this_pp, bp, bn, mp, n, tp + 6 * n);
+
+ /* Compute pp[1] table entry. To avoid excessive scratch usage in the
+ degenerate situation where B >> M, we let redcify use scratch space which
+ will later be used by the pp table (element 2 and up). */
+ /* scratch: | n | n | bn + n + 1 | */
+ /* | pp[0] | pp[1] | redcify | */
+ redcify (this_pp, bp, bn, mp, n, this_pp + n);
/* Precompute powers of b and put them in the temporary area at pp. */
+ /* scratch: | n | n | ... | | 2n | */
+ /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */
for (i = (1 << windowsize) - 2; i > 0; i--)
{
mpn_mul_basecase (tp, this_pp, n, pp + n, n);
this_pp += n;
- mpn_redc_1_sec (this_pp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (this_pp, tp, mp, n, minv);
}
expbits = getbits (ep, ebi, windowsize);
@@ -261,8 +284,15 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
else
ebi -= windowsize;
+#if WANT_CACHE_SECURITY
+ mpn_tabselect (rp, pp, n, 1 << windowsize, expbits);
+#else
MPN_COPY (rp, pp + n * expbits, n);
+#endif
+ /* Main exponentiation loop. */
+ /* scratch: | n | n | ... | | 3n-4n | */
+ /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */
while (ebi != 0)
{
expbits = getbits (ep, ebi, windowsize);
@@ -278,7 +308,7 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
do
{
mpn_local_sqr (tp, rp, n, tp + 2 * n);
- mpn_redc_1_sec (rp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (rp, tp, mp, n, minv);
this_windowsize--;
}
while (this_windowsize != 0);
@@ -289,52 +319,36 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#else
mpn_mul_basecase (tp, rp, n, pp + n * expbits, n);
#endif
- mpn_redc_1_sec (rp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (rp, tp, mp, n, minv);
}
MPN_COPY (tp, rp, n);
MPN_ZERO (tp + n, n);
- mpn_redc_1_sec (rp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (rp, tp, mp, n, minv);
cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */
mpn_subcnd_n (rp, rp, mp, n, !cnd);
- TMP_FREE;
}
-#if ! HAVE_NATIVE_mpn_tabselect
-/* Select entry `which' from table `tab', which has nents entries, each `n'
- limbs. Store the selected entry at rp. Reads entire table to avoid
- side-channel information leaks. O(n*nents).
- FIXME: Move to its own file. */
-void
-mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
- mp_size_t nents, mp_size_t which)
-{
- mp_size_t k, i;
- mp_limb_t mask;
- volatile mp_limb_t *tp;
-
- for (k = 0; k < nents; k++)
- {
- mask = -(mp_limb_t) (which == k);
- tp = tab + n * k;
- for (i = 0; i < n; i++)
- {
- rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
- }
- }
-}
-#endif
-
mp_size_t
mpn_powm_sec_itch (mp_size_t bn, mp_size_t en, mp_size_t n)
{
int windowsize;
mp_size_t redcify_itch, itch;
- windowsize = win_size (en * GMP_NUMB_BITS); /* slight over-estimate of exp */
- itch = 4 * n + (n << windowsize);
- redcify_itch = 2 * bn + n + 1;
- /* The 6n is due to the placement of reduce scratch 6n into the start of the
- scratch area. */
- return MAX (itch, redcify_itch + 6 * n);
+ /* The top scratch usage will either be when reducing B in the 2nd redcify
+ call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It
+ is 3n or 4n depending on if we use mpn_local_sqr or a native
+ mpn_sqr_basecase. We assume 4n always for now.) */
+
+ windowsize = win_size (en * GMP_LIMB_BITS); /* slight over-estimate of exp */
+
+ /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call,
+ the 2bn + n + 1 term is due to redcify's own usage. */
+ redcify_itch = (2 * n) + (2 * bn + n + 1);
+
+ /* The n * 2^windowsize term is due to the power table, the 4n term is due to
+ scratch needs of squaring/multiplication in the exponentiation loop. */
+ itch = (n << windowsize) + (4 * n);
+
+ return MAX (itch, redcify_itch);
}
diff --git a/mpn/generic/redc_1.c b/mpn/generic/redc_1.c
index 177f3932f..3567414eb 100644
--- a/mpn/generic/redc_1.c
+++ b/mpn/generic/redc_1.c
@@ -25,7 +25,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
void
-mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+mpn_redc_1 (mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
{
mp_size_t j;
mp_limb_t cy;
@@ -40,7 +40,4 @@ mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
up[0] = cy;
up++;
}
- cy = mpn_add_n (rp, up, up - n, n);
- if (cy != 0)
- mpn_sub_n (rp, rp, mp, n);
}
diff --git a/mpn/generic/redc_1_sec.c b/mpn/generic/tabselect.c
index 3d914381c..02e52fdc0 100644
--- a/mpn/generic/redc_1_sec.c
+++ b/mpn/generic/tabselect.c
@@ -1,10 +1,9 @@
-/* mpn_redc_1_sec. Set cp[] <- up[]/R^n mod mp[]. Clobber up[].
- mp[] is n limbs; up[] is 2n limbs.
+/* mpn_tabselect.
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright (C) 2000, 2001, 2002, 2004, 2008, 2009 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -24,22 +23,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
+
+/* Select entry `which' from table `tab', which has nents entries, each `n'
+ limbs. Store the selected entry at rp. Reads entire table to avoid
+ side-channel information leaks. O(n*nents).
+ FIXME: Move to its own file. */
void
-mpn_redc_1_sec (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
+ mp_size_t nents, mp_size_t which)
{
- mp_size_t j;
- mp_limb_t cy;
-
- ASSERT (n > 0);
- ASSERT_MPN (up, 2*n);
+ mp_size_t k, i;
+ mp_limb_t mask;
+ volatile mp_limb_t *tp;
- for (j = n - 1; j >= 0; j--)
+ for (k = 0; k < nents; k++)
{
- cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
- ASSERT (up[0] == 0);
- up[0] = cy;
- up++;
+ mask = -(mp_limb_t) (which == k);
+ tp = tab + n * k;
+ for (i = 0; i < n; i++)
+ {
+ rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
+ }
}
- cy = mpn_add_n (rp, up, up - n, n);
- mpn_subcnd_n (rp, rp, mp, n, cy);
}
diff --git a/mpn/generic/udiv_w_sdiv.c b/mpn/generic/udiv_w_sdiv.c
index c01f95847..ceefd1b5f 100644
--- a/mpn/generic/udiv_w_sdiv.c
+++ b/mpn/generic/udiv_w_sdiv.c
@@ -9,7 +9,7 @@
GNU MP RELEASE.
-Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+Copyright 1992, 1994, 1996, 2000, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -116,12 +116,12 @@ mpn_udiv_w_sdiv (rp, a1, a0, d)
{ /* Hence a1 = d - 1 = 2*b1 - 1 */
if (a0 >= -d)
{
- q = -1;
+ q = -CNST_LIMB(1);
r = a0 + d;
}
else
{
- q = -2;
+ q = -CNST_LIMB(2);
r = a0 + 2*d;
}
}
diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h
index 0841c82aa..77e02f518 100644
--- a/mpn/ia64/gmp-mparam.h
+++ b/mpn/ia64/gmp-mparam.h
@@ -1,6 +1,6 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010 Free Software
+Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010, 2011 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
@@ -21,70 +21,94 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define GMP_LIMB_BITS 64
#define BYTES_PER_MP_LIMB 8
-/* 1300MHz Itanium2 (babe.fsffrance.org) */
-
+/* 900MHz Itanium2 (titanic.gmplib.org) */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 26
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD 12
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
#define MUL_TOOM22_THRESHOLD 40
-#define MUL_TOOM33_THRESHOLD 122
-#define MUL_TOOM44_THRESHOLD 212
+#define MUL_TOOM33_THRESHOLD 129
+#define MUL_TOOM44_THRESHOLD 214
#define MUL_TOOM6H_THRESHOLD 318
#define MUL_TOOM8H_THRESHOLD 430
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 146
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 145
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 126
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151
#define SQR_BASECASE_THRESHOLD 11
#define SQR_TOOM2_THRESHOLD 84
-#define SQR_TOOM3_THRESHOLD 125
+#define SQR_TOOM3_THRESHOLD 135
#define SQR_TOOM4_THRESHOLD 494
-#define SQR_TOOM6_THRESHOLD 0 /* never toom4 */
-#define SQR_TOOM8_THRESHOLD 0 /* never toom6 */
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 98
#define MULMOD_BNM1_THRESHOLD 23
-#define SQRMOD_BNM1_THRESHOLD 25
+#define SQRMOD_BNM1_THRESHOLD 28
+
+#define POWM_SEC_TABLE 2,29,130,905
-#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 444, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \
- { 35, 7}, { 18, 6}, { 37, 7}, { 19, 6}, \
+ { { 476, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \
+ { 33, 7}, { 17, 6}, { 37, 7}, { 19, 6}, \
{ 39, 7}, { 21, 6}, { 43, 7}, { 33, 8}, \
{ 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
- { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \
- { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
- { 49, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \
- { 63, 9}, { 35, 8}, { 71, 9}, { 43,10}, \
+ { 21, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \
+ { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
+ { 57, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
{ 23, 9}, { 59,10}, { 31, 9}, { 71,10}, \
- { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
{ 55,11}, { 31,10}, { 87,11}, { 47,10}, \
{ 111,12}, { 31,11}, { 63,10}, { 143,11}, \
{ 79,10}, { 167,11}, { 95,10}, { 191,11}, \
{ 111,12}, { 63,11}, { 143,10}, { 287, 9}, \
{ 575,10}, { 303,11}, { 159,10}, { 319,12}, \
{ 95,11}, { 191,10}, { 399,11}, { 207,10}, \
- { 431,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 76
+ { 431,13}, { 63,12}, { 127,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 335,10}, { 671,11}, { 367,12}, \
+ { 191,11}, { 399,10}, { 799,11}, { 431,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 415,11}, { 863,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 799,11}, { 1599,12}, { 863,13}, \
+ { 447,12}, { 927,11}, { 1855,14}, { 255,13}, \
+ { 511,12}, { 1055,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \
+ { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \
+ { 895,12}, { 1791,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1663,14}, { 895,13}, \
+ { 1855,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,14}, { 1279,13}, { 2687,14}, \
+ { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,14}, { 1791,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 155
#define MUL_FFT_THRESHOLD 5760
-#define SQR_FFT_MODF_THRESHOLD 440 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 436 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 440, 5}, { 14, 4}, { 29, 5}, { 29, 6}, \
- { 15, 5}, { 31, 6}, { 35, 7}, { 18, 6}, \
- { 37, 7}, { 33, 8}, { 17, 7}, { 37, 8}, \
+ { { 436, 5}, { 14, 4}, { 29, 5}, { 31, 6}, \
+ { 35, 7}, { 18, 6}, { 37, 7}, { 37, 8}, \
{ 19, 7}, { 40, 8}, { 37, 9}, { 19, 8}, \
{ 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \
{ 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \
@@ -93,45 +117,69 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
{ 87,11}, { 47,10}, { 111,12}, { 31,11}, \
{ 63,10}, { 135,11}, { 79,10}, { 167,11}, \
{ 95,10}, { 191,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143,10}, { 303,11}, \
- { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
- { 399,11}, { 207,10}, { 431,13}, { 8192,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 66
+ { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \
+ { 95,11}, { 191,10}, { 399,11}, { 207,10}, \
+ { 431,13}, { 63,12}, { 127,11}, { 271,10}, \
+ { 543,11}, { 303,12}, { 159,11}, { 335,10}, \
+ { 671,11}, { 367,10}, { 735,12}, { 191,11}, \
+ { 399,10}, { 799,11}, { 431,12}, { 223,11}, \
+ { 463,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 671,12}, \
+ { 351,11}, { 735,13}, { 191,12}, { 383,11}, \
+ { 799,12}, { 415,11}, { 863,12}, { 447,11}, \
+ { 895,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 863,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1183,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \
+ { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,14}, { 1791,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 151
#define SQR_FFT_THRESHOLD 4032
#define MULLO_BASECASE_THRESHOLD 29
#define MULLO_DC_THRESHOLD 57
#define MULLO_MUL_N_THRESHOLD 11278
-#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIV_QR_THRESHOLD 64
#define DC_DIVAPPR_Q_THRESHOLD 222
#define DC_BDIV_QR_THRESHOLD 95
#define DC_BDIV_Q_THRESHOLD 264
-#define INV_MULMOD_BNM1_THRESHOLD 82
-#define INV_NEWTON_THRESHOLD 11
-#define INV_APPR_THRESHOLD 18
+#define INV_MULMOD_BNM1_THRESHOLD 86
+#define INV_NEWTON_THRESHOLD 139
+#define INV_APPR_THRESHOLD 147
#define BINV_NEWTON_THRESHOLD 252
-#define REDC_1_TO_REDC_2_THRESHOLD 0
+#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */
#define REDC_2_TO_REDC_N_THRESHOLD 147
#define MU_DIV_QR_THRESHOLD 1142
-#define MU_DIVAPPR_Q_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 1142
#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1187
+#define MU_BDIV_QR_THRESHOLD 1210
#define MU_BDIV_Q_THRESHOLD 1470
#define MATRIX22_STRASSEN_THRESHOLD 23
#define HGCD_THRESHOLD 117
-#define GCD_DC_THRESHOLD 469
+#define HGCD_APPR_THRESHOLD 111
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 555
#define GCDEXT_DC_THRESHOLD 368
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 1204
-#define SET_STR_PRECOMPUTE_THRESHOLD 3266
+#define GET_STR_PRECOMPUTE_THRESHOLD 22
+#define SET_STR_DC_THRESHOLD 1474
+#define SET_STR_PRECOMPUTE_THRESHOLD 3168
diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm
new file mode 100644
index 000000000..cc5b49b04
--- /dev/null
+++ b/mpn/ia64/tabselect.asm
@@ -0,0 +1,139 @@
+dnl IA-64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 2.5
+
+C NOTES
+C * Using software pipelining could trivially yield 2 c/l without unrolling,
+C or 1+epsilon with unrolling. (This code was modelled after the powerpc64
+C code, for simplicity.)
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r32')
+define(`tp', `r33')
+define(`n', `r34')
+define(`nents', `r35')
+define(`which', `r36')
+
+define(`mask', `r8')
+
+define(`rp1', `r32')
+define(`tp1', `r33')
+define(`rp2', `r14')
+define(`tp2', `r15')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+.mmi; addp4 rp = 0, rp C M I
+ addp4 tp = 0, tp C M I
+ zxt4 n = n C I
+.mii; nop 0
+ zxt4 nents = nents C I
+ zxt4 which = which C I
+ ;;
+')
+.mmi; add rp2 = 8, rp1
+ add tp2 = 8, tp1
+ add r6 = -2, n
+ ;;
+.mmi; cmp.eq p10, p0 = 1, n
+ and r9 = 1, n C set cr0 for use in inner loop
+ shr.u r6 = r6, 1 C inner loop count
+ ;;
+.mmi; cmp.eq p8, p0 = 0, r9
+ sub which = nents, which
+ shl n = n, 3
+ ;;
+
+L(outer):
+.mmi cmp.eq p6, p7 = which, nents C are we at the selected table entry?
+ nop 0
+ mov ar.lc = r6 C I0
+ ;;
+.mmb;
+ (p6) mov mask = -1
+ (p7) mov mask = 0
+ (p8) br.dptk L(top) C branch to loop entry if n even
+ ;;
+
+.mmi; ld8 r16 = [tp1], 8
+ add tp2 = 8, tp2
+ nop 0
+ ;;
+.mmi; ld8 r18 = [rp1]
+ and r16 = r16, mask
+ nop 0
+ ;;
+.mmi; andcm r18 = r18, mask
+ ;;
+ or r16 = r16, r18
+ nop 0
+ ;;
+.mmb; st8 [rp1] = r16, 8
+ add rp2 = 8, rp2
+ (p10) br.dpnt L(end)
+
+ ALIGN(32)
+L(top):
+.mmi; ld8 r16 = [tp1], 16
+ ld8 r17 = [tp2], 16
+ nop 0
+ ;;
+.mmi; ld8 r18 = [rp1]
+ and r16 = r16, mask
+ nop 0
+.mmi; ld8 r19 = [rp2]
+ and r17 = r17, mask
+ nop 0
+ ;;
+.mmi; andcm r18 = r18, mask
+ andcm r19 = r19, mask
+ nop 0
+ ;;
+.mmi; or r16 = r16, r18
+ or r17 = r17, r19
+ nop 0
+ ;;
+.mmb; st8 [rp1] = r16, 16
+ st8 [rp2] = r17, 16
+ br.cloop.dptk L(top)
+ ;;
+L(end):
+.mmi; sub rp1 = rp1, n C move rp back to beginning
+ sub rp2 = rp2, n C move rp back to beginning
+ cmp.ne p9, p0 = 1, nents
+.mmb; add nents = -1, nents
+ nop 0
+ (p9) br.dptk L(outer)
+ ;;
+
+.mib; nop 0
+ nop 0
+ br.ret.sptk.many b0
+EPILOGUE()
diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h
index d0e86d856..081757aca 100644
--- a/mpn/pa64/gmp-mparam.h
+++ b/mpn/pa64/gmp-mparam.h
@@ -25,14 +25,16 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
#define MOD_1U_TO_MOD_1_1_THRESHOLD 10
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD 21
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -47,16 +49,20 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 54
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 56
-#define SQR_TOOM3_THRESHOLD 169
-#define SQR_TOOM4_THRESHOLD 280
-#define SQR_TOOM6_THRESHOLD 0
-#define SQR_TOOM8_THRESHOLD 309
+#define SQR_BASECASE_THRESHOLD 5
+#define SQR_TOOM2_THRESHOLD 58
+#define SQR_TOOM3_THRESHOLD 153
+#define SQR_TOOM4_THRESHOLD 278
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 0 /* always */
-#define MULMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 56
+
+#define MULMOD_BNM1_THRESHOLD 15
#define SQRMOD_BNM1_THRESHOLD 19
+#define POWM_SEC_TABLE 2,23,228,1084
+
#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 336, 5}, { 11, 4}, { 23, 5}, { 21, 6}, \
@@ -196,34 +202,36 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 1856
#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 133
-#define MULLO_MUL_N_THRESHOLD 4292
+#define MULLO_DC_THRESHOLD 113
+#define MULLO_MUL_N_THRESHOLD 4658
-#define DC_DIV_QR_THRESHOLD 140
-#define DC_DIVAPPR_Q_THRESHOLD 422
-#define DC_BDIV_QR_THRESHOLD 150
-#define DC_BDIV_Q_THRESHOLD 351
+#define DC_DIV_QR_THRESHOLD 123
+#define DC_DIVAPPR_Q_THRESHOLD 372
+#define DC_BDIV_QR_THRESHOLD 142
+#define DC_BDIV_Q_THRESHOLD 312
-#define INV_MULMOD_BNM1_THRESHOLD 60
-#define INV_NEWTON_THRESHOLD 348
-#define INV_APPR_THRESHOLD 324
+#define INV_MULMOD_BNM1_THRESHOLD 58
+#define INV_NEWTON_THRESHOLD 315
+#define INV_APPR_THRESHOLD 315
-#define BINV_NEWTON_THRESHOLD 363
+#define BINV_NEWTON_THRESHOLD 360
#define REDC_1_TO_REDC_N_THRESHOLD 101
-#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIV_QR_THRESHOLD 979
#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 110
+#define MUPI_DIV_QR_THRESHOLD 93
#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 1334
+#define MU_BDIV_Q_THRESHOLD 1187
#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 242
-#define GCD_DC_THRESHOLD 752
-#define GCDEXT_DC_THRESHOLD 545
+#define HGCD_THRESHOLD 234
+#define HGCD_APPR_THRESHOLD 300
+#define HGCD_REDUCE_THRESHOLD 1553
+#define GCD_DC_THRESHOLD 684
+#define GCDEXT_DC_THRESHOLD 525
#define JACOBI_BASE_METHOD 2
#define GET_STR_DC_THRESHOLD 21
#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 2008
-#define SET_STR_PRECOMPUTE_THRESHOLD 4066
+#define SET_STR_DC_THRESHOLD 1951
+#define SET_STR_PRECOMPUTE_THRESHOLD 4034
diff --git a/mpn/powerpc32/aors_n.asm b/mpn/powerpc32/aors_n.asm
index f9e9b50d5..12115a9e9 100644
--- a/mpn/powerpc32/aors_n.asm
+++ b/mpn/powerpc32/aors_n.asm
@@ -19,14 +19,17 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C 603e: ?
-C 604e: ? old: 3.25
-C 75x (G3): ? old: 3.5
-C 7400,7410 (G4): 3.25
-C 744x,745x (G4+): 4
-C power4/ppc970: ? old: 2.0
-C power5: ? old: 2.5
+C cycles/limb
+C 603e: ?
+C 604e: ? old: 3.25
+C 75x (G3): ? old: 3.5
+C 7400,7410 (G4): 3.25
+C 744x,745x (G4+): 4
+C POWER3/PPC630 2
+C POWER4/PPC970 2.4
+C POWER5 2.75
+C POWER6 40-140
+C POWER7 3
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/mpn/powerpc32/p3-p7/aors_n.asm b/mpn/powerpc32/p3-p7/aors_n.asm
new file mode 100644
index 000000000..6999182a8
--- /dev/null
+++ b/mpn/powerpc32/p3-p7/aors_n.asm
@@ -0,0 +1,176 @@
+dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.78
+C POWER7 2.15-2.87
+
+C This code is based on powerpc64/aors_n.asm.
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ SETCBR(r7)
+ b L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+ CLRCB
+L(ent): stw r31, -4(r1)
+ stw r30, -8(r1)
+ stw r29, -12(r1)
+ stw r28, -16(r1)
+
+ rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0
+ cmpwi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srwi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): lwz r8, 0(r4) C load s1 limb
+ lwz r9, 0(r5) C load s2 limb
+ lwz r10, 4(r4) C load s1 limb
+ lwz r11, 4(r5) C load s2 limb
+ lwz r12, 8(r4) C load s1 limb
+ addi r4, r4, 12
+ lwz r0, 8(r5) C load s2 limb
+ addi r5, r5, 12
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ stw r29, 0(r3)
+ stw r30, 4(r3)
+ stw r31, 8(r3)
+ addi r3, r3, 12
+ bdnz L(go)
+ b L(ret)
+
+L(b01): lwz r12, 0(r4) C load s1 limb
+ addi r4, r4, 4
+ lwz r0, 0(r5) C load s2 limb
+ addi r5, r5, 4
+ ADDSUBC r31, r0, r12 C add
+ stw r31, 0(r3)
+ addi r3, r3, 4
+ bdnz L(go)
+ b L(ret)
+
+L(b10): lwz r10, 0(r4) C load s1 limb
+ lwz r11, 0(r5) C load s2 limb
+ lwz r12, 4(r4) C load s1 limb
+ addi r4, r4, 8
+ lwz r0, 4(r5) C load s2 limb
+ addi r5, r5, 8
+ ADDSUBC r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ stw r30, 0(r3)
+ stw r31, 4(r3)
+ addi r3, r3, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b00): C INITCY C clear/set cy
+L(go): lwz r6, 0(r4) C load s1 limb
+ lwz r7, 0(r5) C load s2 limb
+ lwz r8, 4(r4) C load s1 limb
+ lwz r9, 4(r5) C load s2 limb
+ lwz r10, 8(r4) C load s1 limb
+ lwz r11, 8(r5) C load s2 limb
+ lwz r12, 12(r4) C load s1 limb
+ lwz r0, 12(r5) C load s2 limb
+ bdz L(end)
+
+ addi r4, r4, 16
+ addi r5, r5, 16
+
+ ALIGN(16)
+L(top): ADDSUBC r28, r7, r6
+ lwz r6, 0(r4) C load s1 limb
+ lwz r7, 0(r5) C load s2 limb
+ ADDSUBC r29, r9, r8
+ lwz r8, 4(r4) C load s1 limb
+ lwz r9, 4(r5) C load s2 limb
+ ADDSUBC r30, r11, r10
+ lwz r10, 8(r4) C load s1 limb
+ lwz r11, 8(r5) C load s2 limb
+ ADDSUBC r31, r0, r12
+ lwz r12, 12(r4) C load s1 limb
+ lwz r0, 12(r5) C load s2 limb
+ stw r28, 0(r3)
+ addi r4, r4, 16
+ stw r29, 4(r3)
+ addi r5, r5, 16
+ stw r30, 8(r3)
+ stw r31, 12(r3)
+ addi r3, r3, 16
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r7, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ stw r28, 0(r3)
+ stw r29, 4(r3)
+ stw r30, 8(r3)
+ stw r31, 12(r3)
+
+L(ret): lwz r31, -4(r1)
+ lwz r30, -8(r1)
+ lwz r29, -12(r1)
+ lwz r28, -16(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc32/p5/gmp-mparam.h b/mpn/powerpc32/p5/gmp-mparam.h
index a8400ce65..ba210ecc4 100644
--- a/mpn/powerpc32/p5/gmp-mparam.h
+++ b/mpn/powerpc32/p5/gmp-mparam.h
@@ -30,114 +30,117 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 62
+#define BMOD_1_TO_MOD_1_THRESHOLD 61
#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 78
+#define MUL_TOOM33_THRESHOLD 57
#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 206
-#define MUL_TOOM8H_THRESHOLD 260
+#define MUL_TOOM6H_THRESHOLD 189
+#define MUL_TOOM8H_THRESHOLD 309
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 42
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_TOOM2_THRESHOLD 40
#define SQR_TOOM3_THRESHOLD 77
-#define SQR_TOOM4_THRESHOLD 169
-#define SQR_TOOM6_THRESHOLD 246
-#define SQR_TOOM8_THRESHOLD 381
+#define SQR_TOOM4_THRESHOLD 124
+#define SQR_TOOM6_THRESHOLD 140
+#define SQR_TOOM8_THRESHOLD 238
+
+#define MULMID_TOOM42_THRESHOLD 40
#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define POWM_SEC_TABLE 4,29,252,840,2080
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 13, 5}, { 27, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159,11}, { 95, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
- { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415, 9}, { 831,11}, { 223,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 76
+ { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \
+ { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 71
#define MUL_FFT_THRESHOLD 4736
-#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
{ 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 6}, { 77, 7}, { 39, 8}, { 23, 7}, \
- { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 7}, { 121, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
- { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \
- { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \
+ { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \
+ { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \
+ { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
{ 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
{ 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
- { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \
- { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
{ 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
{ 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
{ 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
{ 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
{ 767,10}, { 415,11}, { 223,10}, { 447,12}, \
{ 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 88
+#define SQR_FFT_TABLE3_SIZE 76
#define SQR_FFT_THRESHOLD 3712
#define MULLO_BASECASE_THRESHOLD 2
#define MULLO_DC_THRESHOLD 68
#define MULLO_MUL_N_THRESHOLD 9236
-#define DC_DIV_QR_THRESHOLD 70
-#define DC_DIVAPPR_Q_THRESHOLD 238
+#define DC_DIV_QR_THRESHOLD 69
+#define DC_DIVAPPR_Q_THRESHOLD 220
#define DC_BDIV_QR_THRESHOLD 75
#define DC_BDIV_Q_THRESHOLD 188
#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 250
-#define INV_APPR_THRESHOLD 246
+#define INV_NEWTON_THRESHOLD 230
+#define INV_APPR_THRESHOLD 230
-#define BINV_NEWTON_THRESHOLD 375
+#define BINV_NEWTON_THRESHOLD 278
#define REDC_1_TO_REDC_N_THRESHOLD 87
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1387
-#define MUPI_DIV_QR_THRESHOLD 114
-#define MU_BDIV_QR_THRESHOLD 1078
-#define MU_BDIV_Q_THRESHOLD 1334
+#define MU_DIV_QR_THRESHOLD 1210
+#define MU_DIVAPPR_Q_THRESHOLD 1308
+#define MUPI_DIV_QR_THRESHOLD 106
+#define MU_BDIV_QR_THRESHOLD 1017
+#define MU_BDIV_Q_THRESHOLD 1210
#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 104
-#define GCD_DC_THRESHOLD 424
-#define GCDEXT_DC_THRESHOLD 321
+#define HGCD_THRESHOLD 110
+#define HGCD_APPR_THRESHOLD 138
+#define HGCD_REDUCE_THRESHOLD 2578
+#define GCD_DC_THRESHOLD 408
+#define GCDEXT_DC_THRESHOLD 298
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 454
-#define SET_STR_PRECOMPUTE_THRESHOLD 1074
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 527
+#define SET_STR_PRECOMPUTE_THRESHOLD 1090
diff --git a/mpn/powerpc32/p6/gmp-mparam.h b/mpn/powerpc32/p6/gmp-mparam.h
index 73951d0ae..529a66d19 100644
--- a/mpn/powerpc32/p6/gmp-mparam.h
+++ b/mpn/powerpc32/p6/gmp-mparam.h
@@ -29,115 +29,127 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 3
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 15
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MUL_TOOM22_THRESHOLD 34
-#define MUL_TOOM33_THRESHOLD 70
-#define MUL_TOOM44_THRESHOLD 187
-#define MUL_TOOM6H_THRESHOLD 286
-#define MUL_TOOM8H_THRESHOLD 321
+#define MUL_TOOM22_THRESHOLD 19
+#define MUL_TOOM33_THRESHOLD 55
+#define MUL_TOOM44_THRESHOLD 88
+#define MUL_TOOM6H_THRESHOLD 137
+#define MUL_TOOM8H_THRESHOLD 181
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 110
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 118
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 107
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 68
-#define SQR_TOOM3_THRESHOLD 113
-#define SQR_TOOM4_THRESHOLD 312
-#define SQR_TOOM6_THRESHOLD 330
-#define SQR_TOOM8_THRESHOLD 357
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 56
+#define SQR_TOOM4_THRESHOLD 130
+#define SQR_TOOM6_THRESHOLD 189
+#define SQR_TOOM8_THRESHOLD 296
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 20
+#define MULMID_TOOM42_THRESHOLD 26
-#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */
+#define MULMOD_BNM1_THRESHOLD 7
+#define SQRMOD_BNM1_THRESHOLD 12
+
+#define POWM_SEC_TABLE 2,26,127,453,1068
+
+#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 17, 7}, { 9, 6}, \
- { 20, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \
- { 7, 7}, { 21, 8}, { 11, 7}, { 27, 9}, \
- { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
- { 41, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \
+ { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
+ { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \
+ { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
+ { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
+ { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \
{ 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \
- { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \
- { 191, 8}, { 383,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \
- { 287,10}, { 159, 9}, { 319,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
- { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 415,11}, { 223,10}, { 447,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 83
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 312 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 312, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 17, 6}, { 35, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
- { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \
+ { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \
{ 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 79, 9}, { 159, 8}, \
- { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \
+ { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
+ { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \
+ { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \
+ { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \
+ { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD 1728
+
+#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \
+ { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
+ { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
+ { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
+ { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \
+ { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \
+ { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \
+ { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \
+ { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \
+ { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \
+ { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \
+ { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \
{ 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 415,11}, \
- { 223,10}, { 447,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 78
-#define SQR_FFT_THRESHOLD 2752
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 151
-#define MULLO_MUL_N_THRESHOLD 1175
-
-#define DC_DIV_QR_THRESHOLD 133
-#define DC_DIVAPPR_Q_THRESHOLD 442
-#define DC_BDIV_QR_THRESHOLD 130
-#define DC_BDIV_Q_THRESHOLD 324
-
-#define INV_MULMOD_BNM1_THRESHOLD 116
-#define INV_NEWTON_THRESHOLD 507
-#define INV_APPR_THRESHOLD 454
-
-#define BINV_NEWTON_THRESHOLD 507
-#define REDC_1_TO_REDC_N_THRESHOLD 118
-
-#define MU_DIV_QR_THRESHOLD 1652
-#define MU_DIVAPPR_Q_THRESHOLD 1752
-#define MUPI_DIV_QR_THRESHOLD 225
-#define MU_BDIV_QR_THRESHOLD 762
-#define MU_BDIV_Q_THRESHOLD 1017
-
-#define MATRIX22_STRASSEN_THRESHOLD 28
-#define HGCD_THRESHOLD 76
-#define GCD_DC_THRESHOLD 333
-#define GCDEXT_DC_THRESHOLD 245
+ { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \
+ { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \
+ { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \
+ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 92
+#define SQR_FFT_THRESHOLD 1600
+
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 57
+#define MULLO_MUL_N_THRESHOLD 3176
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 187
+#define DC_BDIV_QR_THRESHOLD 64
+#define DC_BDIV_Q_THRESHOLD 146
+
+#define INV_MULMOD_BNM1_THRESHOLD 68
+#define INV_NEWTON_THRESHOLD 182
+#define INV_APPR_THRESHOLD 182
+
+#define BINV_NEWTON_THRESHOLD 186
+#define REDC_1_TO_REDC_N_THRESHOLD 60
+
+#define MU_DIV_QR_THRESHOLD 924
+#define MU_DIVAPPR_Q_THRESHOLD 807
+#define MUPI_DIV_QR_THRESHOLD 73
+#define MU_BDIV_QR_THRESHOLD 667
+#define MU_BDIV_Q_THRESHOLD 823
+
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 61
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 974
+#define GCD_DC_THRESHOLD 195
+#define GCDEXT_DC_THRESHOLD 134
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 10
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 199
-#define SET_STR_PRECOMPUTE_THRESHOLD 478
+#define GET_STR_DC_THRESHOLD 9
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 190
+#define SET_STR_PRECOMPUTE_THRESHOLD 411
diff --git a/mpn/powerpc32/p7/gmp-mparam.h b/mpn/powerpc32/p7/gmp-mparam.h
new file mode 100644
index 000000000..bd18d4042
--- /dev/null
+++ b/mpn/powerpc32/p7/gmp-mparam.h
@@ -0,0 +1,149 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009,
+2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* 3550 MHz POWER7/T4 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 1
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 34
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 130
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_TOOM2_THRESHOLD 50
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 381
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,35,225,780,2212
+
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 14, 5}, { 29, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
+ { 1087,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671, 8}, { 1343,10}, { 351,11}, \
+ { 191,10}, { 415, 9}, { 831,10}, { 431,11}, \
+ { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 77
+#define MUL_FFT_THRESHOLD 5312
+
+#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543, 8}, { 1087,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \
+ { 831,11}, { 223,10}, { 447,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 79
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 34
+#define MULLO_MUL_N_THRESHOLD 10323
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 202
+#define DC_BDIV_QR_THRESHOLD 68
+#define DC_BDIV_Q_THRESHOLD 152
+
+#define INV_MULMOD_BNM1_THRESHOLD 66
+#define INV_NEWTON_THRESHOLD 226
+#define INV_APPR_THRESHOLD 189
+
+#define BINV_NEWTON_THRESHOLD 292
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 91
+#define MU_BDIV_QR_THRESHOLD 1308
+#define MU_BDIV_Q_THRESHOLD 1442
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 126
+#define HGCD_APPR_THRESHOLD 139
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 573
+#define GCDEXT_DC_THRESHOLD 448
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 9
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 834
+#define SET_STR_PRECOMPUTE_THRESHOLD 1888
diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm
new file mode 100644
index 000000000..155a7b495
--- /dev/null
+++ b/mpn/powerpc32/tabselect.asm
@@ -0,0 +1,98 @@
+dnl PowerPC-32 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 603e: ?
+C 604e: ?
+C 75x (G3): ?
+C 7400,7410 (G4): ?
+C 744x,745x (G4+): ?
+C power4/ppc970: 3.3
+C power5: ?
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`mask', `r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ addi r0, n, 1
+ srwi r0, r0, 1 C inner loop count
+ andi. r9, n, 1 C set cr0 for use in inner loop
+ subf which, nents, which
+ slwi n, n, 2
+
+L(outer):
+ mtctr r0 C put inner loop count in ctr
+
+ add r9, which, nents C are we at the selected table entry?
+ addic r9, r9, -1 C set CF iff not selected entry
+ subfe mask, r0, r0
+
+ beq cr0, L(top) C branch to loop entry if n even
+
+ lwz r9, 0(tp)
+ addi tp, tp, 4
+ and r9, r9, mask
+ lwz r11, 0(rp)
+ andc r11, r11, mask
+ or r9, r9, r11
+ stw r9, 0(rp)
+ addi rp, rp, 4
+ bdz L(end)
+
+ ALIGN(16)
+L(top): lwz r9, 0(tp)
+ lwz r10, 4(tp)
+ addi tp, tp, 8
+ nop
+ and r9, r9, mask
+ and r10, r10, mask
+ lwz r11, 0(rp)
+ lwz r12, 4(rp)
+ andc r11, r11, mask
+ andc r12, r12, mask
+ or r9, r9, r11
+ or r10, r10, r12
+ stw r9, 0(rp)
+ stw r10, 4(rp)
+ addi rp, rp, 8
+ bdnz L(top)
+
+L(end): subf rp, n, rp C move rp back to beginning
+ cmpwi cr6, nents, 1
+ addi nents, nents, -1
+ bne cr6, L(outer)
+
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/com.asm b/mpn/powerpc64/com.asm
index 4fb2e65d7..cb89bade2 100644
--- a/mpn/powerpc64/com.asm
+++ b/mpn/powerpc64/com.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1?
-C POWER4/PPC970: 1.6
+C cycles/limb
+C POWER3/PPC630 1?
+C POWER4/PPC970 1.6
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.45
C TODO
C * 8-way unrolling brings timing down to about 1.3 cycles/limb.
diff --git a/mpn/powerpc64/copyd.asm b/mpn/powerpc64/copyd.asm
index 6a46a433c..256e7dc12 100644
--- a/mpn/powerpc64/copyd.asm
+++ b/mpn/powerpc64/copyd.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1
-C POWER4/PPC970: 1
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
C INPUT PARAMETERS
C rp r3
diff --git a/mpn/powerpc64/copyi.asm b/mpn/powerpc64/copyi.asm
index 5cb7e4856..31d1fc2e7 100644
--- a/mpn/powerpc64/copyi.asm
+++ b/mpn/powerpc64/copyi.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1
-C POWER4/PPC970: 1
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
C INPUT PARAMETERS
C rp r3
diff --git a/mpn/powerpc64/logops_n.asm b/mpn/powerpc64/logops_n.asm
index 917b59f45..2caa2c7c6 100644
--- a/mpn/powerpc64/logops_n.asm
+++ b/mpn/powerpc64/logops_n.asm
@@ -20,9 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1.75
-C POWER4/PPC970: 2.10
+C cycles/limb
+C POWER3/PPC630 1.75
+C POWER4/PPC970 2.10
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.75
C n POWER3/PPC630 POWER4/PPC970
C 1 15.00 15.33
diff --git a/mpn/powerpc64/lshift.asm b/mpn/powerpc64/lshift.asm
index f97661ae7..eb70c4983 100644
--- a/mpn/powerpc64/lshift.asm
+++ b/mpn/powerpc64/lshift.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/lshiftc.asm
index 647244d1f..8f470a5f4 100644
--- a/mpn/powerpc64/mode64/lshiftc.asm
+++ b/mpn/powerpc64/lshiftc.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.5
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.5
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
@@ -189,6 +190,9 @@ L(cj2): std r10, -32(rp)
L(ret): ld r31, -8(r1)
ld r30, -16(r1)
- mr r3, retval
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, retval, 32
+ mr r4, retval
+',` mr r3, retval')
blr
EPILOGUE()
diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm
index 980525f67..8c30871c2 100644
--- a/mpn/powerpc64/mode64/aors_n.asm
+++ b/mpn/powerpc64/mode64/aors_n.asm
@@ -1,6 +1,6 @@
dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,11 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.5
-C POWER4/PPC970 2
-C POWER5 2.25
-C POWER6 2.63
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.63
+C POWER7 2.25-2.87
C This code is a little bit slower for POWER3/PPC630 than the simple code used
C previously, but it is much faster for POWER4/PPC970. The reason for the
@@ -136,6 +137,7 @@ L(go): ld r6, 0(r4) C load s1 limb
addi r4, r4, 32
addi r5, r5, 32
+ ALIGN(16)
L(top): ADDSUBC r28, r7, r6
ld r6, 0(r4) C load s1 limb
ld r7, 0(r5) C load s2 limb
diff --git a/mpn/powerpc64/mode64/aorscnd_n.asm b/mpn/powerpc64/mode64/aorscnd_n.asm
new file mode 100644
index 000000000..47aa6fb39
--- /dev/null
+++ b/mpn/powerpc64/mode64/aorscnd_n.asm
@@ -0,0 +1,185 @@
+dnl PowerPC-64 mpn_addcnd_n/mpn_subcnd_n.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.25
+C POWER5 ?
+C POWER6 ?
+C POWER7 ?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+define(`cnd', `r7')
+
+ifdef(`OPERATION_addcnd_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addcnd_n)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_subcnd_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_subcnd_n)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ subfic cnd, cnd, 0
+ subfe cnd, cnd, cnd
+
+ rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srdi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(up) C load s1 limb
+ ld r9, 0(vp) C load s2 limb
+ ld r10, 8(up) C load s1 limb
+ ld r11, 8(vp) C load s2 limb
+ ld r12, 16(up) C load s1 limb
+ addi up, up, 24
+ ld r0, 16(vp) C load s2 limb
+ addi vp, vp, 24
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(rp)
+ std r30, 8(rp)
+ std r31, 16(rp)
+ addi rp, rp, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(up) C load s1 limb
+ addi up, up, 8
+ ld r0, 0(vp) C load s2 limb
+ addi vp, vp, 8
+ and r0, r0, cnd
+ ADDSUB r31, r0, r12 C add
+ std r31, 0(rp)
+ addi rp, rp, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(up) C load s1 limb
+ ld r11, 0(vp) C load s2 limb
+ ld r12, 8(up) C load s1 limb
+ addi up, up, 16
+ ld r0, 8(vp) C load s2 limb
+ addi vp, vp, 16
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(rp)
+ std r31, 8(rp)
+ addi rp, rp, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): CLRCB C clear/set cy
+L(go): ld r6, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdz L(end)
+
+ addi up, up, 32
+ addi vp, vp, 32
+
+L(top): ADDSUBC r28, r27, r6
+ ld r6, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ std r28, 0(rp)
+ addi up, up, 32
+ std r29, 8(rp)
+ addi vp, vp, 32
+ std r30, 16(rp)
+ std r31, 24(rp)
+ addi rp, rp, 32
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r27, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(rp)
+ std r29, 8(rp)
+ std r30, 16(rp)
+ std r31, 24(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/aorslshC_n.asm b/mpn/powerpc64/mode64/aorslshC_n.asm
index 4622cd946..3776d3e59 100644
--- a/mpn/powerpc64/mode64/aorslshC_n.asm
+++ b/mpn/powerpc64/mode64/aorslshC_n.asm
@@ -17,11 +17,12 @@ dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-C cycles/limb
-C POWER3/PPC630 1.83 (1.5 c/l should be possible)
-C POWER4/PPC970 3 (2.0 c/l should be possible)
-C POWER5 3
-C POWER6 3.5-47
+C cycles/limb
+C POWER3/PPC630 1.83 (1.5 c/l should be possible)
+C POWER4/PPC970 3 (2.0 c/l should be possible)
+C POWER5 3
+C POWER6 3.5-47
+C POWER7 3
C STATUS
C * Try combining upx+up, and vpx+vp.
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index b1a3315b6..4b843a044 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C mpn_addmul_1 mpn_submul_1
-C cycles/limb cycles/limb
-C POWER3/PPC630 6-18 6-18
-C POWER4/PPC970 8 8.3
-C POWER5 8 8.25
-C POWER6 16.25 16.75
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 6-18 6-18
+C POWER4/PPC970 8 8.3
+C POWER5 8 8.25
+C POWER6 16.25 16.75
+C POWER7 3.77 4.9
C TODO
C * Try to reduce the number of needed live registers
@@ -53,7 +54,7 @@ ifdef(`OPERATION_submul_1',`
')
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
+
ASM_START()
PROLOGUE(func_nc)
EPILOGUE()
diff --git a/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
index 40f3d4ec7..e88fc4440 100644
--- a/mpn/powerpc64/mode64/bdiv_dbm1c.asm
+++ b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -19,11 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
+C cycles/limb
C POWER3/PPC630 6-18
C POWER4/PPC970 8.5?
C POWER5 8.5 fluctuating as function of n % 3
C POWER6 15
+C POWER6 15
+C POWER7 4.75
C TODO
C * Nothing to do...
diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm
index d457d65e9..0f94154bf 100644
--- a/mpn/powerpc64/mode64/dive_1.asm
+++ b/mpn/powerpc64/mode64/dive_1.asm
@@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm
+C cycles/limb
+C norm unorm
C POWER3/PPC630 13-19
-C POWER4/PPC970 16
-C POWER5 16 16
-C POWER6 37 46
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
+C POWER7 12 12
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
diff --git a/mpn/powerpc64/mode64/divrem_1.asm b/mpn/powerpc64/mode64/divrem_1.asm
index 9d065b728..c0e7b2a9f 100644
--- a/mpn/powerpc64/mode64/divrem_1.asm
+++ b/mpn/powerpc64/mode64/divrem_1.asm
@@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm frac
-C POWER3/PPC630 16-34 16-34 ~11
-C POWER4/PPC970 29 19
-C POWER5 29 29 ~20
-C POWER6 50 59 ~42
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11
+C POWER4/PPC970 29 19
+C POWER5 29 29 ~20
+C POWER6 50 59 ~42
+C POWER7 25 25 ~14
C INPUT PARAMETERS
C qp = r3
diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm
index 53ef1c708..18f549357 100644
--- a/mpn/powerpc64/mode64/divrem_2.asm
+++ b/mpn/powerpc64/mode64/divrem_2.asm
@@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm frac
+C cycles/limb
+C norm frac
C POWER3/PPC630
-C POWER4/PPC970 ? ?
-C POWER5 37 ?
-C POWER6 62 ?
+C POWER4/PPC970 ? ?
+C POWER5 37 ?
+C POWER6 62 ?
+C POWER6 30.5 ?
C INPUT PARAMETERS
C qp = r3
diff --git a/mpn/powerpc64/mode64/invert_limb.asm b/mpn/powerpc64/mode64/invert_limb.asm
index aed0a32ab..31b243001 100644
--- a/mpn/powerpc64/mode64/invert_limb.asm
+++ b/mpn/powerpc64/mode64/invert_limb.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb (approximate)
-C POWER3/PPC630 80
-C POWER4/PPC970 86
-C POWER5 86
-C POWER6 170
+C cycles/limb (approximate)
+C POWER3/PPC630 80
+C POWER4/PPC970 86
+C POWER5 86
+C POWER6 170
+C POWER7 66
ASM_START()
PROLOGUE(mpn_invert_limb)
diff --git a/mpn/powerpc64/mode64/mod_1_1.asm b/mpn/powerpc64/mode64/mod_1_1.asm
index 61e39310a..f24ceb2c8 100644
--- a/mpn/powerpc64/mode64/mod_1_1.asm
+++ b/mpn/powerpc64/mode64/mod_1_1.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 17
-C POWER5 16
-C POWER6 30
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 17
+C POWER5 16
+C POWER6 30
+C POWER7 10.2
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_1_4.asm b/mpn/powerpc64/mode64/mod_1_4.asm
index e0f26da96..b6163c5e7 100644
--- a/mpn/powerpc64/mode64/mod_1_4.asm
+++ b/mpn/powerpc64/mode64/mod_1_4.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 9
-C POWER5 9
-C POWER6 13
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 9
+C POWER5 9
+C POWER6 13
+C POWER7 3.5
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_34lsub1.asm b/mpn/powerpc64/mode64/mod_34lsub1.asm
index 62ba17a3c..30b9f98be 100644
--- a/mpn/powerpc64/mode64/mod_34lsub1.asm
+++ b/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.33
-C POWER4/PPC970 1.5
-C POWER5 1.32
-C POWER6 2.35
+C cycles/limb
+C POWER3/PPC630 1.33
+C POWER4/PPC970 1.5
+C POWER5 1.32
+C POWER6 2.35
+C POWER7 1
C INPUT PARAMETERS
define(`up',`r3')
diff --git a/mpn/powerpc64/mode64/mode1o.asm b/mpn/powerpc64/mode64/mode1o.asm
index 489ca8551..37e4028d8 100644
--- a/mpn/powerpc64/mode64/mode1o.asm
+++ b/mpn/powerpc64/mode64/mode1o.asm
@@ -19,10 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 13-19
-C POWER4/PPC970: 16
-C POWER5: 16
+C cycles/limb
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16
+C POWER6 ?
+C POWER7 12
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm
index 12bff2fb6..e911cf551 100644
--- a/mpn/powerpc64/mode64/mul_1.asm
+++ b/mpn/powerpc64/mode64/mul_1.asm
@@ -21,11 +21,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 7.25? not updated for last file revision
-C POWER5 7.25
-C POWER6 14
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 7.25? not updated for last file revision
+C POWER5 7.25
+C POWER6 14
+C POWER7 2.9
C TODO
C * Try to reduce the number of needed live registers (at least r5 and r10
diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm
index fd7ff9aa1..9a3957f94 100644
--- a/mpn/powerpc64/mode64/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl PowerPC-64 mpn_basecase.
+dnl PowerPC-64 mpn_mul_basecase.
dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
dnl Foundation, Inc.
@@ -20,11 +20,11 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 8
-C POWER5 8
-C POWER6 24
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 24
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/mpn/powerpc64/mode64/p3/gmp-mparam.h b/mpn/powerpc64/mode64/p3/gmp-mparam.h
index 221b0e1d8..cf1d8ca47 100644
--- a/mpn/powerpc64/mode64/p3/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p3/gmp-mparam.h
@@ -23,12 +23,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -36,22 +37,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 33
#define MUL_TOOM44_THRESHOLD 46
#define MUL_TOOM6H_THRESHOLD 77
-#define MUL_TOOM8H_THRESHOLD 115
+#define MUL_TOOM8H_THRESHOLD 139
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 38
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 33
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 32
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 16
-#define SQR_TOOM3_THRESHOLD 49
-#define SQR_TOOM4_THRESHOLD 70
-#define SQR_TOOM6_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 48
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 14
+#define SQR_TOOM3_THRESHOLD 45
+#define SQR_TOOM4_THRESHOLD 64
+#define SQR_TOOM6_THRESHOLD 85
#define SQR_TOOM8_THRESHOLD 139
+#define MULMID_TOOM42_THRESHOLD 22
+
#define MULMOD_BNM1_THRESHOLD 8
-#define SQRMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 10
+
+#define POWM_SEC_TABLE 2,23,127,502,1421
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -123,35 +128,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 118
#define SQR_FFT_THRESHOLD 1728
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 4940
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 27
+#define MULLO_MUL_N_THRESHOLD 2367
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 95
-#define DC_BDIV_QR_THRESHOLD 28
+#define DC_DIV_QR_THRESHOLD 26
+#define DC_DIVAPPR_Q_THRESHOLD 87
+#define DC_BDIV_QR_THRESHOLD 27
#define DC_BDIV_Q_THRESHOLD 62
-#define INV_MULMOD_BNM1_THRESHOLD 29
-#define INV_NEWTON_THRESHOLD 92
-#define INV_APPR_THRESHOLD 94
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 91
+#define INV_APPR_THRESHOLD 91
#define BINV_NEWTON_THRESHOLD 115
-#define REDC_1_TO_REDC_N_THRESHOLD 30
+#define REDC_1_TO_REDC_N_THRESHOLD 31
#define MU_DIV_QR_THRESHOLD 551
#define MU_DIVAPPR_Q_THRESHOLD 551
-#define MUPI_DIV_QR_THRESHOLD 49
-#define MU_BDIV_QR_THRESHOLD 492
+#define MUPI_DIV_QR_THRESHOLD 50
+#define MU_BDIV_QR_THRESHOLD 474
#define MU_BDIV_Q_THRESHOLD 492
-#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 55
-#define GCD_DC_THRESHOLD 150
-#define GCDEXT_DC_THRESHOLD 124
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 53
+#define HGCD_APPR_THRESHOLD 55
+#define HGCD_REDUCE_THRESHOLD 688
+#define GCD_DC_THRESHOLD 148
+#define GCDEXT_DC_THRESHOLD 118
#define JACOBI_BASE_METHOD 1
-#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_DC_THRESHOLD 16
#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 354
+#define SET_STR_DC_THRESHOLD 375
#define SET_STR_PRECOMPUTE_THRESHOLD 812
diff --git a/mpn/powerpc64/mode64/p4/gmp-mparam.h b/mpn/powerpc64/mode64/p4/gmp-mparam.h
index 9a0932654..317bc94d6 100644
--- a/mpn/powerpc64/mode64/p4/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p4/gmp-mparam.h
@@ -29,6 +29,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 37
@@ -43,16 +44,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 62
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 181
-#define SQR_TOOM8_THRESHOLD 272
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 214
+#define SQR_TOOM6_THRESHOLD 254
+#define SQR_TOOM8_THRESHOLD 430
-#define MULMOD_BNM1_THRESHOLD 13
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 12
#define SQRMOD_BNM1_THRESHOLD 16
+#define POWM_SEC_TABLE 6,47,347,1036,2826
+
#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 372, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
@@ -116,9 +121,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 103
#define SQR_FFT_THRESHOLD 2752
-#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_BASECASE_THRESHOLD 3
#define MULLO_DC_THRESHOLD 36
-#define MULLO_MUL_N_THRESHOLD 12691
+#define MULLO_MUL_N_THRESHOLD 13463
#define DC_DIV_QR_THRESHOLD 43
#define DC_DIVAPPR_Q_THRESHOLD 158
@@ -139,12 +144,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 998
#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 105
+#define HGCD_THRESHOLD 103
+#define HGCD_APPR_THRESHOLD 110
+#define HGCD_REDUCE_THRESHOLD 1962
#define GCD_DC_THRESHOLD 318
#define GCDEXT_DC_THRESHOLD 242
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 858
-#define SET_STR_PRECOMPUTE_THRESHOLD 1864
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1781
diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h
index 827b555c8..9220f99d5 100644
--- a/mpn/powerpc64/mode64/p5/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -1,4 +1,4 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
Software Foundation, Inc.
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 40
@@ -38,22 +39,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 50
#define MUL_TOOM44_THRESHOLD 121
#define MUL_TOOM6H_THRESHOLD 202
-#define MUL_TOOM8H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 260
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 82
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-#define SQR_BASECASE_THRESHOLD 9
-#define SQR_TOOM2_THRESHOLD 36
-#define SQR_TOOM3_THRESHOLD 59
-#define SQR_TOOM4_THRESHOLD 147
-#define SQR_TOOM6_THRESHOLD 204
-#define SQR_TOOM8_THRESHOLD 288
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 142
+#define SQR_TOOM6_THRESHOLD 191
+#define SQR_TOOM8_THRESHOLD 284
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,35,387,1068,2699
#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -166,15 +171,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 0
-#define MULLO_DC_THRESHOLD 31
+#define MULLO_DC_THRESHOLD 42
#define MULLO_MUL_N_THRESHOLD 6633
-#define DC_DIV_QR_THRESHOLD 37
+#define DC_DIV_QR_THRESHOLD 43
#define DC_DIVAPPR_Q_THRESHOLD 155
#define DC_BDIV_QR_THRESHOLD 46
-#define DC_BDIV_Q_THRESHOLD 112
+#define DC_BDIV_Q_THRESHOLD 120
-#define INV_MULMOD_BNM1_THRESHOLD 26
+#define INV_MULMOD_BNM1_THRESHOLD 52
#define INV_NEWTON_THRESHOLD 177
#define INV_APPR_THRESHOLD 165
@@ -189,11 +194,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MATRIX22_STRASSEN_THRESHOLD 15
#define HGCD_THRESHOLD 108
-#define GCD_DC_THRESHOLD 303
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 315
#define GCDEXT_DC_THRESHOLD 237
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 1639
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 000000000..4bd508488
--- /dev/null
+++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
@@ -0,0 +1,172 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011
+dnl Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 12.25 12.8
+C POWER7 ? ?
+
+C TODO
+C * Reduce register usage.
+C * Schedule function entry code.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * Handle n = 1 and perhaps n = 2 seperately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(AM, `$1')
+ define(SM, `')
+ define(CLRRSC, `addic $1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(AM, `')
+ define(SM, `$1')
+ define(CLRRSC, `subfc $1, r0, r0')
+')
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+L(b3): ld r8, 0(up)
+ ld r7, 8(up)
+ ld r27, 16(up)
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r29, -16(rp)
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r5, r5, r29
+ b L(l3)
+
+L(b2): ld r7, 0(up)
+ ld r27, 8(up)
+ addi up, up, 8
+ addi rp, rp, 8
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r11, r11, r7
+ addze r12, r27
+ ADDSUB r9, r9, r30
+ b L(l2)
+
+L(b1): ld r27, 0(up)
+ ld r31, 0(rp)
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ADDSUB r11, r11, r31
+ b L(l1)
+
+L(b0): addi up, up, -8
+ addi rp, rp, -8
+ CLRRSC( r12) C clear r12 and clr/set cy
+
+ ALIGN(32)
+L(top):
+SM(` subfe r11, r0, r0') C complement...
+SM(` addic r11, r11, 1') C ...carry flag
+ ld r10, 8(up)
+ ld r8, 16(up)
+ ld r7, 24(up)
+ ld r27, 32(up)
+ addi up, up, 32
+ addi rp, rp, 32
+ mulld r0, r10, v0
+ mulhdu r10, r10, v0
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r28, -24(rp)
+ adde r0, r0, r12
+ ld r29, -16(rp)
+ adde r5, r5, r10
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ adde r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r0, r0, r28
+ std r0, -24(rp)
+ ADDSUBC r5, r5, r29
+L(l3): std r5, -16(rp)
+ ADDSUBC r9, r9, r30
+L(l2): std r9, -8(rp)
+ ADDSUBC r11, r11, r31
+L(l1): std r11, 0(rp)
+ bdnz L(top)
+
+AM(` addze r3, r12')
+SM(` subfe r11, r0, r0') C complement...
+ ld r31, -8(r1)
+SM(` subf r3, r11, r12')
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index d447b56d9..5392138f1 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -1,7 +1,7 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
-Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 21
@@ -38,23 +39,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 50
#define MUL_TOOM44_THRESHOLD 112
#define MUL_TOOM6H_THRESHOLD 274
-#define MUL_TOOM8H_THRESHOLD 430
+#define MUL_TOOM8H_THRESHOLD 339
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78
-#define SQR_BASECASE_THRESHOLD 9
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 53
-#define SQR_TOOM4_THRESHOLD 148
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 49
+#define SQR_TOOM4_THRESHOLD 136
#define SQR_TOOM6_THRESHOLD 226
-#define SQR_TOOM8_THRESHOLD 430
+#define SQR_TOOM8_THRESHOLD 393
+
+#define MULMID_TOOM42_THRESHOLD 36
#define MULMOD_BNM1_THRESHOLD 14
#define SQRMOD_BNM1_THRESHOLD 14
+#define POWM_SEC_TABLE 4,23,213,840,2618
+
#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
@@ -106,34 +111,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2368
#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 112
-#define DC_BDIV_QR_THRESHOLD 29
-#define DC_BDIV_Q_THRESHOLD 86
-
-#define INV_MULMOD_BNM1_THRESHOLD 47
-#define INV_NEWTON_THRESHOLD 93
-#define INV_APPR_THRESHOLD 91
-
-#define BINV_NEWTON_THRESHOLD 132
-#define REDC_1_TO_REDC_N_THRESHOLD 39
-
-#define MU_DIV_QR_THRESHOLD 855
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 33
-#define MU_BDIV_QR_THRESHOLD 807
-#define MU_BDIV_Q_THRESHOLD 872
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 64
-#define GCD_DC_THRESHOLD 237
-#define GCDEXT_DC_THRESHOLD 183
+#define MULLO_DC_THRESHOLD 61
+#define MULLO_MUL_N_THRESHOLD 3271
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 200
+#define DC_BDIV_QR_THRESHOLD 70
+#define DC_BDIV_Q_THRESHOLD 168
+
+#define INV_MULMOD_BNM1_THRESHOLD 61
+#define INV_NEWTON_THRESHOLD 166
+#define INV_APPR_THRESHOLD 166
+
+#define BINV_NEWTON_THRESHOLD 222
+#define REDC_1_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 59
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 109
+#define HGCD_APPR_THRESHOLD 108
+#define HGCD_REDUCE_THRESHOLD 1052
+#define GCD_DC_THRESHOLD 501
+#define GCDEXT_DC_THRESHOLD 249
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 29
#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 1648
+#define SET_STR_PRECOMPUTE_THRESHOLD 1639
diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm
index 427d6081a..52c5af8ff 100644
--- a/mpn/powerpc64/mode64/p6/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl PowerPC-64 mpn_basecase.
+dnl PowerPC-64 mpn_mul_basecase.
dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free
dnl Software Foundation, Inc.
diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 000000000..02603c525
--- /dev/null
+++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -0,0 +1,159 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 3550 MHz POWER7 (gcc110.fsffrance.org) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 28
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 202
+#define MUL_TOOM6H_THRESHOLD 298
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 143
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 135
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 141
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 36
+#define SQR_TOOM3_THRESHOLD 109
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 399
+
+#define MULMID_TOOM42_THRESHOLD 62
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define POWM_SEC_TABLE 6,65,342,1465
+
+#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 436, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \
+ { 95,11}, { 191,10}, { 383,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 575,11}, \
+ { 303,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 103
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 23
+#define MULLO_MUL_N_THRESHOLD 9174
+
+#define DC_DIV_QR_THRESHOLD 30
+#define DC_DIVAPPR_Q_THRESHOLD 124
+#define DC_BDIV_QR_THRESHOLD 66
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 81
+#define INV_NEWTON_THRESHOLD 165
+#define INV_APPR_THRESHOLD 133
+
+#define BINV_NEWTON_THRESHOLD 300
+#define REDC_1_TO_REDC_N_THRESHOLD 76
+
+#define MU_DIV_QR_THRESHOLD 1470
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 58
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1499
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 124
+#define HGCD_APPR_THRESHOLD 155
+#define HGCD_REDUCE_THRESHOLD 3134
+#define GCD_DC_THRESHOLD 492
+#define GCDEXT_DC_THRESHOLD 333
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 1517
+#define SET_STR_PRECOMPUTE_THRESHOLD 3421
diff --git a/mpn/powerpc64/mode64/rsh1add_n.asm b/mpn/powerpc64/mode64/rsh1add_n.asm
index 8af3ca774..2a5ef3060 100644
--- a/mpn/powerpc64/mode64/rsh1add_n.asm
+++ b/mpn/powerpc64/mode64/rsh1add_n.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 2 (1.5 c/l should be possible)
-C POWER4/PPC970 4 (2.0 c/l should be possible)
-C POWER5 3.5 (2.0 c/l should be possible)
-C POWER6 4.5
+C cycles/limb
+C POWER3/PPC630 2 (1.5 c/l should be possible)
+C POWER4/PPC970 4 (2.0 c/l should be possible)
+C POWER5 3.5 (2.0 c/l should be possible)
+C POWER6 4.5
+C POWER7 3.5
define(`rp',`r3')
define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/rsh1sub_n.asm b/mpn/powerpc64/mode64/rsh1sub_n.asm
index 1faa03379..b10eb8ab7 100644
--- a/mpn/powerpc64/mode64/rsh1sub_n.asm
+++ b/mpn/powerpc64/mode64/rsh1sub_n.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 2 (1.5 c/l should be possible)
-C POWER4/PPC970 4 (2.0 c/l should be possible)
-C POWER5 3.5 (2.0 c/l should be possible)
-C POWER6 4.5
+C cycles/limb
+C POWER3/PPC630 2 (1.5 c/l should be possible)
+C POWER4/PPC970 4 (2.0 c/l should be possible)
+C POWER5 3.5 (2.0 c/l should be possible)
+C POWER6 4.5
+C POWER7 3.5
define(`rp',`r3')
define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/sqr_basecase.asm b/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 000000000..72ac2d318
--- /dev/null
+++ b/mpn/powerpc64/mode64/sqr_basecase.asm
@@ -0,0 +1,852 @@
+dnl PowerPC-64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 Free
+dnl Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 16.25
+C POWER7 3.77
+
+C NOTES
+C * This is very crude, cleanup!
+C * Try to reduce the number of needed live registers.
+C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
+C cost will be more live registers.
+C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C size a lot and speed things up perhaps 25%.
+C * Use computed goto in order to compress the code.
+C * Implement a larger final corner.
+C * Schedule callee-saves register saves into other insns. This could save
+C about 5 cycles/call. (We cannot analogously optimise the restores, since
+C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C * Should the alternating std/adde sequences be split? Some pipelines handle
+C adde poorly, and might sequentialise all these instructions.
+C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C adjacent integer multiply insns. Except for the multiply insns, the code
+C was not carefully optimised for POWER6 or any other CPU.
+C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved', `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ cmpdi cr0, n, 2
+ bge cr0, L(ge2)
+ ld r5, 0(up) C n = 1
+ nop
+ mulld r8, r5, r5 C weight 0
+ mulhdu r9, r5, r5 C weight 1
+ std r8, 0(rp)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(ge2): bgt cr0, L(gt2)
+ ld r0, 0(up) C n = 2
+ nop
+ mulld r8, r0, r0 C u0 * u0
+ mulhdu r9, r0, r0 C u0 * u0
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r8, 0(rp)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+ ALIGN(16)
+L(gt2): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+
+ mr rp_saved, rp
+ mr up_saved, up
+ mr n_saved, n
+ mr rp_outer, rp
+ mr up_outer, up
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic r7, n, 2 C compute count...
+ srdi r7, r7, 2 C ...for ctr
+ mtctr r7 C copy count into ctr
+ beq- cr0, L(b0)
+ blt- cr6, L(b1)
+ beq- cr6, L(b2)
+
+L(b3): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+ li r12, 0 C carry limb
+ bdz L(em3)
+
+ ALIGN(16)
+L(tm3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm3)
+
+L(em3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop)
+
+L(b0): ld r6, 0(up)
+ ld r27, 8(up)
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ std r7, 8(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(em0)
+
+ ALIGN(16)
+L(tm0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm0)
+
+L(em0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_2)
+
+L(b1): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ addc r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(em1)
+
+ ALIGN(16)
+L(tm1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm1)
+
+L(em1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_3)
+
+L(b2): addi r7, r7, -1 C FIXME
+ mtctr r7 C FIXME
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ std r0, 8(rp)
+ std r7, 16(rp)
+ std r11, 24(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(em2)
+
+ ALIGN(16)
+L(tm2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm2)
+
+L(em2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_0)
+
+
+L(outer_loop):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ bdz L(outer_end)
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ adde r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(ea1)
+
+ ALIGN(16)
+L(ta1): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta1)
+
+L(ea1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_0):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r28
+ adde r7, r7, r26
+ addze r12, r8
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(ea0)
+
+ ALIGN(16)
+L(ta0): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta0)
+
+L(ea0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_3):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(ea3)
+
+ ALIGN(16)
+L(ta3): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta3)
+
+L(ea3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ bdz L(ea2)
+ addi up, up, 24
+
+ ALIGN(16)
+L(ta2): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta2)
+
+L(ea2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+ b L(outer_loop)
+
+L(outer_end):
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ addc r0, r0, r11
+ std r0, 0(rp)
+ addze r8, r8
+ std r8, 8(rp)
+
+define(`rp', `rp_saved')
+define(`up', `r5')
+define(`n', `r6')
+define(`climb', `r0')
+
+ addi r4, rp_saved, 8
+ mr r5, up_saved
+ mr r6, n_saved
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 2 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C put loop count into ctr
+ beq cr0, L(xb0)
+ blt cr6, L(xb1)
+ beq cr6, L(xb2)
+
+L(xb3): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ addi up, up, 24
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ addze climb, r29
+ addc r10, r10, r25
+ adde r11, r11, r26
+ adde r6, r6, r27
+ adde r7, r7, r28
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ std r6, 24(rp)
+ std r7, 32(rp)
+ addi rp, rp, 40
+ bdnz L(top)
+ b L(end)
+
+L(xb2): ld r6, 0(up)
+ ld r7, 8(up)
+ addi up, up, 16
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ addze climb, r27
+ addc r10, r10, r25
+ adde r11, r11, r26
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ addi rp, rp, 24
+ bdnz L(top)
+ b L(end)
+
+L(xb0): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ ld r12, 40(rp)
+ ld r23, 48(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze climb, r31
+ std r24, 0(rp)
+ addc r10, r10, r25
+ std r10, 8(rp)
+ adde r11, r11, r26
+ std r11, 16(rp)
+ adde r6, r6, r27
+ std r6, 24(rp)
+ adde r7, r7, r28
+ std r7, 32(rp)
+ adde r12, r12, r29
+ std r12, 40(rp)
+ adde r23, r23, r30
+ std r23, 48(rp)
+ addi rp, rp, 56
+ bdnz L(top)
+ b L(end)
+
+L(xb1): ld r6, 0(up)
+ addi up, up, 8
+ mulld r24, r6, r6
+ mulhdu climb, r6, r6
+ std r24, 0(rp)
+ addic rp, rp, 8 C clear carry as side-effect
+
+ ALIGN(32)
+L(top): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r8, 0(rp)
+ ld r9, 8(rp)
+ adde r8, r8, r8
+ adde r9, r9, r9
+ ld r10, 16(rp)
+ ld r11, 24(rp)
+ adde r10, r10, r10
+ adde r11, r11, r11
+ ld r6, 32(rp)
+ ld r7, 40(rp)
+ adde r6, r6, r6
+ adde r7, r7, r7
+ ld r12, 48(rp)
+ ld r23, 56(rp)
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze r31, r31
+ addc r8, r8, climb
+ std r8, 0(rp)
+ adde r9, r9, r24
+ std r9, 8(rp)
+ adde r10, r10, r25
+ std r10, 16(rp)
+ adde r11, r11, r26
+ std r11, 24(rp)
+ adde r6, r6, r27
+ std r6, 32(rp)
+ adde r7, r7, r28
+ std r7, 40(rp)
+ adde r12, r12, r29
+ std r12, 48(rp)
+ adde r23, r23, r30
+ std r23, 56(rp)
+ mr climb, r31
+ addi rp, rp, 64
+ bdnz L(top)
+
+L(end): addze climb, climb
+ std climb, 0(rp)
+
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
deleted file mode 100644
index 663f04c14..000000000
--- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl PowerPC-64 mpn_sqr_diag_addlsh1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 10
-C POWER4/PPC970 6
-C POWER5 5.375
-C POWER6 8.5
-
-C NOTES
-C * This was written for POWER6 and its preferences for adjacent integer
-C multiply insns. The cost is that we get a large set of live registers,
-C and therefore need to save 9 callee-saves registers. Except for the
-C multiply insns, the code was not carefully optimised for POWER6 or any
-C other CPU.
-C * Perform some cross-jumping in the feed-in code, into the loop's tail.
-
-C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n)
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`tp', `r4')
-define(`up', `r5')
-define(`n', `r6')
-
-define(`climb', `r0')
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
- std r26, -48(r1)
- std r25, -56(r1)
- std r24, -64(r1)
- std r23, -72(r1)
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi n, n, 2 C compute count...
- srdi n, n, 2 C ...for ctr
- mtctr n C put loop count into ctr
- beq cr0, L(b0)
- blt cr6, L(b1)
- beq cr6, L(b2)
-
-L(b3): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- addi up, up, 24
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- ld r10, 0(tp)
- ld r11, 8(tp)
- ld r6, 16(tp)
- ld r7, 24(tp)
- addi tp, tp, 32
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- addze climb, r29
- addc r10, r10, r25
- adde r11, r11, r26
- adde r6, r6, r27
- adde r7, r7, r28
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- std r6, 24(rp)
- std r7, 32(rp)
- addi rp, rp, 40
- bdnz L(top)
- b L(end)
-
-L(b2): ld r6, 0(up)
- ld r7, 8(up)
- addi up, up, 16
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- ld r10, 0(tp)
- ld r11, 8(tp)
- addi tp, tp, 16
- addc r10, r10, r10
- adde r11, r11, r11
- addze climb, r27
- addc r10, r10, r25
- adde r11, r11, r26
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- addi rp, rp, 24
- bdnz L(top)
- b L(end)
-
-L(b0): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r10, 0(tp)
- ld r11, 8(tp)
- ld r6, 16(tp)
- ld r7, 24(tp)
- ld r12, 32(tp)
- ld r23, 40(tp)
- addi tp, tp, 48
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- adde r12, r12, r12
- adde r23, r23, r23
- addze climb, r31
- std r24, 0(rp)
- addc r10, r10, r25
- std r10, 8(rp)
- adde r11, r11, r26
- std r11, 16(rp)
- adde r6, r6, r27
- std r6, 24(rp)
- adde r7, r7, r28
- std r7, 32(rp)
- adde r12, r12, r29
- std r12, 40(rp)
- adde r23, r23, r30
- std r23, 48(rp)
- addi rp, rp, 56
- bdnz L(top)
- b L(end)
-
-L(b1): ld r6, 0(up)
- addi up, up, 8
- mulld r24, r6, r6
- mulhdu climb, r6, r6
- std r24, 0(rp)
- addic rp, rp, 8 C clear carry as side-effect
-
- ALIGN(32)
-L(top): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r8, 0(tp)
- ld r9, 8(tp)
- adde r8, r8, r8
- adde r9, r9, r9
- ld r10, 16(tp)
- ld r11, 24(tp)
- adde r10, r10, r10
- adde r11, r11, r11
- ld r6, 32(tp)
- ld r7, 40(tp)
- adde r6, r6, r6
- adde r7, r7, r7
- ld r12, 48(tp)
- ld r23, 56(tp)
- adde r12, r12, r12
- adde r23, r23, r23
- addi tp, tp, 64
- addze r31, r31
- addc r8, r8, climb
- std r8, 0(rp)
- adde r9, r9, r24
- std r9, 8(rp)
- adde r10, r10, r25
- std r10, 16(rp)
- adde r11, r11, r26
- std r11, 24(rp)
- adde r6, r6, r27
- std r6, 32(rp)
- adde r7, r7, r28
- std r7, 40(rp)
- adde r12, r12, r29
- std r12, 48(rp)
- adde r23, r23, r30
- std r23, 56(rp)
- mr climb, r31
- addi rp, rp, 64
- bdnz L(top)
-
-L(end): addze climb, climb
- std climb, 0(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- ld r26, -48(r1)
- ld r25, -56(r1)
- ld r24, -64(r1)
- ld r23, -72(r1)
- blr
-EPILOGUE()
diff --git a/mpn/powerpc64/rshift.asm b/mpn/powerpc64/rshift.asm
index 6545af769..18406c57e 100644
--- a/mpn/powerpc64/rshift.asm
+++ b/mpn/powerpc64/rshift.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm
new file mode 100644
index 000000000..7d189388b
--- /dev/null
+++ b/mpn/powerpc64/tabselect.asm
@@ -0,0 +1,96 @@
+dnl PowerPC-64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 3.3
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.5
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`mask', `r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ addi r0, n, 1
+ srdi r0, r0, 1 C inner loop count
+ andi. r9, n, 1 C set cr0 for use in inner loop
+ subf which, nents, which
+ sldi n, n, 3
+
+L(outer):
+ mtctr r0 C put inner loop count in ctr
+
+ add r9, which, nents C are we at the selected table entry?
+ addic r9, r9, -1 C set CF iff not selected entry
+ subfe mask, r0, r0
+
+ beq cr0, L(top) C branch to loop entry if n even
+
+ ld r9, 0(tp)
+ addi tp, tp, 8
+ and r9, r9, mask
+ ld r11, 0(rp)
+ andc r11, r11, mask
+ or r9, r9, r11
+ std r9, 0(rp)
+ addi rp, rp, 8
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r9, 0(tp)
+ ld r10, 8(tp)
+ addi tp, tp, 16
+ nop
+ and r9, r9, mask
+ and r10, r10, mask
+ ld r11, 0(rp)
+ ld r12, 8(rp)
+ andc r11, r11, mask
+ andc r12, r12, mask
+ or r9, r9, r11
+ or r10, r10, r12
+ std r9, 0(rp)
+ std r10, 8(rp)
+ addi rp, rp, 16
+ bdnz L(top)
+
+L(end): subf rp, n, rp C move rp back to beginning
+ cmpdi cr6, nents, 1
+ addi nents, nents, -1
+ bne cr6, L(outer)
+
+ blr
+EPILOGUE()
diff --git a/mpn/s390_32/esame/gmp-mparam.h b/mpn/s390_32/esame/gmp-mparam.h
index 5dedeeb81..a6508be1a 100644
--- a/mpn/s390_32/esame/gmp-mparam.h
+++ b/mpn/s390_32/esame/gmp-mparam.h
@@ -24,43 +24,45 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* 1200 MHz IBM z990 running in 32-bit mode */
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 3
-#define MOD_1_1P_METHOD 1
+#define DIVREM_1_UNNORM_THRESHOLD 4
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 12
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 42
#define USE_PREINV_DIVREM_1 1
#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 50
+#define BMOD_1_TO_MOD_1_THRESHOLD 30
#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 66
-#define MUL_TOOM44_THRESHOLD 169
-#define MUL_TOOM6H_THRESHOLD 369
-#define MUL_TOOM8H_THRESHOLD 517
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 147
+#define MUL_TOOM6H_THRESHOLD 226
+#define MUL_TOOM8H_THRESHOLD 333
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 106
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 187
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 100
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 93
-#define SQR_TOOM4_THRESHOLD 387
-#define SQR_TOOM6_THRESHOLD 552
-#define SQR_TOOM8_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 26
+#define SQR_TOOM3_THRESHOLD 81
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 478
#define MULMID_TOOM42_THRESHOLD 38
#define MULMOD_BNM1_THRESHOLD 13
#define SQRMOD_BNM1_THRESHOLD 15
+#define POWM_SEC_TABLE 4,23,262,892,2500
+
#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 336, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
@@ -91,37 +93,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 35
#define SQR_FFT_THRESHOLD 2368
-#define MULLO_BASECASE_THRESHOLD 6
-#define MULLO_DC_THRESHOLD 45
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 49
#define MULLO_MUL_N_THRESHOLD 5397
-#define DC_DIV_QR_THRESHOLD 40
-#define DC_DIVAPPR_Q_THRESHOLD 152
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 146
#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 136
+#define DC_BDIV_Q_THRESHOLD 124
#define INV_MULMOD_BNM1_THRESHOLD 46
-#define INV_NEWTON_THRESHOLD 197
-#define INV_APPR_THRESHOLD 157
+#define INV_NEWTON_THRESHOLD 179
+#define INV_APPR_THRESHOLD 153
-#define BINV_NEWTON_THRESHOLD 114
+#define BINV_NEWTON_THRESHOLD 214
#define REDC_1_TO_REDC_N_THRESHOLD 55
-#define MU_DIV_QR_THRESHOLD 1210
-#define MU_DIVAPPR_Q_THRESHOLD 1334
-#define MUPI_DIV_QR_THRESHOLD 81
-#define MU_BDIV_QR_THRESHOLD 942
-#define MU_BDIV_Q_THRESHOLD 1258
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 104
-#define GCD_DC_THRESHOLD 278
+#define MU_DIV_QR_THRESHOLD 1078
+#define MU_DIVAPPR_Q_THRESHOLD 1078
+#define MUPI_DIV_QR_THRESHOLD 74
+#define MU_BDIV_QR_THRESHOLD 872
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 90
+#define HGCD_APPR_THRESHOLD 111
+#define HGCD_REDUCE_THRESHOLD 1962
+#define GCD_DC_THRESHOLD 225
#define GCDEXT_DC_THRESHOLD 217
#define JACOBI_BASE_METHOD 2
-#define GET_STR_DC_THRESHOLD 16
-#define GET_STR_PRECOMPUTE_THRESHOLD 30
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
#define SET_STR_DC_THRESHOLD 274
#define SET_STR_PRECOMPUTE_THRESHOLD 824
-
-/* Tuneup completed successfully, took 108 seconds */
diff --git a/mpn/s390_32/lshift.asm b/mpn/s390_32/lshift.asm
index 335a5f77a..17e52655f 100644
--- a/mpn/s390_32/lshift.asm
+++ b/mpn/s390_32/lshift.asm
@@ -126,7 +126,7 @@ L(top): l %r10, 16(up)
L(end): l %r10, 16(up)
sll %r10, 0(cnt)
st %r10, 12(rp)
-
+
lr %r2, %r12
lm %r6, %r12, 24(%r15)
br %r14
diff --git a/mpn/s390_32/lshiftc.asm b/mpn/s390_32/lshiftc.asm
index b42bc715b..9bdd0d798 100644
--- a/mpn/s390_32/lshiftc.asm
+++ b/mpn/s390_32/lshiftc.asm
@@ -138,7 +138,7 @@ L(end): l %r10, 16(up)
sll %r10, 0(cnt)
xr %r10, %r13
st %r10, 12(rp)
-
+
lr %r2, %r12
lm %r6, %r13, 24(%r15)
br %r14
diff --git a/mpn/s390_32/rshift.asm b/mpn/s390_32/rshift.asm
index ec32fa764..becbe1893 100644
--- a/mpn/s390_32/rshift.asm
+++ b/mpn/s390_32/rshift.asm
@@ -120,7 +120,7 @@ L(top): l %r11, 0(up)
L(end): l %r11, 0(up)
srl %r11, 0(cnt)
st %r11, 0(rp)
-
+
lr %r2, %r12
lm %r6, %r12, 24(%r15)
br %r14
diff --git a/mpn/s390_64/README b/mpn/s390_64/README
new file mode 100644
index 000000000..82b68a080
--- /dev/null
+++ b/mpn/s390_64/README
@@ -0,0 +1,77 @@
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+
+There are 5 generations of 64-but s390 processors, z900, z990, z9,
+z10, and z196. The current GMP code was optimised for the two oldest,
+z900 and z990.
+
+
+mpn_copyi
+
+This code makes use of a loop around MVC. It almost surely runs very
+close to optimally. A small improvement could be done by using one
+MVC for size 256 bytes, now we use two (we use an extra MVC when
+copying any multiple of 256 bytes).
+
+
+mpn_copyd
+
+We have tried several feed-in variants here, branch tree, jump table
+and computed goto. The fastest (on z990) turned out to be computed
+goto.
+
+An approach not tried is EX of LMG and STMG, modifying the register set
+on-the-fly. Using that trick, we could completely avoid using
+separate feed-in paths.
+
+
+mpn_lshift, mpn_rshift
+
+The current code runs at pipeline decode bandwith on z990.
+
+
+mpn_add_n, mpn_sub_n
+
+The current code is 4-way unrolled. It should be unrolled more, at
+least 8x, in order to reach 2.5 c/l.
+
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1
+
+The current code is very naive, but due to the non-pipelined nature of
+MLGR on z900 and z990, more sophisticated code would not gain much.
+
+On z10 one would need to cluster at least 4 MLGR together, in order to
+reduce stalling.
+
+On z196, one surely want to use unrolling and pipelining, to perhaps
+reach around 12 c/l. A major issue here and on z10 is ALCGR's 3 cycle
+stalling.
+
+
+mpn_mul_2, mpn_addmul_2
+
+At least for older machines (z900, z990) with very slow MLGR, we
+should use Karatsuba's algorithm on 2-limb units, making mul_2 and
+addmul_2 the main multiplicaton primitives. The newer machines might
+benefit less from this approach, perhaps in particular z10, where MLGR
+clustering is more important.
+
+With Karatsuba, one could hope for around 16 cycles per accumulated
+128 cross product, on z990.
diff --git a/mpn/s390_64/gmp-mparam.h b/mpn/s390_64/gmp-mparam.h
index c4960254e..c0ade71c2 100644
--- a/mpn/s390_64/gmp-mparam.h
+++ b/mpn/s390_64/gmp-mparam.h
@@ -28,19 +28,19 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 58
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 38
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19
#define USE_PREINV_DIVREM_1 1
#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 47
+#define BMOD_1_TO_MOD_1_THRESHOLD 88
#define MUL_TOOM22_THRESHOLD 10
#define MUL_TOOM33_THRESHOLD 41
-#define MUL_TOOM44_THRESHOLD 99
+#define MUL_TOOM44_THRESHOLD 104
#define MUL_TOOM6H_THRESHOLD 149
#define MUL_TOOM8H_THRESHOLD 212
@@ -61,6 +61,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULMOD_BNM1_THRESHOLD 9
#define SQRMOD_BNM1_THRESHOLD 11
+#define POWM_SEC_TABLE 4,23,128,598
+
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 220, 5}, { 7, 4}, { 15, 5}, { 8, 4}, \
@@ -131,7 +133,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_BASECASE_THRESHOLD 3
#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 4392
+#define MULLO_MUL_N_THRESHOLD 5240
#define DC_DIV_QR_THRESHOLD 28
#define DC_DIVAPPR_Q_THRESHOLD 106
@@ -152,12 +154,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 680
#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 71
-#define GCD_DC_THRESHOLD 177
-#define GCDEXT_DC_THRESHOLD 142
-#define JACOBI_BASE_METHOD 2
+#define HGCD_THRESHOLD 75
+#define HGCD_APPR_THRESHOLD 59
+#define HGCD_REDUCE_THRESHOLD 901
+#define GCD_DC_THRESHOLD 186
+#define GCDEXT_DC_THRESHOLD 150
+#define JACOBI_BASE_METHOD 3
#define GET_STR_DC_THRESHOLD 27
#define GET_STR_PRECOMPUTE_THRESHOLD 40
-#define SET_STR_DC_THRESHOLD 363
+#define SET_STR_DC_THRESHOLD 418
#define SET_STR_PRECOMPUTE_THRESHOLD 1111
diff --git a/mpn/sparc64/ultrasparc34/gmp-mparam.h b/mpn/sparc64/ultrasparc34/gmp-mparam.h
index faed8efa3..8fe8ddc54 100644
--- a/mpn/sparc64/ultrasparc34/gmp-mparam.h
+++ b/mpn/sparc64/ultrasparc34/gmp-mparam.h
@@ -28,12 +28,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 38
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 33
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -55,8 +56,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 191
#define SQR_TOOM8_THRESHOLD 339
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 13
+#define MULMID_TOOM42_THRESHOLD 42
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 9
+
+#define POWM_SEC_TABLE 4,23,130,780,1812,1926
#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -157,7 +162,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 182
#define SQR_FFT_THRESHOLD 1984
-#define MULLO_BASECASE_THRESHOLD 8
+#define MULLO_BASECASE_THRESHOLD 14
#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */
#define MULLO_MUL_N_THRESHOLD 3791
@@ -170,7 +175,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define INV_NEWTON_THRESHOLD 17
#define INV_APPR_THRESHOLD 17
-#define BINV_NEWTON_THRESHOLD 134
+#define BINV_NEWTON_THRESHOLD 92
#define REDC_1_TO_REDC_2_THRESHOLD 2
#define REDC_2_TO_REDC_N_THRESHOLD 117
@@ -181,12 +186,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 748
#define MATRIX22_STRASSEN_THRESHOLD 12
-#define HGCD_THRESHOLD 46
-#define GCD_DC_THRESHOLD 130
+#define HGCD_THRESHOLD 45
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 1094
+#define GCD_DC_THRESHOLD 126
#define GCDEXT_DC_THRESHOLD 134
#define JACOBI_BASE_METHOD 2
#define GET_STR_DC_THRESHOLD 18
#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 315
+#define SET_STR_DC_THRESHOLD 286
#define SET_STR_PRECOMPUTE_THRESHOLD 1037
diff --git a/mpn/sparc64/ultrasparct1/gmp-mparam.h b/mpn/sparc64/ultrasparct1/gmp-mparam.h
index 744f7e17c..34c8027f5 100644
--- a/mpn/sparc64/ultrasparct1/gmp-mparam.h
+++ b/mpn/sparc64/ultrasparct1/gmp-mparam.h
@@ -25,14 +25,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 35
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -50,13 +52,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_BASECASE_THRESHOLD 0 /* always */
#define SQR_TOOM2_THRESHOLD 16
#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 133
-#define SQR_TOOM6_THRESHOLD 156
+#define SQR_TOOM4_THRESHOLD 135
+#define SQR_TOOM6_THRESHOLD 160
#define SQR_TOOM8_THRESHOLD 260
+#define MULMID_TOOM42_THRESHOLD 12
+
#define MULMOD_BNM1_THRESHOLD 7
#define SQRMOD_BNM1_THRESHOLD 7
+#define POWM_SEC_TABLE 2,23,176,625,2783
+
#define MUL_FFT_MODF_THRESHOLD 176 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 176, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \
@@ -102,30 +108,32 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_MUL_N_THRESHOLD 3176
#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 107
+#define DC_DIVAPPR_Q_THRESHOLD 108
#define DC_BDIV_QR_THRESHOLD 27
#define DC_BDIV_Q_THRESHOLD 62
-#define INV_MULMOD_BNM1_THRESHOLD 22
+#define INV_MULMOD_BNM1_THRESHOLD 14
#define INV_NEWTON_THRESHOLD 163
#define INV_APPR_THRESHOLD 117
#define BINV_NEWTON_THRESHOLD 166
#define REDC_1_TO_REDC_N_THRESHOLD 32
-#define MU_DIV_QR_THRESHOLD 720
-#define MU_DIVAPPR_Q_THRESHOLD 734
-#define MUPI_DIV_QR_THRESHOLD 67
+#define MU_DIV_QR_THRESHOLD 734
+#define MU_DIVAPPR_Q_THRESHOLD 748
+#define MUPI_DIV_QR_THRESHOLD 68
#define MU_BDIV_QR_THRESHOLD 562
#define MU_BDIV_Q_THRESHOLD 734
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 53
+#define MATRIX22_STRASSEN_THRESHOLD 9
+#define HGCD_THRESHOLD 66
+#define HGCD_APPR_THRESHOLD 47
+#define HGCD_REDUCE_THRESHOLD 834
#define GCD_DC_THRESHOLD 183
-#define GCDEXT_DC_THRESHOLD 144
+#define GCDEXT_DC_THRESHOLD 142
#define JACOBI_BASE_METHOD 3
#define GET_STR_DC_THRESHOLD 20
-#define GET_STR_PRECOMPUTE_THRESHOLD 39
+#define GET_STR_PRECOMPUTE_THRESHOLD 36
#define SET_STR_DC_THRESHOLD 458
-#define SET_STR_PRECOMPUTE_THRESHOLD 964
+#define SET_STR_PRECOMPUTE_THRESHOLD 963
diff --git a/mpn/x86/atom/gmp-mparam.h b/mpn/x86/atom/gmp-mparam.h
index 8c2595230..391a0ac4a 100644
--- a/mpn/x86/atom/gmp-mparam.h
+++ b/mpn/x86/atom/gmp-mparam.h
@@ -24,26 +24,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* Generated by tuneup.c */
#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 6
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 33
#define MUL_TOOM22_THRESHOLD 20
#define MUL_TOOM33_THRESHOLD 78
-#define MUL_TOOM44_THRESHOLD 184
+#define MUL_TOOM44_THRESHOLD 168
#define MUL_TOOM6H_THRESHOLD 270
#define MUL_TOOM8H_THRESHOLD 406
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 79
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 127
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 34
@@ -52,8 +53,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 303
#define SQR_TOOM8_THRESHOLD 547
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 18
+#define MULMID_TOOM42_THRESHOLD 54
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 2,35,262,1168
#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -108,9 +113,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 82
#define SQR_FFT_THRESHOLD 2752
-#define MULLO_BASECASE_THRESHOLD 4
+#define MULLO_BASECASE_THRESHOLD 5
#define MULLO_DC_THRESHOLD 51
-#define MULLO_MUL_N_THRESHOLD 8907
+#define MULLO_MUL_N_THRESHOLD 6633
#define DC_DIV_QR_THRESHOLD 63
#define DC_DIVAPPR_Q_THRESHOLD 252
@@ -131,12 +136,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1334
#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 126
-#define GCD_DC_THRESHOLD 483
-#define GCDEXT_DC_THRESHOLD 351
+#define HGCD_THRESHOLD 129
+#define HGCD_APPR_THRESHOLD 163
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 469
+#define GCDEXT_DC_THRESHOLD 348
#define JACOBI_BASE_METHOD 3
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 272
-#define SET_STR_PRECOMPUTE_THRESHOLD 1116
+#define SET_STR_DC_THRESHOLD 262
+#define SET_STR_PRECOMPUTE_THRESHOLD 902
diff --git a/mpn/x86/atom/lshift.asm b/mpn/x86/atom/lshift.asm
index d8cb8b505..1005cce59 100644
--- a/mpn/x86/atom/lshift.asm
+++ b/mpn/x86/atom/lshift.asm
@@ -160,7 +160,7 @@ deflit(`FRAME',4)
shr $2, %eax C (size + 3) / 4
and $3, %edx C (size - 1) % 4
jz L(goloop) C jmp if size == 1 (mod 4)
- shr %edx
+ shr %edx
jnc L(odd) C jum if size == 3 (mod 4)
add %ecx, %ecx
@@ -173,7 +173,7 @@ deflit(`FRAME',4)
jnz L(goloop) C jump if size == 0 (mod 4)
L(odd): lea -8(up), up
lea -8(rp), rp
- jmp L(sentry) C reached if size == 2 or 3 (mod 4)
+ jmp L(sentry) C reached if size == 2 or 3 (mod 4)
L(sloop):
adc %ecx, %ecx
diff --git a/mpn/x86/atom/sse2/mul_1.asm b/mpn/x86/atom/sse2/mul_1.asm
index dd9b95366..5cd86caec 100644
--- a/mpn/x86/atom/sse2/mul_1.asm
+++ b/mpn/x86/atom/sse2/mul_1.asm
@@ -62,7 +62,7 @@ EPILOGUE()
PROLOGUE(mpn_mul_1)
pxor %mm6, %mm6
L(ent): push %esi FRAME_pushl()
- mov PARAM_SRC, up
+ mov PARAM_SRC, up
mov PARAM_SIZE, %eax C size
movd PARAM_MUL, %mm7
movd (up), %mm0
diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm
index 201ef173d..ac9faf270 100644
--- a/mpn/x86/bdiv_dbm1c.asm
+++ b/mpn/x86/bdiv_dbm1c.asm
@@ -24,10 +24,10 @@ C P5
C P6 model 0-8,10-12)
C P6 model 9 (Banias)
C P6 model 13 (Dothan) 5.1
-C P4 model 0 (Willamette)
+C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood) 13.67
-C P4 model 3 (Prescott)
+C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
C Intel Atom
C AMD K6
diff --git a/mpn/x86/bdiv_q_1.asm b/mpn/x86/bdiv_q_1.asm
index 2528d01f7..7f344ab57 100644
--- a/mpn/x86/bdiv_q_1.asm
+++ b/mpn/x86/bdiv_q_1.asm
@@ -30,7 +30,7 @@ C K6 14.0
C K7 12.0
C P4 42.0
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
defframe(PARAM_SHIFT, 24)
defframe(PARAM_INVERSE,20)
diff --git a/mpn/x86/bobcat/gmp-mparam.h b/mpn/x86/bobcat/gmp-mparam.h
new file mode 100644
index 000000000..e14ba39f5
--- /dev/null
+++ b/mpn/x86/bobcat/gmp-mparam.h
@@ -0,0 +1,142 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 42
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 90
+#define MUL_TOOM44_THRESHOLD 147
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 38
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 220
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 454
+
+#define MULMID_TOOM42_THRESHOLD 76
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define POWM_SEC_TABLE 4,14,290,357,2178
+
+#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */
+#define MUL_FFT_TABLE3 \
+ { { 888, 6}, { 25, 7}, { 13, 6}, { 27, 7}, \
+ { 15, 6}, { 33, 7}, { 17, 6}, { 35, 7}, \
+ { 19, 6}, { 39, 7}, { 23, 6}, { 47, 7}, \
+ { 27, 8}, { 15, 7}, { 31, 6}, { 63, 7}, \
+ { 35, 8}, { 19, 7}, { 41, 8}, { 23, 7}, \
+ { 49, 8}, { 31, 7}, { 63, 8}, { 39, 7}, \
+ { 79, 8}, { 43, 9}, { 23, 8}, { 51, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 319, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 70
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 723 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 723, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543,11}, \
+ { 159, 9}, { 671,11}, { 191,10}, { 383, 9}, \
+ { 799,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 69
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 45
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 75
+#define DC_DIVAPPR_Q_THRESHOLD 216
+#define DC_BDIV_QR_THRESHOLD 67
+#define DC_BDIV_Q_THRESHOLD 143
+
+#define INV_MULMOD_BNM1_THRESHOLD 75
+#define INV_NEWTON_THRESHOLD 244
+#define INV_APPR_THRESHOLD 228
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 71
+
+#define MU_DIV_QR_THRESHOLD 1858
+#define MU_DIVAPPR_Q_THRESHOLD 1822
+#define MUPI_DIV_QR_THRESHOLD 122
+#define MU_BDIV_QR_THRESHOLD 1787
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 78
+#define HGCD_APPR_THRESHOLD 55
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 31
+#define SET_STR_DC_THRESHOLD 270
+#define SET_STR_PRECOMPUTE_THRESHOLD 812
diff --git a/mpn/x86/core2/gmp-mparam.h b/mpn/x86/core2/gmp-mparam.h
new file mode 100644
index 000000000..feb0f281f
--- /dev/null
+++ b/mpn/x86/core2/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 19
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 228
+#define MUL_TOOM6H_THRESHOLD 294
+#define MUL_TOOM8H_THRESHOLD 458
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 116
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 262
+#define SQR_TOOM8_THRESHOLD 597
+
+#define MULMID_TOOM42_THRESHOLD 70
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 6,26,262,991,2212
+
+#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 690, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 191,10}, { 383, 9}, \
+ { 799,11}, { 223,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 70
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 630 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 630, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 15, 5}, { 31, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 319, 9}, { 671, 8}, { 1343,11}, \
+ { 191,10}, { 383, 9}, { 799,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 67
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 30
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 15
+#define DC_DIVAPPR_Q_THRESHOLD 49
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 190
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 35
+#define INV_APPR_THRESHOLD 35
+
+#define BINV_NEWTON_THRESHOLD 324
+#define REDC_1_TO_REDC_N_THRESHOLD 83
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1099
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1718
+
+#define MATRIX22_STRASSEN_THRESHOLD 31
+#define HGCD_THRESHOLD 118
+#define HGCD_APPR_THRESHOLD 149
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 351
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 517
+#define SET_STR_PRECOMPUTE_THRESHOLD 1402
diff --git a/mpn/x86/coreinhm/gmp-mparam.h b/mpn/x86/coreinhm/gmp-mparam.h
new file mode 100644
index 000000000..21afeb619
--- /dev/null
+++ b/mpn/x86/coreinhm/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.5 */
+
+#define MOD_1_NORM_THRESHOLD 24
+#define MOD_1_UNNORM_THRESHOLD 15
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 214
+#define MUL_TOOM6H_THRESHOLD 306
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 148
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 42
+#define SQR_TOOM3_THRESHOLD 149
+#define SQR_TOOM4_THRESHOLD 226
+#define SQR_TOOM6_THRESHOLD 333
+#define SQR_TOOM8_THRESHOLD 494
+
+#define MULMID_TOOM42_THRESHOLD 78
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 21
+
+#define POWM_SEC_TABLE 2,33,294,1298,2870
+
+#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 606, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 63
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 505 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 505, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 74
+#define SQR_FFT_THRESHOLD 4800
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 35
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 21
+#define DC_DIVAPPR_Q_THRESHOLD 42
+#define DC_BDIV_QR_THRESHOLD 84
+#define DC_BDIV_Q_THRESHOLD 156
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 17
+#define INV_APPR_THRESHOLD 17
+
+#define BINV_NEWTON_THRESHOLD 348
+#define REDC_1_TO_REDC_N_THRESHOLD 83
+
+#define MU_DIV_QR_THRESHOLD 979
+#define MU_DIVAPPR_Q_THRESHOLD 501
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD_THRESHOLD 57
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 253
+#define GCDEXT_DC_THRESHOLD 233
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 127
+#define SET_STR_PRECOMPUTE_THRESHOLD 646
diff --git a/mpn/x86/coreisbr/gmp-mparam.h b/mpn/x86/coreisbr/gmp-mparam.h
new file mode 100644
index 000000000..16ef958ad
--- /dev/null
+++ b/mpn/x86/coreisbr/gmp-mparam.h
@@ -0,0 +1,140 @@
+/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-24, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 24
+#define MOD_1_UNNORM_THRESHOLD 25
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 18
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 101
+#define MUL_TOOM44_THRESHOLD 244
+#define MUL_TOOM6H_THRESHOLD 351
+#define MUL_TOOM8H_THRESHOLD 547
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 109
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 109
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 48
+#define SQR_TOOM3_THRESHOLD 165
+#define SQR_TOOM4_THRESHOLD 276
+#define SQR_TOOM6_THRESHOLD 366
+#define SQR_TOOM8_THRESHOLD 572
+
+#define MULMID_TOOM42_THRESHOLD 98
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define POWM_SEC_TABLE 2,27,258,1052
+
+#define MUL_FFT_MODF_THRESHOLD 716 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 716, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \
+ { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 31, 6}, \
+ { 63, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 191,10}, { 383, 9}, { 767,11}, \
+ { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 69
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 595 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 595, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 319, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 63
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 100
+#define MULLO_MUL_N_THRESHOLD 14379
+
+#define DC_DIV_QR_THRESHOLD 22
+#define DC_DIVAPPR_Q_THRESHOLD 30
+#define DC_BDIV_QR_THRESHOLD 120
+#define DC_BDIV_Q_THRESHOLD 268
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 12
+#define INV_APPR_THRESHOLD 13
+
+#define BINV_NEWTON_THRESHOLD 410
+#define REDC_1_TO_REDC_N_THRESHOLD 100
+
+#define MU_DIV_QR_THRESHOLD 1037
+#define MU_DIVAPPR_Q_THRESHOLD 889
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1858
+#define MU_BDIV_Q_THRESHOLD 2172
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD_THRESHOLD 59
+#define HGCD_APPR_THRESHOLD 56
+#define HGCD_REDUCE_THRESHOLD 4818
+#define GCD_DC_THRESHOLD 278
+#define GCDEXT_DC_THRESHOLD 298
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 438
+#define SET_STR_PRECOMPUTE_THRESHOLD 1206
diff --git a/mpn/x86/k10/gmp-mparam.h b/mpn/x86/k10/gmp-mparam.h
new file mode 100644
index 000000000..5c036223c
--- /dev/null
+++ b/mpn/x86/k10/gmp-mparam.h
@@ -0,0 +1,142 @@
+/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 12
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 32
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 127
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 77
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 336
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 54
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 4,32,164,879,2178
+
+#define MUL_FFT_MODF_THRESHOLD 786 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 786, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 63, 9}, { 39, 8}, { 83, 9}, { 47,10}, \
+ { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255, 7}, { 1023, 8}, { 543, 9}, { 279,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543, 8}, { 1087,10}, { 287,11}, { 159, 9}, \
+ { 671,11}, { 191,10}, { 399, 9}, { 799,12}, \
+ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 76
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 660 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 660, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 31, 7}, { 63, 8}, \
+ { 35, 7}, { 71, 8}, { 39, 9}, { 23, 8}, \
+ { 55,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 167,10}, { 95,11}, { 63,10}, \
+ { 159,11}, { 95, 8}, { 799,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 543,11}, { 159, 9}, \
+ { 639,10}, { 367,11}, { 191,10}, { 383, 9}, \
+ { 799,10}, { 415,11}, { 223,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 67
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 6
+#define MULLO_DC_THRESHOLD 42
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 248
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 250
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1718
+#define MU_DIVAPPR_Q_THRESHOLD 1652
+#define MUPI_DIV_QR_THRESHOLD 114
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 131
+#define HGCD_APPR_THRESHOLD 163
+#define HGCD_REDUCE_THRESHOLD 3810
+#define GCD_DC_THRESHOLD 555
+#define GCDEXT_DC_THRESHOLD 389
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 140
+#define SET_STR_PRECOMPUTE_THRESHOLD 1334
diff --git a/mpn/x86/k7/addlsh1_n.asm b/mpn/x86/k7/addlsh1_n.asm
index e5163b676..05df4a740 100644
--- a/mpn/x86/k7/addlsh1_n.asm
+++ b/mpn/x86/k7/addlsh1_n.asm
@@ -44,14 +44,14 @@ C AMD K8
C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
C that means we need an initial magic multiply.
-C
+C
C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
C cannot do rsblsh1_n since we feed carry from the shift blocks to the
C add/subtract blocks, which is right for addition but reversed for
C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
C without losing any time, since we're not issue limited but carry recurrency
C latency.
-C
+C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
C force is to 2*2-way unrolling.
@@ -120,7 +120,7 @@ ifdef(`CPU_P6',`
L(exact):
incl VAR_COUNT
jz L(end)
-
+
ALIGN(16)
L(top):
ifdef(`CPU_P6',`
diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h
index 84238c4e0..9cc6798af 100644
--- a/mpn/x86/k7/gmp-mparam.h
+++ b/mpn/x86/k7/gmp-mparam.h
@@ -30,6 +30,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 26
@@ -40,19 +41,23 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM8H_THRESHOLD 454
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 95
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 50
-#define SQR_TOOM3_THRESHOLD 87
+#define SQR_TOOM3_THRESHOLD 81
#define SQR_TOOM4_THRESHOLD 148
-#define SQR_TOOM6_THRESHOLD 306
+#define SQR_TOOM6_THRESHOLD 274
#define SQR_TOOM8_THRESHOLD 430
+#define MULMID_TOOM42_THRESHOLD 88
+
#define MULMOD_BNM1_THRESHOLD 18
-#define SQRMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define POWM_SEC_TABLE 2,17,225,961,1604
#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */
#define MUL_FFT_TABLE3 \
@@ -155,28 +160,30 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_DC_THRESHOLD 42
#define MULLO_MUL_N_THRESHOLD 13463
-#define DC_DIV_QR_THRESHOLD 89
-#define DC_DIVAPPR_Q_THRESHOLD 315
+#define DC_DIV_QR_THRESHOLD 60
+#define DC_DIVAPPR_Q_THRESHOLD 336
#define DC_BDIV_QR_THRESHOLD 91
-#define DC_BDIV_Q_THRESHOLD 274
+#define DC_BDIV_Q_THRESHOLD 268
#define INV_MULMOD_BNM1_THRESHOLD 66
-#define INV_NEWTON_THRESHOLD 300
-#define INV_APPR_THRESHOLD 303
+#define INV_NEWTON_THRESHOLD 284
+#define INV_APPR_THRESHOLD 284
-#define BINV_NEWTON_THRESHOLD 303
-#define REDC_1_TO_REDC_N_THRESHOLD 95
+#define BINV_NEWTON_THRESHOLD 270
+#define REDC_1_TO_REDC_N_THRESHOLD 87
-#define MU_DIV_QR_THRESHOLD 1858
-#define MU_DIVAPPR_Q_THRESHOLD 1718
-#define MUPI_DIV_QR_THRESHOLD 132
-#define MU_BDIV_QR_THRESHOLD 1387
+#define MU_DIV_QR_THRESHOLD 1752
+#define MU_DIVAPPR_Q_THRESHOLD 1652
+#define MUPI_DIV_QR_THRESHOLD 97
+#define MU_BDIV_QR_THRESHOLD 1470
#define MU_BDIV_Q_THRESHOLD 1470
#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 154
-#define GCD_DC_THRESHOLD 599
-#define GCDEXT_DC_THRESHOLD 443
+#define HGCD_THRESHOLD 173
+#define HGCD_APPR_THRESHOLD 226
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 580
+#define GCDEXT_DC_THRESHOLD 414
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 17
diff --git a/mpn/x86/k7/invert_limb.asm b/mpn/x86/k7/invert_limb.asm
index da6f28397..435fa96d0 100644
--- a/mpn/x86/k7/invert_limb.asm
+++ b/mpn/x86/k7/invert_limb.asm
@@ -60,7 +60,7 @@ ifdef(`DARWIN',`
PROLOGUE(mpn_invert_limb)
deflit(`FRAME', 0)
mov PARAM_DIVISOR, %eax
- C Avoid push/pop on k7.
+ C Avoid push/pop on k7.
sub $8, %esp FRAME_subl_esp(8)
mov %ebx, (%esp)
mov %edi, 4(%esp)
diff --git a/mpn/x86/k7/sublsh1_n.asm b/mpn/x86/k7/sublsh1_n.asm
index 41993f99a..965348586 100644
--- a/mpn/x86/k7/sublsh1_n.asm
+++ b/mpn/x86/k7/sublsh1_n.asm
@@ -30,7 +30,7 @@ C cycles/limb
C P5
C P6 model 0-8,10-12
C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
+C P6 model 13 (Dothan)
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
@@ -38,12 +38,12 @@ C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
C Intel Atom 6.75
C AMD K6
-C AMD K7
+C AMD K7
C AMD K8
C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
C processors. It uses 2*4-way unrolling, for good reasons.
-C
+C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
C force is to 2*2-way unrolling.
@@ -114,7 +114,7 @@ ifdef(`CPU_P6',`
adc %ebp, %ebp
rcr %edx C restore 1st saved carry bit
-
+
sbb %eax, (rp)
sbb %ebx, 4(rp)
sbb %ecx, 8(rp)
diff --git a/mpn/x86/k8/gmp-mparam.h b/mpn/x86/k8/gmp-mparam.h
new file mode 100644
index 000000000..727a381f1
--- /dev/null
+++ b/mpn/x86/k8/gmp-mparam.h
@@ -0,0 +1,144 @@
+/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 42
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 136
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 46
+#define SQR_TOOM3_THRESHOLD 77
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 294
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 74
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 2,14,216,991,2658
+
+#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */
+#define MUL_FFT_TABLE3 \
+ { { 888, 6}, { 15, 5}, { 31, 6}, { 25, 7}, \
+ { 13, 6}, { 27, 7}, { 15, 6}, { 33, 7}, \
+ { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
+ { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
+ { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
+ { 41, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \
+ { 287,11}, { 159,10}, { 335,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,11}, \
+ { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 73
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 758 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 758, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
+ { 1087,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 74
+#define SQR_FFT_THRESHOLD 7296
+
+#define MULLO_BASECASE_THRESHOLD 8
+#define MULLO_DC_THRESHOLD 35
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 91
+#define DC_DIVAPPR_Q_THRESHOLD 278
+#define DC_BDIV_QR_THRESHOLD 87
+#define DC_BDIV_Q_THRESHOLD 216
+
+#define INV_MULMOD_BNM1_THRESHOLD 62
+#define INV_NEWTON_THRESHOLD 262
+#define INV_APPR_THRESHOLD 262
+
+#define BINV_NEWTON_THRESHOLD 278
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1787
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 106
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 139
+#define HGCD_APPR_THRESHOLD 176
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 610
+#define GCDEXT_DC_THRESHOLD 419
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 29
+#define SET_STR_DC_THRESHOLD 450
+#define SET_STR_PRECOMPUTE_THRESHOLD 1366
diff --git a/mpn/x86/nano/gmp-mparam.h b/mpn/x86/nano/gmp-mparam.h
new file mode 100644
index 000000000..5fa509372
--- /dev/null
+++ b/mpn/x86/nano/gmp-mparam.h
@@ -0,0 +1,152 @@
+/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_1P_METHOD 1
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 53
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 32
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 132
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 130
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 135
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 194
+#define SQR_TOOM4_THRESHOLD 502
+#define SQR_TOOM6_THRESHOLD 746
+#define SQR_TOOM8_THRESHOLD 1005
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 4,23,258,828,2246
+
+#define MUL_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 308, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
+ { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
+ { 19, 7}, { 11, 6}, { 24, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 9}, { 15, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \
+ { 31, 9}, { 71,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 543, 9}, \
+ { 287, 8}, { 575, 7}, { 1215,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \
+ { 607, 8}, { 1215,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 351, 9}, { 703, 8}, { 1407, 9}, \
+ { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447, 9}, { 895,10}, { 479, 9}, { 959, 8}, \
+ { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD 1856
+
+#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 396, 5}, { 13, 6}, { 7, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 8}, { 11, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 543,10}, { 143, 9}, \
+ { 287, 8}, { 607, 7}, { 1215, 6}, { 2431,10}, \
+ { 159, 8}, { 639,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 607, 8}, { 1215,11}, \
+ { 159,10}, { 319, 9}, { 671,10}, { 351, 9}, \
+ { 703, 8}, { 1407, 9}, { 735, 8}, { 1471, 7}, \
+ { 2943,11}, { 191,10}, { 383, 9}, { 799,10}, \
+ { 415, 9}, { 895,10}, { 479,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD 2368
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 51
+#define MULLO_MUL_N_THRESHOLD 3369
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 183
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 118
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 266
+#define INV_APPR_THRESHOLD 218
+
+#define BINV_NEWTON_THRESHOLD 268
+#define REDC_1_TO_REDC_N_THRESHOLD 56
+
+#define MU_DIV_QR_THRESHOLD 1308
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 124
+#define MU_BDIV_QR_THRESHOLD 855
+#define MU_BDIV_Q_THRESHOLD 1334
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 104
+#define HGCD_APPR_THRESHOLD 139
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 456
+#define GCDEXT_DC_THRESHOLD 321
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 542
+#define SET_STR_PRECOMPUTE_THRESHOLD 840
diff --git a/mpn/x86/p6/bdiv_q_1.asm b/mpn/x86/p6/bdiv_q_1.asm
index 3a8733a0d..0ffbc78e4 100644
--- a/mpn/x86/p6/bdiv_q_1.asm
+++ b/mpn/x86/p6/bdiv_q_1.asm
@@ -25,7 +25,7 @@ include(`../config.m4')
C odd even divisor
C P6: 10.0 12.0 cycles/limb
-C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
C The odd case is basically the same as mpn_modexact_1_odd, just with an
C extra store, and it runs at the same 10 cycles which is the dependent
@@ -269,7 +269,7 @@ ifdef(`PIC',`
imull %edx, %eax C inv*inv*d
subl %eax, %ebp C inv = 2*inv - inv*inv*d
-
+
jmp L(common)
EPILOGUE()
diff --git a/mpn/x86/p6/sse2/gmp-mparam.h b/mpn/x86/p6/sse2/gmp-mparam.h
index 2735b9c63..b163c58cc 100644
--- a/mpn/x86/p6/sse2/gmp-mparam.h
+++ b/mpn/x86/p6/sse2/gmp-mparam.h
@@ -31,37 +31,42 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* 1867 MHz P6 model 13 */
#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 22
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 77
-#define MUL_TOOM44_THRESHOLD 182
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 181
#define MUL_TOOM6H_THRESHOLD 252
-#define MUL_TOOM8H_THRESHOLD 381
+#define MUL_TOOM8H_THRESHOLD 363
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 30
#define SQR_TOOM3_THRESHOLD 101
#define SQR_TOOM4_THRESHOLD 154
#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 547
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 58
#define MULMOD_BNM1_THRESHOLD 13
#define SQRMOD_BNM1_THRESHOLD 17
+#define POWM_SEC_TABLE 4,23,258,768,2388
+
#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
@@ -143,34 +148,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 5760
#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 34
+#define MULLO_DC_THRESHOLD 33
#define MULLO_MUL_N_THRESHOLD 13463
-#define DC_DIV_QR_THRESHOLD 22
+#define DC_DIV_QR_THRESHOLD 20
#define DC_DIVAPPR_Q_THRESHOLD 56
#define DC_BDIV_QR_THRESHOLD 60
-#define DC_BDIV_Q_THRESHOLD 132
+#define DC_BDIV_Q_THRESHOLD 134
#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 71
+#define INV_NEWTON_THRESHOLD 66
#define INV_APPR_THRESHOLD 63
-#define BINV_NEWTON_THRESHOLD 252
-#define REDC_1_TO_REDC_N_THRESHOLD 62
+#define BINV_NEWTON_THRESHOLD 250
+#define REDC_1_TO_REDC_N_THRESHOLD 63
-#define MU_DIV_QR_THRESHOLD 1142
-#define MU_DIVAPPR_Q_THRESHOLD 889
-#define MUPI_DIV_QR_THRESHOLD 39
-#define MU_BDIV_QR_THRESHOLD 1308
-#define MU_BDIV_Q_THRESHOLD 1442
+#define MU_DIV_QR_THRESHOLD 1164
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 38
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1470
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 61
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 298
-#define JACOBI_BASE_METHOD 4
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 105
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 1
#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 582
-#define SET_STR_PRECOMPUTE_THRESHOLD 1055
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 587
+#define SET_STR_PRECOMPUTE_THRESHOLD 1104
diff --git a/mpn/x86/pentium/bdiv_q_1.asm b/mpn/x86/pentium/bdiv_q_1.asm
index 965173d1c..7e84fc817 100644
--- a/mpn/x86/pentium/bdiv_q_1.asm
+++ b/mpn/x86/pentium/bdiv_q_1.asm
@@ -27,7 +27,7 @@ C odd even
C P54: 24.5 30.5 cycles/limb
C P55: 23.0 28.0
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
C expected. On P54 in the even case the shrdl pairing nonsense (see
diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h
index b1e56b5e2..8a198ad96 100644
--- a/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -22,37 +22,42 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define BYTES_PER_MP_LIMB 4
-#define MOD_1_NORM_THRESHOLD 9
-#define MOD_1_UNNORM_THRESHOLD 20
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 20
#define MUL_TOOM22_THRESHOLD 31
-#define MUL_TOOM33_THRESHOLD 120
-#define MUL_TOOM44_THRESHOLD 286
+#define MUL_TOOM33_THRESHOLD 114
+#define MUL_TOOM44_THRESHOLD 300
#define MUL_TOOM6H_THRESHOLD 426
-#define MUL_TOOM8H_THRESHOLD 592
+#define MUL_TOOM8H_THRESHOLD 620
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 195
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 216
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 193
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 187
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 184
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 181
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 209
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 48
-#define SQR_TOOM3_THRESHOLD 174
-#define SQR_TOOM4_THRESHOLD 390
-#define SQR_TOOM6_THRESHOLD 0
-#define SQR_TOOM8_THRESHOLD 507
+#define SQR_TOOM2_THRESHOLD 49
+#define SQR_TOOM3_THRESHOLD 173
+#define SQR_TOOM4_THRESHOLD 264
+#define SQR_TOOM6_THRESHOLD 354
+#define SQR_TOOM8_THRESHOLD 810
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 21
+#define MULMID_TOOM42_THRESHOLD 68
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define POWM_SEC_TABLE 2,33,246,1052,2178
#define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */
#define MUL_FFT_TABLE3 \
@@ -102,35 +107,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 72
#define SQR_FFT_THRESHOLD 6784
-#define MULLO_BASECASE_THRESHOLD 12
-#define MULLO_DC_THRESHOLD 49
-#define MULLO_MUL_N_THRESHOLD 13866
+#define MULLO_BASECASE_THRESHOLD 13
+#define MULLO_DC_THRESHOLD 52
+#define MULLO_MUL_N_THRESHOLD 13463
-#define DC_DIV_QR_THRESHOLD 37
-#define DC_DIVAPPR_Q_THRESHOLD 81
-#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 80
+#define DC_DIV_QR_THRESHOLD 39
+#define DC_DIVAPPR_Q_THRESHOLD 77
+#define DC_BDIV_QR_THRESHOLD 54
+#define DC_BDIV_Q_THRESHOLD 94
#define INV_MULMOD_BNM1_THRESHOLD 60
-#define INV_NEWTON_THRESHOLD 244
-#define INV_APPR_THRESHOLD 98
+#define INV_NEWTON_THRESHOLD 182
+#define INV_APPR_THRESHOLD 93
-#define BINV_NEWTON_THRESHOLD 276
-#define REDC_1_TO_REDC_N_THRESHOLD 63
+#define BINV_NEWTON_THRESHOLD 296
+#define REDC_1_TO_REDC_N_THRESHOLD 66
#define MU_DIV_QR_THRESHOLD 2350
-#define MU_DIVAPPR_Q_THRESHOLD 2172
-#define MUPI_DIV_QR_THRESHOLD 48
-#define MU_BDIV_QR_THRESHOLD 1858
-#define MU_BDIV_Q_THRESHOLD 2172
-
-#define MATRIX22_STRASSEN_THRESHOLD 29
-#define HGCD_THRESHOLD 81
-#define GCD_DC_THRESHOLD 309
+#define MU_DIVAPPR_Q_THRESHOLD 2130
+#define MUPI_DIV_QR_THRESHOLD 71
+#define MU_BDIV_QR_THRESHOLD 2130
+#define MU_BDIV_Q_THRESHOLD 2130
+
+#define MATRIX22_STRASSEN_THRESHOLD 24
+#define HGCD_THRESHOLD 77
+#define HGCD_APPR_THRESHOLD 91
+#define HGCD_REDUCE_THRESHOLD 5010
+#define GCD_DC_THRESHOLD 327
#define GCDEXT_DC_THRESHOLD 253
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 10
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 118
-#define SET_STR_PRECOMPUTE_THRESHOLD 1099
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 144
+#define SET_STR_PRECOMPUTE_THRESHOLD 979
diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm
new file mode 100644
index 000000000..7c8c2601f
--- /dev/null
+++ b/mpn/x86/tabselect.asm
@@ -0,0 +1,104 @@
+dnl x86 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 4.5
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C Intel Atom ?
+C AMD K6 ?
+C AMD K7 3.4
+C AMD K8 ?
+C AMD K10 ?
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using SSE2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `%edi')
+define(`tp', `%esi')
+define(`n', `%ebx')
+define(`nents', `%ecx')
+define(`which', `36(%esp)')
+
+define(`i', `%ebp')
+define(`maskp', `20(%esp)')
+define(`maskn', `32(%esp)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), tp
+ mov 28(%esp), n
+ mov 32(%esp), nents
+
+ lea (rp,n,4), rp
+ lea (tp,n,4), tp
+ sub nents, which
+L(outer):
+ mov which, %eax
+ add nents, %eax
+ neg %eax C set CF iff 'which' != k
+ sbb %eax, %eax
+ mov %eax, maskn
+ not %eax
+ mov %eax, maskp
+
+ mov n, i
+ neg i
+
+ ALIGN(16)
+L(top): mov (tp,i,4), %eax
+ and maskp, %eax
+ mov (rp,i,4), %edx
+ and maskn, %edx
+ or %edx, %eax
+ mov %eax, (rp,i,4)
+ inc i
+ js L(top)
+
+L(end): mov n, %eax
+ lea (tp,%eax,4), tp
+ dec nents
+ jne L(outer)
+
+L(outer_end):
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm
index 107c3dafe..5c6647888 100644
--- a/mpn/x86_64/addmul_2.asm
+++ b/mpn/x86_64/addmul_2.asm
@@ -50,10 +50,14 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
TEXT
ALIGN(16)
-ASM_START()
PROLOGUE(mpn_addmul_2)
+ DOS64_ENTRY(4)
mov n_param, n
push %rbx
push %rbp
@@ -164,6 +168,7 @@ L(end): xor R32(w1), R32(w1)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm
index 2ea556b73..dda7d590e 100644
--- a/mpn/x86_64/aorrlsh1_n.asm
+++ b/mpn/x86_64/aorrlsh1_n.asm
@@ -1,7 +1,8 @@
dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -54,10 +55,14 @@ ifdef(`OPERATION_rsblsh1_n', `
MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %rbp
mov (vp), %r8
@@ -147,5 +152,6 @@ ifdef(`OPERATION_rsblsh1_n',`
movslq R32(%rbp), %rax')
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh2_n.asm b/mpn/x86_64/aorrlsh2_n.asm
index 6d55cfd10..8c427a674 100644
--- a/mpn/x86_64/aorrlsh2_n.asm
+++ b/mpn/x86_64/aorrlsh2_n.asm
@@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n',`
MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm
index cab0b07f4..ae9a9d952 100644
--- a/mpn/x86_64/aorrlshC_n.asm
+++ b/mpn/x86_64/aorrlshC_n.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -37,10 +37,14 @@ define(`n', `%rcx')
define(M, eval(m4_lshift(1,LSH)))
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %r12
push %r13
push %r14
@@ -140,5 +144,6 @@ ifelse(ADDSUB,add,`
pop %r14
pop %r13
pop %r12
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm
index d19dea535..8ab3688d2 100644
--- a/mpn/x86_64/aorrlsh_n.asm
+++ b/mpn/x86_64/aorrlsh_n.asm
@@ -56,10 +56,23 @@ ifdef(`OPERATION_rsblsh_n',`
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %r12
push %r13
push %r14
@@ -155,5 +168,6 @@ L(end): add R32(%rbx), R32(%rbx)
pop %r14
pop %r13
pop %r12
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm
index 916e9b664..eadde641b 100644
--- a/mpn/x86_64/aors_n.asm
+++ b/mpn/x86_64/aors_n.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_add_n, mpn_sub_n
-dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation,
-dnl Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,15 +30,15 @@ C Intel SBR 1.59
C Intel atom 4
C VIA nano 3.25
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8') C (only for mpn_add_nc)
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (only for mpn_add_nc)
ifdef(`OPERATION_add_n', `
define(ADCSBB, adc)
@@ -51,10 +51,23 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
mov R32(n), R32(%rax)
shr $2, n
and $3, R32(%rax)
@@ -69,6 +82,7 @@ PROLOGUE(func_nc)
EPILOGUE()
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
mov R32(n), R32(%rax)
shr $2, n
and $3, R32(%rax)
@@ -85,6 +99,7 @@ L(lt4): dec R32(%rax)
ADCSBB (vp), %r8
mov %r8, (rp)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(2): dec R32(%rax)
@@ -95,6 +110,7 @@ L(2): dec R32(%rax)
mov %r8, (rp)
mov %r9, 8(rp)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(3): mov 16(up), %r10
@@ -105,6 +121,7 @@ L(3): mov 16(up), %r10
mov %r9, 8(rp)
mov %r10, 16(rp)
setc R8(%rax)
+ DOS64_EXIT()
ret
ALIGN(16)
@@ -142,5 +159,6 @@ L(end): lea 32(up), up
dec R32(%rax)
jnz L(lt4)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorscnd_n.asm b/mpn/x86_64/aorscnd_n.asm
new file mode 100644
index 000000000..d22a2a218
--- /dev/null
+++ b/mpn/x86_64/aorscnd_n.asm
@@ -0,0 +1,178 @@
+dnl AMD64 mpn_addcnd_n, mpn_subcnd_n
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.25
+C AMD K10 2
+C Intel P4 13
+C Intel core2 2.9
+C Intel NHM 2.9
+C Intel SBR 2.4
+C Intel atom 6.5
+C VIA nano 3
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about wheather it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+C * Two cases could be optimised: (1) addcnd_n could use ADCSBB-from-memory
+C to save one insn/limb, and (2) when up=rp addcnd_n and subcnd_n could use
+C ADCSBB-to-memory, again saving 1 insn/limb.
+C * This runs optimally at decoder bandwidth on K10. It has not been tuned
+C for any other processor.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnd', `%r8')
+
+ifdef(`OPERATION_addcnd_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addcnd_n)')
+ifdef(`OPERATION_subcnd_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_subcnd_n)')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ neg cnd
+ sbb cnd, cnd C make cnd mask
+
+ lea (vp,n,8), vp
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+
+ mov R32(n), R32(%rax)
+ neg n
+ and $3, R32(%rax)
+ jz L(top) C carry-save reg rax = 0 in this arc
+ cmp $2, R32(%rax)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov 16(vp,n,8), %r14
+ mov (up,n,8), %r10
+ mov 8(up,n,8), %rbx
+ mov 16(up,n,8), %rbp
+ and cnd, %r12
+ and cnd, %r13
+ and cnd, %r14
+ ADDSUB %r12, %r10
+ ADCSBB %r13, %rbx
+ ADCSBB %r14, %rbp
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ mov %rbp, 16(rp,n,8)
+ add $3, n
+ js L(top)
+ jmp L(end)
+
+L(b2): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov (up,n,8), %r10
+ mov 8(up,n,8), %rbx
+ and cnd, %r12
+ and cnd, %r13
+ ADDSUB %r12, %r10
+ ADCSBB %r13, %rbx
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ add $2, n
+ js L(top)
+ jmp L(end)
+
+L(b1): mov (vp,n,8), %r12
+ mov (up,n,8), %r10
+ and cnd, %r12
+ ADDSUB %r12, %r10
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ add $1, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov 16(vp,n,8), %r14
+ mov 24(vp,n,8), %r11
+ mov (up,n,8), %r10
+ mov 8(up,n,8), %rbx
+ mov 16(up,n,8), %rbp
+ mov 24(up,n,8), %r9
+ and cnd, %r12
+ and cnd, %r13
+ and cnd, %r14
+ and cnd, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ ADCSBB %r12, %r10
+ ADCSBB %r13, %rbx
+ ADCSBB %r14, %rbp
+ ADCSBB %r11, %r9
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ mov %rbp, 16(rp,n,8)
+ mov %r9, 24(rp,n,8)
+ add $4, n
+ js L(top)
+
+L(end): neg R32(%rax)
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ DOS64_EXIT()
+ ret
+EPILOGUE()
diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm
index 9c64d56fc..a406bc9e8 100644
--- a/mpn/x86_64/aorsmul_1.asm
+++ b/mpn/x86_64/aorsmul_1.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_addmul_1 and mpn_submul_1.
-dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -28,20 +28,27 @@ C Intel corei ?
C Intel atom 21.3
C VIA nano 5.5
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
-C TODO:
-C * The inner loop is great, but the prologue and epilogue code was
-C quickly written. Tune it!
+C TODO
+C * The loop is great, but the prologue and epilogue code was quickly written.
+C Tune it!
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vl', `%rcx')
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vl', `%rcx') C r9
-define(`n', `%r11')
+define(`n', `%r11')
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
ifdef(`OPERATION_addmul_1',`
define(`ADDSUB', `add')
@@ -52,17 +59,33 @@ ifdef(`OPERATION_submul_1',`
define(`func', `mpn_submul_1')
')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
mov (up), %rax C read first u limb early
push %rbx
- mov n_param, %rbx C move away n from rdx, mul uses it
+IFELF(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
mul vl
- mov %rbx, n
+IFELF(` mov %rbx, n ')
and $3, R32(%rbx)
jz L(b0)
@@ -145,5 +168,7 @@ L(ret): adc $0, %rdx
mov %rdx, %rax
pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
ret
EPILOGUE()
diff --git a/mpn/x86_64/atom/gmp-mparam.h b/mpn/x86_64/atom/gmp-mparam.h
index 37ddcebc2..380f36f25 100644
--- a/mpn/x86_64/atom/gmp-mparam.h
+++ b/mpn/x86_64/atom/gmp-mparam.h
@@ -31,14 +31,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 17
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
#define MUL_TOOM22_THRESHOLD 10
#define MUL_TOOM33_THRESHOLD 65
@@ -58,9 +59,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 222
#define SQR_TOOM8_THRESHOLD 333
+#define MULMID_TOOM42_THRESHOLD 14
+
#define MULMOD_BNM1_THRESHOLD 7
#define SQRMOD_BNM1_THRESHOLD 12
+#define POWM_SEC_TABLE 2,31,213,724,2112
+
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 220, 5}, { 7, 4}, { 15, 5}, { 13, 6}, \
@@ -145,9 +150,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 748
#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 82
+#define HGCD_THRESHOLD 79
+#define HGCD_APPR_THRESHOLD 83
+#define HGCD_REDUCE_THRESHOLD 1137
#define GCD_DC_THRESHOLD 186
-#define GCDEXT_DC_THRESHOLD 186
+#define GCDEXT_DC_THRESHOLD 189
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 15
diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm
index f6a77507d..0fef478d9 100644
--- a/mpn/x86_64/bdiv_dbm1c.asm
+++ b/mpn/x86_64/bdiv_dbm1c.asm
@@ -41,10 +41,23 @@ define(`cy', `%r8')
define(`n', `%r9')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_bdiv_dbm1c)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
mov (up), %rax
mov n_param, n
mov R32(n_param), R32(%r11)
@@ -84,6 +97,7 @@ L(lo1): sub %rax, %r8
add $4, n
jnz L(top)
-L(end): mov %r8, %rax
+ mov %r8, %rax
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm
index 01624a52a..e1e1db5a5 100644
--- a/mpn/x86_64/bdiv_q_1.asm
+++ b/mpn/x86_64/bdiv_q_1.asm
@@ -1,8 +1,8 @@
dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
dnl 1-limb divisor, returning quotient only.
-dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation,
-dnl Inc.
+dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -41,10 +41,22 @@ C di r8 just mpn_pi1_bdiv_q_1
C shift r9 just mpn_pi1_bdiv_q_1
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_bdiv_q_1)
+ DOS64_ENTRY(4)
push %rbx
mov %rcx, %rax
@@ -91,6 +103,9 @@ L(evn): bsf %rax, %rcx
EPILOGUE()
PROLOGUE(mpn_pi1_bdiv_q_1)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
push %rbx
mov %rcx, %r11 C d
@@ -144,11 +159,13 @@ L(ent): imul %r8, %rax
imul %r8, %rax
mov %rax, (%rdi)
pop %rbx
+ DOS64_EXIT()
ret
L(one): shr R8(%rcx), %rax
imul %r8, %rax
mov %rax, (%rdi)
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/bobcat/gmp-mparam.h b/mpn/x86_64/bobcat/gmp-mparam.h
index f1edb1d36..5acb78a62 100644
--- a/mpn/x86_64/bobcat/gmp-mparam.h
+++ b/mpn/x86_64/bobcat/gmp-mparam.h
@@ -58,6 +58,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULMOD_BNM1_THRESHOLD 11
#define SQRMOD_BNM1_THRESHOLD 15
+#define POWM_SEC_TABLE 2,23,322,840
+
#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 376, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
@@ -145,9 +147,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1308
#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 103
-#define GCD_DC_THRESHOLD 469
-#define GCDEXT_DC_THRESHOLD 290
+#define HGCD_THRESHOLD 105
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 330
+#define GCDEXT_DC_THRESHOLD 306
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 17
diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm
index 6ff62eeac..3a232fc20 100644
--- a/mpn/x86_64/com.asm
+++ b/mpn/x86_64/com.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_com.
-dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -34,11 +34,14 @@ define(`rp',`%rdi')
define(`up',`%rsi')
define(`n',`%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_com)
+ DOS64_ENTRY(3)
movq (up), %r8
movl R32(%rdx), R32(%rax)
leaq (up,n,8), up
@@ -76,5 +79,6 @@ L(e10): movq 24(up,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm
index 13210217b..15e929f4e 100644
--- a/mpn/x86_64/copyd.asm
+++ b/mpn/x86_64/copyd.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_copyd -- copy limb vector, decrementing.
-dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -39,10 +39,14 @@ define(`rp',`%rdi')
define(`up',`%rsi')
define(`n',`%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_copyd)
+ DOS64_ENTRY(3)
leaq -8(up,n,8), up
leaq (rp,n,8), rp
subq $4, n
@@ -73,5 +77,6 @@ L(end): shrl R32(%rdx) C edx = lowpart(n)
movq -8(up), %r9
movq %r8, -8(rp)
movq %r9, -16(rp)
-1: ret
+1: DOS64_EXIT()
+ ret
EPILOGUE()
diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm
index d5cbdd644..1dd6c3168 100644
--- a/mpn/x86_64/copyi.asm
+++ b/mpn/x86_64/copyi.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_copyi -- copy limb vector, incrementing.
-dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -39,10 +39,14 @@ define(`rp',`%rdi')
define(`up',`%rsi')
define(`n',`%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_copyi)
+ DOS64_ENTRY(3)
leaq -8(rp), rp
subq $4, n
jc L(end)
@@ -72,5 +76,6 @@ L(end): shrl R32(%rdx) C edx = lowpart(n)
movq 8(up), %r9
movq %r8, 8(rp)
movq %r9, 16(rp)
-1: ret
+1: DOS64_EXIT()
+ ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/aorrlsh1_n.asm b/mpn/x86_64/core2/aorrlsh1_n.asm
index 346c21f33..e44e718a6 100644
--- a/mpn/x86_64/core2/aorrlsh1_n.asm
+++ b/mpn/x86_64/core2/aorrlsh1_n.asm
@@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh1_n', `
MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/core2/aorrlsh2_n.asm b/mpn/x86_64/core2/aorrlsh2_n.asm
index 1da0c527f..2d9c89553 100644
--- a/mpn/x86_64/core2/aorrlsh2_n.asm
+++ b/mpn/x86_64/core2/aorrlsh2_n.asm
@@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n', `
MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/core2/aorrlsh_n.asm b/mpn/x86_64/core2/aorrlsh_n.asm
index 8d03970ca..a8f5c051a 100644
--- a/mpn/x86_64/core2/aorrlsh_n.asm
+++ b/mpn/x86_64/core2/aorrlsh_n.asm
@@ -20,4 +20,8 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm
index 75807c79a..bc109cc22 100644
--- a/mpn/x86_64/core2/aors_n.asm
+++ b/mpn/x86_64/core2/aors_n.asm
@@ -1,6 +1,6 @@
dnl Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl Copyright 2006, 2007 Free Software Foundation, Inc.
+dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -48,16 +48,28 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-ASM_START()
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+ASM_START()
TEXT
ALIGN(16)
-
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
jmp L(start)
EPILOGUE()
PROLOGUE(func)
+ DOS64_ENTRY(4)
xor %r8, %r8
L(start):
mov (up), %r10
@@ -96,6 +108,7 @@ L(end): ADCSBB %r11, %r10
mov %r10, 8(rp)
mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
ALIGN(16)
diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm
index bb4f663c4..aeda30159 100644
--- a/mpn/x86_64/core2/aorsmul_1.asm
+++ b/mpn/x86_64/core2/aorsmul_1.asm
@@ -1,6 +1,7 @@
dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
-dnl Copyright 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -45,10 +46,14 @@ ifdef(`OPERATION_submul_1',`
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
lea (%rdx), %rbx
@@ -127,5 +132,6 @@ L(n1): mov 8(rp), %r10
adc %rdx, %rax
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h
index 43adaa078..0752688fd 100644
--- a/mpn/x86_64/core2/gmp-mparam.h
+++ b/mpn/x86_64/core2/gmp-mparam.h
@@ -31,14 +31,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 26
#define MUL_TOOM22_THRESHOLD 23
#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 178
-#define MUL_TOOM6H_THRESHOLD 222
-#define MUL_TOOM8H_THRESHOLD 0
+#define MUL_TOOM44_THRESHOLD 169
+#define MUL_TOOM6H_THRESHOLD 254
+#define MUL_TOOM8H_THRESHOLD 357
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
@@ -48,15 +49,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 26
#define SQR_TOOM3_THRESHOLD 85
-#define SQR_TOOM4_THRESHOLD 160
-#define SQR_TOOM6_THRESHOLD 218
-#define SQR_TOOM8_THRESHOLD 296
+#define SQR_TOOM4_THRESHOLD 226
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 454
#define MULMID_TOOM42_THRESHOLD 24
#define MULMOD_BNM1_THRESHOLD 15
#define SQRMOD_BNM1_THRESHOLD 15
+#define POWM_SEC_TABLE 2,41,322,840,1100,1556
+
#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 380, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
@@ -156,8 +159,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 20
-#define MULLO_MUL_N_THRESHOLD 10950
+#define MULLO_DC_THRESHOLD 18
+#define MULLO_MUL_N_THRESHOLD 9174
#define DC_DIV_QR_THRESHOLD 47
#define DC_DIVAPPR_Q_THRESHOLD 179
@@ -180,11 +183,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MATRIX22_STRASSEN_THRESHOLD 18
#define HGCD_THRESHOLD 135
+#define HGCD_APPR_THRESHOLD 169
+#define HGCD_REDUCE_THRESHOLD 2121
#define GCD_DC_THRESHOLD 330
#define GCDEXT_DC_THRESHOLD 361
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 746
+#define SET_STR_DC_THRESHOLD 552
#define SET_STR_PRECOMPUTE_THRESHOLD 1893
diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm
index 3b17e8315..2e175de76 100644
--- a/mpn/x86_64/core2/lshift.asm
+++ b/mpn/x86_64/core2/lshift.asm
@@ -1,6 +1,6 @@
dnl x86-64 mpn_lshift optimized for "Core 2".
-dnl Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%cl')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshift)
+ DOS64_ENTRY(4)
lea -8(rp,n,8), rp
lea -8(up,n,8), up
@@ -51,7 +55,7 @@ L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov -8(up), %r11
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r10, %rax
+ shld R8(cnt), %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
@@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ...
jae L(nb01)
L(b01): mov (up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r9, %rax
+ shld R8(cnt), %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
@@ -70,8 +74,9 @@ L(b01): mov (up), %r9
lea -8(up), up
lea 16(rp), rp
jmp L(01)
-L(le1): shl R8(%rcx), %r9
+L(le1): shl R8(cnt), %r9
mov %r9, (rp)
+ DOS64_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
@@ -79,17 +84,18 @@ L(nb01):C n = 2, 6, 10, ...
L(b10): mov (up), %r8
mov -8(up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r8, %rax
+ shld R8(cnt), %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
-L(le2): shld R8(%rcx), %r9, %r8
+L(le2): shld R8(cnt), %r9, %r8
mov %r8, (rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
mov %r9, -8(rp)
+ DOS64_EXIT()
ret
ALIGN(16) C performance critical!
@@ -97,23 +103,23 @@ L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov -8(up), %r8
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r11, %rax
+ shld R8(cnt), %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shld R8(%rcx), %r8, %r11
+L(top): shld R8(cnt), %r8, %r11
mov (up), %r10
mov %r11, (rp)
-L(10): shld R8(%rcx), %r9, %r8
+L(10): shld R8(cnt), %r9, %r8
mov -8(up), %r11
mov %r8, -8(rp)
-L(01): shld R8(%rcx), %r10, %r9
+L(01): shld R8(cnt), %r10, %r9
mov -16(up), %r8
mov %r9, -16(rp)
-L(00): shld R8(%rcx), %r11, %r10
+L(00): shld R8(cnt), %r11, %r10
mov -24(up), %r9
mov %r10, -24(rp)
add $-32, up
@@ -121,11 +127,12 @@ L(00): shld R8(%rcx), %r11, %r10
sub $4, n
jnc L(top)
-L(end): shld R8(%rcx), %r8, %r11
+L(end): shld R8(cnt), %r8, %r11
mov %r11, (rp)
- shld R8(%rcx), %r9, %r8
+ shld R8(cnt), %r9, %r8
mov %r8, -8(rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
mov %r9, -16(rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm
index a19f72297..31a08f7ae 100644
--- a/mpn/x86_64/core2/lshiftc.asm
+++ b/mpn/x86_64/core2/lshiftc.asm
@@ -1,6 +1,6 @@
dnl x86-64 mpn_lshiftc optimized for "Core 2".
-dnl Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%cl')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshiftc)
+ DOS64_ENTRY(4)
lea -8(rp,n,8), rp
lea -8(up,n,8), up
@@ -51,7 +55,7 @@ L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov -8(up), %r11
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r10, %rax
+ shld R8(cnt), %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
@@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ...
jae L(nb01)
L(b01): mov (up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r9, %rax
+ shld R8(cnt), %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
@@ -70,9 +74,10 @@ L(b01): mov (up), %r9
lea -8(up), up
lea 16(rp), rp
jmp L(01)
-L(le1): shl R8(%rcx), %r9
+L(le1): shl R8(cnt), %r9
not %r9
mov %r9, (rp)
+ DOS64_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
@@ -80,19 +85,20 @@ L(nb01):C n = 2, 6, 10, ...
L(b10): mov (up), %r8
mov -8(up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r8, %rax
+ shld R8(cnt), %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
-L(le2): shld R8(%rcx), %r9, %r8
+L(le2): shld R8(cnt), %r9, %r8
not %r8
mov %r8, (rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
not %r9
mov %r9, -8(rp)
+ DOS64_EXIT()
ret
ALIGN(16) C performance critical!
@@ -100,26 +106,26 @@ L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov -8(up), %r8
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r11, %rax
+ shld R8(cnt), %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shld R8(%rcx), %r8, %r11
+L(top): shld R8(cnt), %r8, %r11
mov (up), %r10
not %r11
mov %r11, (rp)
-L(10): shld R8(%rcx), %r9, %r8
+L(10): shld R8(cnt), %r9, %r8
mov -8(up), %r11
not %r8
mov %r8, -8(rp)
-L(01): shld R8(%rcx), %r10, %r9
+L(01): shld R8(cnt), %r10, %r9
mov -16(up), %r8
not %r9
mov %r9, -16(rp)
-L(00): shld R8(%rcx), %r11, %r10
+L(00): shld R8(cnt), %r11, %r10
mov -24(up), %r9
not %r10
mov %r10, -24(rp)
@@ -128,14 +134,15 @@ L(00): shld R8(%rcx), %r11, %r10
sub $4, n
jnc L(top)
-L(end): shld R8(%rcx), %r8, %r11
+L(end): shld R8(cnt), %r8, %r11
not %r11
mov %r11, (rp)
- shld R8(%rcx), %r9, %r8
+ shld R8(cnt), %r9, %r8
not %r8
mov %r8, -8(rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
not %r9
mov %r9, -16(rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm
index eb52efc08..b350e4a43 100644
--- a/mpn/x86_64/core2/rsh1aors_n.asm
+++ b/mpn/x86_64/core2/rsh1aors_n.asm
@@ -1,6 +1,6 @@
dnl Intel P6/64 mpn_rsh1add_n and mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1
-dnl Copyright 2003, 2005, 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -49,11 +49,24 @@ ifdef(`OPERATION_rsh1sub_n', `
MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
push %rbx
push %rbp
@@ -66,6 +79,7 @@ EPILOGUE()
ALIGN(16)
PROLOGUE(func_n)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
@@ -171,5 +185,6 @@ L(end): shrd $1, %rbx, %rbp
mov %rbp, (rp)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm
index 38a77364f..68306881c 100644
--- a/mpn/x86_64/core2/rshift.asm
+++ b/mpn/x86_64/core2/rshift.asm
@@ -1,6 +1,6 @@
dnl x86-64 mpn_rshift optimized for "Core 2".
-dnl Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%cl')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_rshift)
+ DOS64_ENTRY(4)
mov R32(%rdx), R32(%rax)
and $3, R32(%rax)
jne L(nb00)
@@ -48,7 +52,7 @@ L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov 8(up), %r11
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r10, %rax
+ shrd R8(cnt), %r10, %rax
mov 16(up), %r8
lea 8(up), up
lea -24(rp), rp
@@ -60,7 +64,7 @@ L(nb00):C n = 1, 5, 9, ...
jae L(nb01)
L(b01): mov (up), %r9
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r9, %rax
+ shrd R8(cnt), %r9, %rax
sub $2, n
jb L(le1)
mov 8(up), %r10
@@ -68,8 +72,9 @@ L(b01): mov (up), %r9
lea 16(up), up
lea -16(rp), rp
jmp L(01)
-L(le1): shr R8(%rcx), %r9
+L(le1): shr R8(cnt), %r9
mov %r9, (rp)
+ DOS64_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
@@ -77,17 +82,18 @@ L(nb01):C n = 2, 6, 10, ...
L(b10): mov (up), %r8
mov 8(up), %r9
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r8, %rax
+ shrd R8(cnt), %r8, %rax
sub $3, n
jb L(le2)
mov 16(up), %r10
lea 24(up), up
lea -8(rp), rp
jmp L(10)
-L(le2): shrd R8(%rcx), %r9, %r8
+L(le2): shrd R8(cnt), %r9, %r8
mov %r8, (rp)
- shr R8(%rcx), %r9
+ shr R8(cnt), %r9
mov %r9, 8(rp)
+ DOS64_EXIT()
ret
ALIGN(16)
@@ -95,23 +101,23 @@ L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov 8(up), %r8
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r11, %rax
+ shrd R8(cnt), %r11, %rax
mov 16(up), %r9
lea 32(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shrd R8(%rcx), %r8, %r11
+L(top): shrd R8(cnt), %r8, %r11
mov -8(up), %r10
mov %r11, (rp)
-L(10): shrd R8(%rcx), %r9, %r8
+L(10): shrd R8(cnt), %r9, %r8
mov (up), %r11
mov %r8, 8(rp)
-L(01): shrd R8(%rcx), %r10, %r9
+L(01): shrd R8(cnt), %r10, %r9
mov 8(up), %r8
mov %r9, 16(rp)
-L(00): shrd R8(%rcx), %r11, %r10
+L(00): shrd R8(cnt), %r11, %r10
mov 16(up), %r9
mov %r10, 24(rp)
add $32, up
@@ -119,11 +125,12 @@ L(00): shrd R8(%rcx), %r11, %r10
sub $4, n
jnc L(top)
-L(end): shrd R8(%rcx), %r8, %r11
+L(end): shrd R8(cnt), %r8, %r11
mov %r11, (rp)
- shrd R8(%rcx), %r9, %r8
+ shrd R8(cnt), %r9, %r8
mov %r8, 8(rp)
- shr R8(%rcx), %r9
+ shr R8(cnt), %r9
mov %r9, 16(rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/sublsh1_n.asm b/mpn/x86_64/core2/sublsh1_n.asm
index 7522b429f..50411d7d0 100644
--- a/mpn/x86_64/core2/sublsh1_n.asm
+++ b/mpn/x86_64/core2/sublsh1_n.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,4 +30,7 @@ define(func, mpn_sublsh1_n)
MULFUNC_PROLOGUE(mpn_sublsh1_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/mpn/x86_64/core2/sublsh2_n.asm b/mpn/x86_64/core2/sublsh2_n.asm
index 036d2c859..affc87177 100644
--- a/mpn/x86_64/core2/sublsh2_n.asm
+++ b/mpn/x86_64/core2/sublsh2_n.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,4 +30,7 @@ define(func, mpn_sublsh2_n)
MULFUNC_PROLOGUE(mpn_sublsh2_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm
index 2f89c35e3..7c4545f5a 100644
--- a/mpn/x86_64/core2/sublshC_n.asm
+++ b/mpn/x86_64/core2/sublshC_n.asm
@@ -3,7 +3,7 @@ dnl Core iN.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -40,6 +40,7 @@ ASM_START()
TEXT
ALIGN(8)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %rbx
push %r12
@@ -141,5 +142,6 @@ L(end): shr $RSH, %r11
pop %rbx
sub R32(%r11), R32(%rax)
neg R32(%rax)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm
index a4afae69d..e22cc065d 100644
--- a/mpn/x86_64/coreinhm/aorrlsh_n.asm
+++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm
@@ -62,10 +62,23 @@ C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
C refmpn_rsblsh_nc
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(func_n)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
push %rbx
xor R32(%rbx), R32(%rbx) C clear CF save register
L(ent): push %rbp
@@ -170,9 +183,13 @@ L(wd1): shrd %cl, %r8, %r11
IFRSB( neg %rax)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+IFDOS(` mov 64(%rsp), %r9 ') C cy
push %rbx
neg cy
sbb R32(%rbx), R32(%rbx) C initialise CF save register
diff --git a/mpn/x86_64/coreinhm/gmp-mparam.h b/mpn/x86_64/coreinhm/gmp-mparam.h
index eec17787d..0a0ada3c5 100644
--- a/mpn/x86_64/coreinhm/gmp-mparam.h
+++ b/mpn/x86_64/coreinhm/gmp-mparam.h
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 15
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 17
@@ -52,56 +53,92 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 318
#define SQR_TOOM8_THRESHOLD 502
+#define MULMID_TOOM42_THRESHOLD 22
+
#define MULMOD_BNM1_THRESHOLD 13
#define SQRMOD_BNM1_THRESHOLD 13
+#define POWM_SEC_TABLE 3,42,83,643,2080
+
#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
{ 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
{ 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
{ 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
{ 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
{ 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
{ 31,10}, { 79,11}, { 47,10}, { 95,12}, \
{ 31,11}, { 63,10}, { 135,11}, { 79,10}, \
- { 159, 9}, { 319, 8}, { 639,10}, { 167,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,13}, { 8192,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 74
+ { 159,11}, { 95,10}, { 191, 9}, { 383,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,10}, { 1407,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,13}, { 319,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 959,11}, { 1919,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 137
#define MUL_FFT_THRESHOLD 3712
-#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
{ 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
{ 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
{ 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
{ 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
{ 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 43,10}, { 23, 9}, { 47,11}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
{ 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 135,11}, { 79,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383, 8}, \
- { 767,12}, { 63,10}, { 255,11}, { 143, 9}, \
- { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \
- { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
- { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD 3200
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255,11}, { 79,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,11}, { 175,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 543,12}, { 287,11}, { 575,10}, \
+ { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,11}, { 959,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
+ { 447,12}, { 959,11}, { 1919,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 137
+#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 4
#define MULLO_DC_THRESHOLD 21
@@ -112,8 +149,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_QR_THRESHOLD 32
#define DC_BDIV_Q_THRESHOLD 70
-#define INV_MULMOD_BNM1_THRESHOLD 46
-#define INV_NEWTON_THRESHOLD 195
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 177
#define INV_APPR_THRESHOLD 147
#define BINV_NEWTON_THRESHOLD 252
@@ -126,13 +163,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_QR_THRESHOLD 1120
#define MU_BDIV_Q_THRESHOLD 1187
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 117
-#define GCD_DC_THRESHOLD 330
-#define GCDEXT_DC_THRESHOLD 382
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 126
+#define HGCD_APPR_THRESHOLD 171
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 345
+#define GCDEXT_DC_THRESHOLD 386
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_DC_THRESHOLD 15
#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 552
-#define SET_STR_PRECOMPUTE_THRESHOLD 1655
+#define SET_STR_DC_THRESHOLD 232
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm
index 66a5e3b60..4d8d1cccf 100644
--- a/mpn/x86_64/coreisbr/aors_n.asm
+++ b/mpn/x86_64/coreisbr/aors_n.asm
@@ -49,10 +49,22 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
xor %r8, %r8
L(ent): mov R32(n), R32(%rax)
shr $2, n
@@ -144,5 +156,7 @@ L(e1): ADCSBB 16(vp), %r10
ret
EPILOGUE()
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
jmp L(ent)
EPILOGUE()
diff --git a/mpn/x86_64/coreisbr/gmp-mparam.h b/mpn/x86_64/coreisbr/gmp-mparam.h
index e4727116b..c30c64ec8 100644
--- a/mpn/x86_64/coreisbr/gmp-mparam.h
+++ b/mpn/x86_64/coreisbr/gmp-mparam.h
@@ -29,8 +29,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 30
@@ -52,58 +53,123 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 0
#define SQR_TOOM8_THRESHOLD 458
-#define MULMOD_BNM1_THRESHOLD 11
-#define SQRMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 24
-#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define POWM_SEC_TABLE 4,35,130,713,2080
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 376, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 13, 6}, \
- { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
{ 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
{ 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
{ 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
{ 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
{ 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
{ 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
{ 95,10}, { 55,11}, { 31,10}, { 79,11}, \
{ 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 135,11}, { 79,10}, { 167,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,13}, \
- { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 135,11}, { 79,10}, { 159, 9}, { 319,10}, \
+ { 167,11}, { 95,10}, { 191, 9}, { 383,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703, 9}, { 1407,11}, \
+ { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
+ { 895,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,10}, \
+ { 1663,12}, { 447,11}, { 895,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \
+ { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1663,13}, { 447,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,12}, \
+ { 1919,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1663,13}, \
+ { 3455,14}, { 1919,13}, { 3839,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 32768,16}, { 65536,17}, \
{ 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
{2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 83
-#define MUL_FFT_THRESHOLD 3712
+#define MUL_FFT_TABLE3_SIZE 203
+#define MUL_FFT_THRESHOLD 4736
-#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 316, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
{ 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
{ 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
{ 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \
{ 79,10}, { 47,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \
- { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \
- { 639,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD 3264
+ { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,10}, { 959,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 479,11}, \
+ { 959,10}, { 1919,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
+ { 447,12}, { 959,11}, { 1919,14}, { 255,13}, \
+ { 511,12}, { 1087,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,12}, { 1919,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1663,13}, \
+ { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 198
+#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 5
#define MULLO_DC_THRESHOLD 33
@@ -114,27 +180,29 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_QR_THRESHOLD 31
#define DC_BDIV_Q_THRESHOLD 71
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 127
-#define INV_APPR_THRESHOLD 123
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 123
+#define INV_APPR_THRESHOLD 122
-#define BINV_NEWTON_THRESHOLD 181
-#define REDC_1_TO_REDC_2_THRESHOLD 17
-#define REDC_2_TO_REDC_N_THRESHOLD 51
+#define BINV_NEWTON_THRESHOLD 197
+#define REDC_1_TO_REDC_2_THRESHOLD 20
+#define REDC_2_TO_REDC_N_THRESHOLD 54
#define MU_DIV_QR_THRESHOLD 1334
#define MU_DIVAPPR_Q_THRESHOLD 1387
-#define MUPI_DIV_QR_THRESHOLD 57
+#define MUPI_DIV_QR_THRESHOLD 46
#define MU_BDIV_QR_THRESHOLD 1142
#define MU_BDIV_Q_THRESHOLD 1308
#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 90
-#define GCD_DC_THRESHOLD 400
-#define GCDEXT_DC_THRESHOLD 372
+#define HGCD_THRESHOLD 91
+#define HGCD_APPR_THRESHOLD 105
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 358
+#define GCDEXT_DC_THRESHOLD 351
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 802
-#define SET_STR_PRECOMPUTE_THRESHOLD 1712
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 781
+#define SET_STR_PRECOMPUTE_THRESHOLD 1940
diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm
index 9f23012da..c28d0a02c 100644
--- a/mpn/x86_64/div_qr_2n_pi1.asm
+++ b/mpn/x86_64/div_qr_2n_pi1.asm
@@ -44,7 +44,7 @@ C TODO
C * Store qh in the same stack slot as di_param, instead of pushing
C it. (we could put it in register %rbp, but then we would need to
C save and restore that instead, which doesn't seem like a win).
-
+
ASM_START()
TEXT
ALIGN(16)
@@ -56,7 +56,7 @@ PROLOGUE(mpn_div_qr_2n_pi1)
push %r13
push %r12
push %rbx
-
+
mov -16(up, un, 8), u1
mov -8(up, un, 8), u2
@@ -135,5 +135,5 @@ L(fix): C Unlikely update. u2 >= d1
inc t1
sub d0, u1
sbb d1, u2
- jmp L(bck)
+ jmp L(bck)
EPILOGUE()
diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm
index cfc7712d5..bdb64c148 100644
--- a/mpn/x86_64/div_qr_2u_pi1.asm
+++ b/mpn/x86_64/div_qr_2u_pi1.asm
@@ -66,7 +66,7 @@ deflit(`FRAME', 56)
movl shift_param, R32(%rcx)
C FIXME: Different code for SHLD_SLOW
-
+
xor R32(u2), R32(u2)
mov 8(up, un, 8), u1
shld %cl, u1, u2
@@ -173,7 +173,7 @@ L(fix): C Unlikely update. u2 >= d1
inc t1
sub d0, u1
sbb d1, u2
- jmp L(bck)
+ jmp L(bck)
C Duplicated, just jumping back to a different address.
L(fix_qh): C Unlikely update. u2 >= d1
@@ -185,5 +185,5 @@ L(fix_qh): C Unlikely update. u2 >= d1
inc t1
sub d0, u1
sbb d1, u2
- jmp L(bck_qh)
+ jmp L(bck_qh)
EPILOGUE()
diff --git a/mpn/x86_64/dos64.m4 b/mpn/x86_64/dos64.m4
new file mode 100644
index 000000000..ef60834ec
--- /dev/null
+++ b/mpn/x86_64/dos64.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+define(`HOST_DOS64')
+
+define(`JUMPTABSECT', `.section .rdata,"dr"')
+
+define(`DOS64_ENTRY',
+ `push %rdi
+ push %rsi
+ mov %rcx, %rdi
+ifelse(eval($1>=2),1,`dnl
+ mov %rdx, %rsi
+ifelse(eval($1>=3),1,`dnl
+ mov %r8, %rdx
+ifelse(eval($1>=4),1,`dnl
+ mov %r9, %rcx
+')')')')
+
+define(`DOS64_EXIT',
+ `pop %rsi
+ pop %rdi')
+
+divert`'dnl
diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h
index 99499da2b..aca6853f0 100644
--- a/mpn/x86_64/gmp-mparam.h
+++ b/mpn/x86_64/gmp-mparam.h
@@ -30,6 +30,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 15
@@ -56,6 +57,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULMOD_BNM1_THRESHOLD 17
#define SQRMOD_BNM1_THRESHOLD 17
+#define POWM_SEC_TABLE 2,67,322,991
+
#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
@@ -187,10 +190,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_QR_THRESHOLD 1589
#define MU_BDIV_Q_THRESHOLD 1718
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 139
-#define GCD_DC_THRESHOLD 606
-#define GCDEXT_DC_THRESHOLD 474
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 125
+#define HGCD_APPR_THRESHOLD 173
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 555
+#define GCDEXT_DC_THRESHOLD 478
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
diff --git a/mpn/x86_64/invert_limb.asm b/mpn/x86_64/invert_limb.asm
index 8c6aa68b6..06cf1414a 100644
--- a/mpn/x86_64/invert_limb.asm
+++ b/mpn/x86_64/invert_limb.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_invert_limb -- Invert a normalized limb.
dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
-dnl Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl Copyright 2004, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -33,11 +33,14 @@ C VIA nano 79 157
C rax rcx rdx rdi rsi r8
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_invert_limb) C Kn C2 Ci
+ DOS64_ENTRY(1)
mov %rdi, %rax C 0 0 0
shr $55, %rax C 1 1 1
ifdef(`PIC',`
@@ -94,6 +97,7 @@ ifdef(`DARWIN',`
adc %rdi, %rdx
sub %rdx, %rax
+ DOS64_EXIT()
ret
EPILOGUE()
ASM_END()
diff --git a/mpn/x86_64/invert_limb_table.asm b/mpn/x86_64/invert_limb_table.asm
index 98a331372..86d75b8ce 100644
--- a/mpn/x86_64/invert_limb_table.asm
+++ b/mpn/x86_64/invert_limb_table.asm
@@ -21,6 +21,9 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
C Table entry X contains floor (0x7fd00 / (0x100 + X))
diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm
index 1df564a8f..02b9da549 100644
--- a/mpn/x86_64/logops_n.asm
+++ b/mpn/x86_64/logops_n.asm
@@ -1,6 +1,6 @@
dnl AMD64 logops.
-dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -72,6 +72,8 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n',`%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
@@ -79,6 +81,7 @@ ifdef(`VARIANT_1',`
TEXT
ALIGN(32)
PROLOGUE(func)
+ DOS64_ENTRY(4)
movq (vp), %r8
movl R32(%rcx), R32(%rax)
leaq (vp,n,8), vp
@@ -117,7 +120,8 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
')
@@ -125,6 +129,7 @@ ifdef(`VARIANT_2',`
TEXT
ALIGN(32)
PROLOGUE(func)
+ DOS64_ENTRY(4)
movq (vp), %r8
notq %r8
movl R32(%rcx), R32(%rax)
@@ -168,7 +173,8 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
')
@@ -176,6 +182,7 @@ ifdef(`VARIANT_3',`
TEXT
ALIGN(32)
PROLOGUE(func)
+ DOS64_ENTRY(4)
movq (vp), %r8
movl R32(%rcx), R32(%rax)
leaq (vp,n,8), vp
@@ -220,6 +227,7 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
')
diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm
index 2f3d5c94d..5852ba9f9 100644
--- a/mpn/x86_64/lshift.asm
+++ b/mpn/x86_64/lshift.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_lshift -- mpn left shift.
-dnl Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -36,10 +36,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_lshift)
+ DOS64_ENTRY(4)
cmp $1, R8(%rcx)
jne L(gen)
@@ -83,6 +87,7 @@ L(t1): mov (up), %r8
dec R32(%rax)
jne L(n00)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(e1): test R32(%rax), R32(%rax) C clear cy
L(n00): mov (up), %r8
@@ -91,6 +96,7 @@ L(n00): mov (up), %r8
adc %r8, %r8
mov %r8, (rp)
L(ret): adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(n01): dec R32(%rax)
mov 8(up), %r9
@@ -100,6 +106,7 @@ L(n01): dec R32(%rax)
mov %r8, (rp)
mov %r9, 8(rp)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(n10): mov 16(up), %r10
adc %r8, %r8
@@ -109,6 +116,7 @@ L(n10): mov 16(up), %r10
mov %r9, 8(rp)
mov %r10, 16(rp)
adc $-1, R32(%rax)
+ DOS64_EXIT()
ret
L(gen): neg R32(%rcx) C put rsh count in cl
@@ -222,5 +230,6 @@ L(end):
L(ast): mov (up), %r10
shl R8(%rcx), %r10
mov %r10, (rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm
index 93bb614d3..b4124b037 100644
--- a/mpn/x86_64/lshiftc.asm
+++ b/mpn/x86_64/lshiftc.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_lshiftc -- mpn left shift with complement.
-dnl Copyright 2003, 2005, 2006, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -36,10 +36,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_lshiftc)
+ DOS64_ENTRY(4)
neg R32(%rcx) C put rsh count in cl
mov -8(up,n,8), %rax
shr R8(%rcx), %rax C function return value
@@ -162,5 +166,6 @@ L(ast): mov (up), %r10
shl R8(%rcx), %r10
not %r10
mov %r10, (rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm
index 3a42863ad..6e5816b1c 100644
--- a/mpn/x86_64/lshsub_n.asm
+++ b/mpn/x86_64/lshsub_n.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
-dnl Copyright 2006 Free Software Foundation, Inc.
+dnl Copyright 2006, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -44,10 +44,23 @@ define(`vp', `%rdx')
define(`n', `%rcx')
define(`cnt', `%r8')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshsub_n)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %r12
push %r13
@@ -151,5 +164,6 @@ L(end):
pop %r13
pop %r12
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm
index 6b233e074..8afa96e05 100644
--- a/mpn/x86_64/mod_1_1.asm
+++ b/mpn/x86_64/mod_1_1.asm
@@ -51,7 +51,7 @@ C Note: This implementation needs B1modb only when cnt > 0
C The iteration is almost as follows,
C
C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
-C
+C
C where r2 is a single bit represented as a mask. But to make sure that the
C result fits in two limbs and a bit, carry from the addition
C
@@ -67,10 +67,14 @@ C the source of the cmov in the loop.
C
C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1_1p)
+ DOS64_ENTRY(4)
push %rbp
push %rbx
mov %rdx, b
@@ -163,6 +167,7 @@ L(ok): shr R8(%rcx), %rax
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
L(fix): sub b, %rax
jmp L(ok)
@@ -170,6 +175,7 @@ EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1_1p_cps)
+ DOS64_ENTRY(2)
push %rbp
bsr %rsi, %rcx
push %rbx
@@ -206,11 +212,12 @@ ifdef(`SHLD_SLOW',`
')
imul %rdx, %r8
shr R8(%rcx), %r8
- mov %r8, 16(%rbx) C store B1modb
+ mov %r8, 16(%rbx) C store B1modb
L(z):
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
ASM_END()
diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm
index a0ecb6855..b09f24bc0 100644
--- a/mpn/x86_64/mod_1_2.asm
+++ b/mpn/x86_64/mod_1_2.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_mod_1s_2p
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -31,10 +31,14 @@ C Intel SBR 4.5
C Intel atom 28
C VIA nano 8
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1s_2p)
+ DOS64_ENTRY(4)
push %r14
test $1, R8(%rsi)
mov %rdx, %r14
@@ -145,6 +149,7 @@ L(1): xor R32(%rcx), R32(%rcx)
pop %r12
pop %r13
pop %r14
+ DOS64_EXIT()
ret
L(one):
mov (%rdi), %r8
@@ -154,6 +159,7 @@ L(one):
EPILOGUE()
PROLOGUE(mpn_mod_1s_2p_cps)
+ DOS64_ENTRY(2)
push %rbp
bsr %rsi, %rcx
push %rbx
@@ -214,5 +220,6 @@ ifdef(`SHLD_SLOW',`
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm
index d99080d7f..629520877 100644
--- a/mpn/x86_64/mod_1_4.asm
+++ b/mpn/x86_64/mod_1_4.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_mod_1s_4p
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,17 +30,22 @@ C Intel corei 4
C Intel atom 23
C VIA nano 4.75
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1s_4p)
+ DOS64_ENTRY(4)
+ push %r15
push %r14
push %r13
push %r12
push %rbp
push %rbx
- mov %rdx, -16(%rsp)
+ mov %rdx, %r15
mov %rcx, %r14
mov 16(%rcx), %r11 C B1modb
mov 24(%rcx), %rbx C B2modb
@@ -135,7 +140,7 @@ L(end): mov 8(%r14), R32(%rsi)
or %rdx, %rdi
mov %rdi, %rax
mulq (%r14)
- mov -16(%rsp), %rbx
+ mov %r15, %rbx
mov %rax, %r9
sal R8(%rcx), %r8
inc %rdi
@@ -155,11 +160,14 @@ L(end): mov 8(%r14), R32(%rsi)
pop %r12
pop %r13
pop %r14
+ pop %r15
+ DOS64_EXIT()
ret
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1s_4p_cps)
+ DOS64_ENTRY(2)
push %rbp
bsr %rsi, %rcx
push %rbx
@@ -244,5 +252,6 @@ ifdef(`SHLD_SLOW',`
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm
index 08cd7d939..ee4d0d347 100644
--- a/mpn/x86_64/mod_34lsub1.asm
+++ b/mpn/x86_64/mod_34lsub1.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
-dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010 Free Software
-dnl Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011 Free
+dnl Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -39,10 +39,14 @@ C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
C TODO
C * Review feed-in and wind-down code.
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_mod_34lsub1)
+ DOS64_ENTRY(2)
mov $0x0000FFFFFFFFFFFF, %r11
@@ -66,7 +70,8 @@ PROLOGUE(mpn_mod_34lsub1)
shl $16, %rdx C src[1] low
add %rdx, %rax
-L(one): ret
+L(one): DOS64_EXIT()
+ ret
C Don't change this, the wind-down code is not able to handle greater values
@@ -176,5 +181,6 @@ L(0): add %r9, %rax
add %rdx, %rax C apply 2mod3 high
add %rdi, %rax C apply 2mod3 low
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm
index 5f8dc4c9c..3b87bbf01 100644
--- a/mpn/x86_64/mul_1.asm
+++ b/mpn/x86_64/mul_1.asm
@@ -28,38 +28,65 @@ C Intel corei 3.8
C Intel atom 19.8
C VIA nano ?
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
-C TODO:
-C * The inner loop is great, but the prologue and epilogue code was
-C quickly written. Tune it!
+C TODO
+C * The loop is great, but the prologue and epilogue code was quickly written.
+C Tune it!
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vl', `%rcx')
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vl', `%rcx') C r9
-define(`n', `%r11')
+define(`n', `%r11')
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_1c)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
push %rbx
- mov %r8, %r10
+IFELF(` mov %r8, %r10')
+IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns)
jmp L(common)
EPILOGUE()
PROLOGUE(mpn_mul_1)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
push %rbx
xor %r10, %r10
L(common):
mov (up), %rax C read first u limb early
- mov n_param, %rbx C move away n from rdx, mul uses it
+IFELF(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
mul vl
- mov %rbx, %r11
+IFELF(` mov %rbx, n ')
add %r10, %rax
adc $0, %rdx
@@ -145,5 +172,7 @@ L(L2): mul vl
L(ret): mov %rdx, %rax
pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
ret
EPILOGUE()
diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm
index 206a4ea2c..35deefa8b 100644
--- a/mpn/x86_64/mul_2.asm
+++ b/mpn/x86_64/mul_2.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl store the result in a third limb vector.
-dnl Copyright 2008 Free Software Foundation, Inc.
+dnl Copyright 2008, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -53,10 +53,14 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_2)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
@@ -172,5 +176,6 @@ L(m22): mul v1
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mul_basecase.asm b/mpn/x86_64/mul_basecase.asm
index fdba9a6e3..5fede9234 100644
--- a/mpn/x86_64/mul_basecase.asm
+++ b/mpn/x86_64/mul_basecase.asm
@@ -59,10 +59,23 @@ define(`n', `%r11')
define(`outer_addr', `%r14')
define(`un', `%r13')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_basecase)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
push %rbp
push %r12
@@ -448,6 +461,7 @@ L(ret): pop %r15
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mulmid_basecase.asm b/mpn/x86_64/mulmid_basecase.asm
index 375e7f70e..d2d56d4a4 100644
--- a/mpn/x86_64/mulmid_basecase.asm
+++ b/mpn/x86_64/mulmid_basecase.asm
@@ -50,11 +50,23 @@ define(`vp', `%r15')
define(`vp_inner', `%r10')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mulmid_basecase)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
push %rbp
push %r12
@@ -539,6 +551,6 @@ L(ret): pop %r15
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
-
EPILOGUE()
diff --git a/mpn/x86_64/nano/gmp-mparam.h b/mpn/x86_64/nano/gmp-mparam.h
index a1c556937..7ee41927b 100644
--- a/mpn/x86_64/nano/gmp-mparam.h
+++ b/mpn/x86_64/nano/gmp-mparam.h
@@ -34,6 +34,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 22
@@ -50,13 +51,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 74
-#define SQR_TOOM4_THRESHOLD 620
-#define SQR_TOOM6_THRESHOLD 960
-#define SQR_TOOM8_THRESHOLD 1065
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 592
+#define SQR_TOOM6_THRESHOLD 978
+#define SQR_TOOM8_THRESHOLD 1193
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 17
+#define MULMID_TOOM42_THRESHOLD 28
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 20
+
+#define POWM_SEC_TABLE 2,29,387,1421
#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -176,7 +181,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 215
#define SQR_FFT_THRESHOLD 3264
-#define MULLO_BASECASE_THRESHOLD 17
+#define MULLO_BASECASE_THRESHOLD 8
#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */
#define MULLO_MUL_N_THRESHOLD 6633
@@ -190,7 +195,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define INV_APPR_THRESHOLD 153
#define BINV_NEWTON_THRESHOLD 182
-#define REDC_1_TO_REDC_2_THRESHOLD 14
+#define REDC_1_TO_REDC_2_THRESHOLD 20
#define REDC_2_TO_REDC_N_THRESHOLD 75
#define MU_DIV_QR_THRESHOLD 1589
@@ -200,12 +205,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1528
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 84
-#define GCD_DC_THRESHOLD 465
-#define GCDEXT_DC_THRESHOLD 456
+#define HGCD_THRESHOLD 102
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 706
+#define GCDEXT_DC_THRESHOLD 465
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 537
-#define SET_STR_PRECOMPUTE_THRESHOLD 1639
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1794
diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h
index 8983304c2..4d49fc2cf 100644
--- a/mpn/x86_64/pentium4/gmp-mparam.h
+++ b/mpn/x86_64/pentium4/gmp-mparam.h
@@ -33,34 +33,39 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 20
#define MUL_TOOM22_THRESHOLD 12
-#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM33_THRESHOLD 74
#define MUL_TOOM44_THRESHOLD 118
#define MUL_TOOM6H_THRESHOLD 157
-#define MUL_TOOM8H_THRESHOLD 242
+#define MUL_TOOM8H_THRESHOLD 430
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 20
-#define SQR_TOOM3_THRESHOLD 77
-#define SQR_TOOM4_THRESHOLD 214
+#define SQR_TOOM3_THRESHOLD 69
+#define SQR_TOOM4_THRESHOLD 202
#define SQR_TOOM6_THRESHOLD 254
-#define SQR_TOOM8_THRESHOLD 454
+#define SQR_TOOM8_THRESHOLD 418
+
+#define MULMID_TOOM42_THRESHOLD 19
#define MULMOD_BNM1_THRESHOLD 10
-#define SQRMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 9
+
+#define POWM_SEC_TABLE 3,130,140,724,2316
#define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -121,11 +126,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_BASECASE_THRESHOLD 0 /* always */
#define MULLO_DC_THRESHOLD 32
-#define MULLO_MUL_N_THRESHOLD 5397
+#define MULLO_MUL_N_THRESHOLD 6253
-#define DC_DIV_QR_THRESHOLD 28
-#define DC_DIVAPPR_Q_THRESHOLD 67
-#define DC_BDIV_QR_THRESHOLD 27
+#define DC_DIV_QR_THRESHOLD 32
+#define DC_DIVAPPR_Q_THRESHOLD 60
+#define DC_BDIV_QR_THRESHOLD 26
#define DC_BDIV_Q_THRESHOLD 49
#define INV_MULMOD_BNM1_THRESHOLD 22
@@ -133,8 +138,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define INV_APPR_THRESHOLD 101
#define BINV_NEWTON_THRESHOLD 199
-#define REDC_1_TO_REDC_2_THRESHOLD 13
-#define REDC_2_TO_REDC_N_THRESHOLD 44
+#define REDC_1_TO_REDC_2_THRESHOLD 23
+#define REDC_2_TO_REDC_N_THRESHOLD 42
#define MU_DIV_QR_THRESHOLD 979
#define MU_DIVAPPR_Q_THRESHOLD 979
@@ -143,12 +148,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 979
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 101
-#define GCD_DC_THRESHOLD 222
-#define GCDEXT_DC_THRESHOLD 222
+#define HGCD_THRESHOLD 99
+#define HGCD_APPR_THRESHOLD 117
+#define HGCD_REDUCE_THRESHOLD 1679
+#define GCD_DC_THRESHOLD 198
+#define GCDEXT_DC_THRESHOLD 233
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_DC_THRESHOLD 422
#define SET_STR_PRECOMPUTE_THRESHOLD 1438
diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm
index 9db368106..999452328 100644
--- a/mpn/x86_64/popham.asm
+++ b/mpn/x86_64/popham.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
-dnl Copyright 2004, 2005, 2007, 2010 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2007, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -44,6 +44,7 @@ ifdef(`OPERATION_popcount',`
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%rdx')
+ define(`POP', `$1')
define(`HAM', `dnl')
')
ifdef(`OPERATION_hamdist',`
@@ -55,17 +56,22 @@ ifdef(`OPERATION_hamdist',`
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%r14')
+ define(`POP', `dnl')
define(`HAM', `$1')
')
MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(func)
-
+ POP(` DOS64_ENTRY(2) ')
+ HAM(` DOS64_ENTRY(3) ')
push %r12
push %r13
HAM(` push %r14 ')
@@ -155,6 +161,6 @@ L(end):
HAM(` pop %r14 ')
pop %r13
pop %r12
+ DOS64_EXIT()
ret
-
EPILOGUE()
diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm
index 976cab2bc..53b5641a0 100644
--- a/mpn/x86_64/redc_1.asm
+++ b/mpn/x86_64/redc_1.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
-dnl Copyright 2004, 2008 Free Software Foundation, Inc.
+dnl Copyright 2004, 2008, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -34,41 +34,40 @@ C TODO
C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
C The code for 1, 2, 3, 4 should perhaps be completely register based.
C * Perhaps align outer loops.
-C * The sub_n at the end leaks side-channel data. How do we fix that?
-C * Write mpn_add_n_sub_n computing R = A + B - C. It should run at 2 c/l.
C * We could software pipeline the IMUL stuff, by putting it before the
C outer loops and before the end of the outer loops. The last outer
C loop iteration would then compute an unneeded product, but it is at
C least not a stray read from up[], since it is at up[n].
-C * Can we combine both the add_n and sub_n into the loops, somehow?
C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`param_mp',`%rdx')
-define(`n', `%rcx')
-define(`invm', `%r8')
+define(`up', `%rdi')
+define(`mp', `%rsi')
+define(`n_param', `%rdx')
+define(`invm', `%rcx')
-define(`mp', `%r13')
+define(`n', `%r13')
define(`i', `%r11')
define(`nneg', `%r12')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_redc_1)
+ DOS64_ENTRY(4)
push %rbp
push %rbx
push %r12
push %r13
push %r14
- push n
- sub $8, %rsp C maintain ABI required rsp alignment
- lea (param_mp,n,8), mp C mp += n
- lea (up,n,8), up C up += n
+ lea (mp,n_param,8), mp C mp += n
+ lea (up,n_param,8), up C up += n
- mov n, nneg
+ mov n_param, nneg
+ mov n_param, n
neg nneg
mov R32(n), R32(%rax)
@@ -136,9 +135,7 @@ L(n1): mov %r14, 16(up,nneg,8) C up[0]
add $8, up
dec n
jnz L(o1)
-C lea (mp), mp
- lea 16(up), up
- jmp L(common)
+ jmp L(ret)
L(b0): C lea (mp), mp
lea -16(up), up
@@ -190,10 +187,7 @@ L(ed0): add %r10, (up)
add $8, up
dec n
jnz L(o0)
-C lea (mp), mp
- lea 16(up), up
- jmp L(common)
-
+ jmp L(ret)
L(b3): lea -8(mp), mp
lea -24(up), up
@@ -244,9 +238,7 @@ L(ed3): add %r10, 8(up)
add $8, up
dec n
jnz L(o3)
- lea 8(mp), mp
- lea 24(up), up
- jmp L(common)
+ jmp L(ret)
L(b2): lea -16(mp), mp
lea -32(up), up
@@ -299,39 +291,12 @@ L(ed2): add %r10, 16(up)
add $8, up
dec n
jnz L(o2)
- lea 16(mp), mp
- lea 32(up), up
-
-
-L(common):
- lea (mp,nneg,8), mp C restore entry mp
-
-C cy = mpn_add_n (rp, up, up - n, n);
-C rdi rsi rdx rcx
- lea (up,nneg,8), up C up -= n
- lea (up,nneg,8), %rdx C rdx = up - n [up entry value]
- mov rp, nneg C preserve rp over first call
- mov 8(%rsp), %rcx C pass entry n
-C mov rp, %rdi
- CALL( mpn_add_n)
- test R32(%rax), R32(%rax)
- jz L(ret)
-
-C mpn_sub_n (rp, rp, mp, n);
-C rdi rsi rdx rcx
- mov nneg, %rdi
- mov nneg, %rsi
- mov mp, %rdx
- mov 8(%rsp), %rcx C pass entry n
- CALL( mpn_sub_n)
-L(ret):
- add $8, %rsp
- pop n C just increment rsp
- pop %r14
+L(ret): pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm
index c4a336446..1b6a103f1 100644
--- a/mpn/x86_64/rsh1aors_n.asm
+++ b/mpn/x86_64/rsh1aors_n.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
-dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -53,11 +53,24 @@ ifdef(`OPERATION_rsh1sub_n', `
MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
push %rbx
xor R32(%rax), R32(%rax)
@@ -69,6 +82,7 @@ EPILOGUE()
ALIGN(16)
PROLOGUE(func_n)
+ DOS64_ENTRY(4)
push %rbx
xor R32(%rax), R32(%rax)
@@ -169,5 +183,6 @@ L(top): add %rbx, %rbx C rotate carry limb, restore acy
L(end): mov %rbx, (rp)
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm
index 0f822a4a0..57a4ab093 100644
--- a/mpn/x86_64/rshift.asm
+++ b/mpn/x86_64/rshift.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_rshift -- mpn right shift.
-dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -36,10 +36,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_rshift)
+ DOS64_ENTRY(4)
neg R32(%rcx) C put rsh count in cl
mov (up), %rax
shl R8(%rcx), %rax C function return value
@@ -156,5 +160,6 @@ L(end):
L(ast): mov (up), %r10
shr R8(%rcx), %r10
mov %r10, (rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm
index 311daab8a..71195d7ae 100644
--- a/mpn/x86_64/sqr_basecase.asm
+++ b/mpn/x86_64/sqr_basecase.asm
@@ -75,12 +75,14 @@ define(`w1', `%rcx')
define(`w2', `%rbp')
define(`w3', `%r10')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
-
PROLOGUE(mpn_sqr_basecase)
+ DOS64_ENTRY(3)
add $-40, %rsp
mov %rbx, 32(%rsp)
mov %rbp, 24(%rsp)
@@ -115,6 +117,7 @@ L(1): mov (up), %rax
mov %rdx, 8(rp)
add $32, %rsp
pop %rbx
+ DOS64_EXIT()
ret
L(2): mov (up), %rax
@@ -139,6 +142,7 @@ L(2): mov (up), %rax
mov %r11, 24(rp)
add $32, %rsp
pop %rbx
+ DOS64_EXIT()
ret
L(3): mov (up), %rax
@@ -184,6 +188,7 @@ L(3): mov (up), %rax
adc %rbx, 40(rp)
add $32, %rsp
pop %rbx
+ DOS64_EXIT()
ret
L(4): mov (up), %rax
@@ -256,6 +261,7 @@ L(4): mov (up), %rax
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
@@ -780,5 +786,6 @@ L(d1): mov %r11, 24(rp,j,8)
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm
index a2f48c007..a0515cf18 100644
--- a/mpn/x86_64/sublsh1_n.asm
+++ b/mpn/x86_64/sublsh1_n.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
-dnl Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -41,10 +41,14 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_sublsh1_n)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
@@ -140,5 +144,6 @@ L(end): add R32(%rbp), R32(%rax)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
new file mode 100644
index 000000000..a6699a9a4
--- /dev/null
+++ b/mpn/x86_64/tabselect.asm
@@ -0,0 +1,123 @@
+dnl AMD64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.5
+C AMD K10 2.5
+C AMD bobcat 3.5
+C Intel P4 4
+C Intel core2 2.33
+C Intel NHM 2.5
+C Intel SBR 2.2
+C Intel atom 5
+C VIA nano 3.5
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using SSE2/AVX2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `%rdi')
+define(`tp', `%rsi')
+define(`n', `%rdx')
+define(`nents', `%rcx')
+define(`which', `%r8')
+
+define(`i', `%rbp')
+define(`maskp', `%r11')
+define(`maskn', `%r12')
+
+C rax rbx rcx rdx rdi rsi rbp (rsp) r8 r9 r10 r11 r12 r13 r14 r15
+C nents n rp tab which
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+
+ lea (rp,n,8), rp
+ lea (tp,n,8), tp
+ sub nents, which
+L(outer):
+ lea (which,nents), %rax
+ neg %rax C set CF iff 'which' != k
+ sbb maskn, maskn
+ mov maskn, maskp
+ not maskp
+
+ mov n, i
+ neg i
+ test $1, R32(n)
+ je L(top)
+ mov (tp,i,8), %rax
+ and maskp, %rax
+ mov (rp,i,8), %r9
+ and maskn, %r9
+ or %r9, %rax
+ mov %rax, (rp,i,8)
+ add $1, i
+ jns L(end)
+
+ ALIGN(16)
+L(top): mov (tp,i,8), %rax
+ mov 8(tp,i,8), %rbx
+ and maskp, %rax
+ and maskp, %rbx
+ mov (rp,i,8), %r9
+ mov 8(rp,i,8), %r10
+ and maskn, %r9
+ and maskn, %r10
+ or %r9, %rax
+ or %r10, %rbx
+ mov %rax, (rp,i,8)
+ mov %rbx, 8(rp,i,8)
+ add $2, i
+ js L(top)
+
+L(end): lea (tp,n,8), tp
+ dec nents
+ jne L(outer)
+
+L(outer_end):
+ pop %r12
+ pop %rbp
+ pop %rbx
+ DOS64_EXIT()
+ ret
+EPILOGUE()
diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4
index 6942a7882..79d7b3cf2 100644
--- a/mpn/x86_64/x86_64-defs.m4
+++ b/mpn/x86_64/x86_64-defs.m4
@@ -2,8 +2,8 @@ divert(-1)
dnl m4 macros for amd64 assembler.
-dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free
-dnl Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009, 2011
+dnl Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -169,4 +169,7 @@ ifdef(`PIC',
define(`JUMPTABSECT', `.section .data.rel.ro.local,"aw",@progbits')
+define(`DOS64_ENTRY',`')
+define(`DOS64_EXIT',`')
+
divert`'dnl
diff --git a/mpz/jacobi.c b/mpz/jacobi.c
index afd9a49b4..8bfb2e92b 100644
--- a/mpz/jacobi.c
+++ b/mpz/jacobi.c
@@ -110,7 +110,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
result_bit1 ^= JACOBI_N1B_BIT1(blow);
asize = -asize;
}
-
+
JACOBI_STRIP_LOW_ZEROS (result_bit1, blow, asrcp, asize, alow);
/* Ensure asize >= bsize. Take advantage of the generalized
@@ -147,7 +147,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
result_bit1 ^= JACOBI_RECIP_UU_BIT1 (alow, blow);
}
-
+
if (bsize == 1)
{
result_bit1 ^= JACOBI_TWOS_U_BIT1(btwos, alow);
@@ -165,7 +165,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
% B, but when A is much larger than B, we have to allocate space
for the large quotient. We use the same area, pointed to by bp,
for both the quotient A/B and the working copy of B. */
-
+
TMP_MARK;
if (asize >= 2*bsize)
@@ -189,7 +189,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
result_bit1 ^= JACOBI_TWOS_U_BIT1(btwos, alow);
ASSERT_NOCARRY (mpn_rshift (bp, bsrcp, bsize, btwos));
- bsize -= (ap[bsize-1] | bp[bsize-1]) == 0;
+ bsize -= (ap[bsize-1] | bp[bsize-1]) == 0;
}
else
MPN_COPY (bp, bsrcp, bsize);
diff --git a/tests/cxx/t-ops2.cc b/tests/cxx/t-ops2.cc
index 9a6e7e020..4967ed208 100644
--- a/tests/cxx/t-ops2.cc
+++ b/tests/cxx/t-ops2.cc
@@ -148,18 +148,18 @@ void checkqf (){
CHECK_SI(T,0,3,*);
CHECK_ALL_COMPARISONS(T,5.,2);
CHECK_ALL_SIGNS_COMPARISONS(T,11.,3);
- CHECK_MPZ(T,5,-2,<);
- CHECK_MPZ(T,5,-2,>);
+ CHECK_MPZ(T,5,-2,<);
+ CHECK_MPZ(T,5,-2,>);
CHECK_MPZ(T,5,-2,<=);
CHECK_MPZ(T,5,-2,>=);
CHECK_MPZ(T,5,-2,==);
CHECK_MPZ(T,5,-2,!=);
- CHECK_MPZ(T,0,0,<);
- CHECK_MPZ(T,0,0,>);
- CHECK_MPZ(T,0,0,<=);
- CHECK_MPZ(T,0,0,>=);
- CHECK_MPZ(T,0,0,==);
- CHECK_MPZ(T,0,0,!=);
+ CHECK_MPZ(T,0,0,<);
+ CHECK_MPZ(T,0,0,>);
+ CHECK_MPZ(T,0,0,<=);
+ CHECK_MPZ(T,0,0,>=);
+ CHECK_MPZ(T,0,0,==);
+ CHECK_MPZ(T,0,0,!=);
ASSERT_ALWAYS(T(6)<<2==6.*4);
ASSERT_ALWAYS(T(6)>>2==6./4);
ASSERT_ALWAYS(T(-13)<<2==-13.*4);
@@ -217,18 +217,18 @@ void checkf (){
CHECK_MPQ(mpf_class,-5.5,-2.25,-);
CHECK_MPQ(mpf_class,-5.5,-2.25,*);
CHECK_MPQ(mpf_class,-5.25,-0.5,/);
- CHECK_MPQ(mpf_class,5,-2,<);
- CHECK_MPQ(mpf_class,5,-2,>);
- CHECK_MPQ(mpf_class,5,-2,<=);
- CHECK_MPQ(mpf_class,5,-2,>=);
- CHECK_MPQ(mpf_class,5,-2,==);
- CHECK_MPQ(mpf_class,5,-2,!=);
- CHECK_MPQ(mpf_class,0,0,<);
- CHECK_MPQ(mpf_class,0,0,>);
- CHECK_MPQ(mpf_class,0,0,<=);
- CHECK_MPQ(mpf_class,0,0,>=);
- CHECK_MPQ(mpf_class,0,0,==);
- CHECK_MPQ(mpf_class,0,0,!=);
+ CHECK_MPQ(mpf_class,5,-2,<);
+ CHECK_MPQ(mpf_class,5,-2,>);
+ CHECK_MPQ(mpf_class,5,-2,<=);
+ CHECK_MPQ(mpf_class,5,-2,>=);
+ CHECK_MPQ(mpf_class,5,-2,==);
+ CHECK_MPQ(mpf_class,5,-2,!=);
+ CHECK_MPQ(mpf_class,0,0,<);
+ CHECK_MPQ(mpf_class,0,0,>);
+ CHECK_MPQ(mpf_class,0,0,<=);
+ CHECK_MPQ(mpf_class,0,0,>=);
+ CHECK_MPQ(mpf_class,0,0,==);
+ CHECK_MPQ(mpf_class,0,0,!=);
}
int
diff --git a/tests/devel/try.c b/tests/devel/try.c
index 5619ec26d..7ccb9de0b 100644
--- a/tests/devel/try.c
+++ b/tests/devel/try.c
@@ -459,7 +459,7 @@ validate_bdiv_q_1
refmpn_mul_1 (tp, dst, size, divisor);
/* Set ignored low bits */
- tp[0] |= (src[0] & LOW_ZEROS_MASK (divisor));
+ tp[0] |= (src[0] & LOW_ZEROS_MASK (divisor));
if (! refmpn_equal_anynail (tp, src, size))
{
printf ("Bdiv wrong: res * divisor != src (mod B^size)\n");
@@ -622,6 +622,8 @@ enum {
TYPE_SUBLSH1_NC, TYPE_SUBLSH2_NC, TYPE_SUBLSH_NC,
TYPE_RSBLSH1_NC, TYPE_RSBLSH2_NC, TYPE_RSBLSH_NC,
+ TYPE_ADDCND_N, TYPE_SUBCND_N,
+
TYPE_MOD_1, TYPE_MOD_1C, TYPE_DIVMOD_1, TYPE_DIVMOD_1C, TYPE_DIVREM_1,
TYPE_DIVREM_1C, TYPE_PREINV_DIVREM_1, TYPE_DIVREM_2, TYPE_PREINV_MOD_1,
TYPE_MOD_34LSUB1, TYPE_UDIV_QRNND, TYPE_UDIV_QRNND_R,
@@ -742,6 +744,16 @@ param_init (void)
COPY (TYPE_ADD_ERR3_N);
REFERENCE (refmpn_sub_err3_n);
+ p = &param[TYPE_ADDCND_N];
+ COPY (TYPE_ADD_N);
+ p->carry = CARRY_BIT;
+ REFERENCE (refmpn_addcnd_n);
+
+ p = &param[TYPE_SUBCND_N];
+ COPY (TYPE_ADD_N);
+ p->carry = CARRY_BIT;
+ REFERENCE (refmpn_subcnd_n);
+
p = &param[TYPE_MUL_1];
p->retval = 1;
@@ -1704,6 +1716,8 @@ const struct choice_t choice_array[] = {
{ TRY(mpn_copyd), TYPE_COPYD },
#endif
+ { TRY(mpn_addcnd_n), TYPE_ADDCND_N },
+ { TRY(mpn_subcnd_n), TYPE_SUBCND_N },
#if HAVE_NATIVE_mpn_addlsh1_n
{ TRY(mpn_addlsh1_n), TYPE_ADDLSH1_N },
#endif
@@ -2395,6 +2409,8 @@ call (struct each_t *e, tryfun_t function)
case TYPE_RSBLSH2_NC:
case TYPE_ADD_NC:
case TYPE_SUB_NC:
+ case TYPE_ADDCND_N:
+ case TYPE_SUBCND_N:
e->retval = CALLING_CONVENTIONS (function)
(e->d[0].p, e->s[0].p, e->s[1].p, size, carry);
break;
diff --git a/tests/mpn/t-hgcd_appr.c b/tests/mpn/t-hgcd_appr.c
index 912a1fde0..486b13061 100644
--- a/tests/mpn/t-hgcd_appr.c
+++ b/tests/mpn/t-hgcd_appr.c
@@ -261,7 +261,7 @@ one_test (mpz_t a, mpz_t b, int i)
"after tp: %Mx\n"
"expected: %Mx\n",
hgcd_tp[hgcd_scratch], marker[3]);
-
+
abort ();
}
@@ -424,7 +424,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
mp_bitcnt_t dbits, abits, margin;
mpz_t appr_r0, appr_r1, t, q;
struct hgcd_ref appr;
-
+
if (!res0)
{
if (!res1)
@@ -433,7 +433,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
fprintf (stderr, "mpn_hgcd_appr returned 1 when no reduction possible.\n");
return 0;
}
-
+
/* NOTE: No *_clear calls on error return, since we're going to
abort anyway. */
mpz_init (t);
@@ -441,7 +441,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
hgcd_ref_init (&appr);
mpz_init (appr_r0);
mpz_init (appr_r1);
-
+
if (mpz_size (ref_r0) <= s)
{
fprintf (stderr, "ref_r0 too small!!!: "); debug_mp (ref_r0, 16);
@@ -460,7 +460,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
fprintf (stderr, "ref |r0 - r1| too large!!!: "); debug_mp (t, 16);
return 0;
}
-
+
if (!res1)
{
mpz_set (appr_r0, a);
@@ -473,7 +473,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
for (i = 0; i<2; i++)
{
unsigned j;
-
+
for (j = 0; j<2; j++)
{
mp_size_t mn = hgcd->n;
@@ -567,7 +567,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
fprintf (stderr, "appr_r1: "); debug_mp (appr_r1, 16);
fprintf (stderr, "ref_r1: "); debug_mp (ref_r1, 16);
-
+
return 0;
}
mpz_clear (t);
diff --git a/tests/mpn/t-mod_1.c b/tests/mpn/t-mod_1.c
index f1966154d..2f86ba277 100644
--- a/tests/mpn/t-mod_1.c
+++ b/tests/mpn/t-mod_1.c
@@ -90,7 +90,7 @@ main (int argc, char **argv)
rands = RANDS;
mpz_init (a);
mpz_init (b);
-
+
for (i = 0; i < 300; i++)
{
mp_size_t asize;
diff --git a/tests/mpn/t-mulmid.c b/tests/mpn/t-mulmid.c
index ab224acea..a946aefe8 100644
--- a/tests/mpn/t-mulmid.c
+++ b/tests/mpn/t-mulmid.c
@@ -52,7 +52,7 @@ main (int argc, char **argv)
bp = TMP_ALLOC_LIMBS (MAX_N);
rp = TMP_ALLOC_LIMBS (MAX_N + 2);
refp = TMP_ALLOC_LIMBS (MAX_N + 2);
-
+
for (test = 0; test < COUNT; test++)
{
mp_size_t an, bn, rn;
diff --git a/tests/mpz/t-jac.c b/tests/mpz/t-jac.c
index 5d8cad177..34cd82e78 100644
--- a/tests/mpz/t-jac.c
+++ b/tests/mpz/t-jac.c
@@ -921,7 +921,7 @@ mpz_nextprime_step (mpz_ptr p, mpz_srcptr n, mpz_srcptr step_in)
mpz_gcd (gcd, p, step);
ASSERT_ALWAYS (mpz_cmp_ui (gcd, 1) == 0);
mpz_clear (gcd);
-
+
pn = SIZ(p);
count_leading_zeros (cnt, PTR(p)[pn - 1]);
nbits = pn * GMP_NUMB_BITS - (cnt - GMP_NAIL_BITS);
@@ -1016,7 +1016,7 @@ check_large_quotients (void)
mpz_set_ui (op1, 0);
mpz_urandomb (bs, rands, 32);
mpz_urandomb (bs, rands, mpz_get_ui (bs) % 10 + 1);
-
+
gcd_size = 1 + mpz_get_ui (bs);
if (gcd_size & 1)
{
diff --git a/tests/refmpn.c b/tests/refmpn.c
index fbcc602d6..b31804ef9 100644
--- a/tests/refmpn.c
+++ b/tests/refmpn.c
@@ -2,7 +2,7 @@
of the normal gmp code. Speed isn't a consideration.
Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
-2007, 2008, 2009 Free Software Foundation, Inc.
+2007, 2008, 2009, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -596,6 +596,29 @@ refmpn_sub_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
return refmpn_sub_nc (rp, s1p, s2p, size, CNST_LIMB(0));
}
+mp_limb_t
+refmpn_addcnd_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size, mp_limb_t cnd)
+{
+ if (cnd != 0)
+ return refmpn_add_n (rp, s1p, s2p, size);
+ else
+ {
+ refmpn_copyi (rp, s1p, size);
+ return 0;
+ }
+}
+mp_limb_t
+refmpn_subcnd_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size, mp_limb_t cnd)
+{
+ if (cnd != 0)
+ return refmpn_sub_n (rp, s1p, s2p, size);
+ else
+ {
+ refmpn_copyi (rp, s1p, size);
+ return 0;
+ }
+}
+
#define AORS_ERR1_N(operation) \
{ \
@@ -2303,12 +2326,9 @@ refmpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
for (j = n - 1; j >= 0; j--)
{
- up[0] = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
+ up[0] = refmpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
up++;
}
- cy = mpn_add_n (rp, up, up - n, n);
- if (cy != 0)
- mpn_sub_n (rp, rp, mp, n);
}
size_t
diff --git a/tests/tests.h b/tests/tests.h
index 4086e5c5d..75b546319 100644
--- a/tests/tests.h
+++ b/tests/tests.h
@@ -172,6 +172,11 @@ int refmpf_validate_division __GMP_PROTO ((const char *name, mpf_srcptr got,
mpf_srcptr n, mpf_srcptr d));
+mp_limb_t refmpn_addcnd_n __GMP_PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+ mp_size_t size, mp_limb_t cnd));
+mp_limb_t refmpn_subcnd_n __GMP_PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+ mp_size_t size, mp_limb_t cnd));
+
mp_limb_t refmpn_add __GMP_PROTO ((mp_ptr rp,
mp_srcptr s1p, mp_size_t s1size,
mp_srcptr s2p, mp_size_t s2size));
diff --git a/tune/Makefile.am b/tune/Makefile.am
index e54c020d4..646a1f4af 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -43,7 +43,8 @@ libspeed_la_SOURCES = \
common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \
freq.c \
gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \
- hgcd_lehmer.c jacbase1.c jacbase2.c jacbase3.c jacbase4.c \
+ hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \
+ jacbase1.c jacbase2.c jacbase3.c jacbase4.c \
mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \
noop.c powm_mod.c powm_redc.c pre_divrem_1.c \
set_strb.c set_strs.c set_strp.c time.c
@@ -129,7 +130,9 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \
dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c \
invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \
- get_str.c set_str.c matrix22_mul.c hgcd.c mul_n.c sqr.c \
+ get_str.c set_str.c matrix22_mul.c \
+ hgcd.c hgcd_appr.c hgcd_reduce.c \
+ mul_n.c sqr.c powm_sec.c \
mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \
mulmid.c mulmid_n.c toom42_mulmid.c \
nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \
diff --git a/tune/common.c b/tune/common.c
index dbcc5ce90..88f0099e8 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -461,6 +461,11 @@ speed_mpn_com (struct speed_params *s)
{
SPEED_ROUTINE_MPN_COPY (mpn_com);
}
+double
+speed_mpn_tabselect (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_TABSELECT (mpn_tabselect);
+}
double
@@ -1107,6 +1112,17 @@ speed_mpn_rsh1sub_n (struct speed_params *s)
}
#endif
+double
+speed_mpn_addcnd_n (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addcnd_n (wp, xp, yp, s->size, 1));
+}
+double
+speed_mpn_subcnd_n (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_subcnd_n (wp, xp, yp, s->size, 1));
+}
+
/* mpn_and_n etc can be macros and so have to be handled with
SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
double
@@ -1518,7 +1534,7 @@ speed_mpn_hgcd (struct speed_params *s)
double
speed_mpn_hgcd_lehmer (struct speed_params *s)
{
- SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, MPN_HGCD_LEHMER_ITCH);
+ SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch);
}
double
@@ -1528,6 +1544,28 @@ speed_mpn_hgcd_appr (struct speed_params *s)
}
double
+speed_mpn_hgcd_appr_lehmer (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch);
+}
+
+double
+speed_mpn_hgcd_reduce (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch);
+}
+double
+speed_mpn_hgcd_reduce_1 (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch);
+}
+double
+speed_mpn_hgcd_reduce_2 (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch);
+}
+
+double
speed_mpn_gcd (struct speed_params *s)
{
SPEED_ROUTINE_MPN_GCD (mpn_gcd);
diff --git a/tune/hgcd_appr_lehmer.c b/tune/hgcd_appr_lehmer.c
new file mode 100644
index 000000000..18123e951
--- /dev/null
+++ b/tune/hgcd_appr_lehmer.c
@@ -0,0 +1,29 @@
+/* mpn/generic/hgcd_appr.c forced to use Lehmer's quadratic algorithm. */
+
+/*
+Copyright 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#undef HGCD_APPR_THRESHOLD
+#define HGCD_APPR_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd_appr mpn_hgcd_appr_lehmer
+#define __gmpn_hgcd_appr_itch mpn_hgcd_appr_lehmer_itch
+
+#include "../mpn/generic/hgcd_appr.c"
diff --git a/tune/hgcd_reduce_1.c b/tune/hgcd_reduce_1.c
new file mode 100644
index 000000000..996362414
--- /dev/null
+++ b/tune/hgcd_reduce_1.c
@@ -0,0 +1,30 @@
+/* mpn/generic/hgcd_reduce.c forced to use hgcd. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#undef HGCD_REDUCE_THRESHOLD
+#define HGCD_REDUCE_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd_reduce mpn_hgcd_reduce_1
+#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_1_itch
+
+
+#include "../mpn/generic/hgcd_reduce.c"
diff --git a/tune/hgcd_reduce_2.c b/tune/hgcd_reduce_2.c
new file mode 100644
index 000000000..1eed4ba11
--- /dev/null
+++ b/tune/hgcd_reduce_2.c
@@ -0,0 +1,29 @@
+/* mpn/generic/hgcd_reduce.c forced to use hgcd_appr. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#undef HGCD_REDUCE_THRESHOLD
+#define HGCD_REDUCE_THRESHOLD 0
+#define __gmpn_hgcd_reduce mpn_hgcd_reduce_2
+#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_2_itch
+
+#include "../mpn/generic/hgcd_reduce.c"
diff --git a/tune/speed.c b/tune/speed.c
index 0604edded..704d82772 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -278,6 +278,11 @@ const struct routine_t {
{ "mpn_hgcd", speed_mpn_hgcd },
{ "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer },
{ "mpn_hgcd_appr", speed_mpn_hgcd_appr },
+ { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
+
+ { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce },
+ { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 },
+ { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2 },
{ "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL },
{ "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
@@ -407,6 +412,7 @@ const struct routine_t {
#if HAVE_NATIVE_mpn_copyd
{ "mpn_copyd", speed_mpn_copyd },
#endif
+ { "mpn_tabselect", speed_mpn_tabselect, FLAG_R_OPTIONAL },
#if HAVE_NATIVE_mpn_addlsh1_n
{ "mpn_addlsh1_n", speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
#endif
@@ -468,6 +474,9 @@ const struct routine_t {
{ "mpn_rsh1sub_n", speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
#endif
+ { "mpn_addcnd_n", speed_mpn_addcnd_n, FLAG_R_OPTIONAL },
+ { "mpn_subcnd_n", speed_mpn_subcnd_n, FLAG_R_OPTIONAL },
+
{ "MPN_ZERO", speed_MPN_ZERO },
{ "binvert_limb", speed_binvert_limb, FLAG_NODATA },
diff --git a/tune/speed.h b/tune/speed.h
index c017a8ec2..20daad2dd 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -148,6 +148,7 @@ double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_add_err1_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_add_err2_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_add_err3_n __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_addcnd_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_addlsh_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s));
@@ -174,6 +175,7 @@ double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s));
double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s));
double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s));
double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_tabselect __GMP_PROTO ((struct speed_params *s));
double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s));
double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s));
double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
@@ -197,6 +199,10 @@ double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s));
double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s));
double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s));
double speed_mpn_hgcd_appr __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_appr_lehmer __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_reduce __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_reduce_1 __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_reduce_2 __GMP_PROTO ((struct speed_params *s));
double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s));
double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s));
double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s));
@@ -305,6 +311,7 @@ double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_sub_err1_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_sub_err2_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_sub_err3_n __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_subcnd_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_sublsh_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s));
double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s));
@@ -484,7 +491,21 @@ mp_size_t mpn_gcdext_double
__GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
mp_size_t mpn_hgcd_lehmer
__GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr));
-#define MPN_HGCD_LEHMER_ITCH(n) (n)
+mp_size_t mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t));
+
+mp_size_t mpn_hgcd_appr_lehmer
+ __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr));
+mp_size_t mpn_hgcd_appr_lehmer_itch __GMP_PROTO ((mp_size_t));
+
+mp_size_t mpn_hgcd_reduce_1
+ __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr));
+mp_size_t mpn_hgcd_reduce_1_itch
+ __GMP_PROTO ((mp_size_t, mp_size_t));
+
+mp_size_t mpn_hgcd_reduce_2
+ __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr));
+mp_size_t mpn_hgcd_reduce_2_itch
+ __GMP_PROTO ((mp_size_t, mp_size_t));
mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
@@ -593,7 +614,7 @@ int speed_routine_count_zeros_setup
#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
/* For mpn_copy or similar. */
-#define SPEED_ROUTINE_MPN_COPY(function) \
+#define SPEED_ROUTINE_MPN_COPY_CALL(call) \
{ \
mp_ptr wp; \
unsigned i; \
@@ -612,13 +633,18 @@ int speed_routine_count_zeros_setup
speed_starttime (); \
i = s->reps; \
do \
- function (wp, s->xp, s->size); \
+ call; \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
+#define SPEED_ROUTINE_MPN_COPY(function) \
+ SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_TABSELECT(function) \
+ SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size, 1, s->r))
#define SPEED_ROUTINE_MPN_COPYC(function) \
{ \
@@ -2193,7 +2219,7 @@ int speed_routine_count_zeros_setup
#define SPEED_ROUTINE_REDC_1(function) \
{ \
unsigned i; \
- mp_ptr cp, mp, tp, ap; \
+ mp_ptr mp, tp, ap; \
mp_limb_t inv; \
double t; \
TMP_DECL; \
@@ -2203,7 +2229,6 @@ int speed_routine_count_zeros_setup
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
- SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
\
MPN_COPY (ap, s->xp, s->size); \
@@ -2218,14 +2243,13 @@ int speed_routine_count_zeros_setup
speed_operand_src (s, ap, 2*s->size+1); \
speed_operand_dst (s, tp, 2*s->size+1); \
speed_operand_src (s, mp, s->size); \
- speed_operand_dst (s, cp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do { \
MPN_COPY (tp, ap, 2*s->size); \
- function (cp, tp, mp, s->size, inv); \
+ function (tp, mp, s->size, inv); \
} while (--i != 0); \
t = speed_endtime (); \
\
@@ -2706,6 +2730,57 @@ int speed_routine_count_zeros_setup
return t; \
}
+#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \
+ { \
+ mp_size_t hgcd_init_itch, hgcd_step_itch; \
+ mp_ptr ap, bp, wp, tmp1; \
+ struct hgcd_matrix hgcd; \
+ mp_size_t p = s->size/2; \
+ int res; \
+ unsigned i; \
+ double t; \
+ TMP_DECL; \
+ \
+ if (s->size < 2) \
+ return -1; \
+ \
+ TMP_MARK; \
+ \
+ SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \
+ SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \
+ \
+ s->xp[s->size - 1] |= 1; \
+ s->yp[s->size - 1] |= 1; \
+ \
+ hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \
+ hgcd_step_itch = itchfunc (s->size, p); \
+ \
+ SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \
+ SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \
+ \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_src (s, s->yp, s->size); \
+ speed_operand_dst (s, ap, s->size + 1); \
+ speed_operand_dst (s, bp, s->size + 1); \
+ speed_operand_dst (s, wp, hgcd_step_itch); \
+ speed_operand_dst (s, tmp1, hgcd_init_itch); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ { \
+ MPN_COPY (ap, s->xp, s->size); \
+ MPN_COPY (bp, s->yp, s->size); \
+ mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \
+ res = func (&hgcd, ap, bp, s->size, p, wp); \
+ } \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ TMP_FREE; \
+ return t; \
+ }
+
/* Run some GCDs of s->size limbs each. The number of different data values
is decreased as s->size**2, since GCD is a quadratic algorithm.
SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
diff --git a/tune/tune-gcd-p.c b/tune/tune-gcd-p.c
index 3c3815bd2..6d8863178 100644
--- a/tune/tune-gcd-p.c
+++ b/tune/tune-gcd-p.c
@@ -39,7 +39,7 @@ search (double *minp, double (*f)(void *, int), void *ctx, int start, int end)
double y[4];
int best_i;
-
+
x[0] = start;
x[3] = end;
@@ -60,7 +60,7 @@ search (double *minp, double (*f)(void *, int), void *ctx, int start, int end)
#if 0
printf("%d: %f, %d: %f, %d:, %f %d: %f\n",
x[0], y[0], x[1], y[1], x[2], y[2], x[3], y[3]);
-#endif
+#endif
for (best_i = 0, i = 1; i < 4; i++)
if (y[i] < y[best_i])
best_i = i;
diff --git a/tune/tuneup.c b/tune/tuneup.c
index 4f53c979c..bc7e8cc3d 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -192,9 +192,10 @@ mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
-mp_size_t powm_threshold = MP_SIZE_T_MAX;
mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
+mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
+mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
@@ -202,7 +203,7 @@ mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
-int mod_1_1p_method = 0;
+int mod_1_1p_method = 0;
mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
@@ -1567,7 +1568,7 @@ tune_mu_div (void)
param.name = "MU_DIV_QR_THRESHOLD";
param.function = speed_mpn_dcpi1_div_qr;
param.function2 = speed_mpn_mu_div_qr;
- param.min_size = 6;
+ param.min_size = mul_toom22_threshold;
param.max_size = 5000;
param.step_factor = 0.02;
one (&mu_div_qr_threshold, &param);
@@ -1577,7 +1578,7 @@ tune_mu_div (void)
param.name = "MU_DIVAPPR_Q_THRESHOLD";
param.function = speed_mpn_dcpi1_divappr_q;
param.function2 = speed_mpn_mu_divappr_q;
- param.min_size = 6;
+ param.min_size = mul_toom22_threshold;
param.max_size = 5000;
param.step_factor = 0.02;
one (&mu_divappr_q_threshold, &param);
@@ -1626,7 +1627,7 @@ tune_mu_bdiv (void)
param.name = "MU_BDIV_QR_THRESHOLD";
param.function = speed_mpn_dcpi1_bdiv_qr;
param.function2 = speed_mpn_mu_bdiv_qr;
- param.min_size = 4;
+ param.min_size = mul_toom22_threshold;
param.max_size = 5000;
param.step_factor = 0.02;
one (&mu_bdiv_qr_threshold, &param);
@@ -1636,7 +1637,7 @@ tune_mu_bdiv (void)
param.name = "MU_BDIV_Q_THRESHOLD";
param.function = speed_mpn_dcpi1_bdiv_q;
param.function2 = speed_mpn_mu_bdiv_q;
- param.min_size = 4;
+ param.min_size = mul_toom22_threshold;
param.max_size = 5000;
param.step_factor = 0.02;
one (&mu_bdiv_q_threshold, &param);
@@ -1755,6 +1756,30 @@ tune_hgcd (void)
}
void
+tune_hgcd_appr (void)
+{
+ static struct param_t param;
+ param.name = "HGCD_APPR_THRESHOLD";
+ param.function = speed_mpn_hgcd_appr;
+ /* We seem to get strange results for small sizes */
+ param.min_size = 50;
+ param.stop_since_change = 150;
+ one (&hgcd_appr_threshold, &param);
+}
+
+void
+tune_hgcd_reduce (void)
+{
+ static struct param_t param;
+ param.name = "HGCD_REDUCE_THRESHOLD";
+ param.function = speed_mpn_hgcd_reduce;
+ param.min_size = 30;
+ param.max_size = 7000;
+ param.step_factor = 0.04;
+ one (&hgcd_reduce_threshold, &param);
+}
+
+void
tune_gcd_dc (void)
{
static struct param_t param;
@@ -1778,6 +1803,134 @@ tune_gcdext_dc (void)
one (&gcdext_dc_threshold, &param);
}
+/* In tune_powm_sec we compute the table used by the win_size function. The
+ cutoff points are in exponent bits, disregarding other operand sizes. It is
+ not possible to use the one framework since it currently uses a granilarity
+ of full limbs.
+*/
+
+/* This win_size replaces the variant in the powm code, allowing us to
+ control k in the k-ary algorithms. */
+int winsize;
+int
+win_size (mp_bitcnt_t eb)
+{
+ return winsize;
+}
+
+void
+tune_powm_sec (void)
+{
+ mp_size_t n;
+ int k, i;
+ mp_size_t itch;
+ mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
+ const int n_max = 3000 / GMP_NUMB_BITS;
+ const int n_measurements = 5;
+ mp_ptr rp, bp, ep, mp, tp;
+ double ttab[n_measurements], tk, tkp1;
+ TMP_DECL;
+ TMP_MARK;
+
+ possible_nbits_cutoff = 0;
+
+ k = 1;
+
+ winsize = 10; /* the itch function needs this */
+ itch = mpn_powm_sec_itch (n_max, n_max, n_max);
+
+ rp = TMP_ALLOC_LIMBS (n_max);
+ bp = TMP_ALLOC_LIMBS (n_max);
+ ep = TMP_ALLOC_LIMBS (n_max);
+ mp = TMP_ALLOC_LIMBS (n_max);
+ tp = TMP_ALLOC_LIMBS (itch);
+
+ mpn_random (bp, n_max);
+ mpn_random (mp, n_max);
+ mp[0] |= 1;
+
+/* How about taking the M operand size into account?
+
+ An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
+ B = O(M)).
+
+ Using k-ary and no sliding window, the precomputation will need time
+ O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
+ O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
+
+ An operation R=powm_sec(B,E,N) will take time like powm.
+
+ Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
+ main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
+ O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
+ table reads, respectively. */
+
+ printf ("#define POWM_SEC_TABLE ");
+
+ for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; )
+ {
+ n = (nbits - 1) / GMP_NUMB_BITS + 1;
+
+ /* Generate E such that sliding-window for k and k+1 works equally
+ well/poorly (but sliding is not used in powm_sec, of course). */
+ for (i = 0; i < n; i++)
+ ep[i] = ~CNST_LIMB(0);
+
+ /* Truncate E to be exactly nbits large. */
+ if (nbits % GMP_NUMB_BITS != 0)
+ mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS);
+ ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS;
+
+ winsize = k;
+ for (i = 0; i < n_measurements; i++)
+ {
+ speed_starttime ();
+ mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
+ ttab[i] = speed_endtime ();
+ }
+ tk = median (ttab, n_measurements);
+
+ winsize = k + 1;
+ speed_starttime ();
+ for (i = 0; i < n_measurements; i++)
+ {
+ speed_starttime ();
+ mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
+ ttab[i] = speed_endtime ();
+ }
+ tkp1 = median (ttab, n_measurements);
+/*
+ printf ("testing: %ld, %d", nbits, k, ep[n-1]);
+ printf (" %10.5f %10.5f\n", tk, tkp1);
+*/
+ if (tkp1 < tk)
+ {
+ if (possible_nbits_cutoff)
+ {
+ /* Two consecutive sizes indicate k increase, obey. */
+ if (k > 1)
+ printf (",");
+ printf ("%ld", (long) possible_nbits_cutoff);
+ k++;
+ possible_nbits_cutoff = 0;
+ }
+ else
+ {
+ /* One measurement indicate k increase, save nbits for further
+ consideration. */
+ possible_nbits_cutoff = nbits;
+ }
+ }
+ else
+ possible_nbits_cutoff = 0;
+
+ nbits_next = nbits * 65 / 64;
+ nbits = nbits_next + (nbits_next == nbits);
+ }
+ printf ("\n");
+ TMP_FREE;
+}
+
/* size_extra==1 reflects the fact that with high<divisor one division is
always skipped. Forcing high<divisor while testing ensures consistency
@@ -1873,7 +2026,6 @@ tune_mod_1 (void)
{
static struct param_t param;
double t1, t2;
- int method;
s.size = 10;
s.r = randlimb_half ();
@@ -2552,6 +2704,11 @@ all (void)
tune_sqrmod_bnm1 ();
printf("\n");
+#if 1
+ tune_powm_sec ();
+ printf("\n");
+#endif
+
tune_fft_mul ();
printf("\n");
@@ -2579,6 +2736,8 @@ all (void)
tune_matrix22_mul ();
tune_hgcd ();
+ tune_hgcd_appr ();
+ tune_hgcd_reduce();
tune_gcd_dc ();
tune_gcdext_dc ();
tune_jacobi_base ();