From f2112c9a69c750288e0a4b032bcd0ebb004b92eb Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 01:02:27 +0100 Subject: Provide gmp-mparam.h for POWER7. --- mpn/powerpc64/mode64/p7/gmp-mparam.h | 155 +++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 mpn/powerpc64/mode64/p7/gmp-mparam.h diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h new file mode 100644 index 000000000..57b888637 --- /dev/null +++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h @@ -0,0 +1,155 @@ +/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011 +Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define BYTES_PER_MP_LIMB 8 + +/* 3550 MHz POWER7 (gcc110.fsffrance.org) */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 28 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 298 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 143 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 135 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 141 + +#define SQR_BASECASE_THRESHOLD 10 +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 84 +#define SQR_TOOM4_THRESHOLD 160 +#define SQR_TOOM6_THRESHOLD 246 +#define SQR_TOOM8_THRESHOLD 296 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 575,11}, \ + { 303,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 106 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 103 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 4 +#define MULLO_DC_THRESHOLD 34 +#define MULLO_MUL_N_THRESHOLD 9174 + +#define DC_DIV_QR_THRESHOLD 30 +#define DC_DIVAPPR_Q_THRESHOLD 124 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 81 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 133 + +#define BINV_NEWTON_THRESHOLD 300 +#define REDC_1_TO_REDC_N_THRESHOLD 76 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1442 +#define MUPI_DIV_QR_THRESHOLD 58 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 121 +#define GCD_DC_THRESHOLD 443 +#define GCDEXT_DC_THRESHOLD 396 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 1517 +#define SET_STR_PRECOMPUTE_THRESHOLD 4040 -- cgit v1.2.1 From 7efbd396826cff03514bdc27356fa34fcd323f58 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 01:19:16 +0100 Subject: Add POWER7 cycle counts. --- mpn/powerpc64/com.asm | 9 ++++++--- mpn/powerpc64/copyd.asm | 9 ++++++--- mpn/powerpc64/copyi.asm | 9 ++++++--- mpn/powerpc64/logops_n.asm | 9 ++++++--- mpn/powerpc64/lshift.asm | 11 ++++++----- mpn/powerpc64/mode64/aors_n.asm | 11 ++++++----- mpn/powerpc64/mode64/aorslshC_n.asm | 11 ++++++----- mpn/powerpc64/mode64/aorsmul_1.asm | 13 +++++++------ mpn/powerpc64/mode64/bdiv_dbm1c.asm | 4 +++- mpn/powerpc64/mode64/dive_1.asm | 11 ++++++----- mpn/powerpc64/mode64/divrem_1.asm | 13 +++++++------ mpn/powerpc64/mode64/divrem_2.asm | 11 ++++++----- mpn/powerpc64/mode64/invert_limb.asm | 11 ++++++----- mpn/powerpc64/mode64/lshiftc.asm | 11 ++++++----- mpn/powerpc64/mode64/mod_1_1.asm | 11 ++++++----- mpn/powerpc64/mode64/mod_1_4.asm | 11 ++++++----- mpn/powerpc64/mode64/mod_34lsub1.asm | 11 ++++++----- mpn/powerpc64/mode64/mode1o.asm | 10 ++++++---- mpn/powerpc64/mode64/mul_1.asm | 11 ++++++----- mpn/powerpc64/mode64/mul_basecase.asm | 10 +++++----- mpn/powerpc64/mode64/p5/gmp-mparam.h | 2 +- mpn/powerpc64/mode64/p6/gmp-mparam.h | 2 +- mpn/powerpc64/mode64/rsh1add_n.asm | 11 ++++++----- mpn/powerpc64/mode64/rsh1sub_n.asm | 11 ++++++----- mpn/powerpc64/mode64/sqr_diag_addlsh1.asm | 11 ++++++----- mpn/powerpc64/rshift.asm | 11 ++++++----- 26 files changed, 144 insertions(+), 111 deletions(-) diff --git a/mpn/powerpc64/com.asm b/mpn/powerpc64/com.asm index 4fb2e65d7..cb89bade2 100644 --- a/mpn/powerpc64/com.asm +++ b/mpn/powerpc64/com.asm @@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1? -C POWER4/PPC970: 1.6 +C cycles/limb +C POWER3/PPC630 1? +C POWER4/PPC970 1.6 +C POWER5 ? +C POWER6 ? +C POWER7 1.45 C TODO C * 8-way unrolling brings timing down to about 1.3 cycles/limb. diff --git a/mpn/powerpc64/copyd.asm b/mpn/powerpc64/copyd.asm index 6a46a433c..256e7dc12 100644 --- a/mpn/powerpc64/copyd.asm +++ b/mpn/powerpc64/copyd.asm @@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1 -C POWER4/PPC970: 1 +C cycles/limb +C POWER3/PPC630 1 +C POWER4/PPC970 1 +C POWER5 ? +C POWER6 ? +C POWER7 1.4 C INPUT PARAMETERS C rp r3 diff --git a/mpn/powerpc64/copyi.asm b/mpn/powerpc64/copyi.asm index 5cb7e4856..31d1fc2e7 100644 --- a/mpn/powerpc64/copyi.asm +++ b/mpn/powerpc64/copyi.asm @@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1 -C POWER4/PPC970: 1 +C cycles/limb +C POWER3/PPC630 1 +C POWER4/PPC970 1 +C POWER5 ? +C POWER6 ? +C POWER7 1.4 C INPUT PARAMETERS C rp r3 diff --git a/mpn/powerpc64/logops_n.asm b/mpn/powerpc64/logops_n.asm index 917b59f45..2caa2c7c6 100644 --- a/mpn/powerpc64/logops_n.asm +++ b/mpn/powerpc64/logops_n.asm @@ -20,9 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1.75 -C POWER4/PPC970: 2.10 +C cycles/limb +C POWER3/PPC630 1.75 +C POWER4/PPC970 2.10 +C POWER5 ? +C POWER6 ? +C POWER7 1.75 C n POWER3/PPC630 POWER4/PPC970 C 1 15.00 15.33 diff --git a/mpn/powerpc64/lshift.asm b/mpn/powerpc64/lshift.asm index f97661ae7..eb70c4983 100644 --- a/mpn/powerpc64/lshift.asm +++ b/mpn/powerpc64/lshift.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.75 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 C TODO C * Try to reduce the number of needed live registers diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm index 980525f67..c6ea35089 100644 --- a/mpn/powerpc64/mode64/aors_n.asm +++ b/mpn/powerpc64/mode64/aors_n.asm @@ -20,11 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 1.5 -C POWER4/PPC970 2 -C POWER5 2.25 -C POWER6 2.63 +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2.25 +C POWER6 2.63 +C POWER7 2.25-2.87 C This code is a little bit slower for POWER3/PPC630 than the simple code used C previously, but it is much faster for POWER4/PPC970. The reason for the diff --git a/mpn/powerpc64/mode64/aorslshC_n.asm b/mpn/powerpc64/mode64/aorslshC_n.asm index 4622cd946..3776d3e59 100644 --- a/mpn/powerpc64/mode64/aorslshC_n.asm +++ b/mpn/powerpc64/mode64/aorslshC_n.asm @@ -17,11 +17,12 @@ dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. -C cycles/limb -C POWER3/PPC630 1.83 (1.5 c/l should be possible) -C POWER4/PPC970 3 (2.0 c/l should be possible) -C POWER5 3 -C POWER6 3.5-47 +C cycles/limb +C POWER3/PPC630 1.83 (1.5 c/l should be possible) +C POWER4/PPC970 3 (2.0 c/l should be possible) +C POWER5 3 +C POWER6 3.5-47 +C POWER7 3 C STATUS C * Try combining upx+up, and vpx+vp. diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm index b1a3315b6..658a2d941 100644 --- a/mpn/powerpc64/mode64/aorsmul_1.asm +++ b/mpn/powerpc64/mode64/aorsmul_1.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C mpn_addmul_1 mpn_submul_1 -C cycles/limb cycles/limb -C POWER3/PPC630 6-18 6-18 -C POWER4/PPC970 8 8.3 -C POWER5 8 8.25 -C POWER6 16.25 16.75 +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 6-18 6-18 +C POWER4/PPC970 8 8.3 +C POWER5 8 8.25 +C POWER6 16.25 16.75 +C POWER7 3.77 4.9 C TODO C * Try to reduce the number of needed live registers diff --git a/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/mpn/powerpc64/mode64/bdiv_dbm1c.asm index 40f3d4ec7..e88fc4440 100644 --- a/mpn/powerpc64/mode64/bdiv_dbm1c.asm +++ b/mpn/powerpc64/mode64/bdiv_dbm1c.asm @@ -19,11 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb +C cycles/limb C POWER3/PPC630 6-18 C POWER4/PPC970 8.5? C POWER5 8.5 fluctuating as function of n % 3 C POWER6 15 +C POWER6 15 +C POWER7 4.75 C TODO C * Nothing to do... diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm index d457d65e9..0f94154bf 100644 --- a/mpn/powerpc64/mode64/dive_1.asm +++ b/mpn/powerpc64/mode64/dive_1.asm @@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C norm unorm +C cycles/limb +C norm unorm C POWER3/PPC630 13-19 -C POWER4/PPC970 16 -C POWER5 16 16 -C POWER6 37 46 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 C TODO C * Check if n=1 code is really an improvement. It probably isn't. diff --git a/mpn/powerpc64/mode64/divrem_1.asm b/mpn/powerpc64/mode64/divrem_1.asm index 9d065b728..c0e7b2a9f 100644 --- a/mpn/powerpc64/mode64/divrem_1.asm +++ b/mpn/powerpc64/mode64/divrem_1.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C norm unorm frac -C POWER3/PPC630 16-34 16-34 ~11 -C POWER4/PPC970 29 19 -C POWER5 29 29 ~20 -C POWER6 50 59 ~42 +C cycles/limb +C norm unorm frac +C POWER3/PPC630 16-34 16-34 ~11 +C POWER4/PPC970 29 19 +C POWER5 29 29 ~20 +C POWER6 50 59 ~42 +C POWER7 25 25 ~14 C INPUT PARAMETERS C qp = r3 diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm index 53ef1c708..18f549357 100644 --- a/mpn/powerpc64/mode64/divrem_2.asm +++ b/mpn/powerpc64/mode64/divrem_2.asm @@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C norm frac +C cycles/limb +C norm frac C POWER3/PPC630 -C POWER4/PPC970 ? ? -C POWER5 37 ? -C POWER6 62 ? +C POWER4/PPC970 ? ? +C POWER5 37 ? +C POWER6 62 ? +C POWER6 30.5 ? C INPUT PARAMETERS C qp = r3 diff --git a/mpn/powerpc64/mode64/invert_limb.asm b/mpn/powerpc64/mode64/invert_limb.asm index aed0a32ab..31b243001 100644 --- a/mpn/powerpc64/mode64/invert_limb.asm +++ b/mpn/powerpc64/mode64/invert_limb.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb (approximate) -C POWER3/PPC630 80 -C POWER4/PPC970 86 -C POWER5 86 -C POWER6 170 +C cycles/limb (approximate) +C POWER3/PPC630 80 +C POWER4/PPC970 86 +C POWER5 86 +C POWER6 170 +C POWER7 66 ASM_START() PROLOGUE(mpn_invert_limb) diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/mode64/lshiftc.asm index 647244d1f..bca55638f 100644 --- a/mpn/powerpc64/mode64/lshiftc.asm +++ b/mpn/powerpc64/mode64/lshiftc.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.5 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.5 +C POWER7 2.15 C TODO C * Try to reduce the number of needed live registers diff --git a/mpn/powerpc64/mode64/mod_1_1.asm b/mpn/powerpc64/mode64/mod_1_1.asm index 61e39310a..f24ceb2c8 100644 --- a/mpn/powerpc64/mode64/mod_1_1.asm +++ b/mpn/powerpc64/mode64/mod_1_1.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 17 -C POWER5 16 -C POWER6 30 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 17 +C POWER5 16 +C POWER6 30 +C POWER7 10.2 C TODO C * Optimise, in particular the cps function. This was compiler-generated and diff --git a/mpn/powerpc64/mode64/mod_1_4.asm b/mpn/powerpc64/mode64/mod_1_4.asm index e0f26da96..b6163c5e7 100644 --- a/mpn/powerpc64/mode64/mod_1_4.asm +++ b/mpn/powerpc64/mode64/mod_1_4.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 9 -C POWER5 9 -C POWER6 13 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 9 +C POWER5 9 +C POWER6 13 +C POWER7 3.5 C TODO C * Optimise, in particular the cps function. This was compiler-generated and diff --git a/mpn/powerpc64/mode64/mod_34lsub1.asm b/mpn/powerpc64/mode64/mod_34lsub1.asm index 62ba17a3c..30b9f98be 100644 --- a/mpn/powerpc64/mode64/mod_34lsub1.asm +++ b/mpn/powerpc64/mode64/mod_34lsub1.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 1.33 -C POWER4/PPC970 1.5 -C POWER5 1.32 -C POWER6 2.35 +C cycles/limb +C POWER3/PPC630 1.33 +C POWER4/PPC970 1.5 +C POWER5 1.32 +C POWER6 2.35 +C POWER7 1 C INPUT PARAMETERS define(`up',`r3') diff --git a/mpn/powerpc64/mode64/mode1o.asm b/mpn/powerpc64/mode64/mode1o.asm index 489ca8551..37e4028d8 100644 --- a/mpn/powerpc64/mode64/mode1o.asm +++ b/mpn/powerpc64/mode64/mode1o.asm @@ -19,10 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 13-19 -C POWER4/PPC970: 16 -C POWER5: 16 +C cycles/limb +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 +C POWER6 ? +C POWER7 12 C TODO C * Check if n=1 code is really an improvement. It probably isn't. diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm index 12bff2fb6..e911cf551 100644 --- a/mpn/powerpc64/mode64/mul_1.asm +++ b/mpn/powerpc64/mode64/mul_1.asm @@ -21,11 +21,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 6-18 -C POWER4/PPC970 7.25? not updated for last file revision -C POWER5 7.25 -C POWER6 14 +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 7.25? not updated for last file revision +C POWER5 7.25 +C POWER6 14 +C POWER7 2.9 C TODO C * Try to reduce the number of needed live registers (at least r5 and r10 diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm index fd7ff9aa1..a34f75962 100644 --- a/mpn/powerpc64/mode64/mul_basecase.asm +++ b/mpn/powerpc64/mode64/mul_basecase.asm @@ -20,11 +20,11 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 6-18 -C POWER4/PPC970 8 -C POWER5 8 -C POWER6 24 +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 24 C INPUT PARAMETERS define(`rp', `r3') diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h index 827b555c8..d177da94e 100644 --- a/mpn/powerpc64/mode64/p5/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h @@ -1,4 +1,4 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. +/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file. Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free Software Foundation, Inc. diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h index d447b56d9..88cac3e72 100644 --- a/mpn/powerpc64/mode64/p6/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -1,4 +1,4 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. +/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file. Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free Software Foundation, Inc. diff --git a/mpn/powerpc64/mode64/rsh1add_n.asm b/mpn/powerpc64/mode64/rsh1add_n.asm index 8af3ca774..2a5ef3060 100644 --- a/mpn/powerpc64/mode64/rsh1add_n.asm +++ b/mpn/powerpc64/mode64/rsh1add_n.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 2 (1.5 c/l should be possible) -C POWER4/PPC970 4 (2.0 c/l should be possible) -C POWER5 3.5 (2.0 c/l should be possible) -C POWER6 4.5 +C cycles/limb +C POWER3/PPC630 2 (1.5 c/l should be possible) +C POWER4/PPC970 4 (2.0 c/l should be possible) +C POWER5 3.5 (2.0 c/l should be possible) +C POWER6 4.5 +C POWER7 3.5 define(`rp',`r3') define(`up',`r4') diff --git a/mpn/powerpc64/mode64/rsh1sub_n.asm b/mpn/powerpc64/mode64/rsh1sub_n.asm index 1faa03379..b10eb8ab7 100644 --- a/mpn/powerpc64/mode64/rsh1sub_n.asm +++ b/mpn/powerpc64/mode64/rsh1sub_n.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 2 (1.5 c/l should be possible) -C POWER4/PPC970 4 (2.0 c/l should be possible) -C POWER5 3.5 (2.0 c/l should be possible) -C POWER6 4.5 +C cycles/limb +C POWER3/PPC630 2 (1.5 c/l should be possible) +C POWER4/PPC970 4 (2.0 c/l should be possible) +C POWER5 3.5 (2.0 c/l should be possible) +C POWER6 4.5 +C POWER7 3.5 define(`rp',`r3') define(`up',`r4') diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm index 663f04c14..a1903cb6e 100644 --- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm +++ b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 10 -C POWER4/PPC970 6 -C POWER5 5.375 -C POWER6 8.5 +C cycles/limb +C POWER3/PPC630 10 +C POWER4/PPC970 6 +C POWER5 5.375 +C POWER6 8.5 +C POWER7 3.4 C NOTES C * This was written for POWER6 and its preferences for adjacent integer diff --git a/mpn/powerpc64/rshift.asm b/mpn/powerpc64/rshift.asm index 6545af769..18406c57e 100644 --- a/mpn/powerpc64/rshift.asm +++ b/mpn/powerpc64/rshift.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.75 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 C TODO C * Try to reduce the number of needed live registers -- cgit v1.2.1 From bd877338537856ee48e44fc80c92e58bfc68809f Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 01:20:56 +0100 Subject: *** empty log message *** --- ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index 939030555..9bff5fdde 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-03 Torbjorn Granlund + + * mpn/powerpc64/mode64/p7/gmp-mparam.h: New file. + 2011-11-02 Torbjorn Granlund * mpn/s390_64/invert_limb.asm: Slight optimisation. -- cgit v1.2.1 From 0254462dd44a2a730978ca1ca4d5c749ac51902a Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 19:39:30 +0100 Subject: Pass -m32 in more cases, using via _maybe mechanism. Inherit default gcc_cflags in more places. --- configure.in | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/configure.in b/configure.in index 21defe968..9c0092427 100644 --- a/configure.in +++ b/configure.in @@ -648,7 +648,7 @@ case $host in # -mpa-risc-2-0 is only an optional flag, in case an old gcc is # used. Assembler support for 2.0 is essential though, for our asm # files. - gcc_20n_cflags="-O2" + gcc_20n_cflags="$gcc_cflags" gcc_20n_cflags_optlist="arch" gcc_20n_cflags_arch="-mpa-risc-2-0 -mpa-risc-1-1" gcc_20n_testlist="sizeof-long-4 hppa-level-2.0" @@ -671,7 +671,7 @@ case $host in esac cclist_20w="gcc cc" - gcc_20w_cflags="-O2 -mpa-risc-2-0" + gcc_20w_cflags="$gcc_cflags -mpa-risc-2-0" cc_20w_cflags="+DD64 +O2" cc_20w_testlist="hpc-hppa-2-0" path_20w="pa64" @@ -735,7 +735,7 @@ case $host in cc_32_cflags="" cc_32_cflags_optlist="opt" cc_32_cflags_opt="+O3 +O2 +O1" - gcc_32_cflags="-milp32 -O2" + gcc_32_cflags="$gcc_cflags -milp32" limb_32=longlong SPEED_CYCLECOUNTER_OBJ_32=ia64.lo cyclecounter_size_32=2 @@ -750,7 +750,7 @@ case $host in cc_64_cppflags="+DD64" cc_64_cflags_optlist="opt" cc_64_cflags_opt="+O3 +O2 +O1" - gcc_64_cflags="$gcc_64_cflags -mlp64" + gcc_64_cflags="$gcc_cflags -mlp64" ;; esac ;; @@ -831,13 +831,13 @@ case $host in abilist="n32 64 o32" cclist_n32="gcc cc" - gcc_n32_cflags="-O2 -mabi=n32" + gcc_n32_cflags="$gcc_cflags -mabi=n32" cc_n32_cflags="-O2 -n32" # no -g, it disables all optimizations limb_n32=longlong path_n32="mips64" cclist_64="gcc cc" - gcc_64_cflags="$gcc_64_cflags -mabi=64" + gcc_64_cflags="$gcc_cflags -mabi=64" gcc_64_ldflags="-Wc,-mabi=64" cc_64_cflags="-O2 -64" # no -g, it disables all optimizations cc_64_ldflags="-Wc,-64" @@ -969,7 +969,7 @@ case $host in # Need -Wc to pass object type flags through to the linker. abilist="mode64 $abilist" cclist_mode64="gcc xlc" - gcc_mode64_cflags="-O2 -maix64 -mpowerpc64" + gcc_mode64_cflags="$gcc_cflags -maix64 -mpowerpc64" gcc_mode64_cflags_optlist="cpu" gcc_mode64_ldflags="-Wc,-maix64" xlc_mode64_cflags="-O2 -q64 -qmaxmem=20000" @@ -1014,6 +1014,7 @@ case $host in abilist="mode64 mode32 $abilist" gcc_cflags_opt="-O3 -O2 -O1" # will this become used? cclist_mode32="gcc" + gcc_mode32_cflags_maybe="-m32" gcc_mode32_cflags="-mpowerpc64" gcc_mode32_cflags_optlist="subtype cpu opt" gcc_mode32_cflags_subtype="-force_cpusubtype_ALL" @@ -1057,6 +1058,7 @@ case $host in # abilist="mode64 mode32 $abilist" cclist_mode32="gcc" + gcc_mode32_cflags_maybe="-m32" gcc_mode32_cflags="-mpowerpc64" gcc_mode32_cflags_optlist="cpu opt" gcc_mode32_cflags_opt="-O3 -O2 -O1" @@ -1358,7 +1360,7 @@ case $host in # it until we're sure. (Might want -xarch=v9a or -xarch=v9b for the # higher cpu types instead.) # - gcc_64_cflags="$gcc_64_cflags -m64 -mptr64" + gcc_64_cflags="$gcc_cflags -m64 -mptr64" gcc_64_ldflags="-Wc,-m64" gcc_64_cflags_optlist="cpu" @@ -1580,7 +1582,7 @@ case $host in case $host in X86_64_PATTERN) cclist_64="gcc" - gcc_64_cflags="$gcc_64_cflags -m64" + gcc_64_cflags="$gcc_cflags -m64" gcc_64_cflags_optlist="cpu arch" CALLING_CONVENTIONS_OBJS_64='amd64call.lo amd64check$U.lo' SPEED_CYCLECOUNTER_OBJ_64=x86_64.lo @@ -1625,7 +1627,7 @@ case $host in path_64="" # Windows amd64 calling conventions are *different* extra_functions_64="" # Silence many pedantic warnings for w64. FIXME. - gcc_64_cflags="$gcc_64_cflags -std=gnu99" + gcc_64_cflags="$gcc_cflags -std=gnu99" ;; esac ;; -- cgit v1.2.1 From ac8a2270a9a1c3596bc8abc2c3785ef324b85d5d Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 19:39:45 +0100 Subject: *** empty log message *** --- ChangeLog | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9bff5fdde..0b26e4664 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2011-11-03 Torbjorn Granlund + * configure.in: Pass -m32 in more cases, using via _maybe mechanism. + Inherit default gcc_cflags in more places. + * mpn/powerpc64/mode64/p7/gmp-mparam.h: New file. 2011-11-02 Torbjorn Granlund -- cgit v1.2.1 From f64a1e744b5e0c511dd012a46bc5a845e901836e Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 22:42:23 +0100 Subject: Move file up from mode64. --- mpn/powerpc64/lshiftc.asm | 198 +++++++++++++++++++++++++++++++++++++++ mpn/powerpc64/mode64/lshiftc.asm | 195 -------------------------------------- 2 files changed, 198 insertions(+), 195 deletions(-) create mode 100644 mpn/powerpc64/lshiftc.asm delete mode 100644 mpn/powerpc64/mode64/lshiftc.asm diff --git a/mpn/powerpc64/lshiftc.asm b/mpn/powerpc64/lshiftc.asm new file mode 100644 index 000000000..8f470a5f4 --- /dev/null +++ b/mpn/powerpc64/lshiftc.asm @@ -0,0 +1,198 @@ +dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt + +dnl Copyright 2003, 2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.5 +C POWER7 2.15 + +C TODO +C * Try to reduce the number of needed live registers +C * Micro-optimise header code +C * Keep in synch with lshift.asm and rshift.asm + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`u0',`r30') +define(`u1',`r31') +define(`retval',`r5') + +ASM_START() +PROLOGUE(mpn_lshiftc) + std r31, -8(r1) + std r30, -16(r1) + subfic tnc, cnt, 64 + sldi r7, n, 3 C byte count corresponding to n + add up, up, r7 C up = up + n + add rp, rp, r7 C rp = rp + n + rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 + cmpdi cr6, r30, 2 + addi r31, n, 3 C compute count... + ld r10, -8(up) C load 1st limb for b00...b11 + srd retval, r10, tnc + srdi r31, r31, 2 C ...for ctr + mtctr r31 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + ld r11, -16(up) C load 2nd limb for b10 and b11 + beq cr6, L(b10) + + ALIGN(16) +L(b11): sld r8, r10, cnt + srd r9, r11, tnc + ld u1, -24(up) + addi up, up, -24 + sld r12, r11, cnt + srd r7, u1, tnc + addi rp, rp, 16 + bdnz L(gt3) + + nor r11, r8, r9 + sld r8, u1, cnt + nor r8, r8, r8 + b L(cj3) + + ALIGN(16) +L(gt3): ld u0, -8(up) + nor r11, r8, r9 + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -16(up) + nor r10, r12, r7 + b L(L11) + + ALIGN(32) +L(b10): sld r12, r10, cnt + addi rp, rp, 24 + srd r7, r11, tnc + bdnz L(gt2) + + sld r8, r11, cnt + nor r10, r12, r7 + nor r8, r8, r8 + b L(cj2) + +L(gt2): ld u0, -24(up) + sld r8, r11, cnt + srd r9, u0, tnc + ld u1, -32(up) + nor r10, r12, r7 + sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -40(up) + nor r11, r8, r9 + addi up, up, -16 + b L(L10) + + ALIGN(16) +L(b00): ld u1, -16(up) + sld r12, r10, cnt + srd r7, u1, tnc + ld u0, -24(up) + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -32(up) + nor r10, r12, r7 + sld r12, u0, cnt + srd r7, u1, tnc + addi rp, rp, 8 + bdz L(cj4) + +L(gt4): addi up, up, -32 + ld u0, -8(up) + nor r11, r8, r9 + b L(L00) + + ALIGN(16) +L(b01): bdnz L(gt1) + sld r8, r10, cnt + nor r8, r8, r8 + std r8, -8(rp) + b L(ret) + +L(gt1): ld u0, -16(up) + sld r8, r10, cnt + srd r9, u0, tnc + ld u1, -24(up) + sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -32(up) + nor r11, r8, r9 + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -40(up) + addi up, up, -40 + nor r10, r12, r7 + bdz L(end) + + ALIGN(32) +L(top): sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -8(up) + std r11, -8(rp) + nor r11, r8, r9 +L(L00): sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -16(up) + std r10, -16(rp) + nor r10, r12, r7 +L(L11): sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -24(up) + std r11, -24(rp) + nor r11, r8, r9 +L(L10): sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -32(up) + addi up, up, -32 + std r10, -32(rp) + addi rp, rp, -32 + nor r10, r12, r7 + bdnz L(top) + + ALIGN(32) +L(end): sld r12, u0, cnt + srd r7, u1, tnc + std r11, -8(rp) +L(cj4): nor r11, r8, r9 + sld r8, u1, cnt + std r10, -16(rp) + nor r8, r8, r8 +L(cj3): nor r10, r12, r7 + std r11, -24(rp) +L(cj2): std r10, -32(rp) + std r8, -40(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) +ifdef(`HAVE_ABI_mode32', +` srdi r3, retval, 32 + mr r4, retval +',` mr r3, retval') + blr +EPILOGUE() diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/mode64/lshiftc.asm deleted file mode 100644 index bca55638f..000000000 --- a/mpn/powerpc64/mode64/lshiftc.asm +++ /dev/null @@ -1,195 +0,0 @@ -dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt - -dnl Copyright 2003, 2005, 2010 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.5 -C POWER7 2.15 - -C TODO -C * Try to reduce the number of needed live registers -C * Micro-optimise header code -C * Keep in synch with lshift.asm and rshift.asm - -C INPUT PARAMETERS -define(`rp', `r3') -define(`up', `r4') -define(`n', `r5') -define(`cnt', `r6') - -define(`tnc',`r0') -define(`u0',`r30') -define(`u1',`r31') -define(`retval',`r5') - -ASM_START() -PROLOGUE(mpn_lshiftc) - std r31, -8(r1) - std r30, -16(r1) - subfic tnc, cnt, 64 - sldi r7, n, 3 C byte count corresponding to n - add up, up, r7 C up = up + n - add rp, rp, r7 C rp = rp + n - rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 - cmpdi cr6, r30, 2 - addi r31, n, 3 C compute count... - ld r10, -8(up) C load 1st limb for b00...b11 - srd retval, r10, tnc - srdi r31, r31, 2 C ...for ctr - mtctr r31 C copy count into ctr - beq cr0, L(b00) - blt cr6, L(b01) - ld r11, -16(up) C load 2nd limb for b10 and b11 - beq cr6, L(b10) - - ALIGN(16) -L(b11): sld r8, r10, cnt - srd r9, r11, tnc - ld u1, -24(up) - addi up, up, -24 - sld r12, r11, cnt - srd r7, u1, tnc - addi rp, rp, 16 - bdnz L(gt3) - - nor r11, r8, r9 - sld r8, u1, cnt - nor r8, r8, r8 - b L(cj3) - - ALIGN(16) -L(gt3): ld u0, -8(up) - nor r11, r8, r9 - sld r8, u1, cnt - srd r9, u0, tnc - ld u1, -16(up) - nor r10, r12, r7 - b L(L11) - - ALIGN(32) -L(b10): sld r12, r10, cnt - addi rp, rp, 24 - srd r7, r11, tnc - bdnz L(gt2) - - sld r8, r11, cnt - nor r10, r12, r7 - nor r8, r8, r8 - b L(cj2) - -L(gt2): ld u0, -24(up) - sld r8, r11, cnt - srd r9, u0, tnc - ld u1, -32(up) - nor r10, r12, r7 - sld r12, u0, cnt - srd r7, u1, tnc - ld u0, -40(up) - nor r11, r8, r9 - addi up, up, -16 - b L(L10) - - ALIGN(16) -L(b00): ld u1, -16(up) - sld r12, r10, cnt - srd r7, u1, tnc - ld u0, -24(up) - sld r8, u1, cnt - srd r9, u0, tnc - ld u1, -32(up) - nor r10, r12, r7 - sld r12, u0, cnt - srd r7, u1, tnc - addi rp, rp, 8 - bdz L(cj4) - -L(gt4): addi up, up, -32 - ld u0, -8(up) - nor r11, r8, r9 - b L(L00) - - ALIGN(16) -L(b01): bdnz L(gt1) - sld r8, r10, cnt - nor r8, r8, r8 - std r8, -8(rp) - b L(ret) - -L(gt1): ld u0, -16(up) - sld r8, r10, cnt - srd r9, u0, tnc - ld u1, -24(up) - sld r12, u0, cnt - srd r7, u1, tnc - ld u0, -32(up) - nor r11, r8, r9 - sld r8, u1, cnt - srd r9, u0, tnc - ld u1, -40(up) - addi up, up, -40 - nor r10, r12, r7 - bdz L(end) - - ALIGN(32) -L(top): sld r12, u0, cnt - srd r7, u1, tnc - ld u0, -8(up) - std r11, -8(rp) - nor r11, r8, r9 -L(L00): sld r8, u1, cnt - srd r9, u0, tnc - ld u1, -16(up) - std r10, -16(rp) - nor r10, r12, r7 -L(L11): sld r12, u0, cnt - srd r7, u1, tnc - ld u0, -24(up) - std r11, -24(rp) - nor r11, r8, r9 -L(L10): sld r8, u1, cnt - srd r9, u0, tnc - ld u1, -32(up) - addi up, up, -32 - std r10, -32(rp) - addi rp, rp, -32 - nor r10, r12, r7 - bdnz L(top) - - ALIGN(32) -L(end): sld r12, u0, cnt - srd r7, u1, tnc - std r11, -8(rp) -L(cj4): nor r11, r8, r9 - sld r8, u1, cnt - std r10, -16(rp) - nor r8, r8, r8 -L(cj3): nor r10, r12, r7 - std r11, -24(rp) -L(cj2): std r10, -32(rp) - std r8, -40(rp) - -L(ret): ld r31, -8(r1) - ld r30, -16(r1) - mr r3, retval - blr -EPILOGUE() -- cgit v1.2.1 From c7aa2d66f1403def399929e97347f16b4386550a Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 3 Nov 2011 22:44:45 +0100 Subject: (mpz_sub): Abort for non-handled case. --- dumbmp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dumbmp.c b/dumbmp.c index 293580228..3292d6eec 100644 --- a/dumbmp.c +++ b/dumbmp.c @@ -421,6 +421,8 @@ mpz_sub (mpz_t r, mpz_t a, mpz_t b) mp_limb_t *tp; int tn; tn = an; an = bn; bn = tn; tp = ap; ap = bp; bp = tp; + /* This needs sign change, not done so abort. */ + abort (); } cy = 0; -- cgit v1.2.1 From 9e346a7777b9c5576e5a4758a1701ed114a9e977 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Fri, 4 Nov 2011 00:15:09 +0100 Subject: *** empty log message *** --- ChangeLog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0b26e4664..803bcb543 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2011-11-03 Torbjorn Granlund + * dumbmp.c (mpz_sub): Abort for non-handled case. + + * mpn/powerpc64/mode64/lshiftc.asm: Move file from here... + * mpn/powerpc64/lshiftc.asm: ...to here, with trivial modifications. + * configure.in: Pass -m32 in more cases, using via _maybe mechanism. Inherit default gcc_cflags in more places. -- cgit v1.2.1 From 98ec919fbc24e85c20818b472131687ba42ae6ab Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 7 Nov 2011 18:42:27 +0100 Subject: Change how mpn_redc_1 works, use more broadly. --- configure.in | 2 +- gmp-impl.h | 5 +--- mpn/generic/powm.c | 44 +++++++++++++++++++------------ mpn/generic/powm_sec.c | 18 +++++++++---- mpn/generic/redc_1.c | 5 +--- mpn/generic/redc_1_sec.c | 45 -------------------------------- mpn/x86_64/redc_1.asm | 68 ++++++++++-------------------------------------- tests/refmpn.c | 7 ++--- tune/speed.h | 6 ++--- 9 files changed, 61 insertions(+), 139 deletions(-) delete mode 100644 mpn/generic/redc_1_sec.c diff --git a/configure.in b/configure.in index 9c0092427..79367c210 100644 --- a/configure.in +++ b/configure.in @@ -2638,7 +2638,7 @@ gmp_mpn_functions="$extra_functions \ mu_bdiv_q mu_bdiv_qr \ bdiv_q bdiv_qr \ divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec \ - redc_1_sec trialdiv remove \ + trialdiv remove \ and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n \ copyi copyd zero \ $gmp_mpn_functions_optional" diff --git a/gmp-impl.h b/gmp-impl.h index e918c31ed..c0ed63791 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -1063,7 +1063,7 @@ __GMP_DECLSPEC void mpn_mulmid __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_sr __GMP_DECLSPEC mp_limb_t mpn_submul_1c __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t)); #define mpn_redc_1 __MPN(redc_1) -__GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t)); +__GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t)); #define mpn_redc_2 __MPN(redc_2) __GMP_DECLSPEC void mpn_redc_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr)); @@ -1471,9 +1471,6 @@ __GMP_DECLSPEC void mpn_powm_sec __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t __GMP_DECLSPEC mp_size_t mpn_powm_sec_itch __GMP_PROTO ((mp_size_t, mp_size_t, mp_size_t)); #define mpn_tabselect __MPN(tabselect) __GMP_DECLSPEC void mpn_tabselect __GMP_PROTO ((volatile mp_limb_t *, volatile mp_limb_t *, mp_size_t, mp_size_t, mp_size_t)); -#define mpn_redc_1_sec __MPN(redc_1_sec) -__GMP_DECLSPEC void mpn_redc_1_sec __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t)); - #define mpn_addcnd_n __MPN(addcnd_n) __GMP_DECLSPEC mp_limb_t mpn_addcnd_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t)); #define mpn_subcnd_n __MPN(subcnd_n) diff --git a/mpn/generic/powm.c b/mpn/generic/powm.c index 57edfd4f6..fa92362ad 100644 --- a/mpn/generic/powm.c +++ b/mpn/generic/powm.c @@ -6,7 +6,7 @@ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. -Copyright 2007, 2008, 2009, 2010 Free Software Foundation, Inc. +Copyright 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -74,6 +74,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" +#undef MPN_REDC_1 +#define MPN_REDC_1(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + mpn_redc_1 (up, mp, n, invm); \ + cy = mpn_add_n (rp, up + n, up, n); \ + if (cy != 0) \ + mpn_sub_n (rp, rp, mp, n); \ + } while (0) + #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 #define WANT_REDC_2 1 #endif @@ -212,12 +222,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mpn_sqr (tp, this_pp, n); #if WANT_REDC_2 if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) - mpn_redc_1 (rp, tp, mp, n, mip[0]); + MPN_REDC_1 (rp, tp, mp, n, mip[0]); else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) mpn_redc_2 (rp, tp, mp, n, mip); #else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) - mpn_redc_1 (rp, tp, mp, n, mip[0]); + MPN_REDC_1 (rp, tp, mp, n, mip[0]); #endif else mpn_redc_n (rp, tp, mp, n, mip); @@ -229,12 +239,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, this_pp += n; #if WANT_REDC_2 if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) - mpn_redc_1 (this_pp, tp, mp, n, mip[0]); + MPN_REDC_1 (this_pp, tp, mp, n, mip[0]); else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) mpn_redc_2 (this_pp, tp, mp, n, mip); #else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) - mpn_redc_1 (this_pp, tp, mp, n, mip[0]); + MPN_REDC_1 (this_pp, tp, mp, n, mip[0]); #endif else mpn_redc_n (this_pp, tp, mp, n, mip); @@ -309,7 +319,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } else @@ -319,7 +329,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } } @@ -380,7 +390,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } else @@ -390,7 +400,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } } @@ -401,7 +411,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) #define MPN_SQR(r,a,n) mpn_sqr (r,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) @@ -440,7 +450,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } else @@ -450,7 +460,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } } @@ -501,7 +511,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } else @@ -511,7 +521,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) #define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } } @@ -522,7 +532,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #undef MPN_REDUCE #define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) #define MPN_SQR(r,a,n) mpn_sqr (r,a,n) -#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0]) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) INNERLOOP; } else @@ -545,12 +555,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #if WANT_REDC_2 if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) - mpn_redc_1 (rp, tp, mp, n, mip[0]); + MPN_REDC_1 (rp, tp, mp, n, mip[0]); else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) mpn_redc_2 (rp, tp, mp, n, mip); #else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) - mpn_redc_1 (rp, tp, mp, n, mip[0]); + MPN_REDC_1 (rp, tp, mp, n, mip[0]); #endif else mpn_redc_n (rp, tp, mp, n, mip); diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c index 315ae6e5e..3a6f55403 100644 --- a/mpn/generic/powm_sec.c +++ b/mpn/generic/powm_sec.c @@ -7,7 +7,7 @@ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. -Copyright 2007, 2008, 2009 Free Software Foundation, Inc. +Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -56,6 +56,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define WANT_CACHE_SECURITY 1 +#undef MPN_REDC_1_SEC +#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + mpn_redc_1 (up, mp, n, invm); \ + cy = mpn_add_n (rp, up + n, up, n); \ + mpn_subcnd_n (rp, rp, mp, n, cy); \ + } while (0) /* Define our own mpn squaring function. We do this since we cannot use a native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over @@ -252,7 +260,7 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, { mpn_mul_basecase (tp, this_pp, n, pp + n, n); this_pp += n; - mpn_redc_1_sec (this_pp, tp, mp, n, minv); + MPN_REDC_1_SEC (this_pp, tp, mp, n, minv); } expbits = getbits (ep, ebi, windowsize); @@ -278,7 +286,7 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, do { mpn_local_sqr (tp, rp, n, tp + 2 * n); - mpn_redc_1_sec (rp, tp, mp, n, minv); + MPN_REDC_1_SEC (rp, tp, mp, n, minv); this_windowsize--; } while (this_windowsize != 0); @@ -289,12 +297,12 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, #else mpn_mul_basecase (tp, rp, n, pp + n * expbits, n); #endif - mpn_redc_1_sec (rp, tp, mp, n, minv); + MPN_REDC_1_SEC (rp, tp, mp, n, minv); } MPN_COPY (tp, rp, n); MPN_ZERO (tp + n, n); - mpn_redc_1_sec (rp, tp, mp, n, minv); + MPN_REDC_1_SEC (rp, tp, mp, n, minv); cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ mpn_subcnd_n (rp, rp, mp, n, !cnd); TMP_FREE; diff --git a/mpn/generic/redc_1.c b/mpn/generic/redc_1.c index 177f3932f..3567414eb 100644 --- a/mpn/generic/redc_1.c +++ b/mpn/generic/redc_1.c @@ -25,7 +25,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" void -mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) +mpn_redc_1 (mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) { mp_size_t j; mp_limb_t cy; @@ -40,7 +40,4 @@ mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) up[0] = cy; up++; } - cy = mpn_add_n (rp, up, up - n, n); - if (cy != 0) - mpn_sub_n (rp, rp, mp, n); } diff --git a/mpn/generic/redc_1_sec.c b/mpn/generic/redc_1_sec.c deleted file mode 100644 index 3d914381c..000000000 --- a/mpn/generic/redc_1_sec.c +++ /dev/null @@ -1,45 +0,0 @@ -/* mpn_redc_1_sec. Set cp[] <- up[]/R^n mod mp[]. Clobber up[]. - mp[] is n limbs; up[] is 2n limbs. - - THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY - SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. - -Copyright (C) 2000, 2001, 2002, 2004, 2008, 2009 Free Software Foundation, Inc. - -This file is part of the GNU MP Library. - -The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Lesser General Public License as published by -the Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -License for more details. - -You should have received a copy of the GNU Lesser General Public License -along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ - -#include "gmp.h" -#include "gmp-impl.h" - -void -mpn_redc_1_sec (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) -{ - mp_size_t j; - mp_limb_t cy; - - ASSERT (n > 0); - ASSERT_MPN (up, 2*n); - - for (j = n - 1; j >= 0; j--) - { - cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK); - ASSERT (up[0] == 0); - up[0] = cy; - up++; - } - cy = mpn_add_n (rp, up, up - n, n); - mpn_subcnd_n (rp, rp, mp, n, cy); -} diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm index 976cab2bc..8d731c68c 100644 --- a/mpn/x86_64/redc_1.asm +++ b/mpn/x86_64/redc_1.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse. -dnl Copyright 2004, 2008 Free Software Foundation, Inc. +dnl Copyright 2004, 2008, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -34,22 +34,18 @@ C TODO C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code. C The code for 1, 2, 3, 4 should perhaps be completely register based. C * Perhaps align outer loops. -C * The sub_n at the end leaks side-channel data. How do we fix that? -C * Write mpn_add_n_sub_n computing R = A + B - C. It should run at 2 c/l. C * We could software pipeline the IMUL stuff, by putting it before the C outer loops and before the end of the outer loops. The last outer C loop iteration would then compute an unneeded product, but it is at C least not a stray read from up[], since it is at up[n]. -C * Can we combine both the add_n and sub_n into the loops, somehow? C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`param_mp',`%rdx') -define(`n', `%rcx') -define(`invm', `%r8') +define(`up', `%rdi') +define(`mp', `%rsi') +define(`n_param', `%rdx') +define(`invm', `%rcx') -define(`mp', `%r13') +define(`n', `%r13') define(`i', `%r11') define(`nneg', `%r12') @@ -62,13 +58,12 @@ PROLOGUE(mpn_redc_1) push %r12 push %r13 push %r14 - push n - sub $8, %rsp C maintain ABI required rsp alignment - lea (param_mp,n,8), mp C mp += n - lea (up,n,8), up C up += n + lea (mp,n_param,8), mp C mp += n + lea (up,n_param,8), up C up += n - mov n, nneg + mov n_param, nneg + mov n_param, n neg nneg mov R32(n), R32(%rax) @@ -136,9 +131,7 @@ L(n1): mov %r14, 16(up,nneg,8) C up[0] add $8, up dec n jnz L(o1) -C lea (mp), mp - lea 16(up), up - jmp L(common) + jmp L(ret) L(b0): C lea (mp), mp lea -16(up), up @@ -190,10 +183,7 @@ L(ed0): add %r10, (up) add $8, up dec n jnz L(o0) -C lea (mp), mp - lea 16(up), up - jmp L(common) - + jmp L(ret) L(b3): lea -8(mp), mp lea -24(up), up @@ -244,9 +234,7 @@ L(ed3): add %r10, 8(up) add $8, up dec n jnz L(o3) - lea 8(mp), mp - lea 24(up), up - jmp L(common) + jmp L(ret) L(b2): lea -16(mp), mp lea -32(up), up @@ -299,36 +287,8 @@ L(ed2): add %r10, 16(up) add $8, up dec n jnz L(o2) - lea 16(mp), mp - lea 32(up), up - - -L(common): - lea (mp,nneg,8), mp C restore entry mp - -C cy = mpn_add_n (rp, up, up - n, n); -C rdi rsi rdx rcx - lea (up,nneg,8), up C up -= n - lea (up,nneg,8), %rdx C rdx = up - n [up entry value] - mov rp, nneg C preserve rp over first call - mov 8(%rsp), %rcx C pass entry n -C mov rp, %rdi - CALL( mpn_add_n) - test R32(%rax), R32(%rax) - jz L(ret) - -C mpn_sub_n (rp, rp, mp, n); -C rdi rsi rdx rcx - mov nneg, %rdi - mov nneg, %rsi - mov mp, %rdx - mov 8(%rsp), %rcx C pass entry n - CALL( mpn_sub_n) -L(ret): - add $8, %rsp - pop n C just increment rsp - pop %r14 +L(ret): pop %r14 pop %r13 pop %r12 pop %rbx diff --git a/tests/refmpn.c b/tests/refmpn.c index fbcc602d6..7ace7ebce 100644 --- a/tests/refmpn.c +++ b/tests/refmpn.c @@ -2,7 +2,7 @@ of the normal gmp code. Speed isn't a consideration. Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, -2007, 2008, 2009 Free Software Foundation, Inc. +2007, 2008, 2009, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -2303,12 +2303,9 @@ refmpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) for (j = n - 1; j >= 0; j--) { - up[0] = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK); + up[0] = refmpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK); up++; } - cy = mpn_add_n (rp, up, up - n, n); - if (cy != 0) - mpn_sub_n (rp, rp, mp, n); } size_t diff --git a/tune/speed.h b/tune/speed.h index c017a8ec2..08c01a5dc 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -2193,7 +2193,7 @@ int speed_routine_count_zeros_setup #define SPEED_ROUTINE_REDC_1(function) \ { \ unsigned i; \ - mp_ptr cp, mp, tp, ap; \ + mp_ptr mp, tp, ap; \ mp_limb_t inv; \ double t; \ TMP_DECL; \ @@ -2203,7 +2203,6 @@ int speed_routine_count_zeros_setup TMP_MARK; \ SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ - SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ \ MPN_COPY (ap, s->xp, s->size); \ @@ -2218,14 +2217,13 @@ int speed_routine_count_zeros_setup speed_operand_src (s, ap, 2*s->size+1); \ speed_operand_dst (s, tp, 2*s->size+1); \ speed_operand_src (s, mp, s->size); \ - speed_operand_dst (s, cp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ i = s->reps; \ do { \ MPN_COPY (tp, ap, 2*s->size); \ - function (cp, tp, mp, s->size, inv); \ + function (tp, mp, s->size, inv); \ } while (--i != 0); \ t = speed_endtime (); \ \ -- cgit v1.2.1 From ac2c5637c823e21c9fd3aa8cf3d52fedb70519e5 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 7 Nov 2011 18:42:54 +0100 Subject: *** empty log message *** --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 803bcb543..6d92c7d2a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +2011-11-07 Torbjorn Granlund + + * mpn/generic/redc_1.c: Just reduce U uperand using Hensel norm, but + not fully canonically; leave add_n and conditional sub_n to caller. + Therefore omit R argument. + + * mpn/generic/redc_1_sec.c: Remove. + + * gmp-impl.h (mpn_redc_1): Update declaration. + (mpn_redc_1_sec): Remove declaration. + + * configure.in (gmp_mpn_functions): Remove redc_1. + + * mpn/x86_64/redc_1.asm: Adopt to new defined functionality/interface. + * tune/speed.h (SPEED_ROUTINE_REDC_1): Likewise. + + * tests/refmpn.c (refmpn_redc_1): Likewise; also call refmpn_addmul_1 + instead of mpn_addmul_1. + + * mpn/generic/powm.c (MPN_REDC_1): New macro, use for mpn_redc_1. + * mpn/generic/powm_sec.c (MPN_REDC_1_SEC): New macro, use for + mpn_redc_1_sec. + 2011-11-03 Torbjorn Granlund * dumbmp.c (mpz_sub): Abort for non-handled case. -- cgit v1.2.1 From eb4ffad7f3a72c693a161ecf544e3d7cb9a1c0ec Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 16:34:21 +0100 Subject: Fix comment typo. --- mpn/powerpc64/mode64/mul_basecase.asm | 2 +- mpn/powerpc64/mode64/p6/mul_basecase.asm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm index a34f75962..9a3957f94 100644 --- a/mpn/powerpc64/mode64/mul_basecase.asm +++ b/mpn/powerpc64/mode64/mul_basecase.asm @@ -1,4 +1,4 @@ -dnl PowerPC-64 mpn_basecase. +dnl PowerPC-64 mpn_mul_basecase. dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software dnl Foundation, Inc. diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm index 427d6081a..52c5af8ff 100644 --- a/mpn/powerpc64/mode64/p6/mul_basecase.asm +++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm @@ -1,4 +1,4 @@ -dnl PowerPC-64 mpn_basecase. +dnl PowerPC-64 mpn_mul_basecase. dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free dnl Software Foundation, Inc. -- cgit v1.2.1 From 91ea899257061155b12c5bfab949561117a70a4b Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 22:26:07 +0100 Subject: (gmp_mpn_functions): Add addcnd_n and subcnd_n. --- configure.in | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.in b/configure.in index 79367c210..6c9a313c3 100644 --- a/configure.in +++ b/configure.in @@ -2654,6 +2654,7 @@ case $tmp_fn in tmp_mulfunc="aors_err2_n" ;; add_err3_n|sub_err3_n) tmp_mulfunc="aors_err3_n" ;; + addcnd_n|subcnd_n) tmp_mulfunc="aorscnd_n" ;; addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;; popcount|hamdist) tmp_mulfunc="popham" ;; and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n) -- cgit v1.2.1 From 0247111bce9444a966b57323f42fdd3e5a754b22 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 22:29:36 +0100 Subject: New file. --- mpn/x86_64/aorscnd_n.asm | 164 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 mpn/x86_64/aorscnd_n.asm diff --git a/mpn/x86_64/aorscnd_n.asm b/mpn/x86_64/aorscnd_n.asm new file mode 100644 index 000000000..19ea42f2a --- /dev/null +++ b/mpn/x86_64/aorscnd_n.asm @@ -0,0 +1,164 @@ +dnl AMD64 mpn_addcnd_n, mpn_subcnd_n + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.25 +C AMD K10 2 +C Intel P4 13 +C Intel core2 2.9 +C Intel NHM 2.9 +C Intel SBR 2.4 +C Intel atom 6.5 +C VIA nano 3 + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about wheather it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Two cases could be optimised: (1) addcnd_n could use ADCSBB-from-memory +C to save one insn/limb, and (2) when up=rp addcnd_n and subcnd_n could use +C ADCSBB-to-memory, again saving 1 insn/limb. +C * This runs optimally at decoder bandwidth on K10. It has not been tuned +C for any other processor. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnd', `%r8') + +ifdef(`OPERATION_addcnd_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addcnd_n)') +ifdef(`OPERATION_subcnd_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_subcnd_n)') + +MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + neg cnd + sbb cnd, cnd C make cnd mask + + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + + mov R32(n), R32(%rax) + neg n + and $3, R32(%rax) + jz L(top) C carry-save reg rax = 0 in this arc + cmp $2, R32(%rax) + jc L(b1) + jz L(b2) + +L(b3): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov 16(vp,n,8), %r14 + mov (up,n,8), %r10 + mov 8(up,n,8), %rbx + mov 16(up,n,8), %rbp + and cnd, %r12 + and cnd, %r13 + and cnd, %r14 + ADDSUB %r12, %r10 + ADCSBB %r13, %rbx + ADCSBB %r14, %rbp + sbb R32(%rax), R32(%rax) C save carry + mov %r10, (rp,n,8) + mov %rbx, 8(rp,n,8) + mov %rbp, 16(rp,n,8) + add $3, n + js L(top) + jmp L(end) + +L(b2): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov (up,n,8), %r10 + mov 8(up,n,8), %rbx + and cnd, %r12 + and cnd, %r13 + ADDSUB %r12, %r10 + ADCSBB %r13, %rbx + sbb R32(%rax), R32(%rax) C save carry + mov %r10, (rp,n,8) + mov %rbx, 8(rp,n,8) + add $2, n + js L(top) + jmp L(end) + +L(b1): mov (vp,n,8), %r12 + mov (up,n,8), %r10 + and cnd, %r12 + ADDSUB %r12, %r10 + sbb R32(%rax), R32(%rax) C save carry + mov %r10, (rp,n,8) + add $1, n + jns L(end) + + ALIGN(16) +L(top): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov 16(vp,n,8), %r14 + mov 24(vp,n,8), %r11 + mov (up,n,8), %r10 + mov 8(up,n,8), %rbx + mov 16(up,n,8), %rbp + mov 24(up,n,8), %r9 + and cnd, %r12 + and cnd, %r13 + and cnd, %r14 + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB %r12, %r10 + ADCSBB %r13, %rbx + ADCSBB %r14, %rbp + ADCSBB %r11, %r9 + sbb R32(%rax), R32(%rax) C save carry + mov %r10, (rp,n,8) + mov %rbx, 8(rp,n,8) + mov %rbp, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + js L(top) + +L(end): neg R32(%rax) + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() -- cgit v1.2.1 From 76dbb3ab764f748395af063c5b58f188ccbdb163 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 22:31:09 +0100 Subject: Add measuring of mpn_addcnd_n, mpn_subcnd_n. --- tune/common.c | 11 +++++++++++ tune/speed.c | 3 +++ tune/speed.h | 2 ++ 3 files changed, 16 insertions(+) diff --git a/tune/common.c b/tune/common.c index dbcc5ce90..eb2d4ba1a 100644 --- a/tune/common.c +++ b/tune/common.c @@ -1107,6 +1107,17 @@ speed_mpn_rsh1sub_n (struct speed_params *s) } #endif +double +speed_mpn_addcnd_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addcnd_n (wp, xp, yp, s->size, 1)); +} +double +speed_mpn_subcnd_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_subcnd_n (wp, xp, yp, s->size, 1)); +} + /* mpn_and_n etc can be macros and so have to be handled with SPEED_ROUTINE_MPN_BINARY_N_CALL forms */ double diff --git a/tune/speed.c b/tune/speed.c index 0604edded..061517e28 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -468,6 +468,9 @@ const struct routine_t { { "mpn_rsh1sub_n", speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL }, #endif + { "mpn_addcnd_n", speed_mpn_addcnd_n, FLAG_R_OPTIONAL }, + { "mpn_subcnd_n", speed_mpn_subcnd_n, FLAG_R_OPTIONAL }, + { "MPN_ZERO", speed_MPN_ZERO }, { "binvert_limb", speed_binvert_limb, FLAG_NODATA }, diff --git a/tune/speed.h b/tune/speed.h index 08c01a5dc..70484d391 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -148,6 +148,7 @@ double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_add_err1_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_add_err2_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_add_err3_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_addcnd_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_addlsh_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s)); @@ -305,6 +306,7 @@ double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sub_err1_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sub_err2_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sub_err3_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_subcnd_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sublsh_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s)); -- cgit v1.2.1 From 2a071bbcca683a848366e451963e451e8d4c0d23 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 22:31:54 +0100 Subject: Add testing of mpn_addcnd_n, mpn_subcnd_n. --- tests/devel/try.c | 16 ++++++++++++++++ tests/refmpn.c | 23 +++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/tests/devel/try.c b/tests/devel/try.c index 5619ec26d..bf09dd829 100644 --- a/tests/devel/try.c +++ b/tests/devel/try.c @@ -622,6 +622,8 @@ enum { TYPE_SUBLSH1_NC, TYPE_SUBLSH2_NC, TYPE_SUBLSH_NC, TYPE_RSBLSH1_NC, TYPE_RSBLSH2_NC, TYPE_RSBLSH_NC, + TYPE_ADDCND_N, TYPE_SUBCND_N, + TYPE_MOD_1, TYPE_MOD_1C, TYPE_DIVMOD_1, TYPE_DIVMOD_1C, TYPE_DIVREM_1, TYPE_DIVREM_1C, TYPE_PREINV_DIVREM_1, TYPE_DIVREM_2, TYPE_PREINV_MOD_1, TYPE_MOD_34LSUB1, TYPE_UDIV_QRNND, TYPE_UDIV_QRNND_R, @@ -742,6 +744,16 @@ param_init (void) COPY (TYPE_ADD_ERR3_N); REFERENCE (refmpn_sub_err3_n); + p = ¶m[TYPE_ADDCND_N]; + COPY (TYPE_ADD_N); + p->carry = CARRY_BIT; + REFERENCE (refmpn_addcnd_n); + + p = ¶m[TYPE_SUBCND_N]; + COPY (TYPE_ADD_N); + p->carry = CARRY_BIT; + REFERENCE (refmpn_subcnd_n); + p = ¶m[TYPE_MUL_1]; p->retval = 1; @@ -1704,6 +1716,8 @@ const struct choice_t choice_array[] = { { TRY(mpn_copyd), TYPE_COPYD }, #endif + { TRY(mpn_addcnd_n), TYPE_ADDCND_N }, + { TRY(mpn_subcnd_n), TYPE_SUBCND_N }, #if HAVE_NATIVE_mpn_addlsh1_n { TRY(mpn_addlsh1_n), TYPE_ADDLSH1_N }, #endif @@ -2395,6 +2409,8 @@ call (struct each_t *e, tryfun_t function) case TYPE_RSBLSH2_NC: case TYPE_ADD_NC: case TYPE_SUB_NC: + case TYPE_ADDCND_N: + case TYPE_SUBCND_N: e->retval = CALLING_CONVENTIONS (function) (e->d[0].p, e->s[0].p, e->s[1].p, size, carry); break; diff --git a/tests/refmpn.c b/tests/refmpn.c index 7ace7ebce..b31804ef9 100644 --- a/tests/refmpn.c +++ b/tests/refmpn.c @@ -596,6 +596,29 @@ refmpn_sub_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size) return refmpn_sub_nc (rp, s1p, s2p, size, CNST_LIMB(0)); } +mp_limb_t +refmpn_addcnd_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size, mp_limb_t cnd) +{ + if (cnd != 0) + return refmpn_add_n (rp, s1p, s2p, size); + else + { + refmpn_copyi (rp, s1p, size); + return 0; + } +} +mp_limb_t +refmpn_subcnd_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size, mp_limb_t cnd) +{ + if (cnd != 0) + return refmpn_sub_n (rp, s1p, s2p, size); + else + { + refmpn_copyi (rp, s1p, size); + return 0; + } +} + #define AORS_ERR1_N(operation) \ { \ -- cgit v1.2.1 From 61f85141acc5225219d9be7f38738af7258aac9a Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 22:32:44 +0100 Subject: Declare just added refmpn_addcnd_n, refmpn_subcnd_n. --- tests/tests.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/tests.h b/tests/tests.h index 4086e5c5d..75b546319 100644 --- a/tests/tests.h +++ b/tests/tests.h @@ -172,6 +172,11 @@ int refmpf_validate_division __GMP_PROTO ((const char *name, mpf_srcptr got, mpf_srcptr n, mpf_srcptr d)); +mp_limb_t refmpn_addcnd_n __GMP_PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp, + mp_size_t size, mp_limb_t cnd)); +mp_limb_t refmpn_subcnd_n __GMP_PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp, + mp_size_t size, mp_limb_t cnd)); + mp_limb_t refmpn_add __GMP_PROTO ((mp_ptr rp, mp_srcptr s1p, mp_size_t s1size, mp_srcptr s2p, mp_size_t s2size)); -- cgit v1.2.1 From ecb644d44d69f619e469bd22383f9c8558ef2afb Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 23:41:38 +0100 Subject: New file. --- mpn/powerpc64/mode64/sqr_basecase.asm | 852 ++++++++++++++++++++++++++++++++++ 1 file changed, 852 insertions(+) create mode 100644 mpn/powerpc64/mode64/sqr_basecase.asm diff --git a/mpn/powerpc64/mode64/sqr_basecase.asm b/mpn/powerpc64/mode64/sqr_basecase.asm new file mode 100644 index 000000000..72ac2d318 --- /dev/null +++ b/mpn/powerpc64/mode64/sqr_basecase.asm @@ -0,0 +1,852 @@ +dnl PowerPC-64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 Free +dnl Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 16.25 +C POWER7 3.77 + +C NOTES +C * This is very crude, cleanup! +C * Try to reduce the number of needed live registers. +C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The +C cost will be more live registers. +C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code +C size a lot and speed things up perhaps 25%. +C * Use computed goto in order to compress the code. +C * Implement a larger final corner. +C * Schedule callee-saves register saves into other insns. This could save +C about 5 cycles/call. (We cannot analogously optimise the restores, since +C the sqr_diag_addlsh1 loop has no wind-down code as currently written.) +C * Should the alternating std/adde sequences be split? Some pipelines handle +C adde poorly, and might sequentialise all these instructions. +C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for +C adjacent integer multiply insns. Except for the multiply insns, the code +C was not carefully optimised for POWER6 or any other CPU. +C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`rp_outer', `r25') +define(`up_outer', `r21') +define(`rp_saved', `r22') +define(`up_saved', `r23') +define(`n_saved', `r24') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + cmpdi cr0, n, 2 + bge cr0, L(ge2) + ld r5, 0(up) C n = 1 + nop + mulld r8, r5, r5 C weight 0 + mulhdu r9, r5, r5 C weight 1 + std r8, 0(rp) + std r9, 8(rp) + blr + ALIGN(16) +L(ge2): bgt cr0, L(gt2) + ld r0, 0(up) C n = 2 + nop + mulld r8, r0, r0 C u0 * u0 + mulhdu r9, r0, r0 C u0 * u0 + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r8, 0(rp) + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + + ALIGN(16) +L(gt2): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + + mr rp_saved, rp + mr up_saved, up + mr n_saved, n + mr rp_outer, rp + mr up_outer, up + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic r7, n, 2 C compute count... + srdi r7, r7, 2 C ...for ctr + mtctr r7 C copy count into ctr + beq- cr0, L(b0) + blt- cr6, L(b1) + beq- cr6, L(b2) + +L(b3): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 + li r12, 0 C carry limb + bdz L(em3) + + ALIGN(16) +L(tm3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm3) + +L(em3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop) + +L(b0): ld r6, 0(up) + ld r27, 8(up) + mulld r7, r27, r6 + mulhdu r12, r27, r6 + std r7, 8(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(em0) + + ALIGN(16) +L(tm0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm0) + +L(em0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_2) + +L(b1): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r12, r27, r6 + addc r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(em1) + + ALIGN(16) +L(tm1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm1) + +L(em1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_3) + +L(b2): addi r7, r7, -1 C FIXME + mtctr r7 C FIXME + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + std r0, 8(rp) + std r7, 16(rp) + std r11, 24(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(em2) + + ALIGN(16) +L(tm2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm2) + +L(em2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_0) + + +L(outer_loop): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + bdz L(outer_end) + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + adde r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(ea1) + + ALIGN(16) +L(ta1): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta1) + +L(ea1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_0): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r28 + adde r7, r7, r26 + addze r12, r8 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(ea0) + + ALIGN(16) +L(ta0): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta0) + +L(ea0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_3): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + addc r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(ea3) + + ALIGN(16) +L(ta3): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta3) + +L(ea3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + +L(outer_loop_ent_2): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + bdz L(ea2) + addi up, up, 24 + + ALIGN(16) +L(ta2): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta2) + +L(ea2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + b L(outer_loop) + +L(outer_end): + ld r6, 0(up) + ld r9, 8(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + addc r0, r0, r11 + std r0, 0(rp) + addze r8, r8 + std r8, 8(rp) + +define(`rp', `rp_saved') +define(`up', `r5') +define(`n', `r6') +define(`climb', `r0') + + addi r4, rp_saved, 8 + mr r5, up_saved + mr r6, n_saved + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 2 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C put loop count into ctr + beq cr0, L(xb0) + blt cr6, L(xb1) + beq cr6, L(xb2) + +L(xb3): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + addi up, up, 24 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + addze climb, r29 + addc r10, r10, r25 + adde r11, r11, r26 + adde r6, r6, r27 + adde r7, r7, r28 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + std r6, 24(rp) + std r7, 32(rp) + addi rp, rp, 40 + bdnz L(top) + b L(end) + +L(xb2): ld r6, 0(up) + ld r7, 8(up) + addi up, up, 16 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + ld r10, 8(rp) + ld r11, 16(rp) + addc r10, r10, r10 + adde r11, r11, r11 + addze climb, r27 + addc r10, r10, r25 + adde r11, r11, r26 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + addi rp, rp, 24 + bdnz L(top) + b L(end) + +L(xb0): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + ld r12, 40(rp) + ld r23, 48(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + adde r12, r12, r12 + adde r23, r23, r23 + addze climb, r31 + std r24, 0(rp) + addc r10, r10, r25 + std r10, 8(rp) + adde r11, r11, r26 + std r11, 16(rp) + adde r6, r6, r27 + std r6, 24(rp) + adde r7, r7, r28 + std r7, 32(rp) + adde r12, r12, r29 + std r12, 40(rp) + adde r23, r23, r30 + std r23, 48(rp) + addi rp, rp, 56 + bdnz L(top) + b L(end) + +L(xb1): ld r6, 0(up) + addi up, up, 8 + mulld r24, r6, r6 + mulhdu climb, r6, r6 + std r24, 0(rp) + addic rp, rp, 8 C clear carry as side-effect + + ALIGN(32) +L(top): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r8, 0(rp) + ld r9, 8(rp) + adde r8, r8, r8 + adde r9, r9, r9 + ld r10, 16(rp) + ld r11, 24(rp) + adde r10, r10, r10 + adde r11, r11, r11 + ld r6, 32(rp) + ld r7, 40(rp) + adde r6, r6, r6 + adde r7, r7, r7 + ld r12, 48(rp) + ld r23, 56(rp) + adde r12, r12, r12 + adde r23, r23, r23 + addze r31, r31 + addc r8, r8, climb + std r8, 0(rp) + adde r9, r9, r24 + std r9, 8(rp) + adde r10, r10, r25 + std r10, 16(rp) + adde r11, r11, r26 + std r11, 24(rp) + adde r6, r6, r27 + std r6, 32(rp) + adde r7, r7, r28 + std r7, 40(rp) + adde r12, r12, r29 + std r12, 48(rp) + adde r23, r23, r30 + std r23, 56(rp) + mr climb, r31 + addi rp, rp, 64 + bdnz L(top) + +L(end): addze climb, climb + std climb, 0(rp) + + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + blr +EPILOGUE() -- cgit v1.2.1 From e958b3fb3edcec522f3c374a122f80a2d3d9207a Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 9 Nov 2011 23:41:46 +0100 Subject: *** empty log message *** --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index 6d92c7d2a..4ec18c5ac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2011-11-09 Torbjorn Granlund + + * mpn/powerpc64/mode64/sqr_basecase.asm: New file. + + * mpn/x86_64/aorscnd_n.asm: New file. + + * tune/speed.c (routine): Add measuring of mpn_addcnd_n, mpn_subcnd_n. + * tune/common.c (speed_mpn_addcnd_n,speed_mpn_subcnd_n): New functions. + * tune/speed.h: Declare them. + + * tests/devel/try.c: Add tests for mpn_addcnd_n and mpn_subcnd_n. + * tests/refmpn.c (refmpn_addcnd_n, refmpn_subcnd_n): New functions. + * tests/tests.h: Declare them. + + * configure.in (gmp_mpn_functions): Add addcnd_n and subcnd_n. + 2011-11-07 Torbjorn Granlund * mpn/generic/redc_1.c: Just reduce U uperand using Hensel norm, but -- cgit v1.2.1 From eb453fbaa0a498d2b1bfd05c9a51310da203fd33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Fri, 11 Nov 2011 14:13:49 +0100 Subject: Make mpn_hgcd_appr use mpn_hgcd_reduce. --- ChangeLog | 6 ++ mpn/generic/hgcd_appr.c | 175 ++---------------------------------------------- 2 files changed, 10 insertions(+), 171 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4ec18c5ac..7bd87d97e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2011-11-11 Niels Möller + + * mpn/generic/hgcd_appr.c (submul, hgcd_matrix_apply): Deleted + functions, earlier copied to hgcd_reduce.c. + (mpn_hgcd_appr): Use hgcd_reduce. + 2011-11-09 Torbjorn Granlund * mpn/powerpc64/mode64/sqr_basecase.asm: New file. diff --git a/mpn/generic/hgcd_appr.c b/mpn/generic/hgcd_appr.c index 963eaea47..8454f9da5 100644 --- a/mpn/generic/hgcd_appr.c +++ b/mpn/generic/hgcd_appr.c @@ -25,172 +25,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" -/* Computes R -= A * B. Result must be non-negative. Normalized down - to size an, and resulting size is returned. */ -static mp_size_t -submul (mp_ptr rp, mp_size_t rn, - mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn) -{ - mp_ptr tp; - TMP_DECL; - - ASSERT (bn > 0); - ASSERT (an >= bn); - ASSERT (rn >= an); - ASSERT (an + bn <= rn + 1); - - TMP_MARK; - tp = TMP_ALLOC_LIMBS (an + bn); - - mpn_mul (tp, ap, an, bp, bn); - if (an + bn > rn) - { - ASSERT (tp[rn] == 0); - bn--; - } - ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn)); - TMP_FREE; - - while (rn > an && (rp[rn-1] == 0)) - rn--; - - return rn; -} - -/* Computes (a, b) <-- M^{-1} (a; b) */ -/* FIXME: - x Take scratch parameter, and figure out scratch need. - - x Use some fallback for small M->n? -*/ -static mp_size_t -hgcd_matrix_apply (const struct hgcd_matrix *M, - mp_ptr ap, mp_ptr bp, - mp_size_t n) -{ - mp_size_t an, bn, un, vn, nn; - mp_size_t mn[2][2]; - mp_size_t modn; - mp_ptr tp, sp, scratch; - mp_limb_t cy; - unsigned i, j; - - TMP_DECL; - - ASSERT ( (ap[n-1] | bp[n-1]) > 0); - - an = n; - MPN_NORMALIZE (ap, an); - bn = n; - MPN_NORMALIZE (bp, bn); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - { - mp_size_t k; - k = M->n; - MPN_NORMALIZE (M->p[i][j], k); - mn[i][j] = k; - } - - ASSERT (mn[0][0] > 0); - ASSERT (mn[1][1] > 0); - ASSERT ( (mn[0][1] | mn[1][0]) > 0); - - TMP_MARK; - - if (mn[0][1] == 0) - { - mp_size_t qn; - - /* A unchanged, M = (1, 0; q, 1) */ - ASSERT (mn[0][0] == 1); - ASSERT (M->p[0][0][0] == 1); - ASSERT (mn[1][1] == 1); - ASSERT (M->p[1][1][0] == 1); - - /* Put B <-- B - q A */ - nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]); - } - else if (mn[1][0] == 0) - { - /* B unchanged, M = (1, q; 0, 1) */ - ASSERT (mn[0][0] == 1); - ASSERT (M->p[0][0][0] == 1); - ASSERT (mn[1][1] == 1); - ASSERT (M->p[1][1][0] == 1); - - /* Put A <-- A - q * B */ - nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]); - } - else - { - /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01. - B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */ - un = MIN (an - mn[0][0], bn - mn[1][0]) + 1; - vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1; - - nn = MAX (un, vn); - /* In the range of interest, mulmod_bnm1 should always beat mullo. */ - modn = mpn_mulmod_bnm1_next_size (nn + 1); - - scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n)); - tp = TMP_ALLOC_LIMBS (modn); - sp = TMP_ALLOC_LIMBS (modn); - - ASSERT (n <= 2*modn); - - if (n > modn) - { - cy = mpn_add (ap, ap, modn, ap + modn, n - modn); - MPN_INCR_U (ap, modn, cy); - - cy = mpn_add (bp, bp, modn, bp + modn, n - modn); - MPN_INCR_U (bp, modn, cy); - - n = modn; - } - - mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch); - mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch); - - /* FIXME: Handle the small n case in some better way. */ - if (n + mn[1][1] < modn) - MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]); - if (n + mn[0][1] < modn) - MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]); - - cy = mpn_sub_n (tp, tp, sp, modn); - MPN_DECR_U (tp, modn, cy); - - ASSERT (mpn_zero_p (tp + nn, modn - nn)); - - mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch); - MPN_COPY (ap, tp, nn); - mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch); - - if (n + mn[1][0] < modn) - MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]); - if (n + mn[0][0] < modn) - MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]); - - cy = mpn_sub_n (tp, tp, sp, modn); - MPN_DECR_U (tp, modn, cy); - - ASSERT (mpn_zero_p (tp + nn, modn - nn)); - MPN_COPY (bp, tp, nn); - - while ( (ap[nn-1] | bp[nn-1]) == 0) - { - nn--; - ASSERT (nn > 0); - } - } - TMP_FREE; - - return nn; -} - /* Identical to mpn_hgcd_itch. FIXME: Do we really need to add HGCD_THRESHOLD at the end? */ mp_size_t @@ -347,13 +181,12 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n, { mp_size_t n2 = (3*n)/4 + 1; mp_size_t p = n/2; - mp_size_t input_n = n; + mp_size_t nn; - MPN_COPY (tp, ap + p, n - p); - MPN_COPY (tp + n - p, bp + p, n - p); - if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p))) + nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp); + if (nn) { - n = hgcd_matrix_apply (M, ap, bp, n); + n = nn; /* FIXME: Discard some of the low limbs immediately? */ success = 1; } -- cgit v1.2.1 From 5b0e8651a493b9128594851eff5387bde8081526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Fri, 11 Nov 2011 14:59:14 +0100 Subject: Tuning of mpn_hgcd_appr and mpn_hgcd_reduce. --- ChangeLog | 24 ++++++++++++++++++++ tune/Makefile.am | 7 ++++-- tune/common.c | 16 +++++++++++++ tune/hgcd_reduce_1.c | 30 ++++++++++++++++++++++++ tune/hgcd_reduce_2.c | 29 ++++++++++++++++++++++++ tune/speed.c | 4 ++++ tune/speed.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++ tune/tuneup.c | 25 ++++++++++++++++++++ 8 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 tune/hgcd_reduce_1.c create mode 100644 tune/hgcd_reduce_2.c diff --git a/ChangeLog b/ChangeLog index 7bd87d97e..3187619a5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,29 @@ 2011-11-11 Niels Möller + * tune/hgcd_reduce_2.c: New file. + * tune/hgcd_reduce_1.c: New file. + + * tune/tuneup.c (hgcd_appr_threshold): New threshold variable. + (hgcd_reduce_threshold): Likewise. + (tune_hgcd_appr): New function. + (tune_hgcd_reduce): New function. + (all): Call tune_hgcd_appr and tune_hgcd_reduce. + + * tune/speed.h (speed_mpn_hgcd_reduce): Declaration. + (speed_mpn_hgcd_reduce_[12]): Likewise. + (mpn_hgcd_reduce_[12]): Likewise. + (SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL): New macro. + + * tune/speed.c (routine): Added mpn_hgcd_reduce, + mpn_hgcd_reduce_1, and mpn_hgcd_reduce_2. + + * tune/common.c (speed_mpn_hgcd_reduce): New function. + (speed_mpn_hgcd_reduce_[12]): Likewise. + + * tune/Makefile.am (libspeed_la_SOURCES): Added hgcd_reduce_1.c + hgcd_reduce_2.c. + (TUNE_MPN_SRCS_BASIC): Added hgcd_appr.c and hgcd_reduce.c. + * mpn/generic/hgcd_appr.c (submul, hgcd_matrix_apply): Deleted functions, earlier copied to hgcd_reduce.c. (mpn_hgcd_appr): Use hgcd_reduce. diff --git a/tune/Makefile.am b/tune/Makefile.am index e54c020d4..117e5ca2c 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -43,7 +43,8 @@ libspeed_la_SOURCES = \ common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ freq.c \ gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ - hgcd_lehmer.c jacbase1.c jacbase2.c jacbase3.c jacbase4.c \ + hgcd_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ + jacbase1.c jacbase2.c jacbase3.c jacbase4.c \ mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \ noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ set_strb.c set_strs.c set_strp.c time.c @@ -129,7 +130,9 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \ dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c \ invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \ - get_str.c set_str.c matrix22_mul.c hgcd.c mul_n.c sqr.c \ + get_str.c set_str.c matrix22_mul.c \ + hgcd.c hgcd_appr.c hgcd_reduce.c \ + mul_n.c sqr.c \ mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \ mulmid.c mulmid_n.c toom42_mulmid.c \ nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \ diff --git a/tune/common.c b/tune/common.c index eb2d4ba1a..cc333a470 100644 --- a/tune/common.c +++ b/tune/common.c @@ -1538,6 +1538,22 @@ speed_mpn_hgcd_appr (struct speed_params *s) SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch); } +double +speed_mpn_hgcd_reduce (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch); +} +double +speed_mpn_hgcd_reduce_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch); +} +double +speed_mpn_hgcd_reduce_2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch); +} + double speed_mpn_gcd (struct speed_params *s) { diff --git a/tune/hgcd_reduce_1.c b/tune/hgcd_reduce_1.c new file mode 100644 index 000000000..996362414 --- /dev/null +++ b/tune/hgcd_reduce_1.c @@ -0,0 +1,30 @@ +/* mpn/generic/hgcd_reduce.c forced to use hgcd. */ + +/* +Copyright 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#undef HGCD_REDUCE_THRESHOLD +#define HGCD_REDUCE_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_hgcd_reduce mpn_hgcd_reduce_1 +#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_1_itch + + +#include "../mpn/generic/hgcd_reduce.c" diff --git a/tune/hgcd_reduce_2.c b/tune/hgcd_reduce_2.c new file mode 100644 index 000000000..1eed4ba11 --- /dev/null +++ b/tune/hgcd_reduce_2.c @@ -0,0 +1,29 @@ +/* mpn/generic/hgcd_reduce.c forced to use hgcd_appr. */ + +/* +Copyright 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#undef HGCD_REDUCE_THRESHOLD +#define HGCD_REDUCE_THRESHOLD 0 +#define __gmpn_hgcd_reduce mpn_hgcd_reduce_2 +#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_2_itch + +#include "../mpn/generic/hgcd_reduce.c" diff --git a/tune/speed.c b/tune/speed.c index 061517e28..08c13e776 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -279,6 +279,10 @@ const struct routine_t { { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer }, { "mpn_hgcd_appr", speed_mpn_hgcd_appr }, + { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce }, + { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 }, + { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2 }, + { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL }, { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL }, diff --git a/tune/speed.h b/tune/speed.h index 70484d391..5add58720 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -198,6 +198,9 @@ double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_appr __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_hgcd_reduce __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_hgcd_reduce_1 __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_hgcd_reduce_2 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s)); double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s)); @@ -488,6 +491,16 @@ mp_size_t mpn_hgcd_lehmer __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr)); #define MPN_HGCD_LEHMER_ITCH(n) (n) +mp_size_t mpn_hgcd_reduce_1 + __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr)); +mp_size_t mpn_hgcd_reduce_1_itch + __GMP_PROTO ((mp_size_t, mp_size_t)); + +mp_size_t mpn_hgcd_reduce_2 + __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr)); +mp_size_t mpn_hgcd_reduce_2_itch + __GMP_PROTO ((mp_size_t, mp_size_t)); + mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t)); mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t)); @@ -2706,6 +2719,57 @@ int speed_routine_count_zeros_setup return t; \ } +#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \ + { \ + mp_size_t hgcd_init_itch, hgcd_step_itch; \ + mp_ptr ap, bp, wp, tmp1; \ + struct hgcd_matrix hgcd; \ + mp_size_t p = s->size/2; \ + int res; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + if (s->size < 2) \ + return -1; \ + \ + TMP_MARK; \ + \ + SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ + \ + s->xp[s->size - 1] |= 1; \ + s->yp[s->size - 1] |= 1; \ + \ + hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ + hgcd_step_itch = itchfunc (s->size, p); \ + \ + SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, ap, s->size + 1); \ + speed_operand_dst (s, bp, s->size + 1); \ + speed_operand_dst (s, wp, hgcd_step_itch); \ + speed_operand_dst (s, tmp1, hgcd_init_itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (bp, s->yp, s->size); \ + mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ + res = func (&hgcd, ap, bp, s->size, p, wp); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + TMP_FREE; \ + return t; \ + } + /* Run some GCDs of s->size limbs each. The number of different data values is decreased as s->size**2, since GCD is a quadratic algorithm. SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT diff --git a/tune/tuneup.c b/tune/tuneup.c index 4f53c979c..c62a25362 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -195,6 +195,8 @@ mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; mp_size_t powm_threshold = MP_SIZE_T_MAX; mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; mp_size_t hgcd_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX; mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; @@ -1754,6 +1756,27 @@ tune_hgcd (void) one (&hgcd_threshold, ¶m); } +void +tune_hgcd_appr (void) +{ + static struct param_t param; + param.name = "HGCD_APPR_THRESHOLD"; + param.function = speed_mpn_hgcd_appr; + /* We seem to get strange results for small sizes */ + param.min_size = 30; + one (&hgcd_appr_threshold, ¶m); +} + +void +tune_hgcd_reduce (void) +{ + static struct param_t param; + param.name = "HGCD_REDUCE_THRESHOLD"; + param.function = speed_mpn_hgcd_reduce; + param.min_size = 30; + one (&hgcd_reduce_threshold, ¶m); +} + void tune_gcd_dc (void) { @@ -2579,6 +2602,8 @@ all (void) tune_matrix22_mul (); tune_hgcd (); + tune_hgcd_appr (); + tune_hgcd_reduce(); tune_gcd_dc (); tune_gcdext_dc (); tune_jacobi_base (); -- cgit v1.2.1 From 11d8c9b34ef96cb653f6af6124af9f54767805a8 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Fri, 11 Nov 2011 23:33:52 +0100 Subject: Remove file, now part of sqr_basecase.asm. --- mpn/powerpc64/mode64/sqr_diag_addlsh1.asm | 239 ------------------------------ 1 file changed, 239 deletions(-) delete mode 100644 mpn/powerpc64/mode64/sqr_diag_addlsh1.asm diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm deleted file mode 100644 index a1903cb6e..000000000 --- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm +++ /dev/null @@ -1,239 +0,0 @@ -dnl PowerPC-64 mpn_sqr_diag_addlsh1 - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C POWER3/PPC630 10 -C POWER4/PPC970 6 -C POWER5 5.375 -C POWER6 8.5 -C POWER7 3.4 - -C NOTES -C * This was written for POWER6 and its preferences for adjacent integer -C multiply insns. The cost is that we get a large set of live registers, -C and therefore need to save 9 callee-saves registers. Except for the -C multiply insns, the code was not carefully optimised for POWER6 or any -C other CPU. -C * Perform some cross-jumping in the feed-in code, into the loop's tail. - -C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n) - -C INPUT PARAMETERS -define(`rp', `r3') -define(`tp', `r4') -define(`up', `r5') -define(`n', `r6') - -define(`climb', `r0') - -ASM_START() -PROLOGUE(mpn_sqr_diag_addlsh1) - std r31, -8(r1) - std r30, -16(r1) - std r29, -24(r1) - std r28, -32(r1) - std r27, -40(r1) - std r26, -48(r1) - std r25, -56(r1) - std r24, -64(r1) - std r23, -72(r1) - - rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 - cmpdi cr6, r0, 2 - addi n, n, 2 C compute count... - srdi n, n, 2 C ...for ctr - mtctr n C put loop count into ctr - beq cr0, L(b0) - blt cr6, L(b1) - beq cr6, L(b2) - -L(b3): ld r6, 0(up) - ld r7, 8(up) - ld r12, 16(up) - addi up, up, 24 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - mulld r28, r12, r12 - mulhdu r29, r12, r12 - ld r10, 0(tp) - ld r11, 8(tp) - ld r6, 16(tp) - ld r7, 24(tp) - addi tp, tp, 32 - addc r10, r10, r10 - adde r11, r11, r11 - adde r6, r6, r6 - adde r7, r7, r7 - addze climb, r29 - addc r10, r10, r25 - adde r11, r11, r26 - adde r6, r6, r27 - adde r7, r7, r28 - std r24, 0(rp) - std r10, 8(rp) - std r11, 16(rp) - std r6, 24(rp) - std r7, 32(rp) - addi rp, rp, 40 - bdnz L(top) - b L(end) - -L(b2): ld r6, 0(up) - ld r7, 8(up) - addi up, up, 16 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - ld r10, 0(tp) - ld r11, 8(tp) - addi tp, tp, 16 - addc r10, r10, r10 - adde r11, r11, r11 - addze climb, r27 - addc r10, r10, r25 - adde r11, r11, r26 - std r24, 0(rp) - std r10, 8(rp) - std r11, 16(rp) - addi rp, rp, 24 - bdnz L(top) - b L(end) - -L(b0): ld r6, 0(up) - ld r7, 8(up) - ld r12, 16(up) - ld r23, 24(up) - addi up, up, 32 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - mulld r28, r12, r12 - mulhdu r29, r12, r12 - mulld r30, r23, r23 - mulhdu r31, r23, r23 - ld r10, 0(tp) - ld r11, 8(tp) - ld r6, 16(tp) - ld r7, 24(tp) - ld r12, 32(tp) - ld r23, 40(tp) - addi tp, tp, 48 - addc r10, r10, r10 - adde r11, r11, r11 - adde r6, r6, r6 - adde r7, r7, r7 - adde r12, r12, r12 - adde r23, r23, r23 - addze climb, r31 - std r24, 0(rp) - addc r10, r10, r25 - std r10, 8(rp) - adde r11, r11, r26 - std r11, 16(rp) - adde r6, r6, r27 - std r6, 24(rp) - adde r7, r7, r28 - std r7, 32(rp) - adde r12, r12, r29 - std r12, 40(rp) - adde r23, r23, r30 - std r23, 48(rp) - addi rp, rp, 56 - bdnz L(top) - b L(end) - -L(b1): ld r6, 0(up) - addi up, up, 8 - mulld r24, r6, r6 - mulhdu climb, r6, r6 - std r24, 0(rp) - addic rp, rp, 8 C clear carry as side-effect - - ALIGN(32) -L(top): ld r6, 0(up) - ld r7, 8(up) - ld r12, 16(up) - ld r23, 24(up) - addi up, up, 32 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - mulld r28, r12, r12 - mulhdu r29, r12, r12 - mulld r30, r23, r23 - mulhdu r31, r23, r23 - ld r8, 0(tp) - ld r9, 8(tp) - adde r8, r8, r8 - adde r9, r9, r9 - ld r10, 16(tp) - ld r11, 24(tp) - adde r10, r10, r10 - adde r11, r11, r11 - ld r6, 32(tp) - ld r7, 40(tp) - adde r6, r6, r6 - adde r7, r7, r7 - ld r12, 48(tp) - ld r23, 56(tp) - adde r12, r12, r12 - adde r23, r23, r23 - addi tp, tp, 64 - addze r31, r31 - addc r8, r8, climb - std r8, 0(rp) - adde r9, r9, r24 - std r9, 8(rp) - adde r10, r10, r25 - std r10, 16(rp) - adde r11, r11, r26 - std r11, 24(rp) - adde r6, r6, r27 - std r6, 32(rp) - adde r7, r7, r28 - std r7, 40(rp) - adde r12, r12, r29 - std r12, 48(rp) - adde r23, r23, r30 - std r23, 56(rp) - mr climb, r31 - addi rp, rp, 64 - bdnz L(top) - -L(end): addze climb, climb - std climb, 0(rp) - -L(ret): ld r31, -8(r1) - ld r30, -16(r1) - ld r29, -24(r1) - ld r28, -32(r1) - ld r27, -40(r1) - ld r26, -48(r1) - ld r25, -56(r1) - ld r24, -64(r1) - ld r23, -72(r1) - blr -EPILOGUE() -- cgit v1.2.1 From 4b6d13c7c4cac584b8f8391eeaa87f335417ceec Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Fri, 11 Nov 2011 23:37:19 +0100 Subject: *** empty log message *** --- ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index 3187619a5..e13e336e0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-11 Torbjorn Granlund + + * mpn/powerpc64/mode64/sqr_diag_addlsh1.asm: Remove. + 2011-11-11 Niels Möller * tune/hgcd_reduce_2.c: New file. -- cgit v1.2.1 From 890e8c8008d6518223533612dfe95b07db2c696d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Sun, 13 Nov 2011 20:24:47 +0100 Subject: Tweaked tuning setup for hgcd_appr. --- ChangeLog | 6 ++++++ tune/tuneup.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index e13e336e0..71c2427e1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2011-11-13 Niels Möller + + * tune/tuneup.c (tune_hgcd_appr): Use default min_size. + (tune_hgcd_reduce): Increase max_size and step_factor, to 7000 + and 0.04, respectively. + 2011-11-11 Torbjorn Granlund * mpn/powerpc64/mode64/sqr_diag_addlsh1.asm: Remove. diff --git a/tune/tuneup.c b/tune/tuneup.c index c62a25362..ce1db103d 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -1762,8 +1762,6 @@ tune_hgcd_appr (void) static struct param_t param; param.name = "HGCD_APPR_THRESHOLD"; param.function = speed_mpn_hgcd_appr; - /* We seem to get strange results for small sizes */ - param.min_size = 30; one (&hgcd_appr_threshold, ¶m); } @@ -1774,6 +1772,8 @@ tune_hgcd_reduce (void) param.name = "HGCD_REDUCE_THRESHOLD"; param.function = speed_mpn_hgcd_reduce; param.min_size = 30; + param.max_size = 7000; + param.step_factor = 0.04; one (&hgcd_reduce_threshold, ¶m); } -- cgit v1.2.1 From e037315eefee1b249bbe052bfd84c1a1c01c6f72 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sun, 13 Nov 2011 21:31:57 +0100 Subject: Add support for POWM_SEC_TABLE table. --- mpn/generic/powm_sec.c | 14 ++++- tune/Makefile.am | 2 +- tune/tuneup.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c index 3a6f55403..c6358947b 100644 --- a/mpn/generic/powm_sec.c +++ b/mpn/generic/powm_sec.c @@ -189,15 +189,27 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits) } } +#ifndef POWM_SEC_TABLE +#if GMP_NUMB_BITS < 50 +#define POWM_SEC_TABLE 2,33,96,780,2741 +#else +#define POWM_SEC_TABLE 2,130,524,2578 +#endif +#endif + +#if TUNE_PROGRAM_BUILD +extern int win_size (mp_bitcnt_t); +#else static inline int win_size (mp_bitcnt_t eb) { int k; - static mp_bitcnt_t x[] = {0,4,27,100,325,1026,2905,7848,20457,51670,~(mp_bitcnt_t)0}; + static mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0}; for (k = 1; eb > x[k]; k++) ; return k; } +#endif /* Convert U to REDC form, U_r = B^n * U mod M */ static void diff --git a/tune/Makefile.am b/tune/Makefile.am index 117e5ca2c..38b1fe9d2 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -132,7 +132,7 @@ TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \ invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \ get_str.c set_str.c matrix22_mul.c \ hgcd.c hgcd_appr.c hgcd_reduce.c \ - mul_n.c sqr.c \ + mul_n.c sqr.c powm_sec.c \ mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \ mulmid.c mulmid_n.c toom42_mulmid.c \ nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \ diff --git a/tune/tuneup.c b/tune/tuneup.c index ce1db103d..c30d19d6b 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -192,7 +192,6 @@ mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; -mp_size_t powm_threshold = MP_SIZE_T_MAX; mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; mp_size_t hgcd_threshold = MP_SIZE_T_MAX; mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; @@ -1801,6 +1800,134 @@ tune_gcdext_dc (void) one (&gcdext_dc_threshold, ¶m); } +/* In tune_powm_sec we compute the table used by the win_size function. The + cutoff points are in exponent bits, disregarding other operand sizes. It is + not possible to use the one framework since it currently uses a granilarity + of full limbs. +*/ + +/* This win_size replaces the variant in the powm code, allowing us to + control k in the k-ary algorithms. */ +int winsize; +int +win_size (mp_bitcnt_t eb) +{ + return winsize; +} + +void +tune_powm_sec (void) +{ + mp_size_t n; + int k, i; + mp_size_t itch; + mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff; + const int n_max = 3000 / GMP_NUMB_BITS; + const int n_measurements = 5; + mp_ptr rp, bp, ep, mp, tp; + double ttab[n_measurements], tk, tkp1; + TMP_DECL; + TMP_MARK; + + possible_nbits_cutoff = 0; + + k = 1; + + winsize = 10; /* the itch function needs this */ + itch = mpn_powm_sec_itch (n_max, n_max, n_max); + + rp = TMP_ALLOC_LIMBS (n_max); + bp = TMP_ALLOC_LIMBS (n_max); + ep = TMP_ALLOC_LIMBS (n_max); + mp = TMP_ALLOC_LIMBS (n_max); + tp = TMP_ALLOC_LIMBS (itch); + + mpn_random (bp, n_max); + mpn_random (mp, n_max); + mp[0] |= 1; + +/* How about taking the M operand size into account? + + An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming + B = O(M)). + + Using k-ary and no sliding window, the precomputation will need time + O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + + O(log(E)/k*M(N)), for the squarings, multiplications, respectively. + + An operation R=powm_sec(B,E,N) will take time like powm. + + Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the + main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + + O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full + table reads, respectively. */ + + printf ("#define POWM_SEC_TABLE "); + + for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; ) + { + n = (nbits - 1) / GMP_NUMB_BITS + 1; + + /* Generate E such that sliding-window for k and k+1 works equally + well/poorly (but sliding is not used in powm_sec, of course). */ + for (i = 0; i < n; i++) + ep[i] = ~CNST_LIMB(0); + + /* Truncate E to be exactly nbits large. */ + if (nbits % GMP_NUMB_BITS != 0) + mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS); + ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS; + + winsize = k; + for (i = 0; i < n_measurements; i++) + { + speed_starttime (); + mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp); + ttab[i] = speed_endtime (); + } + tk = median (ttab, n_measurements); + + winsize = k + 1; + speed_starttime (); + for (i = 0; i < n_measurements; i++) + { + speed_starttime (); + mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp); + ttab[i] = speed_endtime (); + } + tkp1 = median (ttab, n_measurements); +/* + printf ("testing: %ld, %d", nbits, k, ep[n-1]); + printf (" %10.5f %10.5f\n", tk, tkp1); +*/ + if (tkp1 < tk) + { + if (possible_nbits_cutoff) + { + /* Two consecutive sizes indicate k increase, obey. */ + if (k > 1) + printf (","); + printf ("%ld", (long) possible_nbits_cutoff); + k++; + possible_nbits_cutoff = 0; + } + else + { + /* One measurement indicate k increase, save nbits for further + consideration. */ + possible_nbits_cutoff = nbits; + } + } + else + possible_nbits_cutoff = 0; + + nbits_next = nbits * 65 / 64; + nbits = nbits_next + (nbits_next == nbits); + } + printf ("\n"); + TMP_FREE; +} + /* size_extra==1 reflects the fact that with high Date: Sun, 13 Nov 2011 21:33:48 +0100 Subject: *** empty log message *** --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 71c2427e1..262ff7215 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2011-11-13 Torbjorn Granlund + + * tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Add powm_sec.c. + + * mpn/generic/powm_sec.c (win_size): Use POWM_SEC_TABLE + (POWM_SEC_TABLE): Define default. + + * tune/tuneup.c (tune_powm_sec): New function computing POWM_SEC_TABLE. + (all): Call new function. + + * mpn/generic/powm_sec.c (win_size): Define only when + TUNE_PROGRAM_BUILD is not set. + 2011-11-13 Niels Möller * tune/tuneup.c (tune_hgcd_appr): Use default min_size. -- cgit v1.2.1 From e1d8e2b8173bbd8e9b034722206979eef782df2c Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 15 Nov 2011 00:49:29 +0100 Subject: Separate mpn_tabselect from mpn_powm_sec and prepare for asm support. --- ChangeLog | 10 ++++++++++ configure.in | 5 ++++- mpn/asm-defs.m4 | 1 + mpn/generic/powm_sec.c | 25 ------------------------- mpn/generic/tabselect.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 26 deletions(-) create mode 100644 mpn/generic/tabselect.c diff --git a/ChangeLog b/ChangeLog index 262ff7215..0491b1574 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2011-11-15 Torbjorn Granlund + + * mpn/asm-defs.m4 (define_mpn): Add tabselect. + + * configure.in (gmp_mpn_functions): Add tabselect. + (HAVE_NATIVE): Add entries for addncd_n, subcnd_n, tabselect. + + * mpn/generic/powm_sec.c: Remove mpn_tabselect implementation. + * mpn/generic/tabselect.c: New file with removed code. + 2011-11-13 Torbjorn Granlund * tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Add powm_sec.c. diff --git a/configure.in b/configure.in index 6c9a313c3..eedab0eca 100644 --- a/configure.in +++ b/configure.in @@ -2640,7 +2640,7 @@ gmp_mpn_functions="$extra_functions \ divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec \ trialdiv remove \ and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n \ - copyi copyd zero \ + copyi copyd zero tabselect \ $gmp_mpn_functions_optional" define(GMP_MULFUNC_CHOICES, @@ -3103,6 +3103,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_mpn_add_n_sub_n #undef HAVE_NATIVE_mpn_add_nc #undef HAVE_NATIVE_mpn_addaddmul_1msb0 +#undef HAVE_NATIVE_mpn_addcnd_n #undef HAVE_NATIVE_mpn_addlsh1_n #undef HAVE_NATIVE_mpn_addlsh2_n #undef HAVE_NATIVE_mpn_addlsh_n @@ -3191,6 +3192,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_mpn_sqr_diag_addlsh1 #undef HAVE_NATIVE_mpn_sub_n #undef HAVE_NATIVE_mpn_sub_nc +#undef HAVE_NATIVE_mpn_subcnd_n #undef HAVE_NATIVE_mpn_sublsh1_n #undef HAVE_NATIVE_mpn_sublsh2_n #undef HAVE_NATIVE_mpn_sublsh_n @@ -3204,6 +3206,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 #undef HAVE_NATIVE_mpn_submul_1c +#undef HAVE_NATIVE_mpn_tabselect #undef HAVE_NATIVE_mpn_udiv_qrnnd #undef HAVE_NATIVE_mpn_udiv_qrnnd_r #undef HAVE_NATIVE_mpn_umul_ppmm diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4 index 4f049b21b..7a5639fbe 100644 --- a/mpn/asm-defs.m4 +++ b/mpn/asm-defs.m4 @@ -1471,6 +1471,7 @@ define_mpn(sub_n) define_mpn(sub_nc) define_mpn(submul_1) define_mpn(submul_1c) +define_mpn(tabselect) define_mpn(umul_ppmm) define_mpn(umul_ppmm_r) define_mpn(udiv_qrnnd) diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c index c6358947b..d7ed2b486 100644 --- a/mpn/generic/powm_sec.c +++ b/mpn/generic/powm_sec.c @@ -320,31 +320,6 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, TMP_FREE; } -#if ! HAVE_NATIVE_mpn_tabselect -/* Select entry `which' from table `tab', which has nents entries, each `n' - limbs. Store the selected entry at rp. Reads entire table to avoid - side-channel information leaks. O(n*nents). - FIXME: Move to its own file. */ -void -mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n, - mp_size_t nents, mp_size_t which) -{ - mp_size_t k, i; - mp_limb_t mask; - volatile mp_limb_t *tp; - - for (k = 0; k < nents; k++) - { - mask = -(mp_limb_t) (which == k); - tp = tab + n * k; - for (i = 0; i < n; i++) - { - rp[i] = (rp[i] & ~mask) | (tp[i] & mask); - } - } -} -#endif - mp_size_t mpn_powm_sec_itch (mp_size_t bn, mp_size_t en, mp_size_t n) { diff --git a/mpn/generic/tabselect.c b/mpn/generic/tabselect.c new file mode 100644 index 000000000..02e52fdc0 --- /dev/null +++ b/mpn/generic/tabselect.c @@ -0,0 +1,48 @@ +/* mpn_tabselect. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" + + +/* Select entry `which' from table `tab', which has nents entries, each `n' + limbs. Store the selected entry at rp. Reads entire table to avoid + side-channel information leaks. O(n*nents). + FIXME: Move to its own file. */ +void +mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n, + mp_size_t nents, mp_size_t which) +{ + mp_size_t k, i; + mp_limb_t mask; + volatile mp_limb_t *tp; + + for (k = 0; k < nents; k++) + { + mask = -(mp_limb_t) (which == k); + tp = tab + n * k; + for (i = 0; i < n; i++) + { + rp[i] = (rp[i] & ~mask) | (tp[i] & mask); + } + } +} -- cgit v1.2.1 From aebd2151218bded6e4278834b9f082808eef6590 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 15 Nov 2011 00:53:06 +0100 Subject: Add mpn_tabselect assembly support for powerpc64, x86, x86_64, ia64. --- ChangeLog | 7 ++- mpn/ia64/tabselect.asm | 139 ++++++++++++++++++++++++++++++++++++++++++++ mpn/powerpc64/tabselect.asm | 95 ++++++++++++++++++++++++++++++ mpn/x86/tabselect.asm | 104 +++++++++++++++++++++++++++++++++ mpn/x86_64/tabselect.asm | 108 ++++++++++++++++++++++++++++++++++ 5 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 mpn/ia64/tabselect.asm create mode 100644 mpn/powerpc64/tabselect.asm create mode 100644 mpn/x86/tabselect.asm create mode 100644 mpn/x86_64/tabselect.asm diff --git a/ChangeLog b/ChangeLog index 0491b1574..b14d2a8da 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,10 +1,15 @@ 2011-11-15 Torbjorn Granlund + * mpn/powerpc64/tabselect.asm: New file. + * mpn/x86_64/tabselect.asm: New file. + * mpn/x86/tabselect.asm: New file. + * mpn/ia64/tabselect.asm: New file. + * mpn/asm-defs.m4 (define_mpn): Add tabselect. * configure.in (gmp_mpn_functions): Add tabselect. (HAVE_NATIVE): Add entries for addncd_n, subcnd_n, tabselect. - + * mpn/generic/powm_sec.c: Remove mpn_tabselect implementation. * mpn/generic/tabselect.c: New file with removed code. diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm new file mode 100644 index 000000000..0ae3fdcfe --- /dev/null +++ b/mpn/ia64/tabselect.asm @@ -0,0 +1,139 @@ +dnl IA-64 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 5 (estimated) + +C NOTES +C * Using software pipelining could trivially yield 3 c/l even without +C unrolling. (This code was modelled after the powerpc64 code, for +C simplicity.) + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r32') +define(`tp', `r33') +define(`n', `r34') +define(`nents', `r35') +define(`which', `r36') + +define(`mask', `r8') + +define(`rp1', `r32') +define(`tp1', `r33') +define(`rp2', `r14') +define(`tp2', `r15') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` +.mmi; addp4 rp = 0, rp C M I + addp4 tp = 0, tp C M I + zxt4 n = n C I +.mii; nop 0 + zxt4 nents = nents C I + zxt4 which = which C I + ;; +') +.mmi; add rp2 = 8, rp1 + add tp2 = 8, tp1 + add r6 = -2, n + ;; +.mmi; cmp.eq p10, p0 = 1, n + and r9 = 1, n C set cr0 for use in inner loop + shr.u r6 = r6, 1 C inner loop count + ;; +.mmi; cmp.eq p8, p0 = 0, r9 + sub which = nents, which + shl n = n, 3 + ;; + +L(outer): +.mmi cmp.eq p6, p7 = which, nents C are we at the selected table entry? + nop 0 + mov ar.lc = r6 C I0 + ;; +.mmb; + (p6) mov mask = -1 + (p7) mov mask = 0 + (p8) br.dptk L(top) C branch to loop entry if n even + ;; + +.mmi; ld8 r16 = [tp1], 8 + add tp2 = 8, tp2 + nop 0 + ;; +.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 + ;; +.mmi; andcm r18 = r18, mask + ;; + or r16 = r16, r18 + nop 0 + ;; +.mmb; st8 [rp1] = r16, 8 + add rp2 = 8, rp2 + (p10) br.dpnt L(end) + + ALIGN(32) +L(top): +.mmi; ld8 r16 = [tp1], 16 + ld8 r17 = [tp2], 16 + nop 0 + ;; +.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 +.mmi; ld8 r19 = [rp2] + and r17 = r17, mask + nop 0 + ;; +.mmi; andcm r18 = r18, mask + andcm r19 = r19, mask + nop 0 + ;; +.mmi; or r16 = r16, r18 + or r17 = r17, r19 + nop 0 + ;; +.mmb; st8 [rp1] = r16, 16 + st8 [rp2] = r17, 16 + br.cloop.dptk L(top) + ;; +L(end): +.mmi; sub rp1 = rp1, n C move rp back to beginning + sub rp2 = rp2, n C move rp back to beginning + cmp.ne p9, p0 = 1, nents +.mmb; add nents = -1, nents + nop 0 + (p9) br.dptk L(outer) + ;; + +.mib; nop 0 + nop 0 + br.ret.sptk.many b0 +EPILOGUE() diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm new file mode 100644 index 000000000..0ac2e9ba0 --- /dev/null +++ b/mpn/powerpc64/tabselect.asm @@ -0,0 +1,95 @@ +dnl PowerPC-64 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 ? + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using VMX could result in significant speedup for certain CPUs. + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`mask', `r8') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + addi r0, n, 1 + srdi r0, r0, 1 C inner loop count + andi. r9, n, 1 C set cr0 for use in inner loop + subf which, nents, which + sldi n, n, 3 + +L(outer): + mtctr r0 C put inner loop count in ctr + + add r9, which, nents C are we at the selected table entry? + addic r9, r9, -1 C set CF iff not selected entry + subfe mask, r0, r0 + + beq cr0, L(top) C branch to loop entry if n even + + ld r9, 0(tp) + and r9, r9, mask + ld r11, 0(rp) + andc r11, r11, mask + or r9, r9, r11 + std r9, 0(rp) + addi tp, tp, 8 + addi rp, rp, 8 + bdz L(end) + + ALIGN(16) +L(top): ld r9, 0(tp) + ld r10, 8(tp) + and r9, r9, mask + and r10, r10, mask + ld r11, 0(rp) + ld r12, 8(rp) + andc r11, r11, mask + andc r12, r12, mask + or r9, r9, r11 + or r10, r10, r12 + std r9, 0(rp) + std r10, 8(rp) + addi tp, tp, 16 + addi rp, rp, 16 + bdnz L(top) + +L(end): subf rp, n, rp C move rp back to beginning + addi nents, nents, -1 + cmpdi cr6, nents, 0 + bne cr6, L(outer) + + blr +EPILOGUE() diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm new file mode 100644 index 000000000..ab646dac3 --- /dev/null +++ b/mpn/x86/tabselect.asm @@ -0,0 +1,104 @@ +dnl x86 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) ? +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C Intel Atom ? +C AMD K6 ? +C AMD K7 ? +C AMD K8 ? +C AMD K10 ? + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using SSE2 could result in many-fold speedup. + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `%edi') +define(`tp', `%esi') +define(`n', `%ebx') +define(`nents', `%ecx') +define(`which', `36(%esp)') + +define(`i', `%ebp') +define(`maskp', `20(%esp)') +define(`maskn', `32(%esp)') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + push %edi + push %esi + push %ebx + push %ebp + mov 20(%esp), rp + mov 24(%esp), tp + mov 28(%esp), n + mov 32(%esp), nents + + lea (rp,n,4), rp + lea (tp,n,4), tp + sub nents, which +L(outer): + mov which, %eax + add nents, %eax + neg %eax C set CF iff 'which' != k + sbb %eax, %eax + mov %eax, maskn + not %eax + mov %eax, maskp + + mov n, i + neg i + + ALIGN(16) +L(top): mov (tp,i,4), %eax + and maskp, %eax + mov (rp,i,4), %edx + and maskn, %edx + or %edx, %eax + mov %eax, (rp,i,4) + inc i + js L(top) + +L(end): mov n, %eax + lea (tp,%eax,4), tp + dec nents + jne L(outer) + +L(outer_end): + pop %ebp + pop %ebx + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm new file mode 100644 index 000000000..f7de6a85b --- /dev/null +++ b/mpn/x86_64/tabselect.asm @@ -0,0 +1,108 @@ +dnl AMD64 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom ? +C VIA nano ? + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using SSE2/AVX2 could result in many-fold speedup. + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`n', `%rdx') +define(`nents', `%rcx') +define(`which', `%r8') + +define(`i', `%rbp') +define(`maskp', `%r11') +define(`maskn', `%r12') + +C rax rbx rcx rdx rdi rsi rbp (rsp) r8 r9 r10 r11 r12 r13 r14 r15 +C nents n rp tab which + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + push %rbx + push %rbp + push %r12 + + lea (rp,n,8), rp + lea (tp,n,8), tp + sub nents, which +L(outer): + lea (which,nents), %rax + neg %rax C set CF iff 'which' != k + sbb maskn, maskn + mov maskn, maskp + not maskp + + mov n, i + neg i + test $1, R32(n) + je L(top) + mov (tp,i,8), %rax + and maskp, %rax + mov (rp,i,8), %r9 + and maskn, %r9 + or %r9, %rax + mov %rax, (rp,i,8) + add $1, i + jns L(end) + + ALIGN(16) +L(top): mov (tp,i,8), %rax + mov 8(tp,i,8), %rbx + and maskp, %rax + and maskp, %rbx + mov (rp,i,8), %r9 + mov 8(rp,i,8), %r10 + and maskn, %r9 + and maskn, %r10 + or %r9, %rax + or %r10, %rbx + mov %rax, (rp,i,8) + mov %rbx, 8(rp,i,8) + add $2, i + js L(top) + +L(end): lea (tp,n,8), tp + dec nents + jne L(outer) + +L(outer_end): + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() -- cgit v1.2.1 From 4fc9dd5d4647c86ba4d5b08b0a2589f6f6079796 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 15 Nov 2011 01:33:25 +0100 Subject: Amend 2011-11-03 gcc_cflags change. --- ChangeLog | 2 ++ configure.in | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index b14d2a8da..64b394b03 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2011-11-15 Torbjorn Granlund + * configure.in: Amend 2011-11-03 gcc_cflags change. + * mpn/powerpc64/tabselect.asm: New file. * mpn/x86_64/tabselect.asm: New file. * mpn/x86/tabselect.asm: New file. diff --git a/configure.in b/configure.in index eedab0eca..887975c40 100644 --- a/configure.in +++ b/configure.in @@ -875,7 +875,7 @@ case $host in abilist="32" cclist="gcc cc" cc_cflags="-O2" - gcc_cflags="$gcc_cflags -mpowerpc" + gcc_32_cflags="$gcc_cflags -mpowerpc" gcc_cflags_optlist="precomp subtype asm cpu" gcc_cflags_precomp="-no-cpp-precomp" gcc_cflags_subtype="-force_cpusubtype_ALL" # for vmx on darwin @@ -1243,9 +1243,7 @@ case $host in # case $host_cpu in sparc64 | sparcv9* | ultrasparc*) - gcc_cflags="$gcc_cflags -Wa,-xarch=v8plus" ;; - *) - gcc_cflags="$gcc_cflags" ;; + gcc_32_cflags="$gcc_cflags -Wa,-xarch=v8plus" ;; esac gcc_32_cflags_maybe="-m32" gcc_cflags_optlist="cpu" -- cgit v1.2.1 From 8467dfae35b0349c306be952466a6382818d4188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Tue, 15 Nov 2011 14:01:48 +0100 Subject: Further tweak for HGCD_APPR_THRESHOLD tuning. --- ChangeLog | 5 +++++ tune/tuneup.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 64b394b03..9af90086e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2011-11-15 Niels Möller + + * tune/tuneup.c (tune_hgcd_appr): Increased min_size to 50; some + machines got small thresholds which appear to be bogus. + 2011-11-15 Torbjorn Granlund * configure.in: Amend 2011-11-03 gcc_cflags change. diff --git a/tune/tuneup.c b/tune/tuneup.c index c30d19d6b..b61729119 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -1761,6 +1761,8 @@ tune_hgcd_appr (void) static struct param_t param; param.name = "HGCD_APPR_THRESHOLD"; param.function = speed_mpn_hgcd_appr; + /* We seem to get strange results for small sizes */ + param.min_size = 50; one (&hgcd_appr_threshold, ¶m); } -- cgit v1.2.1 From 5153cf91d11990dbb6dc0291eb9eee2a6796b089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Tue, 15 Nov 2011 14:07:40 +0100 Subject: speed support for mpn_hgcd_appr_lehmer. --- ChangeLog | 17 +++++++++++++++++ tune/Makefile.am | 2 +- tune/common.c | 8 +++++++- tune/hgcd_appr_lehmer.c | 29 +++++++++++++++++++++++++++++ tune/speed.c | 1 + tune/speed.h | 7 ++++++- 6 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 tune/hgcd_appr_lehmer.c diff --git a/ChangeLog b/ChangeLog index 9af90086e..7efdb424d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,22 @@ 2011-11-15 Niels Möller + * tune/speed.h (speed_mpn_hgcd_appr_lehmer): New prototype. + (mpn_hgcd_lehmer_itch): Likewise. + (mpn_hgcd_appr_lehmer): Likewise. + (mpn_hgcd_appr_lehmer_itch): Likewise. + (MPN_HGCD_LEHMER_ITCH): Deleted macro. + + * tune/speed.c (routine): Added mpn_hgcd_appr_lehmer. + + * tune/common.c (speed_mpn_hgcd_lehmer): Use mpn_hgcd_lehmer_itch + rather than similarly named macro. + (speed_mpn_hgcd_appr_lehmer): New function. + + * tune/Makefile.am (libspeed_la_SOURCES): Added + hgcd_appr_lehmer.c. + + * tune/hgcd_appr_lehmer.c: New file. + * tune/tuneup.c (tune_hgcd_appr): Increased min_size to 50; some machines got small thresholds which appear to be bogus. diff --git a/tune/Makefile.am b/tune/Makefile.am index 38b1fe9d2..646a1f4af 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -43,7 +43,7 @@ libspeed_la_SOURCES = \ common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ freq.c \ gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ - hgcd_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ + hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ jacbase1.c jacbase2.c jacbase3.c jacbase4.c \ mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \ noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ diff --git a/tune/common.c b/tune/common.c index cc333a470..9855e8845 100644 --- a/tune/common.c +++ b/tune/common.c @@ -1529,7 +1529,7 @@ speed_mpn_hgcd (struct speed_params *s) double speed_mpn_hgcd_lehmer (struct speed_params *s) { - SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, MPN_HGCD_LEHMER_ITCH); + SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch); } double @@ -1538,6 +1538,12 @@ speed_mpn_hgcd_appr (struct speed_params *s) SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch); } +double +speed_mpn_hgcd_appr_lehmer (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch); +} + double speed_mpn_hgcd_reduce (struct speed_params *s) { diff --git a/tune/hgcd_appr_lehmer.c b/tune/hgcd_appr_lehmer.c new file mode 100644 index 000000000..18123e951 --- /dev/null +++ b/tune/hgcd_appr_lehmer.c @@ -0,0 +1,29 @@ +/* mpn/generic/hgcd_appr.c forced to use Lehmer's quadratic algorithm. */ + +/* +Copyright 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#undef HGCD_APPR_THRESHOLD +#define HGCD_APPR_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_hgcd_appr mpn_hgcd_appr_lehmer +#define __gmpn_hgcd_appr_itch mpn_hgcd_appr_lehmer_itch + +#include "../mpn/generic/hgcd_appr.c" diff --git a/tune/speed.c b/tune/speed.c index 08c13e776..cffed35b6 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -278,6 +278,7 @@ const struct routine_t { { "mpn_hgcd", speed_mpn_hgcd }, { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer }, { "mpn_hgcd_appr", speed_mpn_hgcd_appr }, + { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer }, { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce }, { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 }, diff --git a/tune/speed.h b/tune/speed.h index 5add58720..329c09783 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -198,6 +198,7 @@ double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_appr __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_hgcd_appr_lehmer __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_reduce __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_reduce_1 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_hgcd_reduce_2 __GMP_PROTO ((struct speed_params *s)); @@ -489,7 +490,11 @@ mp_size_t mpn_gcdext_double __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); mp_size_t mpn_hgcd_lehmer __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr)); -#define MPN_HGCD_LEHMER_ITCH(n) (n) +mp_size_t mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t)); + +mp_size_t mpn_hgcd_appr_lehmer + __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr)); +mp_size_t mpn_hgcd_appr_lehmer_itch __GMP_PROTO ((mp_size_t)); mp_size_t mpn_hgcd_reduce_1 __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr)); -- cgit v1.2.1 From 1d309e7a2dbaa08258bd681c59cd19b73d67e03a Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 15 Nov 2011 20:36:09 +0100 Subject: Rewrite mpn/generic/powm_sec.c. --- ChangeLog | 10 ++++++++ mpn/generic/powm_sec.c | 69 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/ChangeLog b/ChangeLog index 64b394b03..d329c7233 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,15 @@ 2011-11-15 Torbjorn Granlund + * mpn/generic/powm_sec.c (mpn_local_sqr): Remove forgotten TMP_* calls. + (redcify): Likewise. + (mpn_powm_sec): Likewise. + + * mpn/generic/powm_sec.c (mpn_powm_sec): Rework scratch usage + (mpn_powm_sec_itch): Rewrite. + + * mpn/generic/powm_sec.c (mpn_powm_sec): Use mpn_tabselect also in + initialisation. + * configure.in: Amend 2011-11-03 gcc_cflags change. * mpn/powerpc64/tabselect.asm: New file. diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c index d7ed2b486..24bb83de3 100644 --- a/mpn/generic/powm_sec.c +++ b/mpn/generic/powm_sec.c @@ -133,8 +133,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp) if (n > 1) { mp_limb_t cy; - TMP_DECL; - TMP_MARK; cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); tp[n - 1] = cy; @@ -156,8 +154,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp) #endif rp[2 * n - 1] += cy; } - - TMP_FREE; } } #endif @@ -211,26 +207,24 @@ win_size (mp_bitcnt_t eb) } #endif -/* Convert U to REDC form, U_r = B^n * U mod M */ +/* Convert U to REDC form, U_r = B^n * U mod M. + Uses scratch space at tp of size 2un + n + 1. */ static void redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp) { mp_ptr qp; - TMP_DECL; - TMP_MARK; - qp = tp + un + n; + qp = tp + un + n; /* un + n - n + 1 = un + 1 limbs */ MPN_ZERO (tp, n); MPN_COPY (tp + n, up, un); mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n); - TMP_FREE; } /* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] - Requires that mp[n-1..0] is odd. FIXME: is this true? - Requires that ep[en-1..0] is > 1. - Uses scratch space at tp of 3n+1 limbs. */ + Requires that mp[n-1..0] is odd. + Requires that ep[en-1..0] > 1. + Uses scratch space at tp as defined by mpn_powm_sec_itch. */ void mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_srcptr ep, mp_size_t en, @@ -244,13 +238,10 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_ptr pp, this_pp; long i; int cnd; - TMP_DECL; ASSERT (en > 1 || (en == 1 && ep[0] > 0)); ASSERT (n >= 1 && ((mp[0] & 1) != 0)); - TMP_MARK; - count_leading_zeros (cnt, ep[en - 1]); ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt; @@ -259,15 +250,27 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, binvert_limb (minv, mp[0]); minv = -minv; - pp = tp + 4 * n; + pp = tp; + tp += (n << windowsize); /* put tp after power table */ + /* Compute pp[0] table entry */ + /* scratch: | n | 1 | n+2 | */ + /* | pp[0] | 1 | redcify | */ this_pp = pp; this_pp[n] = 1; - redcify (this_pp, this_pp + n, 1, mp, n, tp + 6 * n); + redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1); this_pp += n; - redcify (this_pp, bp, bn, mp, n, tp + 6 * n); + + /* Compute pp[1] table entry. To avoid excessive scratch usage in the + degenerate situation where B >> M, we let redcify use scratch space which + will later be used by the pp table (element 2 and up). */ + /* scratch: | n | n | bn + n + 1 | */ + /* | pp[0] | pp[1] | redcify | */ + redcify (this_pp, bp, bn, mp, n, this_pp + n); /* Precompute powers of b and put them in the temporary area at pp. */ + /* scratch: | n | n | ... | | 2n | */ + /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */ for (i = (1 << windowsize) - 2; i > 0; i--) { mpn_mul_basecase (tp, this_pp, n, pp + n, n); @@ -281,8 +284,15 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, else ebi -= windowsize; +#if WANT_CACHE_SECURITY + mpn_tabselect (rp, pp, n, 1 << windowsize, expbits); +#else MPN_COPY (rp, pp + n * expbits, n); +#endif + /* Main exponentiation loop. */ + /* scratch: | n | n | ... | | 3n-4n | */ + /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */ while (ebi != 0) { expbits = getbits (ep, ebi, windowsize); @@ -317,7 +327,6 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, MPN_REDC_1_SEC (rp, tp, mp, n, minv); cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ mpn_subcnd_n (rp, rp, mp, n, !cnd); - TMP_FREE; } mp_size_t @@ -326,10 +335,20 @@ mpn_powm_sec_itch (mp_size_t bn, mp_size_t en, mp_size_t n) int windowsize; mp_size_t redcify_itch, itch; - windowsize = win_size (en * GMP_NUMB_BITS); /* slight over-estimate of exp */ - itch = 4 * n + (n << windowsize); - redcify_itch = 2 * bn + n + 1; - /* The 6n is due to the placement of reduce scratch 6n into the start of the - scratch area. */ - return MAX (itch, redcify_itch + 6 * n); + /* The top scratch usage will either be when reducing B in the 2nd redcify + call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It + is 3n or 4n depending on if we use mpn_local_sqr or a native + mpn_sqr_basecase. We assume 4n always for now.) */ + + windowsize = win_size (en * GMP_LIMB_BITS); /* slight over-estimate of exp */ + + /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call, + the 2bn + n + 1 term is due to redcify's own usage. */ + redcify_itch = (2 * n) + (2 * bn + n + 1); + + /* The n * 2^windowsize term is due to the power table, the 4n term is due to + scratch needs of squaring/multiplication in the exponentiation loop. */ + itch = (n << windowsize) + (4 * n); + + return MAX (itch, redcify_itch); } -- cgit v1.2.1 From 4187da90eab6dd83437babffd6c845501de64a1d Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 16 Nov 2011 21:46:58 +0100 Subject: New file. --- ChangeLog | 4 + mpn/powerpc64/mode64/aorscnd_n.asm | 185 +++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 mpn/powerpc64/mode64/aorscnd_n.asm diff --git a/ChangeLog b/ChangeLog index 89d9ebd45..6cfc78d1c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-16 Torbjorn Granlund + + * mpn/powerpc64/mode64/aorscnd_n.asm: New file. + 2011-11-15 Niels Möller * tune/speed.h (speed_mpn_hgcd_appr_lehmer): New prototype. diff --git a/mpn/powerpc64/mode64/aorscnd_n.asm b/mpn/powerpc64/mode64/aorscnd_n.asm new file mode 100644 index 000000000..47aa6fb39 --- /dev/null +++ b/mpn/powerpc64/mode64/aorscnd_n.asm @@ -0,0 +1,185 @@ +dnl PowerPC-64 mpn_addcnd_n/mpn_subcnd_n. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 2.25 +C POWER5 ? +C POWER6 ? +C POWER7 ? + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') +define(`cnd', `r7') + +ifdef(`OPERATION_addcnd_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addcnd_n) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_subcnd_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_subcnd_n) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + subfic cnd, cnd, 0 + subfe cnd, cnd, cnd + + rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi r6, r6, 3 C compute count... + srdi r6, r6, 2 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r8, 0(up) C load s1 limb + ld r9, 0(vp) C load s2 limb + ld r10, 8(up) C load s1 limb + ld r11, 8(vp) C load s2 limb + ld r12, 16(up) C load s1 limb + addi up, up, 24 + ld r0, 16(vp) C load s2 limb + addi vp, vp, 24 + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r29, 0(rp) + std r30, 8(rp) + std r31, 16(rp) + addi rp, rp, 24 + bdnz L(go) + b L(ret) + +L(b01): ld r12, 0(up) C load s1 limb + addi up, up, 8 + ld r0, 0(vp) C load s2 limb + addi vp, vp, 8 + and r0, r0, cnd + ADDSUB r31, r0, r12 C add + std r31, 0(rp) + addi rp, rp, 8 + bdnz L(go) + b L(ret) + +L(b10): ld r10, 0(up) C load s1 limb + ld r11, 0(vp) C load s2 limb + ld r12, 8(up) C load s1 limb + addi up, up, 16 + ld r0, 8(vp) C load s2 limb + addi vp, vp, 16 + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + std r30, 0(rp) + std r31, 8(rp) + addi rp, rp, 16 + bdnz L(go) + b L(ret) + +L(b00): CLRCB C clear/set cy +L(go): ld r6, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdz L(end) + + addi up, up, 32 + addi vp, vp, 32 + +L(top): ADDSUBC r28, r27, r6 + ld r6, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ADDSUBC r29, r9, r8 + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ADDSUBC r30, r11, r10 + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ADDSUBC r31, r0, r12 + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + std r28, 0(rp) + addi up, up, 32 + std r29, 8(rp) + addi vp, vp, 32 + std r30, 16(rp) + std r31, 24(rp) + addi rp, rp, 32 + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r27, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r28, 0(rp) + std r29, 8(rp) + std r30, 16(rp) + std r31, 24(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() -- cgit v1.2.1 From e143b1a779b0a2f13627758436f3ee6d3103f39d Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 16 Nov 2011 21:49:38 +0100 Subject: Slight tweak of new code. --- mpn/powerpc64/tabselect.asm | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm index 0ac2e9ba0..7d189388b 100644 --- a/mpn/powerpc64/tabselect.asm +++ b/mpn/powerpc64/tabselect.asm @@ -21,10 +21,10 @@ include(`../config.m4') C cycles/limb C POWER3/PPC630 ? -C POWER4/PPC970 ? +C POWER4/PPC970 3.3 C POWER5 ? C POWER6 ? -C POWER7 ? +C POWER7 2.5 C NOTES C * This has not been tuned for any specific processor. Its speed should not @@ -60,18 +60,20 @@ L(outer): beq cr0, L(top) C branch to loop entry if n even ld r9, 0(tp) + addi tp, tp, 8 and r9, r9, mask ld r11, 0(rp) andc r11, r11, mask or r9, r9, r11 std r9, 0(rp) - addi tp, tp, 8 addi rp, rp, 8 bdz L(end) ALIGN(16) L(top): ld r9, 0(tp) ld r10, 8(tp) + addi tp, tp, 16 + nop and r9, r9, mask and r10, r10, mask ld r11, 0(rp) @@ -82,13 +84,12 @@ L(top): ld r9, 0(tp) or r10, r10, r12 std r9, 0(rp) std r10, 8(rp) - addi tp, tp, 16 addi rp, rp, 16 bdnz L(top) L(end): subf rp, n, rp C move rp back to beginning + cmpdi cr6, nents, 1 addi nents, nents, -1 - cmpdi cr6, nents, 0 bne cr6, L(outer) blr -- cgit v1.2.1 From 380a7c946b6837264f7a34c2cb20b1ed0cc7f967 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 16 Nov 2011 21:50:51 +0100 Subject: Add cycle counts. --- mpn/ia64/tabselect.asm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm index 0ae3fdcfe..cc5b49b04 100644 --- a/mpn/ia64/tabselect.asm +++ b/mpn/ia64/tabselect.asm @@ -21,12 +21,12 @@ include(`../config.m4') C cycles/limb C Itanium: ? -C Itanium 2: 5 (estimated) +C Itanium 2: 2.5 C NOTES -C * Using software pipelining could trivially yield 3 c/l even without -C unrolling. (This code was modelled after the powerpc64 code, for -C simplicity.) +C * Using software pipelining could trivially yield 2 c/l without unrolling, +C or 1+epsilon with unrolling. (This code was modelled after the powerpc64 +C code, for simplicity.) C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) define(`rp', `r32') -- cgit v1.2.1 From 3b86e6c687af5ab4aeb153bc5ca1bb4a0ccd7759 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 16 Nov 2011 21:51:17 +0100 Subject: Add cycle counts. --- mpn/x86_64/tabselect.asm | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm index f7de6a85b..ca475942b 100644 --- a/mpn/x86_64/tabselect.asm +++ b/mpn/x86_64/tabselect.asm @@ -21,14 +21,14 @@ include(`../config.m4') C cycles/limb -C AMD K8,K9 ? -C AMD K10 ? -C Intel P4 ? -C Intel core2 ? -C Intel NHM ? -C Intel SBR ? +C AMD K8,K9 2.5 +C AMD K10 2.5 +C Intel P4 4 +C Intel core2 2.3 +C Intel NHM 2.5 +C Intel SBR 2.2 C Intel atom ? -C VIA nano ? +C VIA nano 3.5 C NOTES C * This has not been tuned for any specific processor. Its speed should not -- cgit v1.2.1 From 27057444042708cc07e7f2959af63076c042065b Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Wed, 16 Nov 2011 21:55:23 +0100 Subject: New file. --- ChangeLog | 2 + mpn/powerpc32/tabselect.asm | 98 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 mpn/powerpc32/tabselect.asm diff --git a/ChangeLog b/ChangeLog index 6cfc78d1c..9d2e0c041 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2011-11-16 Torbjorn Granlund + * mpn/powerpc32/tabselect.asm: New file. + * mpn/powerpc64/mode64/aorscnd_n.asm: New file. 2011-11-15 Niels Möller diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm new file mode 100644 index 000000000..b12fecd12 --- /dev/null +++ b/mpn/powerpc32/tabselect.asm @@ -0,0 +1,98 @@ +dnl PowerPC-32 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): ? +C 744x,745x (G4+): ? +C power4/ppc970: 3.3 +C power5: ? + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using VMX could result in significant speedup for certain CPUs. + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`mask', `r8') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + addi r0, n, 1 + srdi r0, r0, 1 C inner loop count + andi. r9, n, 1 C set cr0 for use in inner loop + subf which, nents, which + sldi n, n, 2 + +L(outer): + mtctr r0 C put inner loop count in ctr + + add r9, which, nents C are we at the selected table entry? + addic r9, r9, -1 C set CF iff not selected entry + subfe mask, r0, r0 + + beq cr0, L(top) C branch to loop entry if n even + + lwz r9, 0(tp) + addi tp, tp, 4 + and r9, r9, mask + lwz r11, 0(rp) + andc r11, r11, mask + or r9, r9, r11 + stw r9, 0(rp) + addi rp, rp, 4 + bdz L(end) + + ALIGN(16) +L(top): lwz r9, 0(tp) + lwz r10, 4(tp) + addi tp, tp, 8 + nop + and r9, r9, mask + and r10, r10, mask + lwz r11, 0(rp) + lwz r12, 4(rp) + andc r11, r11, mask + andc r12, r12, mask + or r9, r9, r11 + or r10, r10, r12 + stw r9, 0(rp) + stw r10, 4(rp) + addi rp, rp, 8 + bdnz L(top) + +L(end): subf rp, n, rp C move rp back to beginning + cmpdi cr6, nents, 1 + addi nents, nents, -1 + bne cr6, L(outer) + + blr +EPILOGUE() -- cgit v1.2.1 From 4aa30987572b144b7606af6394aadf1efdc7a65c Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 17 Nov 2011 09:02:17 +0100 Subject: Use 32-bit insn forms. --- mpn/powerpc32/tabselect.asm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm index b12fecd12..155a7b495 100644 --- a/mpn/powerpc32/tabselect.asm +++ b/mpn/powerpc32/tabselect.asm @@ -47,10 +47,10 @@ ASM_START() ALIGN(16) PROLOGUE(mpn_tabselect) addi r0, n, 1 - srdi r0, r0, 1 C inner loop count + srwi r0, r0, 1 C inner loop count andi. r9, n, 1 C set cr0 for use in inner loop subf which, nents, which - sldi n, n, 2 + slwi n, n, 2 L(outer): mtctr r0 C put inner loop count in ctr @@ -90,7 +90,7 @@ L(top): lwz r9, 0(tp) bdnz L(top) L(end): subf rp, n, rp C move rp back to beginning - cmpdi cr6, nents, 1 + cmpwi cr6, nents, 1 addi nents, nents, -1 bne cr6, L(outer) -- cgit v1.2.1 From 63f97805b47041bbd20aca91e30542de25276c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Thu, 17 Nov 2011 15:18:45 +0100 Subject: Another tweak to mpn_hgcd_appr tuning --- ChangeLog | 4 ++++ tune/tuneup.c | 1 + 2 files changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9d2e0c041..758ec78ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-17 Niels Möller + + * tune/tuneup.c (tune_hgcd_appr): Increase stop_since_change. + 2011-11-16 Torbjorn Granlund * mpn/powerpc32/tabselect.asm: New file. diff --git a/tune/tuneup.c b/tune/tuneup.c index b61729119..444e5e429 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -1763,6 +1763,7 @@ tune_hgcd_appr (void) param.function = speed_mpn_hgcd_appr; /* We seem to get strange results for small sizes */ param.min_size = 50; + param.stop_since_change = 150; one (&hgcd_appr_threshold, ¶m); } -- cgit v1.2.1 From ca20b2f018c660e83322ff0bd1a3a3a2f9874bb1 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 17 Nov 2011 21:24:47 +0100 Subject: Add speed measurement for mpn_tabselect. --- ChangeLog | 9 +++++++++ tune/common.c | 5 +++++ tune/speed.c | 3 ++- tune/speed.h | 10 ++++++++-- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 758ec78ae..658930906 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2011-11-17 Torbjorn Granlund + + * tune/speed.c (routine): Add mpn_tabselect. + * tune/common.c (speed_mpn_tabselect): New function. + * tune/speed.h (SPEED_ROUTINE_MPN_COPY_CALL): New macro, made from + old SPEED_ROUTINE_MPN_COPY. + (SPEED_ROUTINE_MPN_COPY): Just invoke SPEED_ROUTINE_MPN_COPY_CALL. + (SPEED_ROUTINE_MPN_TABSELECT): New macro. + 2011-11-17 Niels Möller * tune/tuneup.c (tune_hgcd_appr): Increase stop_since_change. diff --git a/tune/common.c b/tune/common.c index 9855e8845..88f0099e8 100644 --- a/tune/common.c +++ b/tune/common.c @@ -461,6 +461,11 @@ speed_mpn_com (struct speed_params *s) { SPEED_ROUTINE_MPN_COPY (mpn_com); } +double +speed_mpn_tabselect (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TABSELECT (mpn_tabselect); +} double diff --git a/tune/speed.c b/tune/speed.c index cffed35b6..704d82772 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -283,7 +283,7 @@ const struct routine_t { { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce }, { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 }, { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2 }, - + { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL }, { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL }, @@ -412,6 +412,7 @@ const struct routine_t { #if HAVE_NATIVE_mpn_copyd { "mpn_copyd", speed_mpn_copyd }, #endif + { "mpn_tabselect", speed_mpn_tabselect, FLAG_R_OPTIONAL }, #if HAVE_NATIVE_mpn_addlsh1_n { "mpn_addlsh1_n", speed_mpn_addlsh1_n, FLAG_R_OPTIONAL }, #endif diff --git a/tune/speed.h b/tune/speed.h index 329c09783..20daad2dd 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -175,6 +175,7 @@ double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s)); double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s)); double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s)); double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_tabselect __GMP_PROTO ((struct speed_params *s)); double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *)); @@ -613,7 +614,7 @@ int speed_routine_count_zeros_setup #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; /* For mpn_copy or similar. */ -#define SPEED_ROUTINE_MPN_COPY(function) \ +#define SPEED_ROUTINE_MPN_COPY_CALL(call) \ { \ mp_ptr wp; \ unsigned i; \ @@ -632,13 +633,18 @@ int speed_routine_count_zeros_setup speed_starttime (); \ i = s->reps; \ do \ - function (wp, s->xp, s->size); \ + call; \ while (--i != 0); \ t = speed_endtime (); \ \ TMP_FREE; \ return t; \ } +#define SPEED_ROUTINE_MPN_COPY(function) \ + SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size)) + +#define SPEED_ROUTINE_MPN_TABSELECT(function) \ + SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size, 1, s->r)) #define SPEED_ROUTINE_MPN_COPYC(function) \ { \ -- cgit v1.2.1 From 342bc281b1d2e59520f99be86120b177606f43f3 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 17 Nov 2011 22:19:28 +0100 Subject: Retune, adding several new THRESHOLDs. --- mpn/alpha/ev5/gmp-mparam.h | 40 ++++++---- mpn/alpha/ev6/gmp-mparam.h | 76 ++++++++++-------- mpn/ia64/gmp-mparam.h | 142 ++++++++++++++++++++++------------ mpn/pa64/gmp-mparam.h | 62 ++++++++------- mpn/powerpc64/mode64/p4/gmp-mparam.h | 31 +++++--- mpn/powerpc64/mode64/p5/gmp-mparam.h | 39 ++++++---- mpn/powerpc64/mode64/p6/gmp-mparam.h | 37 +++++---- mpn/powerpc64/mode64/p7/gmp-mparam.h | 32 ++++---- mpn/s390_64/gmp-mparam.h | 16 ++-- mpn/sparc64/ultrasparc34/gmp-mparam.h | 29 ++++--- mpn/sparc64/ultrasparct1/gmp-mparam.h | 36 +++++---- mpn/x86/atom/gmp-mparam.h | 41 ++++++---- mpn/x86/k7/gmp-mparam.h | 45 ++++++----- mpn/x86/p6/sse2/gmp-mparam.h | 61 ++++++++------- mpn/x86/pentium4/sse2/gmp-mparam.h | 85 ++++++++++---------- mpn/x86_64/atom/gmp-mparam.h | 17 ++-- mpn/x86_64/bobcat/gmp-mparam.h | 10 ++- mpn/x86_64/core2/gmp-mparam.h | 23 +++--- mpn/x86_64/coreinhm/gmp-mparam.h | 23 +++--- mpn/x86_64/coreisbr/gmp-mparam.h | 132 ++++++++++++++++++++++--------- mpn/x86_64/gmp-mparam.h | 13 +++- mpn/x86_64/nano/gmp-mparam.h | 33 ++++---- mpn/x86_64/pentium4/gmp-mparam.h | 51 ++++++------ 23 files changed, 659 insertions(+), 415 deletions(-) diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h index a4c794838..395353a46 100644 --- a/mpn/alpha/ev5/gmp-mparam.h +++ b/mpn/alpha/ev5/gmp-mparam.h @@ -26,38 +26,44 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ #define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 29 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 #define MOD_1U_TO_MOD_1_1_THRESHOLD 2 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 /* never mpn_mod_1_1p */ +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 4 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 14 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 75 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 #define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_2_PI2_THRESHOLD 21 #define DIVEXACT_1_THRESHOLD 0 /* always */ -#define BMOD_1_TO_MOD_1_THRESHOLD 80 +#define BMOD_1_TO_MOD_1_THRESHOLD 78 -#define MUL_TOOM22_THRESHOLD 18 -#define MUL_TOOM33_THRESHOLD 61 -#define MUL_TOOM44_THRESHOLD 88 +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 118 #define MUL_TOOM6H_THRESHOLD 173 -#define MUL_TOOM8H_THRESHOLD 0 +#define MUL_TOOM8H_THRESHOLD 240 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 60 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 #define SQR_BASECASE_THRESHOLD 4 #define SQR_TOOM2_THRESHOLD 28 -#define SQR_TOOM3_THRESHOLD 65 +#define SQR_TOOM3_THRESHOLD 77 #define SQR_TOOM4_THRESHOLD 136 -#define SQR_TOOM6_THRESHOLD 180 -#define SQR_TOOM8_THRESHOLD 248 +#define SQR_TOOM6_THRESHOLD 173 +#define SQR_TOOM8_THRESHOLD 260 + +#define MULMID_TOOM42_THRESHOLD 20 #define MULMOD_BNM1_THRESHOLD 11 #define SQRMOD_BNM1_THRESHOLD 13 +#define POWM_SEC_TABLE 2,17,322,387 + #define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 244, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ @@ -161,9 +167,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 942 #define MATRIX22_STRASSEN_THRESHOLD 13 -#define HGCD_THRESHOLD 101 -#define GCD_DC_THRESHOLD 330 -#define GCDEXT_DC_THRESHOLD 222 +#define HGCD_THRESHOLD 105 +#define HGCD_APPR_THRESHOLD 111 +#define HGCD_REDUCE_THRESHOLD 1437 +#define GCD_DC_THRESHOLD 318 +#define GCDEXT_DC_THRESHOLD 214 #define JACOBI_BASE_METHOD 2 #define GET_STR_DC_THRESHOLD 16 diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h index 12c3891d7..ce865f4cc 100644 --- a/mpn/alpha/ev6/gmp-mparam.h +++ b/mpn/alpha/ev6/gmp-mparam.h @@ -29,38 +29,44 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ #define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 #define MOD_1U_TO_MOD_1_1_THRESHOLD 2 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 6 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 4 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 #define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_2_PI2_THRESHOLD 8 #define DIVEXACT_1_THRESHOLD 0 /* always */ -#define BMOD_1_TO_MOD_1_THRESHOLD 16 +#define BMOD_1_TO_MOD_1_THRESHOLD 20 #define MUL_TOOM22_THRESHOLD 35 -#define MUL_TOOM33_THRESHOLD 74 -#define MUL_TOOM44_THRESHOLD 178 -#define MUL_TOOM6H_THRESHOLD 288 -#define MUL_TOOM8H_THRESHOLD 333 +#define MUL_TOOM33_THRESHOLD 77 +#define MUL_TOOM44_THRESHOLD 184 +#define MUL_TOOM6H_THRESHOLD 228 +#define MUL_TOOM8H_THRESHOLD 288 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 75 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 101 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 #define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 73 -#define SQR_BASECASE_THRESHOLD 5 -#define SQR_TOOM2_THRESHOLD 61 -#define SQR_TOOM3_THRESHOLD 107 -#define SQR_TOOM4_THRESHOLD 170 -#define SQR_TOOM6_THRESHOLD 309 -#define SQR_TOOM8_THRESHOLD 360 +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 58 +#define SQR_TOOM3_THRESHOLD 103 +#define SQR_TOOM4_THRESHOLD 172 +#define SQR_TOOM6_THRESHOLD 264 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 52 #define MULMOD_BNM1_THRESHOLD 20 #define SQRMOD_BNM1_THRESHOLD 23 +#define POWM_SEC_TABLE 4,17,246,2388 + #define MUL_FFT_MODF_THRESHOLD 480 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 480, 5}, { 18, 6}, { 10, 5}, { 21, 6}, \ @@ -148,19 +154,19 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 3136 #define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 130 -#define MULLO_MUL_N_THRESHOLD 15604 +#define MULLO_DC_THRESHOLD 173 +#define MULLO_MUL_N_THRESHOLD 11355 -#define DC_DIV_QR_THRESHOLD 119 -#define DC_DIVAPPR_Q_THRESHOLD 390 +#define DC_DIV_QR_THRESHOLD 112 +#define DC_DIVAPPR_Q_THRESHOLD 422 #define DC_BDIV_QR_THRESHOLD 110 -#define DC_BDIV_Q_THRESHOLD 318 +#define DC_BDIV_Q_THRESHOLD 348 -#define INV_MULMOD_BNM1_THRESHOLD 75 -#define INV_NEWTON_THRESHOLD 390 -#define INV_APPR_THRESHOLD 372 +#define INV_MULMOD_BNM1_THRESHOLD 68 +#define INV_NEWTON_THRESHOLD 402 +#define INV_APPR_THRESHOLD 396 -#define BINV_NEWTON_THRESHOLD 393 +#define BINV_NEWTON_THRESHOLD 399 #define REDC_1_TO_REDC_N_THRESHOLD 110 #define MU_DIV_QR_THRESHOLD 1718 @@ -170,12 +176,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 1652 #define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 282 -#define GCD_DC_THRESHOLD 1138 -#define GCDEXT_DC_THRESHOLD 773 +#define HGCD_THRESHOLD 278 +#define HGCD_APPR_THRESHOLD 366 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 1258 +#define GCDEXT_DC_THRESHOLD 777 #define JACOBI_BASE_METHOD 3 -#define GET_STR_DC_THRESHOLD 14 -#define GET_STR_PRECOMPUTE_THRESHOLD 19 -#define SET_STR_DC_THRESHOLD 3754 -#define SET_STR_PRECOMPUTE_THRESHOLD 8097 +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 3539 +#define SET_STR_PRECOMPUTE_THRESHOLD 7784 diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h index 0841c82aa..f080b876e 100644 --- a/mpn/ia64/gmp-mparam.h +++ b/mpn/ia64/gmp-mparam.h @@ -1,6 +1,6 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010 Free Software +Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -21,70 +21,92 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define GMP_LIMB_BITS 64 #define BYTES_PER_MP_LIMB 8 -/* 1300MHz Itanium2 (babe.fsffrance.org) */ - +/* 900MHz Itanium2 (titanic.gmplib.org) */ +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 #define MOD_1U_TO_MOD_1_1_THRESHOLD 8 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD 12 #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ #define MUL_TOOM22_THRESHOLD 40 -#define MUL_TOOM33_THRESHOLD 122 -#define MUL_TOOM44_THRESHOLD 212 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 214 #define MUL_TOOM6H_THRESHOLD 318 #define MUL_TOOM8H_THRESHOLD 430 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 146 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 126 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 #define SQR_BASECASE_THRESHOLD 11 #define SQR_TOOM2_THRESHOLD 84 -#define SQR_TOOM3_THRESHOLD 125 +#define SQR_TOOM3_THRESHOLD 135 #define SQR_TOOM4_THRESHOLD 494 -#define SQR_TOOM6_THRESHOLD 0 /* never toom4 */ -#define SQR_TOOM8_THRESHOLD 0 /* never toom6 */ +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ #define MULMOD_BNM1_THRESHOLD 23 -#define SQRMOD_BNM1_THRESHOLD 25 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define POWM_SEC_TABLE 2,29,130,905 -#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 444, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \ - { 35, 7}, { 18, 6}, { 37, 7}, { 19, 6}, \ + { { 476, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \ + { 33, 7}, { 17, 6}, { 37, 7}, { 19, 6}, \ { 39, 7}, { 21, 6}, { 43, 7}, { 33, 8}, \ { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ - { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \ - { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ - { 49, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \ - { 63, 9}, { 35, 8}, { 71, 9}, { 43,10}, \ + { 21, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ { 23, 9}, { 59,10}, { 31, 9}, { 71,10}, \ - { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ { 55,11}, { 31,10}, { 87,11}, { 47,10}, \ { 111,12}, { 31,11}, { 63,10}, { 143,11}, \ { 79,10}, { 167,11}, { 95,10}, { 191,11}, \ { 111,12}, { 63,11}, { 143,10}, { 287, 9}, \ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ { 95,11}, { 191,10}, { 399,11}, { 207,10}, \ - { 431,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ - { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ - {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } -#define MUL_FFT_TABLE3_SIZE 76 + { 431,13}, { 63,12}, { 127,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 335,10}, { 671,11}, { 367,12}, \ + { 191,11}, { 399,10}, { 799,11}, { 431,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 415,11}, { 863,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 799,11}, { 1599,12}, { 863,13}, \ + { 447,12}, { 927,11}, { 1855,14}, { 255,13}, \ + { 511,12}, { 1055,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1663,14}, { 895,13}, \ + { 1855,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,14}, { 1279,13}, { 2687,14}, \ + { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,14}, { 1791,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 155 #define MUL_FFT_THRESHOLD 5760 -#define SQR_FFT_MODF_THRESHOLD 440 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 436 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 440, 5}, { 14, 4}, { 29, 5}, { 29, 6}, \ - { 15, 5}, { 31, 6}, { 35, 7}, { 18, 6}, \ - { 37, 7}, { 33, 8}, { 17, 7}, { 37, 8}, \ + { { 436, 5}, { 14, 4}, { 29, 5}, { 31, 6}, \ + { 35, 7}, { 18, 6}, { 37, 7}, { 37, 8}, \ { 19, 7}, { 40, 8}, { 37, 9}, { 19, 8}, \ { 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \ { 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \ @@ -93,45 +115,69 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ { 87,11}, { 47,10}, { 111,12}, { 31,11}, \ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ { 95,10}, { 191,11}, { 111,12}, { 63,11}, \ - { 127,10}, { 255,11}, { 143,10}, { 303,11}, \ - { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ - { 399,11}, { 207,10}, { 431,13}, { 8192,14}, \ - { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ - { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ - {4194304,23}, {8388608,24} } -#define SQR_FFT_TABLE3_SIZE 66 + { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 399,11}, { 207,10}, \ + { 431,13}, { 63,12}, { 127,11}, { 271,10}, \ + { 543,11}, { 303,12}, { 159,11}, { 335,10}, \ + { 671,11}, { 367,10}, { 735,12}, { 191,11}, \ + { 399,10}, { 799,11}, { 431,12}, { 223,11}, \ + { 463,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 735,13}, { 191,12}, { 383,11}, \ + { 799,12}, { 415,11}, { 863,12}, { 447,11}, \ + { 895,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 863,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1183,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,14}, { 1791,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 151 #define SQR_FFT_THRESHOLD 4032 #define MULLO_BASECASE_THRESHOLD 29 #define MULLO_DC_THRESHOLD 57 #define MULLO_MUL_N_THRESHOLD 11278 -#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIV_QR_THRESHOLD 64 #define DC_DIVAPPR_Q_THRESHOLD 222 #define DC_BDIV_QR_THRESHOLD 95 #define DC_BDIV_Q_THRESHOLD 264 -#define INV_MULMOD_BNM1_THRESHOLD 82 -#define INV_NEWTON_THRESHOLD 11 -#define INV_APPR_THRESHOLD 18 +#define INV_MULMOD_BNM1_THRESHOLD 86 +#define INV_NEWTON_THRESHOLD 139 +#define INV_APPR_THRESHOLD 147 #define BINV_NEWTON_THRESHOLD 252 -#define REDC_1_TO_REDC_2_THRESHOLD 0 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ #define REDC_2_TO_REDC_N_THRESHOLD 147 #define MU_DIV_QR_THRESHOLD 1142 -#define MU_DIVAPPR_Q_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 1142 #define MUPI_DIV_QR_THRESHOLD 0 /* always */ -#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_QR_THRESHOLD 1210 #define MU_BDIV_Q_THRESHOLD 1470 #define MATRIX22_STRASSEN_THRESHOLD 23 #define HGCD_THRESHOLD 117 -#define GCD_DC_THRESHOLD 469 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 496 #define GCDEXT_DC_THRESHOLD 368 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 13 -#define GET_STR_PRECOMPUTE_THRESHOLD 21 -#define SET_STR_DC_THRESHOLD 1204 -#define SET_STR_PRECOMPUTE_THRESHOLD 3266 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 1474 +#define SET_STR_PRECOMPUTE_THRESHOLD 3168 diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h index d0e86d856..081757aca 100644 --- a/mpn/pa64/gmp-mparam.h +++ b/mpn/pa64/gmp-mparam.h @@ -25,14 +25,16 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIVREM_1_NORM_THRESHOLD 0 /* always */ #define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 #define MOD_1U_TO_MOD_1_1_THRESHOLD 10 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 14 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD 21 #define DIVEXACT_1_THRESHOLD 0 /* always */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -47,16 +49,20 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 54 -#define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 56 -#define SQR_TOOM3_THRESHOLD 169 -#define SQR_TOOM4_THRESHOLD 280 -#define SQR_TOOM6_THRESHOLD 0 -#define SQR_TOOM8_THRESHOLD 309 +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 58 +#define SQR_TOOM3_THRESHOLD 153 +#define SQR_TOOM4_THRESHOLD 278 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ -#define MULMOD_BNM1_THRESHOLD 16 +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 15 #define SQRMOD_BNM1_THRESHOLD 19 +#define POWM_SEC_TABLE 2,23,228,1084 + #define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 336, 5}, { 11, 4}, { 23, 5}, { 21, 6}, \ @@ -196,34 +202,36 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 1856 #define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 133 -#define MULLO_MUL_N_THRESHOLD 4292 +#define MULLO_DC_THRESHOLD 113 +#define MULLO_MUL_N_THRESHOLD 4658 -#define DC_DIV_QR_THRESHOLD 140 -#define DC_DIVAPPR_Q_THRESHOLD 422 -#define DC_BDIV_QR_THRESHOLD 150 -#define DC_BDIV_Q_THRESHOLD 351 +#define DC_DIV_QR_THRESHOLD 123 +#define DC_DIVAPPR_Q_THRESHOLD 372 +#define DC_BDIV_QR_THRESHOLD 142 +#define DC_BDIV_Q_THRESHOLD 312 -#define INV_MULMOD_BNM1_THRESHOLD 60 -#define INV_NEWTON_THRESHOLD 348 -#define INV_APPR_THRESHOLD 324 +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 315 +#define INV_APPR_THRESHOLD 315 -#define BINV_NEWTON_THRESHOLD 363 +#define BINV_NEWTON_THRESHOLD 360 #define REDC_1_TO_REDC_N_THRESHOLD 101 -#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIV_QR_THRESHOLD 979 #define MU_DIVAPPR_Q_THRESHOLD 1142 -#define MUPI_DIV_QR_THRESHOLD 110 +#define MUPI_DIV_QR_THRESHOLD 93 #define MU_BDIV_QR_THRESHOLD 889 -#define MU_BDIV_Q_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1187 #define MATRIX22_STRASSEN_THRESHOLD 9 -#define HGCD_THRESHOLD 242 -#define GCD_DC_THRESHOLD 752 -#define GCDEXT_DC_THRESHOLD 545 +#define HGCD_THRESHOLD 234 +#define HGCD_APPR_THRESHOLD 300 +#define HGCD_REDUCE_THRESHOLD 1553 +#define GCD_DC_THRESHOLD 684 +#define GCDEXT_DC_THRESHOLD 525 #define JACOBI_BASE_METHOD 2 #define GET_STR_DC_THRESHOLD 21 #define GET_STR_PRECOMPUTE_THRESHOLD 24 -#define SET_STR_DC_THRESHOLD 2008 -#define SET_STR_PRECOMPUTE_THRESHOLD 4066 +#define SET_STR_DC_THRESHOLD 1951 +#define SET_STR_PRECOMPUTE_THRESHOLD 4034 diff --git a/mpn/powerpc64/mode64/p4/gmp-mparam.h b/mpn/powerpc64/mode64/p4/gmp-mparam.h index 9a0932654..317bc94d6 100644 --- a/mpn/powerpc64/mode64/p4/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p4/gmp-mparam.h @@ -29,6 +29,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 37 @@ -43,16 +44,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 62 -#define SQR_BASECASE_THRESHOLD 5 -#define SQR_TOOM2_THRESHOLD 28 -#define SQR_TOOM3_THRESHOLD 57 -#define SQR_TOOM4_THRESHOLD 136 -#define SQR_TOOM6_THRESHOLD 181 -#define SQR_TOOM8_THRESHOLD 272 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 214 +#define SQR_TOOM6_THRESHOLD 254 +#define SQR_TOOM8_THRESHOLD 430 -#define MULMOD_BNM1_THRESHOLD 13 +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 12 #define SQRMOD_BNM1_THRESHOLD 16 +#define POWM_SEC_TABLE 6,47,347,1036,2826 + #define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 372, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ @@ -116,9 +121,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 103 #define SQR_FFT_THRESHOLD 2752 -#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_BASECASE_THRESHOLD 3 #define MULLO_DC_THRESHOLD 36 -#define MULLO_MUL_N_THRESHOLD 12691 +#define MULLO_MUL_N_THRESHOLD 13463 #define DC_DIV_QR_THRESHOLD 43 #define DC_DIVAPPR_Q_THRESHOLD 158 @@ -139,12 +144,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 998 #define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 105 +#define HGCD_THRESHOLD 103 +#define HGCD_APPR_THRESHOLD 110 +#define HGCD_REDUCE_THRESHOLD 1962 #define GCD_DC_THRESHOLD 318 #define GCDEXT_DC_THRESHOLD 242 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 12 #define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 858 -#define SET_STR_PRECOMPUTE_THRESHOLD 1864 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1781 diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h index d177da94e..9220f99d5 100644 --- a/mpn/powerpc64/mode64/p5/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h @@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 40 @@ -38,22 +39,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 50 #define MUL_TOOM44_THRESHOLD 121 #define MUL_TOOM6H_THRESHOLD 202 -#define MUL_TOOM8H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 260 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 82 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 #define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 -#define SQR_BASECASE_THRESHOLD 9 -#define SQR_TOOM2_THRESHOLD 36 -#define SQR_TOOM3_THRESHOLD 59 -#define SQR_TOOM4_THRESHOLD 147 -#define SQR_TOOM6_THRESHOLD 204 -#define SQR_TOOM8_THRESHOLD 288 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 191 +#define SQR_TOOM8_THRESHOLD 284 -#define MULMOD_BNM1_THRESHOLD 14 -#define SQRMOD_BNM1_THRESHOLD 16 +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 4,35,387,1068,2699 #define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -166,15 +171,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 2752 #define MULLO_BASECASE_THRESHOLD 0 -#define MULLO_DC_THRESHOLD 31 +#define MULLO_DC_THRESHOLD 42 #define MULLO_MUL_N_THRESHOLD 6633 -#define DC_DIV_QR_THRESHOLD 37 +#define DC_DIV_QR_THRESHOLD 43 #define DC_DIVAPPR_Q_THRESHOLD 155 #define DC_BDIV_QR_THRESHOLD 46 -#define DC_BDIV_Q_THRESHOLD 112 +#define DC_BDIV_Q_THRESHOLD 120 -#define INV_MULMOD_BNM1_THRESHOLD 26 +#define INV_MULMOD_BNM1_THRESHOLD 52 #define INV_NEWTON_THRESHOLD 177 #define INV_APPR_THRESHOLD 165 @@ -189,11 +194,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MATRIX22_STRASSEN_THRESHOLD 15 #define HGCD_THRESHOLD 108 -#define GCD_DC_THRESHOLD 303 +#define HGCD_APPR_THRESHOLD 113 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 315 #define GCDEXT_DC_THRESHOLD 237 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 13 #define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 532 -#define SET_STR_PRECOMPUTE_THRESHOLD 1639 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1585 diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h index 88cac3e72..5ec334089 100644 --- a/mpn/powerpc64/mode64/p6/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 21 @@ -45,16 +46,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 -#define SQR_BASECASE_THRESHOLD 9 -#define SQR_TOOM2_THRESHOLD 30 -#define SQR_TOOM3_THRESHOLD 53 -#define SQR_TOOM4_THRESHOLD 148 -#define SQR_TOOM6_THRESHOLD 226 -#define SQR_TOOM8_THRESHOLD 430 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 49 +#define SQR_TOOM4_THRESHOLD 136 +#define SQR_TOOM6_THRESHOLD 274 +#define SQR_TOOM8_THRESHOLD 410 + +#define MULMID_TOOM42_THRESHOLD 24 #define MULMOD_BNM1_THRESHOLD 14 #define SQRMOD_BNM1_THRESHOLD 14 +#define POWM_SEC_TABLE 4,19,228,713,919 + #define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ @@ -107,14 +112,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULLO_BASECASE_THRESHOLD 5 #define MULLO_DC_THRESHOLD 28 -#define MULLO_MUL_N_THRESHOLD 6633 +#define MULLO_MUL_N_THRESHOLD 3084 -#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIV_QR_THRESHOLD 23 #define DC_DIVAPPR_Q_THRESHOLD 112 #define DC_BDIV_QR_THRESHOLD 29 -#define DC_BDIV_Q_THRESHOLD 86 +#define DC_BDIV_Q_THRESHOLD 79 -#define INV_MULMOD_BNM1_THRESHOLD 47 +#define INV_MULMOD_BNM1_THRESHOLD 51 #define INV_NEWTON_THRESHOLD 93 #define INV_APPR_THRESHOLD 91 @@ -123,14 +128,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_DIV_QR_THRESHOLD 855 #define MU_DIVAPPR_Q_THRESHOLD 807 -#define MUPI_DIV_QR_THRESHOLD 33 +#define MUPI_DIV_QR_THRESHOLD 23 #define MU_BDIV_QR_THRESHOLD 807 #define MU_BDIV_Q_THRESHOLD 872 -#define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 64 -#define GCD_DC_THRESHOLD 237 -#define GCDEXT_DC_THRESHOLD 183 +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 69 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 268 +#define GCDEXT_DC_THRESHOLD 209 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 17 diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h index 57b888637..02603c525 100644 --- a/mpn/powerpc64/mode64/p7/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h @@ -28,7 +28,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 6 #define MOD_1U_TO_MOD_1_1_THRESHOLD 5 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 7 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 #define USE_PREINV_DIVREM_1 0 #define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -46,18 +46,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM42_TO_TOOM53_THRESHOLD 135 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 141 -#define SQR_BASECASE_THRESHOLD 10 -#define SQR_TOOM2_THRESHOLD 50 -#define SQR_TOOM3_THRESHOLD 84 -#define SQR_TOOM4_THRESHOLD 160 -#define SQR_TOOM6_THRESHOLD 246 -#define SQR_TOOM8_THRESHOLD 296 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 36 +#define SQR_TOOM3_THRESHOLD 109 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 303 +#define SQR_TOOM8_THRESHOLD 399 #define MULMID_TOOM42_THRESHOLD 62 #define MULMOD_BNM1_THRESHOLD 15 #define SQRMOD_BNM1_THRESHOLD 16 +#define POWM_SEC_TABLE 6,65,342,1465 + #define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 436, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ @@ -121,8 +123,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 103 #define SQR_FFT_THRESHOLD 3264 -#define MULLO_BASECASE_THRESHOLD 4 -#define MULLO_DC_THRESHOLD 34 +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 23 #define MULLO_MUL_N_THRESHOLD 9174 #define DC_DIV_QR_THRESHOLD 30 @@ -144,12 +146,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 1499 #define MATRIX22_STRASSEN_THRESHOLD 15 -#define HGCD_THRESHOLD 121 -#define GCD_DC_THRESHOLD 443 -#define GCDEXT_DC_THRESHOLD 396 +#define HGCD_THRESHOLD 124 +#define HGCD_APPR_THRESHOLD 155 +#define HGCD_REDUCE_THRESHOLD 3134 +#define GCD_DC_THRESHOLD 492 +#define GCDEXT_DC_THRESHOLD 333 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 11 -#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 #define SET_STR_DC_THRESHOLD 1517 -#define SET_STR_PRECOMPUTE_THRESHOLD 4040 +#define SET_STR_PRECOMPUTE_THRESHOLD 3421 diff --git a/mpn/s390_64/gmp-mparam.h b/mpn/s390_64/gmp-mparam.h index c4960254e..46ca86726 100644 --- a/mpn/s390_64/gmp-mparam.h +++ b/mpn/s390_64/gmp-mparam.h @@ -61,6 +61,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULMOD_BNM1_THRESHOLD 9 #define SQRMOD_BNM1_THRESHOLD 11 +#define POWM_SEC_TABLE 4,23,128,598 + #define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 220, 5}, { 7, 4}, { 15, 5}, { 8, 4}, \ @@ -131,7 +133,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULLO_BASECASE_THRESHOLD 3 #define MULLO_DC_THRESHOLD 33 -#define MULLO_MUL_N_THRESHOLD 4392 +#define MULLO_MUL_N_THRESHOLD 5240 #define DC_DIV_QR_THRESHOLD 28 #define DC_DIVAPPR_Q_THRESHOLD 106 @@ -152,12 +154,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 680 #define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 71 -#define GCD_DC_THRESHOLD 177 -#define GCDEXT_DC_THRESHOLD 142 -#define JACOBI_BASE_METHOD 2 +#define HGCD_THRESHOLD 75 +#define HGCD_APPR_THRESHOLD 59 +#define HGCD_REDUCE_THRESHOLD 901 +#define GCD_DC_THRESHOLD 186 +#define GCDEXT_DC_THRESHOLD 150 +#define JACOBI_BASE_METHOD 3 #define GET_STR_DC_THRESHOLD 27 #define GET_STR_PRECOMPUTE_THRESHOLD 40 -#define SET_STR_DC_THRESHOLD 363 +#define SET_STR_DC_THRESHOLD 418 #define SET_STR_PRECOMPUTE_THRESHOLD 1111 diff --git a/mpn/sparc64/ultrasparc34/gmp-mparam.h b/mpn/sparc64/ultrasparc34/gmp-mparam.h index faed8efa3..8fe8ddc54 100644 --- a/mpn/sparc64/ultrasparc34/gmp-mparam.h +++ b/mpn/sparc64/ultrasparc34/gmp-mparam.h @@ -28,12 +28,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 38 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 33 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -55,8 +56,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 191 #define SQR_TOOM8_THRESHOLD 339 -#define MULMOD_BNM1_THRESHOLD 14 -#define SQRMOD_BNM1_THRESHOLD 13 +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define POWM_SEC_TABLE 4,23,130,780,1812,1926 #define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -157,7 +162,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 182 #define SQR_FFT_THRESHOLD 1984 -#define MULLO_BASECASE_THRESHOLD 8 +#define MULLO_BASECASE_THRESHOLD 14 #define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */ #define MULLO_MUL_N_THRESHOLD 3791 @@ -170,7 +175,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define INV_NEWTON_THRESHOLD 17 #define INV_APPR_THRESHOLD 17 -#define BINV_NEWTON_THRESHOLD 134 +#define BINV_NEWTON_THRESHOLD 92 #define REDC_1_TO_REDC_2_THRESHOLD 2 #define REDC_2_TO_REDC_N_THRESHOLD 117 @@ -181,12 +186,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 748 #define MATRIX22_STRASSEN_THRESHOLD 12 -#define HGCD_THRESHOLD 46 -#define GCD_DC_THRESHOLD 130 +#define HGCD_THRESHOLD 45 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 1094 +#define GCD_DC_THRESHOLD 126 #define GCDEXT_DC_THRESHOLD 134 #define JACOBI_BASE_METHOD 2 #define GET_STR_DC_THRESHOLD 18 #define GET_STR_PRECOMPUTE_THRESHOLD 27 -#define SET_STR_DC_THRESHOLD 315 +#define SET_STR_DC_THRESHOLD 286 #define SET_STR_PRECOMPUTE_THRESHOLD 1037 diff --git a/mpn/sparc64/ultrasparct1/gmp-mparam.h b/mpn/sparc64/ultrasparct1/gmp-mparam.h index 744f7e17c..34c8027f5 100644 --- a/mpn/sparc64/ultrasparct1/gmp-mparam.h +++ b/mpn/sparc64/ultrasparct1/gmp-mparam.h @@ -25,14 +25,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIVREM_1_NORM_THRESHOLD 0 /* always */ #define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 #define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX #define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 35 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -50,13 +52,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_BASECASE_THRESHOLD 0 /* always */ #define SQR_TOOM2_THRESHOLD 16 #define SQR_TOOM3_THRESHOLD 57 -#define SQR_TOOM4_THRESHOLD 133 -#define SQR_TOOM6_THRESHOLD 156 +#define SQR_TOOM4_THRESHOLD 135 +#define SQR_TOOM6_THRESHOLD 160 #define SQR_TOOM8_THRESHOLD 260 +#define MULMID_TOOM42_THRESHOLD 12 + #define MULMOD_BNM1_THRESHOLD 7 #define SQRMOD_BNM1_THRESHOLD 7 +#define POWM_SEC_TABLE 2,23,176,625,2783 + #define MUL_FFT_MODF_THRESHOLD 176 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 176, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \ @@ -102,30 +108,32 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULLO_MUL_N_THRESHOLD 3176 #define DC_DIV_QR_THRESHOLD 27 -#define DC_DIVAPPR_Q_THRESHOLD 107 +#define DC_DIVAPPR_Q_THRESHOLD 108 #define DC_BDIV_QR_THRESHOLD 27 #define DC_BDIV_Q_THRESHOLD 62 -#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_MULMOD_BNM1_THRESHOLD 14 #define INV_NEWTON_THRESHOLD 163 #define INV_APPR_THRESHOLD 117 #define BINV_NEWTON_THRESHOLD 166 #define REDC_1_TO_REDC_N_THRESHOLD 32 -#define MU_DIV_QR_THRESHOLD 720 -#define MU_DIVAPPR_Q_THRESHOLD 734 -#define MUPI_DIV_QR_THRESHOLD 67 +#define MU_DIV_QR_THRESHOLD 734 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 68 #define MU_BDIV_QR_THRESHOLD 562 #define MU_BDIV_Q_THRESHOLD 734 -#define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 53 +#define MATRIX22_STRASSEN_THRESHOLD 9 +#define HGCD_THRESHOLD 66 +#define HGCD_APPR_THRESHOLD 47 +#define HGCD_REDUCE_THRESHOLD 834 #define GCD_DC_THRESHOLD 183 -#define GCDEXT_DC_THRESHOLD 144 +#define GCDEXT_DC_THRESHOLD 142 #define JACOBI_BASE_METHOD 3 #define GET_STR_DC_THRESHOLD 20 -#define GET_STR_PRECOMPUTE_THRESHOLD 39 +#define GET_STR_PRECOMPUTE_THRESHOLD 36 #define SET_STR_DC_THRESHOLD 458 -#define SET_STR_PRECOMPUTE_THRESHOLD 964 +#define SET_STR_PRECOMPUTE_THRESHOLD 963 diff --git a/mpn/x86/atom/gmp-mparam.h b/mpn/x86/atom/gmp-mparam.h index 8c2595230..391a0ac4a 100644 --- a/mpn/x86/atom/gmp-mparam.h +++ b/mpn/x86/atom/gmp-mparam.h @@ -24,26 +24,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ /* Generated by tuneup.c */ #define MOD_1_NORM_THRESHOLD 3 -#define MOD_1_UNNORM_THRESHOLD 6 -#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 33 #define MUL_TOOM22_THRESHOLD 20 #define MUL_TOOM33_THRESHOLD 78 -#define MUL_TOOM44_THRESHOLD 184 +#define MUL_TOOM44_THRESHOLD 168 #define MUL_TOOM6H_THRESHOLD 270 #define MUL_TOOM8H_THRESHOLD 406 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 79 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 127 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 34 @@ -52,8 +53,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 303 #define SQR_TOOM8_THRESHOLD 547 -#define MULMOD_BNM1_THRESHOLD 14 -#define SQRMOD_BNM1_THRESHOLD 18 +#define MULMID_TOOM42_THRESHOLD 54 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 2,35,262,1168 #define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -108,9 +113,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 82 #define SQR_FFT_THRESHOLD 2752 -#define MULLO_BASECASE_THRESHOLD 4 +#define MULLO_BASECASE_THRESHOLD 5 #define MULLO_DC_THRESHOLD 51 -#define MULLO_MUL_N_THRESHOLD 8907 +#define MULLO_MUL_N_THRESHOLD 6633 #define DC_DIV_QR_THRESHOLD 63 #define DC_DIVAPPR_Q_THRESHOLD 252 @@ -131,12 +136,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 1334 #define MATRIX22_STRASSEN_THRESHOLD 15 -#define HGCD_THRESHOLD 126 -#define GCD_DC_THRESHOLD 483 -#define GCDEXT_DC_THRESHOLD 351 +#define HGCD_THRESHOLD 129 +#define HGCD_APPR_THRESHOLD 163 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 469 +#define GCDEXT_DC_THRESHOLD 348 #define JACOBI_BASE_METHOD 3 #define GET_STR_DC_THRESHOLD 13 #define GET_STR_PRECOMPUTE_THRESHOLD 24 -#define SET_STR_DC_THRESHOLD 272 -#define SET_STR_PRECOMPUTE_THRESHOLD 1116 +#define SET_STR_DC_THRESHOLD 262 +#define SET_STR_PRECOMPUTE_THRESHOLD 902 diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h index 84238c4e0..9cc6798af 100644 --- a/mpn/x86/k7/gmp-mparam.h +++ b/mpn/x86/k7/gmp-mparam.h @@ -30,6 +30,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 26 @@ -40,19 +41,23 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM8H_THRESHOLD 454 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 85 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 95 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 101 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 50 -#define SQR_TOOM3_THRESHOLD 87 +#define SQR_TOOM3_THRESHOLD 81 #define SQR_TOOM4_THRESHOLD 148 -#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM6_THRESHOLD 274 #define SQR_TOOM8_THRESHOLD 430 +#define MULMID_TOOM42_THRESHOLD 88 + #define MULMOD_BNM1_THRESHOLD 18 -#define SQRMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define POWM_SEC_TABLE 2,17,225,961,1604 #define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */ #define MUL_FFT_TABLE3 \ @@ -155,28 +160,30 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULLO_DC_THRESHOLD 42 #define MULLO_MUL_N_THRESHOLD 13463 -#define DC_DIV_QR_THRESHOLD 89 -#define DC_DIVAPPR_Q_THRESHOLD 315 +#define DC_DIV_QR_THRESHOLD 60 +#define DC_DIVAPPR_Q_THRESHOLD 336 #define DC_BDIV_QR_THRESHOLD 91 -#define DC_BDIV_Q_THRESHOLD 274 +#define DC_BDIV_Q_THRESHOLD 268 #define INV_MULMOD_BNM1_THRESHOLD 66 -#define INV_NEWTON_THRESHOLD 300 -#define INV_APPR_THRESHOLD 303 +#define INV_NEWTON_THRESHOLD 284 +#define INV_APPR_THRESHOLD 284 -#define BINV_NEWTON_THRESHOLD 303 -#define REDC_1_TO_REDC_N_THRESHOLD 95 +#define BINV_NEWTON_THRESHOLD 270 +#define REDC_1_TO_REDC_N_THRESHOLD 87 -#define MU_DIV_QR_THRESHOLD 1858 -#define MU_DIVAPPR_Q_THRESHOLD 1718 -#define MUPI_DIV_QR_THRESHOLD 132 -#define MU_BDIV_QR_THRESHOLD 1387 +#define MU_DIV_QR_THRESHOLD 1752 +#define MU_DIVAPPR_Q_THRESHOLD 1652 +#define MUPI_DIV_QR_THRESHOLD 97 +#define MU_BDIV_QR_THRESHOLD 1470 #define MU_BDIV_Q_THRESHOLD 1470 #define MATRIX22_STRASSEN_THRESHOLD 15 -#define HGCD_THRESHOLD 154 -#define GCD_DC_THRESHOLD 599 -#define GCDEXT_DC_THRESHOLD 443 +#define HGCD_THRESHOLD 173 +#define HGCD_APPR_THRESHOLD 226 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 580 +#define GCDEXT_DC_THRESHOLD 414 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 17 diff --git a/mpn/x86/p6/sse2/gmp-mparam.h b/mpn/x86/p6/sse2/gmp-mparam.h index 2735b9c63..b163c58cc 100644 --- a/mpn/x86/p6/sse2/gmp-mparam.h +++ b/mpn/x86/p6/sse2/gmp-mparam.h @@ -31,37 +31,42 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ /* 1867 MHz P6 model 13 */ #define MOD_1_NORM_THRESHOLD 4 -#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 #define MOD_1N_TO_MOD_1_1_THRESHOLD 5 #define MOD_1U_TO_MOD_1_1_THRESHOLD 4 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 22 +#define BMOD_1_TO_MOD_1_THRESHOLD 21 #define MUL_TOOM22_THRESHOLD 20 -#define MUL_TOOM33_THRESHOLD 77 -#define MUL_TOOM44_THRESHOLD 182 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 181 #define MUL_TOOM6H_THRESHOLD 252 -#define MUL_TOOM8H_THRESHOLD 381 +#define MUL_TOOM8H_THRESHOLD 363 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 30 #define SQR_TOOM3_THRESHOLD 101 #define SQR_TOOM4_THRESHOLD 154 #define SQR_TOOM6_THRESHOLD 222 -#define SQR_TOOM8_THRESHOLD 547 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 58 #define MULMOD_BNM1_THRESHOLD 13 #define SQRMOD_BNM1_THRESHOLD 17 +#define POWM_SEC_TABLE 4,23,258,768,2388 + #define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ @@ -143,34 +148,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 5760 #define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 34 +#define MULLO_DC_THRESHOLD 33 #define MULLO_MUL_N_THRESHOLD 13463 -#define DC_DIV_QR_THRESHOLD 22 +#define DC_DIV_QR_THRESHOLD 20 #define DC_DIVAPPR_Q_THRESHOLD 56 #define DC_BDIV_QR_THRESHOLD 60 -#define DC_BDIV_Q_THRESHOLD 132 +#define DC_BDIV_Q_THRESHOLD 134 #define INV_MULMOD_BNM1_THRESHOLD 38 -#define INV_NEWTON_THRESHOLD 71 +#define INV_NEWTON_THRESHOLD 66 #define INV_APPR_THRESHOLD 63 -#define BINV_NEWTON_THRESHOLD 252 -#define REDC_1_TO_REDC_N_THRESHOLD 62 +#define BINV_NEWTON_THRESHOLD 250 +#define REDC_1_TO_REDC_N_THRESHOLD 63 -#define MU_DIV_QR_THRESHOLD 1142 -#define MU_DIVAPPR_Q_THRESHOLD 889 -#define MUPI_DIV_QR_THRESHOLD 39 -#define MU_BDIV_QR_THRESHOLD 1308 -#define MU_BDIV_Q_THRESHOLD 1442 +#define MU_DIV_QR_THRESHOLD 1164 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 38 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1470 #define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 61 -#define GCD_DC_THRESHOLD 379 -#define GCDEXT_DC_THRESHOLD 298 -#define JACOBI_BASE_METHOD 4 +#define HGCD_THRESHOLD 64 +#define HGCD_APPR_THRESHOLD 105 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 1 #define GET_STR_DC_THRESHOLD 13 -#define GET_STR_PRECOMPUTE_THRESHOLD 20 -#define SET_STR_DC_THRESHOLD 582 -#define SET_STR_PRECOMPUTE_THRESHOLD 1055 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 587 +#define SET_STR_PRECOMPUTE_THRESHOLD 1104 diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h index b1e56b5e2..8a198ad96 100644 --- a/mpn/x86/pentium4/sse2/gmp-mparam.h +++ b/mpn/x86/pentium4/sse2/gmp-mparam.h @@ -22,37 +22,42 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define BYTES_PER_MP_LIMB 4 -#define MOD_1_NORM_THRESHOLD 9 -#define MOD_1_UNNORM_THRESHOLD 20 +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 6 #define MOD_1U_TO_MOD_1_1_THRESHOLD 5 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 20 #define MUL_TOOM22_THRESHOLD 31 -#define MUL_TOOM33_THRESHOLD 120 -#define MUL_TOOM44_THRESHOLD 286 +#define MUL_TOOM33_THRESHOLD 114 +#define MUL_TOOM44_THRESHOLD 300 #define MUL_TOOM6H_THRESHOLD 426 -#define MUL_TOOM8H_THRESHOLD 592 +#define MUL_TOOM8H_THRESHOLD 620 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 195 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 216 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 193 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 187 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 184 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 181 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 209 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 48 -#define SQR_TOOM3_THRESHOLD 174 -#define SQR_TOOM4_THRESHOLD 390 -#define SQR_TOOM6_THRESHOLD 0 -#define SQR_TOOM8_THRESHOLD 507 +#define SQR_TOOM2_THRESHOLD 49 +#define SQR_TOOM3_THRESHOLD 173 +#define SQR_TOOM4_THRESHOLD 264 +#define SQR_TOOM6_THRESHOLD 354 +#define SQR_TOOM8_THRESHOLD 810 -#define MULMOD_BNM1_THRESHOLD 17 -#define SQRMOD_BNM1_THRESHOLD 21 +#define MULMID_TOOM42_THRESHOLD 68 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define POWM_SEC_TABLE 2,33,246,1052,2178 #define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */ #define MUL_FFT_TABLE3 \ @@ -102,35 +107,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 72 #define SQR_FFT_THRESHOLD 6784 -#define MULLO_BASECASE_THRESHOLD 12 -#define MULLO_DC_THRESHOLD 49 -#define MULLO_MUL_N_THRESHOLD 13866 +#define MULLO_BASECASE_THRESHOLD 13 +#define MULLO_DC_THRESHOLD 52 +#define MULLO_MUL_N_THRESHOLD 13463 -#define DC_DIV_QR_THRESHOLD 37 -#define DC_DIVAPPR_Q_THRESHOLD 81 -#define DC_BDIV_QR_THRESHOLD 51 -#define DC_BDIV_Q_THRESHOLD 80 +#define DC_DIV_QR_THRESHOLD 39 +#define DC_DIVAPPR_Q_THRESHOLD 77 +#define DC_BDIV_QR_THRESHOLD 54 +#define DC_BDIV_Q_THRESHOLD 94 #define INV_MULMOD_BNM1_THRESHOLD 60 -#define INV_NEWTON_THRESHOLD 244 -#define INV_APPR_THRESHOLD 98 +#define INV_NEWTON_THRESHOLD 182 +#define INV_APPR_THRESHOLD 93 -#define BINV_NEWTON_THRESHOLD 276 -#define REDC_1_TO_REDC_N_THRESHOLD 63 +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_N_THRESHOLD 66 #define MU_DIV_QR_THRESHOLD 2350 -#define MU_DIVAPPR_Q_THRESHOLD 2172 -#define MUPI_DIV_QR_THRESHOLD 48 -#define MU_BDIV_QR_THRESHOLD 1858 -#define MU_BDIV_Q_THRESHOLD 2172 - -#define MATRIX22_STRASSEN_THRESHOLD 29 -#define HGCD_THRESHOLD 81 -#define GCD_DC_THRESHOLD 309 +#define MU_DIVAPPR_Q_THRESHOLD 2130 +#define MUPI_DIV_QR_THRESHOLD 71 +#define MU_BDIV_QR_THRESHOLD 2130 +#define MU_BDIV_Q_THRESHOLD 2130 + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD_THRESHOLD 77 +#define HGCD_APPR_THRESHOLD 91 +#define HGCD_REDUCE_THRESHOLD 5010 +#define GCD_DC_THRESHOLD 327 #define GCDEXT_DC_THRESHOLD 253 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 10 -#define GET_STR_PRECOMPUTE_THRESHOLD 25 -#define SET_STR_DC_THRESHOLD 118 -#define SET_STR_PRECOMPUTE_THRESHOLD 1099 +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 144 +#define SET_STR_PRECOMPUTE_THRESHOLD 979 diff --git a/mpn/x86_64/atom/gmp-mparam.h b/mpn/x86_64/atom/gmp-mparam.h index 37ddcebc2..380f36f25 100644 --- a/mpn/x86_64/atom/gmp-mparam.h +++ b/mpn/x86_64/atom/gmp-mparam.h @@ -31,14 +31,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 #define MOD_1U_TO_MOD_1_1_THRESHOLD 3 #define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 17 +#define BMOD_1_TO_MOD_1_THRESHOLD 16 #define MUL_TOOM22_THRESHOLD 10 #define MUL_TOOM33_THRESHOLD 65 @@ -58,9 +59,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 222 #define SQR_TOOM8_THRESHOLD 333 +#define MULMID_TOOM42_THRESHOLD 14 + #define MULMOD_BNM1_THRESHOLD 7 #define SQRMOD_BNM1_THRESHOLD 12 +#define POWM_SEC_TABLE 2,31,213,724,2112 + #define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 220, 5}, { 7, 4}, { 15, 5}, { 13, 6}, \ @@ -145,9 +150,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 748 #define MATRIX22_STRASSEN_THRESHOLD 13 -#define HGCD_THRESHOLD 82 +#define HGCD_THRESHOLD 79 +#define HGCD_APPR_THRESHOLD 83 +#define HGCD_REDUCE_THRESHOLD 1137 #define GCD_DC_THRESHOLD 186 -#define GCDEXT_DC_THRESHOLD 186 +#define GCDEXT_DC_THRESHOLD 189 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 15 diff --git a/mpn/x86_64/bobcat/gmp-mparam.h b/mpn/x86_64/bobcat/gmp-mparam.h index f1edb1d36..5acb78a62 100644 --- a/mpn/x86_64/bobcat/gmp-mparam.h +++ b/mpn/x86_64/bobcat/gmp-mparam.h @@ -58,6 +58,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULMOD_BNM1_THRESHOLD 11 #define SQRMOD_BNM1_THRESHOLD 15 +#define POWM_SEC_TABLE 2,23,322,840 + #define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 376, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ @@ -145,9 +147,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 1308 #define MATRIX22_STRASSEN_THRESHOLD 14 -#define HGCD_THRESHOLD 103 -#define GCD_DC_THRESHOLD 469 -#define GCDEXT_DC_THRESHOLD 290 +#define HGCD_THRESHOLD 105 +#define HGCD_APPR_THRESHOLD 113 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 330 +#define GCDEXT_DC_THRESHOLD 306 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 17 diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h index 43adaa078..0752688fd 100644 --- a/mpn/x86_64/core2/gmp-mparam.h +++ b/mpn/x86_64/core2/gmp-mparam.h @@ -31,14 +31,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 26 #define MUL_TOOM22_THRESHOLD 23 #define MUL_TOOM33_THRESHOLD 65 -#define MUL_TOOM44_THRESHOLD 178 -#define MUL_TOOM6H_THRESHOLD 222 -#define MUL_TOOM8H_THRESHOLD 0 +#define MUL_TOOM44_THRESHOLD 169 +#define MUL_TOOM6H_THRESHOLD 254 +#define MUL_TOOM8H_THRESHOLD 357 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 69 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 @@ -48,15 +49,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 26 #define SQR_TOOM3_THRESHOLD 85 -#define SQR_TOOM4_THRESHOLD 160 -#define SQR_TOOM6_THRESHOLD 218 -#define SQR_TOOM8_THRESHOLD 296 +#define SQR_TOOM4_THRESHOLD 226 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 454 #define MULMID_TOOM42_THRESHOLD 24 #define MULMOD_BNM1_THRESHOLD 15 #define SQRMOD_BNM1_THRESHOLD 15 +#define POWM_SEC_TABLE 2,41,322,840,1100,1556 + #define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 380, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ @@ -156,8 +159,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 2752 #define MULLO_BASECASE_THRESHOLD 3 -#define MULLO_DC_THRESHOLD 20 -#define MULLO_MUL_N_THRESHOLD 10950 +#define MULLO_DC_THRESHOLD 18 +#define MULLO_MUL_N_THRESHOLD 9174 #define DC_DIV_QR_THRESHOLD 47 #define DC_DIVAPPR_Q_THRESHOLD 179 @@ -180,11 +183,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MATRIX22_STRASSEN_THRESHOLD 18 #define HGCD_THRESHOLD 135 +#define HGCD_APPR_THRESHOLD 169 +#define HGCD_REDUCE_THRESHOLD 2121 #define GCD_DC_THRESHOLD 330 #define GCDEXT_DC_THRESHOLD 361 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 13 #define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 746 +#define SET_STR_DC_THRESHOLD 552 #define SET_STR_PRECOMPUTE_THRESHOLD 1893 diff --git a/mpn/x86_64/coreinhm/gmp-mparam.h b/mpn/x86_64/coreinhm/gmp-mparam.h index eec17787d..90cfa2be4 100644 --- a/mpn/x86_64/coreinhm/gmp-mparam.h +++ b/mpn/x86_64/coreinhm/gmp-mparam.h @@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 15 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 17 @@ -55,6 +56,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULMOD_BNM1_THRESHOLD 13 #define SQRMOD_BNM1_THRESHOLD 13 +#define POWM_SEC_TABLE 2,65,322,1084 + #define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ @@ -112,8 +115,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DC_BDIV_QR_THRESHOLD 32 #define DC_BDIV_Q_THRESHOLD 70 -#define INV_MULMOD_BNM1_THRESHOLD 46 -#define INV_NEWTON_THRESHOLD 195 +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 177 #define INV_APPR_THRESHOLD 147 #define BINV_NEWTON_THRESHOLD 252 @@ -126,13 +129,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_QR_THRESHOLD 1120 #define MU_BDIV_Q_THRESHOLD 1187 -#define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 117 -#define GCD_DC_THRESHOLD 330 -#define GCDEXT_DC_THRESHOLD 382 +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 126 +#define HGCD_APPR_THRESHOLD 171 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 345 +#define GCDEXT_DC_THRESHOLD 386 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_DC_THRESHOLD 15 #define GET_STR_PRECOMPUTE_THRESHOLD 20 -#define SET_STR_DC_THRESHOLD 552 -#define SET_STR_PRECOMPUTE_THRESHOLD 1655 +#define SET_STR_DC_THRESHOLD 232 +#define SET_STR_PRECOMPUTE_THRESHOLD 1585 diff --git a/mpn/x86_64/coreisbr/gmp-mparam.h b/mpn/x86_64/coreisbr/gmp-mparam.h index e4727116b..dab35f174 100644 --- a/mpn/x86_64/coreisbr/gmp-mparam.h +++ b/mpn/x86_64/coreisbr/gmp-mparam.h @@ -29,8 +29,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1U_TO_MOD_1_1_THRESHOLD 3 #define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 30 @@ -52,58 +53,119 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 0 #define SQR_TOOM8_THRESHOLD 458 -#define MULMOD_BNM1_THRESHOLD 11 -#define SQRMOD_BNM1_THRESHOLD 16 +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 14 -#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 376, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ - { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ - { 21, 7}, { 11, 6}, { 23, 7}, { 13, 6}, \ - { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ - { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ - { 135,11}, { 79,10}, { 167,11}, { 95,10}, \ - { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ - { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ - { 287, 9}, { 575,11}, { 159,10}, { 319,12}, \ - { 95,11}, { 191,10}, { 383,11}, { 207,13}, \ - { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 135,11}, { 79,10}, { 159, 9}, { 319,10}, \ + { 167,11}, { 95,10}, { 191, 9}, { 383,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703, 9}, { 1407,11}, \ + { 367,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,10}, \ + { 895,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1663,13}, { 447,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,12}, \ + { 1919,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1663,13}, \ + { 3455,14}, { 1919,13}, { 3839,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 32768,16}, { 65536,17}, \ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ {2097152,22}, {4194304,23}, {8388608,24} } -#define MUL_FFT_TABLE3_SIZE 83 -#define MUL_FFT_THRESHOLD 3712 +#define MUL_FFT_TABLE3_SIZE 203 +#define MUL_FFT_THRESHOLD 4736 -#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 316, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ - { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ - { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ - { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ - { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ - { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ - { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ - { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \ - { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ - { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \ - { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ - { 639,12}, { 95,11}, { 191,10}, { 383,11}, \ - { 207,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ - { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ - {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } -#define SQR_FFT_TABLE3_SIZE 76 -#define SQR_FFT_THRESHOLD 3264 + { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,10}, { 959,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,10}, { 1919,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \ + { 447,12}, { 959,11}, { 1919,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1663,13}, \ + { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 198 +#define SQR_FFT_THRESHOLD 2752 #define MULLO_BASECASE_THRESHOLD 5 #define MULLO_DC_THRESHOLD 33 diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h index 99499da2b..b16ff5a6b 100644 --- a/mpn/x86_64/gmp-mparam.h +++ b/mpn/x86_64/gmp-mparam.h @@ -30,6 +30,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 28 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 15 @@ -56,6 +57,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULMOD_BNM1_THRESHOLD 17 #define SQRMOD_BNM1_THRESHOLD 17 +#define POWM_SEC_TABLE 2,67,322,991 + #define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ @@ -187,10 +190,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_QR_THRESHOLD 1589 #define MU_BDIV_Q_THRESHOLD 1718 -#define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 139 -#define GCD_DC_THRESHOLD 606 -#define GCDEXT_DC_THRESHOLD 474 +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 125 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 555 +#define GCDEXT_DC_THRESHOLD 478 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 12 diff --git a/mpn/x86_64/nano/gmp-mparam.h b/mpn/x86_64/nano/gmp-mparam.h index a1c556937..7ee41927b 100644 --- a/mpn/x86_64/nano/gmp-mparam.h +++ b/mpn/x86_64/nano/gmp-mparam.h @@ -34,6 +34,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 22 @@ -50,13 +51,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 34 -#define SQR_TOOM3_THRESHOLD 74 -#define SQR_TOOM4_THRESHOLD 620 -#define SQR_TOOM6_THRESHOLD 960 -#define SQR_TOOM8_THRESHOLD 1065 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 592 +#define SQR_TOOM6_THRESHOLD 978 +#define SQR_TOOM8_THRESHOLD 1193 -#define MULMOD_BNM1_THRESHOLD 15 -#define SQRMOD_BNM1_THRESHOLD 17 +#define MULMID_TOOM42_THRESHOLD 28 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 20 + +#define POWM_SEC_TABLE 2,29,387,1421 #define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -176,7 +181,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 215 #define SQR_FFT_THRESHOLD 3264 -#define MULLO_BASECASE_THRESHOLD 17 +#define MULLO_BASECASE_THRESHOLD 8 #define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */ #define MULLO_MUL_N_THRESHOLD 6633 @@ -190,7 +195,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define INV_APPR_THRESHOLD 153 #define BINV_NEWTON_THRESHOLD 182 -#define REDC_1_TO_REDC_2_THRESHOLD 14 +#define REDC_1_TO_REDC_2_THRESHOLD 20 #define REDC_2_TO_REDC_N_THRESHOLD 75 #define MU_DIV_QR_THRESHOLD 1589 @@ -200,12 +205,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 1528 #define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 84 -#define GCD_DC_THRESHOLD 465 -#define GCDEXT_DC_THRESHOLD 456 +#define HGCD_THRESHOLD 102 +#define HGCD_APPR_THRESHOLD 113 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 706 +#define GCDEXT_DC_THRESHOLD 465 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 12 #define GET_STR_PRECOMPUTE_THRESHOLD 24 -#define SET_STR_DC_THRESHOLD 537 -#define SET_STR_PRECOMPUTE_THRESHOLD 1639 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1794 diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h index 8983304c2..4d49fc2cf 100644 --- a/mpn/x86_64/pentium4/gmp-mparam.h +++ b/mpn/x86_64/pentium4/gmp-mparam.h @@ -33,34 +33,39 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 4 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 20 #define MUL_TOOM22_THRESHOLD 12 -#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM33_THRESHOLD 74 #define MUL_TOOM44_THRESHOLD 118 #define MUL_TOOM6H_THRESHOLD 157 -#define MUL_TOOM8H_THRESHOLD 242 +#define MUL_TOOM8H_THRESHOLD 430 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 20 -#define SQR_TOOM3_THRESHOLD 77 -#define SQR_TOOM4_THRESHOLD 214 +#define SQR_TOOM3_THRESHOLD 69 +#define SQR_TOOM4_THRESHOLD 202 #define SQR_TOOM6_THRESHOLD 254 -#define SQR_TOOM8_THRESHOLD 454 +#define SQR_TOOM8_THRESHOLD 418 + +#define MULMID_TOOM42_THRESHOLD 19 #define MULMOD_BNM1_THRESHOLD 10 -#define SQRMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define POWM_SEC_TABLE 3,130,140,724,2316 #define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -121,11 +126,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULLO_BASECASE_THRESHOLD 0 /* always */ #define MULLO_DC_THRESHOLD 32 -#define MULLO_MUL_N_THRESHOLD 5397 +#define MULLO_MUL_N_THRESHOLD 6253 -#define DC_DIV_QR_THRESHOLD 28 -#define DC_DIVAPPR_Q_THRESHOLD 67 -#define DC_BDIV_QR_THRESHOLD 27 +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 60 +#define DC_BDIV_QR_THRESHOLD 26 #define DC_BDIV_Q_THRESHOLD 49 #define INV_MULMOD_BNM1_THRESHOLD 22 @@ -133,8 +138,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define INV_APPR_THRESHOLD 101 #define BINV_NEWTON_THRESHOLD 199 -#define REDC_1_TO_REDC_2_THRESHOLD 13 -#define REDC_2_TO_REDC_N_THRESHOLD 44 +#define REDC_1_TO_REDC_2_THRESHOLD 23 +#define REDC_2_TO_REDC_N_THRESHOLD 42 #define MU_DIV_QR_THRESHOLD 979 #define MU_DIVAPPR_Q_THRESHOLD 979 @@ -143,12 +148,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 979 #define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 101 -#define GCD_DC_THRESHOLD 222 -#define GCDEXT_DC_THRESHOLD 222 +#define HGCD_THRESHOLD 99 +#define HGCD_APPR_THRESHOLD 117 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 198 +#define GCDEXT_DC_THRESHOLD 233 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 12 #define GET_STR_PRECOMPUTE_THRESHOLD 26 -#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_DC_THRESHOLD 422 #define SET_STR_PRECOMPUTE_THRESHOLD 1438 -- cgit v1.2.1 From a7466d9e0e147ffcb964e987d207562306da48b5 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sun, 20 Nov 2011 21:47:49 +0100 Subject: Configure improvements powerpc64 with abi=32. --- ChangeLog | 7 +++++++ configure.in | 14 ++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index 658930906..2e4b53904 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2011-11-20 Torbjorn Granlund + + * configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe + mechanism. + + * configure.in: Support powerpc32/p3-p7 directory for affected CPUs. + 2011-11-17 Torbjorn Granlund * tune/speed.c (routine): Add mpn_tabselect. diff --git a/configure.in b/configure.in index 887975c40..6427ec3dd 100644 --- a/configure.in +++ b/configure.in @@ -919,7 +919,7 @@ case $host in powerpc620) gcc_cflags_cpu="-mcpu=620" ;; powerpc630) gcc_cflags_cpu="-mcpu=630" xlc_cflags_arch="-qarch=pwr3" - cpu_path="p3" ;; + cpu_path="p3 p3-p7" ;; powerpc740) gcc_cflags_cpu="-mcpu=740" ;; powerpc7400 | powerpc7410) gcc_cflags_asm="-Wa,-maltivec" @@ -935,19 +935,19 @@ case $host in powerpc970) gcc_cflags_cpu="-mtune=970" xlc_cflags_arch="-qarch=970 -qarch=pwr3" vmx_path="powerpc64/vmx" - cpu_path="p4" ;; + cpu_path="p4 p3-p7" ;; power4) gcc_cflags_cpu="-mtune=power4" xlc_cflags_arch="-qarch=pwr4" - cpu_path="p4" ;; + cpu_path="p4 p3-p7" ;; power5) gcc_cflags_cpu="-mtune=power5 -mtune=power4" xlc_cflags_arch="-qarch=pwr5" - cpu_path="p5 p4" ;; + cpu_path="p5 p4 p3-p7" ;; power6) gcc_cflags_cpu="-mtune=power6" xlc_cflags_arch="-qarch=pwr6" - cpu_path="p6" ;; + cpu_path="p6 p3-p7" ;; power7) gcc_cflags_cpu="-mtune=power7 -mtune=power5" xlc_cflags_arch="-qarch=pwr7 -qarch=pwr5" - cpu_path="p7 p5 p4" ;; + cpu_path="p7 p5 p4 p3-p7" ;; esac case $host in @@ -1012,6 +1012,7 @@ case $host in # incompatible with a shared library. # abilist="mode64 mode32 $abilist" + gcc_32_cflags_maybe="-m32" gcc_cflags_opt="-O3 -O2 -O1" # will this become used? cclist_mode32="gcc" gcc_mode32_cflags_maybe="-m32" @@ -1057,6 +1058,7 @@ case $host in # 64-bits. # abilist="mode64 mode32 $abilist" + gcc_32_cflags_maybe="-m32" cclist_mode32="gcc" gcc_mode32_cflags_maybe="-m32" gcc_mode32_cflags="-mpowerpc64" -- cgit v1.2.1 From 952803d3c43dcacfbd001d5fed37b32316b529dd Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sun, 20 Nov 2011 21:49:10 +0100 Subject: Provide special powerpc64 add_n/sub_n abi=32 code. --- ChangeLog | 2 + mpn/powerpc32/p3-p7/aors_n.asm | 176 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 mpn/powerpc32/p3-p7/aors_n.asm diff --git a/ChangeLog b/ChangeLog index 2e4b53904..420ae5f4e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2011-11-20 Torbjorn Granlund + * mpn/powerpc32/p3-p7/aors_n.asm: New file. + * configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe mechanism. diff --git a/mpn/powerpc32/p3-p7/aors_n.asm b/mpn/powerpc32/p3-p7/aors_n.asm new file mode 100644 index 000000000..6999182a8 --- /dev/null +++ b/mpn/powerpc32/p3-p7/aors_n.asm @@ -0,0 +1,176 @@ +dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.78 +C POWER7 2.15-2.87 + +C This code is based on powerpc64/aors_n.asm. + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): stw r31, -4(r1) + stw r30, -8(r1) + stw r29, -12(r1) + stw r28, -16(r1) + + rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0 + cmpwi cr6, r0, 2 + addi r6, r6, 3 C compute count... + srwi r6, r6, 2 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): lwz r8, 0(r4) C load s1 limb + lwz r9, 0(r5) C load s2 limb + lwz r10, 4(r4) C load s1 limb + lwz r11, 4(r5) C load s2 limb + lwz r12, 8(r4) C load s1 limb + addi r4, r4, 12 + lwz r0, 8(r5) C load s2 limb + addi r5, r5, 12 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r29, 0(r3) + stw r30, 4(r3) + stw r31, 8(r3) + addi r3, r3, 12 + bdnz L(go) + b L(ret) + +L(b01): lwz r12, 0(r4) C load s1 limb + addi r4, r4, 4 + lwz r0, 0(r5) C load s2 limb + addi r5, r5, 4 + ADDSUBC r31, r0, r12 C add + stw r31, 0(r3) + addi r3, r3, 4 + bdnz L(go) + b L(ret) + +L(b10): lwz r10, 0(r4) C load s1 limb + lwz r11, 0(r5) C load s2 limb + lwz r12, 4(r4) C load s1 limb + addi r4, r4, 8 + lwz r0, 4(r5) C load s2 limb + addi r5, r5, 8 + ADDSUBC r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + stw r30, 0(r3) + stw r31, 4(r3) + addi r3, r3, 8 + bdnz L(go) + b L(ret) + +L(b00): C INITCY C clear/set cy +L(go): lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + bdz L(end) + + addi r4, r4, 16 + addi r5, r5, 16 + + ALIGN(16) +L(top): ADDSUBC r28, r7, r6 + lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + ADDSUBC r29, r9, r8 + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + ADDSUBC r30, r11, r10 + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + ADDSUBC r31, r0, r12 + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + stw r28, 0(r3) + addi r4, r4, 16 + stw r29, 4(r3) + addi r5, r5, 16 + stw r30, 8(r3) + stw r31, 12(r3) + addi r3, r3, 16 + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r7, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r28, 0(r3) + stw r29, 4(r3) + stw r30, 8(r3) + stw r31, 12(r3) + +L(ret): lwz r31, -4(r1) + lwz r30, -8(r1) + lwz r29, -12(r1) + lwz r28, -16(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() -- cgit v1.2.1 From 1c9f3475308f9c3ae0b811566c4c88650128b772 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sun, 20 Nov 2011 22:55:07 +0100 Subject: Split x86 CPUs into more subtypes for more accurate passing of gcc flags. --- ChangeLog | 3 +++ configure.in | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 420ae5f4e..bca740a7f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2011-11-20 Torbjorn Granlund + * configure.in: Split x86 CPUs into more subtypes for more accurate + passing of gcc flags. + * mpn/powerpc32/p3-p7/aors_n.asm: New file. * configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe diff --git a/configure.in b/configure.in index 6427ec3dd..186d4b576 100644 --- a/configure.in +++ b/configure.in @@ -1536,14 +1536,30 @@ case $host in gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486" gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium" ;; - athlon64 | k8 | k10 | bobcat | bulldozer | x86_64) + athlon64 | k8 | x86_64) gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium" ;; - core2 | corei | coreinhm | coreiwsm | coreisbr) + k10) + gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8" + gcc_cflags_arch="-march=amdfam10 -mtune=k8 -march=k8~-mno-sse2" + ;; + bobcat) + gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8" + gcc_cflags_arch="-march=btver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2" + ;; + bulldozer) + gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8" + gcc_cflags_arch="-march=bdver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2" + ;; + core2) gcc_cflags_cpu="-mtune=core2 -mtune=k8" gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" ;; + corei | coreinhm | coreiwsm | coreisbr) + gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" + gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + ;; atom) gcc_cflags_cpu="-mtune=atom -mtune=pentium3" gcc_cflags_arch="-march=atom -march=pentium3" -- cgit v1.2.1 From cda511a97523ac223432c6767e11b40e95e157e4 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sun, 20 Nov 2011 22:56:02 +0100 Subject: Retune. --- mpn/x86_64/coreisbr/gmp-mparam.h | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/mpn/x86_64/coreisbr/gmp-mparam.h b/mpn/x86_64/coreisbr/gmp-mparam.h index dab35f174..c30c64ec8 100644 --- a/mpn/x86_64/coreisbr/gmp-mparam.h +++ b/mpn/x86_64/coreisbr/gmp-mparam.h @@ -53,9 +53,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 0 #define SQR_TOOM8_THRESHOLD 458 -#define MULMOD_BNM1_THRESHOLD 13 +#define MULMID_TOOM42_THRESHOLD 24 + +#define MULMOD_BNM1_THRESHOLD 14 #define SQRMOD_BNM1_THRESHOLD 14 +#define POWM_SEC_TABLE 4,35,130,713,2080 + #define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ @@ -176,27 +180,29 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DC_BDIV_QR_THRESHOLD 31 #define DC_BDIV_Q_THRESHOLD 71 -#define INV_MULMOD_BNM1_THRESHOLD 38 -#define INV_NEWTON_THRESHOLD 127 -#define INV_APPR_THRESHOLD 123 +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 123 +#define INV_APPR_THRESHOLD 122 -#define BINV_NEWTON_THRESHOLD 181 -#define REDC_1_TO_REDC_2_THRESHOLD 17 -#define REDC_2_TO_REDC_N_THRESHOLD 51 +#define BINV_NEWTON_THRESHOLD 197 +#define REDC_1_TO_REDC_2_THRESHOLD 20 +#define REDC_2_TO_REDC_N_THRESHOLD 54 #define MU_DIV_QR_THRESHOLD 1334 #define MU_DIVAPPR_Q_THRESHOLD 1387 -#define MUPI_DIV_QR_THRESHOLD 57 +#define MUPI_DIV_QR_THRESHOLD 46 #define MU_BDIV_QR_THRESHOLD 1142 #define MU_BDIV_Q_THRESHOLD 1308 #define MATRIX22_STRASSEN_THRESHOLD 15 -#define HGCD_THRESHOLD 90 -#define GCD_DC_THRESHOLD 400 -#define GCDEXT_DC_THRESHOLD 372 +#define HGCD_THRESHOLD 91 +#define HGCD_APPR_THRESHOLD 105 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 358 +#define GCDEXT_DC_THRESHOLD 351 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 12 -#define GET_STR_PRECOMPUTE_THRESHOLD 21 -#define SET_STR_DC_THRESHOLD 802 -#define SET_STR_PRECOMPUTE_THRESHOLD 1712 +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 781 +#define SET_STR_PRECOMPUTE_THRESHOLD 1940 -- cgit v1.2.1 From 15a7619b6229dea0d8d895aaa5506e40304dcb3f Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 21 Nov 2011 21:03:39 +0100 Subject: (__GNU_MP_RELEASE): Renamed from typo name. --- ChangeLog | 4 ++++ gmp-h.in | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index bca740a7f..4d031a239 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-21 Torbjorn Granlund + + * gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name. + 2011-11-20 Torbjorn Granlund * configure.in: Split x86 CPUs into more subtypes for more accurate diff --git a/gmp-h.in b/gmp-h.in index 7d6b22926..ba732f5e3 100644 --- a/gmp-h.in +++ b/gmp-h.in @@ -2275,7 +2275,7 @@ enum #define __GNU_MP_VERSION 5 #define __GNU_MP_VERSION_MINOR 0 #define __GNU_MP_VERSION_PATCHLEVEL 90 -#define __GMP_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL) +#define __GNU_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL) #define __GMP_H__ #endif /* __GMP_H__ */ -- cgit v1.2.1 From f24a8deaf598267ea9c57ba93e9e6a94038bc8f3 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 21 Nov 2011 21:15:18 +0100 Subject: Spacing cleanup. --- AUTHORS | 2 +- gmp-h.in | 2 +- gmpxx.h | 2 +- mpn/generic/gcd_subdiv_step.c | 2 +- mpn/generic/hgcd_appr.c | 6 +++--- mpn/generic/hgcd_jacobi.c | 4 ++-- mpn/generic/hgcd_reduce.c | 14 ++++++------- mpn/generic/hgcd_step.c | 2 +- mpn/powerpc64/mode64/aorsmul_1.asm | 2 +- mpn/s390_32/lshift.asm | 2 +- mpn/s390_32/lshiftc.asm | 2 +- mpn/s390_32/rshift.asm | 2 +- mpn/x86/atom/lshift.asm | 4 ++-- mpn/x86/atom/sse2/mul_1.asm | 2 +- mpn/x86/bdiv_dbm1c.asm | 4 ++-- mpn/x86/bdiv_q_1.asm | 2 +- mpn/x86/k7/addlsh1_n.asm | 6 +++--- mpn/x86/k7/invert_limb.asm | 2 +- mpn/x86/k7/sublsh1_n.asm | 8 ++++---- mpn/x86/p6/bdiv_q_1.asm | 4 ++-- mpn/x86/pentium/bdiv_q_1.asm | 2 +- mpn/x86_64/div_qr_2n_pi1.asm | 6 +++--- mpn/x86_64/div_qr_2u_pi1.asm | 6 +++--- mpn/x86_64/mod_1_1.asm | 4 ++-- mpz/jacobi.c | 8 ++++---- tests/cxx/t-ops2.cc | 40 +++++++++++++++++++------------------- tests/devel/try.c | 2 +- tests/mpn/t-hgcd_appr.c | 14 ++++++------- tests/mpn/t-mod_1.c | 2 +- tests/mpn/t-mulmid.c | 2 +- tests/mpz/t-jac.c | 4 ++-- tune/tune-gcd-p.c | 4 ++-- tune/tuneup.c | 2 +- 33 files changed, 85 insertions(+), 85 deletions(-) diff --git a/AUTHORS b/AUTHORS index 170c766e1..f399ce345 100644 --- a/AUTHORS +++ b/AUTHORS @@ -58,5 +58,5 @@ David Harvey mpn/generic/add_err1_n.c, add_err2_n.c, aors_err2_n.asm, aors_err3_n.asm, mulmid_basecase.asm, mpn/x86_64/core2/aors_err1_n.asm. - + Martin Boij mpn/generic/perfpow.c diff --git a/gmp-h.in b/gmp-h.in index ba732f5e3..fa3438041 100644 --- a/gmp-h.in +++ b/gmp-h.in @@ -1535,7 +1535,7 @@ __GMP_DECLSPEC mp_limb_t mpn_divrem_2 __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, m #define mpn_div_qr_2 __MPN(div_qr_2) __GMP_DECLSPEC mp_limb_t mpn_div_qr_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr)); - + #define mpn_gcd __MPN(gcd) __GMP_DECLSPEC mp_size_t mpn_gcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); diff --git a/gmpxx.h b/gmpxx.h index e7ef16266..fb4865466 100644 --- a/gmpxx.h +++ b/gmpxx.h @@ -616,7 +616,7 @@ struct __gmp_binary_divides } else #endif - mpz_tdiv_q_ui(z, w, l); + mpz_tdiv_q_ui(z, w, l); } static void eval(mpz_ptr z, unsigned long int l, mpz_srcptr w) { diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c index 11c00bb6a..3db34073c 100644 --- a/mpn/generic/gcd_subdiv_step.c +++ b/mpn/generic/gcd_subdiv_step.c @@ -185,7 +185,7 @@ mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s, } else MPN_COPY (bp, ap, an); - + MPN_DECR_U (tp, qn, 1); } diff --git a/mpn/generic/hgcd_appr.c b/mpn/generic/hgcd_appr.c index 8454f9da5..f7c7eb2c9 100644 --- a/mpn/generic/hgcd_appr.c +++ b/mpn/generic/hgcd_appr.c @@ -72,7 +72,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n, we discard some of the least significant limbs, we must keep one additional bit to account for the truncation error. We maintain the GMP_NUMB_BITS * s - extra_bits as the current target size. */ - + s = n/2 + 1; if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD)) { @@ -155,7 +155,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n, ASSERT (n <= 2*s); nn = mpn_hgcd_step (n, ap, bp, s, M, tp); - + if (!nn) return 1; @@ -249,7 +249,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n, ASSERT (n <= 2*s); nn = mpn_hgcd_step (n, ap, bp, s, M, tp); - + if (!nn) return success; diff --git a/mpn/generic/hgcd_jacobi.c b/mpn/generic/hgcd_jacobi.c index 2dce43b99..0d4cb021c 100644 --- a/mpn/generic/hgcd_jacobi.c +++ b/mpn/generic/hgcd_jacobi.c @@ -26,7 +26,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "longlong.h" /* This file is almost a copy of hgcd.c, with some added calls to - mpn_jacobi_update */ + mpn_jacobi_update */ struct hgcd_jacobi_ctx { @@ -127,7 +127,7 @@ hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, struct hgcd_jacobi_ctx ctx; ctx.M = M; ctx.bitsp = bitsp; - + return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp); } } diff --git a/mpn/generic/hgcd_reduce.c b/mpn/generic/hgcd_reduce.c index 142d44a30..89240af4d 100644 --- a/mpn/generic/hgcd_reduce.c +++ b/mpn/generic/hgcd_reduce.c @@ -38,7 +38,7 @@ submul (mp_ptr rp, mp_size_t rn, ASSERT (an >= bn); ASSERT (rn >= an); ASSERT (an + bn <= rn + 1); - + TMP_MARK; tp = TMP_ALLOC_LIMBS (an + bn); @@ -61,7 +61,7 @@ submul (mp_ptr rp, mp_size_t rn, /* FIXME: x Take scratch parameter, and figure out scratch need. - x Use some fallback for small M->n? + x Use some fallback for small M->n? */ static mp_size_t hgcd_matrix_apply (const struct hgcd_matrix *M, @@ -83,7 +83,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M, MPN_NORMALIZE (ap, an); bn = n; MPN_NORMALIZE (bp, bn); - + for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) { @@ -102,7 +102,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M, if (mn[0][1] == 0) { mp_size_t qn; - + /* A unchanged, M = (1, 0; q, 1) */ ASSERT (mn[0][0] == 1); ASSERT (M->p[0][0][0] == 1); @@ -121,7 +121,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M, ASSERT (M->p[1][1][0] == 1); /* Put A <-- A - q * B */ - nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]); + nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]); } else { @@ -159,7 +159,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M, MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]); if (n + mn[0][1] < modn) MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]); - + cy = mpn_sub_n (tp, tp, sp, modn); MPN_DECR_U (tp, modn, cy); @@ -209,7 +209,7 @@ mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p) itch = 2*(n-p) + mpn_hgcd_itch (n-p); /* Currently, hgcd_matrix_apply allocates its own storage. */ } - return itch; + return itch; } /* FIXME: Document storage need. */ diff --git a/mpn/generic/hgcd_step.c b/mpn/generic/hgcd_step.c index 0e56be39e..dbc757935 100644 --- a/mpn/generic/hgcd_step.c +++ b/mpn/generic/hgcd_step.c @@ -112,7 +112,7 @@ mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, /* Multiply M1^{-1} (a;b) */ return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n); } - + subtract: return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp); diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm index 658a2d941..4b843a044 100644 --- a/mpn/powerpc64/mode64/aorsmul_1.asm +++ b/mpn/powerpc64/mode64/aorsmul_1.asm @@ -54,7 +54,7 @@ ifdef(`OPERATION_submul_1',` ') MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) - + ASM_START() PROLOGUE(func_nc) EPILOGUE() diff --git a/mpn/s390_32/lshift.asm b/mpn/s390_32/lshift.asm index 335a5f77a..17e52655f 100644 --- a/mpn/s390_32/lshift.asm +++ b/mpn/s390_32/lshift.asm @@ -126,7 +126,7 @@ L(top): l %r10, 16(up) L(end): l %r10, 16(up) sll %r10, 0(cnt) st %r10, 12(rp) - + lr %r2, %r12 lm %r6, %r12, 24(%r15) br %r14 diff --git a/mpn/s390_32/lshiftc.asm b/mpn/s390_32/lshiftc.asm index b42bc715b..9bdd0d798 100644 --- a/mpn/s390_32/lshiftc.asm +++ b/mpn/s390_32/lshiftc.asm @@ -138,7 +138,7 @@ L(end): l %r10, 16(up) sll %r10, 0(cnt) xr %r10, %r13 st %r10, 12(rp) - + lr %r2, %r12 lm %r6, %r13, 24(%r15) br %r14 diff --git a/mpn/s390_32/rshift.asm b/mpn/s390_32/rshift.asm index ec32fa764..becbe1893 100644 --- a/mpn/s390_32/rshift.asm +++ b/mpn/s390_32/rshift.asm @@ -120,7 +120,7 @@ L(top): l %r11, 0(up) L(end): l %r11, 0(up) srl %r11, 0(cnt) st %r11, 0(rp) - + lr %r2, %r12 lm %r6, %r12, 24(%r15) br %r14 diff --git a/mpn/x86/atom/lshift.asm b/mpn/x86/atom/lshift.asm index d8cb8b505..1005cce59 100644 --- a/mpn/x86/atom/lshift.asm +++ b/mpn/x86/atom/lshift.asm @@ -160,7 +160,7 @@ deflit(`FRAME',4) shr $2, %eax C (size + 3) / 4 and $3, %edx C (size - 1) % 4 jz L(goloop) C jmp if size == 1 (mod 4) - shr %edx + shr %edx jnc L(odd) C jum if size == 3 (mod 4) add %ecx, %ecx @@ -173,7 +173,7 @@ deflit(`FRAME',4) jnz L(goloop) C jump if size == 0 (mod 4) L(odd): lea -8(up), up lea -8(rp), rp - jmp L(sentry) C reached if size == 2 or 3 (mod 4) + jmp L(sentry) C reached if size == 2 or 3 (mod 4) L(sloop): adc %ecx, %ecx diff --git a/mpn/x86/atom/sse2/mul_1.asm b/mpn/x86/atom/sse2/mul_1.asm index dd9b95366..5cd86caec 100644 --- a/mpn/x86/atom/sse2/mul_1.asm +++ b/mpn/x86/atom/sse2/mul_1.asm @@ -62,7 +62,7 @@ EPILOGUE() PROLOGUE(mpn_mul_1) pxor %mm6, %mm6 L(ent): push %esi FRAME_pushl() - mov PARAM_SRC, up + mov PARAM_SRC, up mov PARAM_SIZE, %eax C size movd PARAM_MUL, %mm7 movd (up), %mm0 diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm index 201ef173d..ac9faf270 100644 --- a/mpn/x86/bdiv_dbm1c.asm +++ b/mpn/x86/bdiv_dbm1c.asm @@ -24,10 +24,10 @@ C P5 C P6 model 0-8,10-12) C P6 model 9 (Banias) C P6 model 13 (Dothan) 5.1 -C P4 model 0 (Willamette) +C P4 model 0 (Willamette) C P4 model 1 (?) C P4 model 2 (Northwood) 13.67 -C P4 model 3 (Prescott) +C P4 model 3 (Prescott) C P4 model 4 (Nocona) C Intel Atom C AMD K6 diff --git a/mpn/x86/bdiv_q_1.asm b/mpn/x86/bdiv_q_1.asm index 2528d01f7..7f344ab57 100644 --- a/mpn/x86/bdiv_q_1.asm +++ b/mpn/x86/bdiv_q_1.asm @@ -30,7 +30,7 @@ C K6 14.0 C K7 12.0 C P4 42.0 -MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) defframe(PARAM_SHIFT, 24) defframe(PARAM_INVERSE,20) diff --git a/mpn/x86/k7/addlsh1_n.asm b/mpn/x86/k7/addlsh1_n.asm index e5163b676..05df4a740 100644 --- a/mpn/x86/k7/addlsh1_n.asm +++ b/mpn/x86/k7/addlsh1_n.asm @@ -44,14 +44,14 @@ C AMD K8 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, C that means we need an initial magic multiply. -C +C C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We C cannot do rsblsh1_n since we feed carry from the shift blocks to the C add/subtract blocks, which is right for addition but reversed for C subtraction. We could perhaps do sublsh1_n, with some extra move insns, C without losing any time, since we're not issue limited but carry recurrency C latency. -C +C C Breaking carry recurrency might be a good idea. We would then need separate C registers for the shift carry and add/subtract carry, which in turn would C force is to 2*2-way unrolling. @@ -120,7 +120,7 @@ ifdef(`CPU_P6',` L(exact): incl VAR_COUNT jz L(end) - + ALIGN(16) L(top): ifdef(`CPU_P6',` diff --git a/mpn/x86/k7/invert_limb.asm b/mpn/x86/k7/invert_limb.asm index da6f28397..435fa96d0 100644 --- a/mpn/x86/k7/invert_limb.asm +++ b/mpn/x86/k7/invert_limb.asm @@ -60,7 +60,7 @@ ifdef(`DARWIN',` PROLOGUE(mpn_invert_limb) deflit(`FRAME', 0) mov PARAM_DIVISOR, %eax - C Avoid push/pop on k7. + C Avoid push/pop on k7. sub $8, %esp FRAME_subl_esp(8) mov %ebx, (%esp) mov %edi, 4(%esp) diff --git a/mpn/x86/k7/sublsh1_n.asm b/mpn/x86/k7/sublsh1_n.asm index 41993f99a..965348586 100644 --- a/mpn/x86/k7/sublsh1_n.asm +++ b/mpn/x86/k7/sublsh1_n.asm @@ -30,7 +30,7 @@ C cycles/limb C P5 C P6 model 0-8,10-12 C P6 model 9 (Banias) -C P6 model 13 (Dothan) +C P6 model 13 (Dothan) C P4 model 0 (Willamette) C P4 model 1 (?) C P4 model 2 (Northwood) @@ -38,12 +38,12 @@ C P4 model 3 (Prescott) C P4 model 4 (Nocona) C Intel Atom 6.75 C AMD K6 -C AMD K7 +C AMD K7 C AMD K8 C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 C processors. It uses 2*4-way unrolling, for good reasons. -C +C C Breaking carry recurrency might be a good idea. We would then need separate C registers for the shift carry and add/subtract carry, which in turn would C force is to 2*2-way unrolling. @@ -114,7 +114,7 @@ ifdef(`CPU_P6',` adc %ebp, %ebp rcr %edx C restore 1st saved carry bit - + sbb %eax, (rp) sbb %ebx, 4(rp) sbb %ecx, 8(rp) diff --git a/mpn/x86/p6/bdiv_q_1.asm b/mpn/x86/p6/bdiv_q_1.asm index 3a8733a0d..0ffbc78e4 100644 --- a/mpn/x86/p6/bdiv_q_1.asm +++ b/mpn/x86/p6/bdiv_q_1.asm @@ -25,7 +25,7 @@ include(`../config.m4') C odd even divisor C P6: 10.0 12.0 cycles/limb -C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) C The odd case is basically the same as mpn_modexact_1_odd, just with an C extra store, and it runs at the same 10 cycles which is the dependent @@ -269,7 +269,7 @@ ifdef(`PIC',` imull %edx, %eax C inv*inv*d subl %eax, %ebp C inv = 2*inv - inv*inv*d - + jmp L(common) EPILOGUE() diff --git a/mpn/x86/pentium/bdiv_q_1.asm b/mpn/x86/pentium/bdiv_q_1.asm index 965173d1c..7e84fc817 100644 --- a/mpn/x86/pentium/bdiv_q_1.asm +++ b/mpn/x86/pentium/bdiv_q_1.asm @@ -27,7 +27,7 @@ C odd even C P54: 24.5 30.5 cycles/limb C P55: 23.0 28.0 -MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as C expected. On P54 in the even case the shrdl pairing nonsense (see diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm index 9f23012da..c28d0a02c 100644 --- a/mpn/x86_64/div_qr_2n_pi1.asm +++ b/mpn/x86_64/div_qr_2n_pi1.asm @@ -44,7 +44,7 @@ C TODO C * Store qh in the same stack slot as di_param, instead of pushing C it. (we could put it in register %rbp, but then we would need to C save and restore that instead, which doesn't seem like a win). - + ASM_START() TEXT ALIGN(16) @@ -56,7 +56,7 @@ PROLOGUE(mpn_div_qr_2n_pi1) push %r13 push %r12 push %rbx - + mov -16(up, un, 8), u1 mov -8(up, un, 8), u2 @@ -135,5 +135,5 @@ L(fix): C Unlikely update. u2 >= d1 inc t1 sub d0, u1 sbb d1, u2 - jmp L(bck) + jmp L(bck) EPILOGUE() diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm index cfc7712d5..bdb64c148 100644 --- a/mpn/x86_64/div_qr_2u_pi1.asm +++ b/mpn/x86_64/div_qr_2u_pi1.asm @@ -66,7 +66,7 @@ deflit(`FRAME', 56) movl shift_param, R32(%rcx) C FIXME: Different code for SHLD_SLOW - + xor R32(u2), R32(u2) mov 8(up, un, 8), u1 shld %cl, u1, u2 @@ -173,7 +173,7 @@ L(fix): C Unlikely update. u2 >= d1 inc t1 sub d0, u1 sbb d1, u2 - jmp L(bck) + jmp L(bck) C Duplicated, just jumping back to a different address. L(fix_qh): C Unlikely update. u2 >= d1 @@ -185,5 +185,5 @@ L(fix_qh): C Unlikely update. u2 >= d1 inc t1 sub d0, u1 sbb d1, u2 - jmp L(bck_qh) + jmp L(bck_qh) EPILOGUE() diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm index 6b233e074..56f708a75 100644 --- a/mpn/x86_64/mod_1_1.asm +++ b/mpn/x86_64/mod_1_1.asm @@ -51,7 +51,7 @@ C Note: This implementation needs B1modb only when cnt > 0 C The iteration is almost as follows, C C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u -C +C C where r2 is a single bit represented as a mask. But to make sure that the C result fits in two limbs and a bit, carry from the addition C @@ -206,7 +206,7 @@ ifdef(`SHLD_SLOW',` ') imul %rdx, %r8 shr R8(%rcx), %r8 - mov %r8, 16(%rbx) C store B1modb + mov %r8, 16(%rbx) C store B1modb L(z): pop %r12 pop %rbx diff --git a/mpz/jacobi.c b/mpz/jacobi.c index afd9a49b4..8bfb2e92b 100644 --- a/mpz/jacobi.c +++ b/mpz/jacobi.c @@ -110,7 +110,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b) result_bit1 ^= JACOBI_N1B_BIT1(blow); asize = -asize; } - + JACOBI_STRIP_LOW_ZEROS (result_bit1, blow, asrcp, asize, alow); /* Ensure asize >= bsize. Take advantage of the generalized @@ -147,7 +147,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b) result_bit1 ^= JACOBI_RECIP_UU_BIT1 (alow, blow); } - + if (bsize == 1) { result_bit1 ^= JACOBI_TWOS_U_BIT1(btwos, alow); @@ -165,7 +165,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b) % B, but when A is much larger than B, we have to allocate space for the large quotient. We use the same area, pointed to by bp, for both the quotient A/B and the working copy of B. */ - + TMP_MARK; if (asize >= 2*bsize) @@ -189,7 +189,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b) result_bit1 ^= JACOBI_TWOS_U_BIT1(btwos, alow); ASSERT_NOCARRY (mpn_rshift (bp, bsrcp, bsize, btwos)); - bsize -= (ap[bsize-1] | bp[bsize-1]) == 0; + bsize -= (ap[bsize-1] | bp[bsize-1]) == 0; } else MPN_COPY (bp, bsrcp, bsize); diff --git a/tests/cxx/t-ops2.cc b/tests/cxx/t-ops2.cc index 9a6e7e020..4967ed208 100644 --- a/tests/cxx/t-ops2.cc +++ b/tests/cxx/t-ops2.cc @@ -148,18 +148,18 @@ void checkqf (){ CHECK_SI(T,0,3,*); CHECK_ALL_COMPARISONS(T,5.,2); CHECK_ALL_SIGNS_COMPARISONS(T,11.,3); - CHECK_MPZ(T,5,-2,<); - CHECK_MPZ(T,5,-2,>); + CHECK_MPZ(T,5,-2,<); + CHECK_MPZ(T,5,-2,>); CHECK_MPZ(T,5,-2,<=); CHECK_MPZ(T,5,-2,>=); CHECK_MPZ(T,5,-2,==); CHECK_MPZ(T,5,-2,!=); - CHECK_MPZ(T,0,0,<); - CHECK_MPZ(T,0,0,>); - CHECK_MPZ(T,0,0,<=); - CHECK_MPZ(T,0,0,>=); - CHECK_MPZ(T,0,0,==); - CHECK_MPZ(T,0,0,!=); + CHECK_MPZ(T,0,0,<); + CHECK_MPZ(T,0,0,>); + CHECK_MPZ(T,0,0,<=); + CHECK_MPZ(T,0,0,>=); + CHECK_MPZ(T,0,0,==); + CHECK_MPZ(T,0,0,!=); ASSERT_ALWAYS(T(6)<<2==6.*4); ASSERT_ALWAYS(T(6)>>2==6./4); ASSERT_ALWAYS(T(-13)<<2==-13.*4); @@ -217,18 +217,18 @@ void checkf (){ CHECK_MPQ(mpf_class,-5.5,-2.25,-); CHECK_MPQ(mpf_class,-5.5,-2.25,*); CHECK_MPQ(mpf_class,-5.25,-0.5,/); - CHECK_MPQ(mpf_class,5,-2,<); - CHECK_MPQ(mpf_class,5,-2,>); - CHECK_MPQ(mpf_class,5,-2,<=); - CHECK_MPQ(mpf_class,5,-2,>=); - CHECK_MPQ(mpf_class,5,-2,==); - CHECK_MPQ(mpf_class,5,-2,!=); - CHECK_MPQ(mpf_class,0,0,<); - CHECK_MPQ(mpf_class,0,0,>); - CHECK_MPQ(mpf_class,0,0,<=); - CHECK_MPQ(mpf_class,0,0,>=); - CHECK_MPQ(mpf_class,0,0,==); - CHECK_MPQ(mpf_class,0,0,!=); + CHECK_MPQ(mpf_class,5,-2,<); + CHECK_MPQ(mpf_class,5,-2,>); + CHECK_MPQ(mpf_class,5,-2,<=); + CHECK_MPQ(mpf_class,5,-2,>=); + CHECK_MPQ(mpf_class,5,-2,==); + CHECK_MPQ(mpf_class,5,-2,!=); + CHECK_MPQ(mpf_class,0,0,<); + CHECK_MPQ(mpf_class,0,0,>); + CHECK_MPQ(mpf_class,0,0,<=); + CHECK_MPQ(mpf_class,0,0,>=); + CHECK_MPQ(mpf_class,0,0,==); + CHECK_MPQ(mpf_class,0,0,!=); } int diff --git a/tests/devel/try.c b/tests/devel/try.c index bf09dd829..7ccb9de0b 100644 --- a/tests/devel/try.c +++ b/tests/devel/try.c @@ -459,7 +459,7 @@ validate_bdiv_q_1 refmpn_mul_1 (tp, dst, size, divisor); /* Set ignored low bits */ - tp[0] |= (src[0] & LOW_ZEROS_MASK (divisor)); + tp[0] |= (src[0] & LOW_ZEROS_MASK (divisor)); if (! refmpn_equal_anynail (tp, src, size)) { printf ("Bdiv wrong: res * divisor != src (mod B^size)\n"); diff --git a/tests/mpn/t-hgcd_appr.c b/tests/mpn/t-hgcd_appr.c index 912a1fde0..486b13061 100644 --- a/tests/mpn/t-hgcd_appr.c +++ b/tests/mpn/t-hgcd_appr.c @@ -261,7 +261,7 @@ one_test (mpz_t a, mpz_t b, int i) "after tp: %Mx\n" "expected: %Mx\n", hgcd_tp[hgcd_scratch], marker[3]); - + abort (); } @@ -424,7 +424,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0, mp_bitcnt_t dbits, abits, margin; mpz_t appr_r0, appr_r1, t, q; struct hgcd_ref appr; - + if (!res0) { if (!res1) @@ -433,7 +433,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0, fprintf (stderr, "mpn_hgcd_appr returned 1 when no reduction possible.\n"); return 0; } - + /* NOTE: No *_clear calls on error return, since we're going to abort anyway. */ mpz_init (t); @@ -441,7 +441,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0, hgcd_ref_init (&appr); mpz_init (appr_r0); mpz_init (appr_r1); - + if (mpz_size (ref_r0) <= s) { fprintf (stderr, "ref_r0 too small!!!: "); debug_mp (ref_r0, 16); @@ -460,7 +460,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0, fprintf (stderr, "ref |r0 - r1| too large!!!: "); debug_mp (t, 16); return 0; } - + if (!res1) { mpz_set (appr_r0, a); @@ -473,7 +473,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0, for (i = 0; i<2; i++) { unsigned j; - + for (j = 0; j<2; j++) { mp_size_t mn = hgcd->n; @@ -567,7 +567,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0, fprintf (stderr, "appr_r1: "); debug_mp (appr_r1, 16); fprintf (stderr, "ref_r1: "); debug_mp (ref_r1, 16); - + return 0; } mpz_clear (t); diff --git a/tests/mpn/t-mod_1.c b/tests/mpn/t-mod_1.c index f1966154d..2f86ba277 100644 --- a/tests/mpn/t-mod_1.c +++ b/tests/mpn/t-mod_1.c @@ -90,7 +90,7 @@ main (int argc, char **argv) rands = RANDS; mpz_init (a); mpz_init (b); - + for (i = 0; i < 300; i++) { mp_size_t asize; diff --git a/tests/mpn/t-mulmid.c b/tests/mpn/t-mulmid.c index ab224acea..a946aefe8 100644 --- a/tests/mpn/t-mulmid.c +++ b/tests/mpn/t-mulmid.c @@ -52,7 +52,7 @@ main (int argc, char **argv) bp = TMP_ALLOC_LIMBS (MAX_N); rp = TMP_ALLOC_LIMBS (MAX_N + 2); refp = TMP_ALLOC_LIMBS (MAX_N + 2); - + for (test = 0; test < COUNT; test++) { mp_size_t an, bn, rn; diff --git a/tests/mpz/t-jac.c b/tests/mpz/t-jac.c index 5d8cad177..34cd82e78 100644 --- a/tests/mpz/t-jac.c +++ b/tests/mpz/t-jac.c @@ -921,7 +921,7 @@ mpz_nextprime_step (mpz_ptr p, mpz_srcptr n, mpz_srcptr step_in) mpz_gcd (gcd, p, step); ASSERT_ALWAYS (mpz_cmp_ui (gcd, 1) == 0); mpz_clear (gcd); - + pn = SIZ(p); count_leading_zeros (cnt, PTR(p)[pn - 1]); nbits = pn * GMP_NUMB_BITS - (cnt - GMP_NAIL_BITS); @@ -1016,7 +1016,7 @@ check_large_quotients (void) mpz_set_ui (op1, 0); mpz_urandomb (bs, rands, 32); mpz_urandomb (bs, rands, mpz_get_ui (bs) % 10 + 1); - + gcd_size = 1 + mpz_get_ui (bs); if (gcd_size & 1) { diff --git a/tune/tune-gcd-p.c b/tune/tune-gcd-p.c index 3c3815bd2..6d8863178 100644 --- a/tune/tune-gcd-p.c +++ b/tune/tune-gcd-p.c @@ -39,7 +39,7 @@ search (double *minp, double (*f)(void *, int), void *ctx, int start, int end) double y[4]; int best_i; - + x[0] = start; x[3] = end; @@ -60,7 +60,7 @@ search (double *minp, double (*f)(void *, int), void *ctx, int start, int end) #if 0 printf("%d: %f, %d: %f, %d:, %f %d: %f\n", x[0], y[0], x[1], y[1], x[2], y[2], x[3], y[3]); -#endif +#endif for (best_i = 0, i = 1; i < 4; i++) if (y[i] < y[best_i]) best_i = i; diff --git a/tune/tuneup.c b/tune/tuneup.c index 444e5e429..4cc75eed1 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -203,7 +203,7 @@ mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; -int mod_1_1p_method = 0; +int mod_1_1p_method = 0; mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; -- cgit v1.2.1 From 4c05211240a87d8eaa562b7c842ceac0a521bc02 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 00:11:49 +0100 Subject: Add power6-specific addmul_1. --- ChangeLog | 4 + mpn/powerpc64/mode64/p6/addmul_1.asm | 147 +++++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 mpn/powerpc64/mode64/p6/addmul_1.asm diff --git a/ChangeLog b/ChangeLog index 4d031a239..61631cce8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-22 Torbjorn Granlund + + * mpn/powerpc64/mode64/p6/addmul_1.asm: New file. + 2011-11-21 Torbjorn Granlund * gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name. diff --git a/mpn/powerpc64/mode64/p6/addmul_1.asm b/mpn/powerpc64/mode64/p6/addmul_1.asm new file mode 100644 index 000000000..bffa6f308 --- /dev/null +++ b/mpn/powerpc64/mode64/p6/addmul_1.asm @@ -0,0 +1,147 @@ +dnl PowerPC-64 mpn_addmul_1 optimised for power6. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 +dnl Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 12.25 +C POWER7 ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Generalise to handle submul_1. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_addmul_1) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): addi up, up, 16 + addi rp, rp, 16 + ld r8, -16(up) + ld r7, -8(up) + ld r27, 0(up) + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + addc r5, r5, r29 + b L(l3) + +L(b2): addi up, up, 8 + addi rp, rp, 8 + ld r7, -8(up) + ld r27, 0(up) + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + addc r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + addc r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + addic r12, r0, 0 C clear r12 and cy (use that r0 = 0) + + ALIGN(32) +L(top): ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + addc r0, r0, r28 + std r0, -24(rp) + adde r5, r5, r29 +L(l3): std r5, -16(rp) + adde r9, r9, r30 +L(l2): std r9, -8(rp) + adde r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + + addze r3, r12 + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() -- cgit v1.2.1 From 8ba30f40072e06e46ce109592fb3df2c9087e5d1 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 14:30:39 +0100 Subject: Retune. --- mpn/powerpc64/mode64/p6/gmp-mparam.h | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h index 5ec334089..bf7f0fd0c 100644 --- a/mpn/powerpc64/mode64/p6/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -1,7 +1,7 @@ /* POWER6 gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free -Software Foundation, Inc. +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011 +Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -53,7 +53,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 274 #define SQR_TOOM8_THRESHOLD 410 -#define MULMID_TOOM42_THRESHOLD 24 +#define MULMID_TOOM42_THRESHOLD 36 #define MULMOD_BNM1_THRESHOLD 14 #define SQRMOD_BNM1_THRESHOLD 14 @@ -111,36 +111,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 2368 #define MULLO_BASECASE_THRESHOLD 5 -#define MULLO_DC_THRESHOLD 28 -#define MULLO_MUL_N_THRESHOLD 3084 +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 5558 -#define DC_DIV_QR_THRESHOLD 23 +#define DC_DIV_QR_THRESHOLD 29 #define DC_DIVAPPR_Q_THRESHOLD 112 -#define DC_BDIV_QR_THRESHOLD 29 -#define DC_BDIV_Q_THRESHOLD 79 +#define DC_BDIV_QR_THRESHOLD 70 +#define DC_BDIV_Q_THRESHOLD 168 -#define INV_MULMOD_BNM1_THRESHOLD 51 +#define INV_MULMOD_BNM1_THRESHOLD 61 #define INV_NEWTON_THRESHOLD 93 #define INV_APPR_THRESHOLD 91 -#define BINV_NEWTON_THRESHOLD 132 -#define REDC_1_TO_REDC_N_THRESHOLD 39 +#define BINV_NEWTON_THRESHOLD 222 +#define REDC_1_TO_REDC_N_THRESHOLD 63 -#define MU_DIV_QR_THRESHOLD 855 +#define MU_DIV_QR_THRESHOLD 807 #define MU_DIVAPPR_Q_THRESHOLD 807 -#define MUPI_DIV_QR_THRESHOLD 23 -#define MU_BDIV_QR_THRESHOLD 807 -#define MU_BDIV_Q_THRESHOLD 872 +#define MUPI_DIV_QR_THRESHOLD 27 +#define MU_BDIV_QR_THRESHOLD 872 +#define MU_BDIV_Q_THRESHOLD 1078 #define MATRIX22_STRASSEN_THRESHOLD 13 -#define HGCD_THRESHOLD 69 -#define HGCD_APPR_THRESHOLD 50 +#define HGCD_THRESHOLD 94 +#define HGCD_APPR_THRESHOLD 55 #define HGCD_REDUCE_THRESHOLD 2121 -#define GCD_DC_THRESHOLD 268 -#define GCDEXT_DC_THRESHOLD 209 +#define GCD_DC_THRESHOLD 253 +#define GCDEXT_DC_THRESHOLD 217 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 17 -#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 #define SET_STR_DC_THRESHOLD 532 -#define SET_STR_PRECOMPUTE_THRESHOLD 1648 +#define SET_STR_PRECOMPUTE_THRESHOLD 1561 -- cgit v1.2.1 From 0e7ee006721d05c6a652b5ebb3feda42ca44c68b Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 16:57:06 +0100 Subject: Add more cycle numbers. --- mpn/x86/tabselect.asm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm index ab646dac3..7c8c2601f 100644 --- a/mpn/x86/tabselect.asm +++ b/mpn/x86/tabselect.asm @@ -27,12 +27,12 @@ C P6 model 9 (Banias) ? C P6 model 13 (Dothan) ? C P4 model 0 (Willamette) ? C P4 model 1 (?) ? -C P4 model 2 (Northwood) ? +C P4 model 2 (Northwood) 4.5 C P4 model 3 (Prescott) ? C P4 model 4 (Nocona) ? C Intel Atom ? C AMD K6 ? -C AMD K7 ? +C AMD K7 3.4 C AMD K8 ? C AMD K10 ? -- cgit v1.2.1 From c1be217f4a744da94162daa00293255d16f61cac Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 16:58:46 +0100 Subject: Align loop for slightly better power5 performance. --- mpn/powerpc64/mode64/aors_n.asm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm index c6ea35089..8c30871c2 100644 --- a/mpn/powerpc64/mode64/aors_n.asm +++ b/mpn/powerpc64/mode64/aors_n.asm @@ -1,6 +1,6 @@ dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. -dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -23,7 +23,7 @@ include(`../config.m4') C cycles/limb C POWER3/PPC630 1.5 C POWER4/PPC970 2 -C POWER5 2.25 +C POWER5 2 C POWER6 2.63 C POWER7 2.25-2.87 @@ -137,6 +137,7 @@ L(go): ld r6, 0(r4) C load s1 limb addi r4, r4, 32 addi r5, r5, 32 + ALIGN(16) L(top): ADDSUBC r28, r7, r6 ld r6, 0(r4) C load s1 limb ld r7, 0(r5) C load s2 limb -- cgit v1.2.1 From 10688cef0b5361ffebd094967fdcd7ebb3b63d83 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 17:01:22 +0100 Subject: Add more cycle numbers. --- mpn/powerpc32/aors_n.asm | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mpn/powerpc32/aors_n.asm b/mpn/powerpc32/aors_n.asm index f9e9b50d5..12115a9e9 100644 --- a/mpn/powerpc32/aors_n.asm +++ b/mpn/powerpc32/aors_n.asm @@ -19,14 +19,17 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C 603e: ? -C 604e: ? old: 3.25 -C 75x (G3): ? old: 3.5 -C 7400,7410 (G4): 3.25 -C 744x,745x (G4+): 4 -C power4/ppc970: ? old: 2.0 -C power5: ? old: 2.5 +C cycles/limb +C 603e: ? +C 604e: ? old: 3.25 +C 75x (G3): ? old: 3.5 +C 7400,7410 (G4): 3.25 +C 744x,745x (G4+): 4 +C POWER3/PPC630 2 +C POWER4/PPC970 2.4 +C POWER5 2.75 +C POWER6 40-140 +C POWER7 3 C INPUT PARAMETERS define(`rp', `r3') -- cgit v1.2.1 From d8b2d9eabb0faff2d12c3f2b3ab5c9e36fb21701 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 17:03:17 +0100 Subject: Add more cycle numbers. --- mpn/x86_64/tabselect.asm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm index ca475942b..2611b3212 100644 --- a/mpn/x86_64/tabselect.asm +++ b/mpn/x86_64/tabselect.asm @@ -23,11 +23,12 @@ include(`../config.m4') C cycles/limb C AMD K8,K9 2.5 C AMD K10 2.5 +C AMD bobcat 3.5 C Intel P4 4 -C Intel core2 2.3 +C Intel core2 2.33 C Intel NHM 2.5 C Intel SBR 2.2 -C Intel atom ? +C Intel atom 5 C VIA nano 3.5 C NOTES -- cgit v1.2.1 From e7f9942cc24335135f4bd92e53787fd619efa69d Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 17:14:32 +0100 Subject: Don't fail fat builds under 64-bit DOS. --- configure.in | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/configure.in b/configure.in index 186d4b576..1d1ebd10f 100644 --- a/configure.in +++ b/configure.in @@ -1930,9 +1930,17 @@ case $host in if test "$abi" = 64; then gcc_64_cflags="" - extra_functions_64="$extra_functions_64 fat fat_entry" - path_64="x86_64/fat x86_64" - fat_path="x86_64 x86_64/fat x86_64/pentium4 x86_64/core2 x86_64/coreinhm x86_64/coreisbr x86_64/atom x86_64/nano" + case $host in + *-*-mingw* | *-*-cygwin) + path_64="" # Windows amd64 calling conventions are *different* + fat_path="" + ;; + *) + extra_functions_64="$extra_functions_64 fat fat_entry" + path_64="x86_64/fat x86_64" + fat_path="x86_64 x86_64/fat x86_64/pentium4 x86_64/core2 x86_64/coreinhm x86_64/coreisbr x86_64/atom x86_64/nano" + ;; + esac fi fat_functions="add_n addmul_1 copyd copyi -- cgit v1.2.1 From 042073d276059b723232c6db58c005645131d167 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 17:14:35 +0100 Subject: *** empty log message *** --- ChangeLog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index 61631cce8..1d6a44512 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2011-11-22 Torbjorn Granlund + * configure.in: Don't fail fat builds under 64-bit DOS. + + * mpn/powerpc64/mode64/aors_n.asm: Align loop for slightly better + power5 performance. + * mpn/powerpc64/mode64/p6/addmul_1.asm: New file. 2011-11-21 Torbjorn Granlund -- cgit v1.2.1 From 17a8a01f86586cbe7436565a7d22764f8f5988ea Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 22:05:25 +0100 Subject: Generalise new power6 addmul_1 to support also submul_1. --- ChangeLog | 4 +- mpn/powerpc64/mode64/p6/addmul_1.asm | 147 ----------------------------- mpn/powerpc64/mode64/p6/aorsmul_1.asm | 172 ++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+), 149 deletions(-) delete mode 100644 mpn/powerpc64/mode64/p6/addmul_1.asm create mode 100644 mpn/powerpc64/mode64/p6/aorsmul_1.asm diff --git a/ChangeLog b/ChangeLog index 1d6a44512..80e0f7a32 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,12 +1,12 @@ 2011-11-22 Torbjorn Granlund + * mpn/powerpc64/mode64/p6/aorsmul_1.asm: New file. + * configure.in: Don't fail fat builds under 64-bit DOS. * mpn/powerpc64/mode64/aors_n.asm: Align loop for slightly better power5 performance. - * mpn/powerpc64/mode64/p6/addmul_1.asm: New file. - 2011-11-21 Torbjorn Granlund * gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name. diff --git a/mpn/powerpc64/mode64/p6/addmul_1.asm b/mpn/powerpc64/mode64/p6/addmul_1.asm deleted file mode 100644 index bffa6f308..000000000 --- a/mpn/powerpc64/mode64/p6/addmul_1.asm +++ /dev/null @@ -1,147 +0,0 @@ -dnl PowerPC-64 mpn_addmul_1 optimised for power6. - -dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 -dnl Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 ? -C POWER6 12.25 -C POWER7 ? - -C TODO -C * Reduce register usage. -C * Schedule function entry code. -C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling -C would bring us to 9 c/l. -C * Generalise to handle submul_1. - -C INPUT PARAMETERS -define(`rp', `r3') -define(`up', `r4') -define(`n', `r5') -define(`v0', `r6') - -ASM_START() -PROLOGUE(mpn_addmul_1) - std r31, -8(r1) - std r30, -16(r1) - std r29, -24(r1) - std r28, -32(r1) - std r27, -40(r1) - - rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 - cmpdi cr6, r0, 2 - addi n, n, 3 C compute count... - srdi n, n, 2 C ...for ctr - mtctr n C copy loop count into ctr - beq cr0, L(b0) - blt cr6, L(b1) - beq cr6, L(b2) - -L(b3): addi up, up, 16 - addi rp, rp, 16 - ld r8, -16(up) - ld r7, -8(up) - ld r27, 0(up) - mulld r5, r8, v0 - mulhdu r8, r8, v0 - mulld r9, r7, v0 - mulhdu r7, r7, v0 - mulld r11, r27, v0 - mulhdu r27, r27, v0 - ld r29, -16(rp) - ld r30, -8(rp) - ld r31, 0(rp) - addc r9, r9, r8 - adde r11, r11, r7 - addze r12, r27 - addc r5, r5, r29 - b L(l3) - -L(b2): addi up, up, 8 - addi rp, rp, 8 - ld r7, -8(up) - ld r27, 0(up) - mulld r9, r7, v0 - mulhdu r7, r7, v0 - mulld r11, r27, v0 - mulhdu r27, r27, v0 - ld r30, -8(rp) - ld r31, 0(rp) - addc r11, r11, r7 - addze r12, r27 - addc r9, r9, r30 - b L(l2) - -L(b1): ld r27, 0(up) - ld r31, 0(rp) - mulld r11, r27, v0 - mulhdu r12, r27, v0 - addc r11, r11, r31 - b L(l1) - -L(b0): addi up, up, -8 - addi rp, rp, -8 - addic r12, r0, 0 C clear r12 and cy (use that r0 = 0) - - ALIGN(32) -L(top): ld r10, 8(up) - ld r8, 16(up) - ld r7, 24(up) - ld r27, 32(up) - addi up, up, 32 - addi rp, rp, 32 - mulld r0, r10, v0 - mulhdu r10, r10, v0 - mulld r5, r8, v0 - mulhdu r8, r8, v0 - mulld r9, r7, v0 - mulhdu r7, r7, v0 - mulld r11, r27, v0 - mulhdu r27, r27, v0 - ld r28, -24(rp) - adde r0, r0, r12 - ld r29, -16(rp) - adde r5, r5, r10 - ld r30, -8(rp) - ld r31, 0(rp) - adde r9, r9, r8 - adde r11, r11, r7 - addze r12, r27 - addc r0, r0, r28 - std r0, -24(rp) - adde r5, r5, r29 -L(l3): std r5, -16(rp) - adde r9, r9, r30 -L(l2): std r9, -8(rp) - adde r11, r11, r31 -L(l1): std r11, 0(rp) - bdnz L(top) - - addze r3, r12 - ld r31, -8(r1) - ld r30, -16(r1) - ld r29, -24(r1) - ld r28, -32(r1) - ld r27, -40(r1) - blr -EPILOGUE() diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm new file mode 100644 index 000000000..4bd508488 --- /dev/null +++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm @@ -0,0 +1,172 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 +dnl Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 12.25 12.8 +C POWER7 ? ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Handle n = 1 and perhaps n = 2 seperately, without saving any registers. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(AM, `$1') + define(SM, `') + define(CLRRSC, `addic $1, r0, 0') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(AM, `') + define(SM, `$1') + define(CLRRSC, `subfc $1, r0, r0') +') + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): ld r8, 0(up) + ld r7, 8(up) + ld r27, 16(up) + addi up, up, 16 + addi rp, rp, 16 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r5, r5, r29 + b L(l3) + +L(b2): ld r7, 0(up) + ld r27, 8(up) + addi up, up, 8 + addi rp, rp, 8 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + ADDSUB r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ADDSUB r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + CLRRSC( r12) C clear r12 and clr/set cy + + ALIGN(32) +L(top): +SM(` subfe r11, r0, r0') C complement... +SM(` addic r11, r11, 1') C ...carry flag + ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r0, r0, r28 + std r0, -24(rp) + ADDSUBC r5, r5, r29 +L(l3): std r5, -16(rp) + ADDSUBC r9, r9, r30 +L(l2): std r9, -8(rp) + ADDSUBC r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + +AM(` addze r3, r12') +SM(` subfe r11, r0, r0') C complement... + ld r31, -8(r1) +SM(` subf r3, r11, r12') + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() -- cgit v1.2.1 From 682827871b8ddf4674d2233c852b516cbcd9c2a1 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 12:13:26 +0100 Subject: (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's threshold. --- tune/tuneup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tune/tuneup.c b/tune/tuneup.c index 4cc75eed1..bc7e8cc3d 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -1568,7 +1568,7 @@ tune_mu_div (void) param.name = "MU_DIV_QR_THRESHOLD"; param.function = speed_mpn_dcpi1_div_qr; param.function2 = speed_mpn_mu_div_qr; - param.min_size = 6; + param.min_size = mul_toom22_threshold; param.max_size = 5000; param.step_factor = 0.02; one (&mu_div_qr_threshold, ¶m); @@ -1578,7 +1578,7 @@ tune_mu_div (void) param.name = "MU_DIVAPPR_Q_THRESHOLD"; param.function = speed_mpn_dcpi1_divappr_q; param.function2 = speed_mpn_mu_divappr_q; - param.min_size = 6; + param.min_size = mul_toom22_threshold; param.max_size = 5000; param.step_factor = 0.02; one (&mu_divappr_q_threshold, ¶m); @@ -1627,7 +1627,7 @@ tune_mu_bdiv (void) param.name = "MU_BDIV_QR_THRESHOLD"; param.function = speed_mpn_dcpi1_bdiv_qr; param.function2 = speed_mpn_mu_bdiv_qr; - param.min_size = 4; + param.min_size = mul_toom22_threshold; param.max_size = 5000; param.step_factor = 0.02; one (&mu_bdiv_qr_threshold, ¶m); @@ -1637,7 +1637,7 @@ tune_mu_bdiv (void) param.name = "MU_BDIV_Q_THRESHOLD"; param.function = speed_mpn_dcpi1_bdiv_q; param.function2 = speed_mpn_mu_bdiv_q; - param.min_size = 4; + param.min_size = mul_toom22_threshold; param.max_size = 5000; param.step_factor = 0.02; one (&mu_bdiv_q_threshold, ¶m); -- cgit v1.2.1 From faeba6f2f2dfe18c15702387f1c2267f341a7783 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 12:17:47 +0100 Subject: Add power7/32 tuning file. --- mpn/powerpc32/p7/gmp-mparam.h | 149 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 mpn/powerpc32/p7/gmp-mparam.h diff --git a/mpn/powerpc32/p7/gmp-mparam.h b/mpn/powerpc32/p7/gmp-mparam.h new file mode 100644 index 000000000..bd18d4042 --- /dev/null +++ b/mpn/powerpc32/p7/gmp-mparam.h @@ -0,0 +1,149 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009, +2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* 3550 MHz POWER7/T4 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 34 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 363 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 381 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 4,35,225,780,2212 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 14, 5}, { 29, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671, 8}, { 1343,10}, { 351,11}, \ + { 191,10}, { 415, 9}, { 831,10}, { 431,11}, \ + { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 77 +#define MUL_FFT_THRESHOLD 5312 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543, 8}, { 1087,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 79 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 34 +#define MULLO_MUL_N_THRESHOLD 10323 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 202 +#define DC_BDIV_QR_THRESHOLD 68 +#define DC_BDIV_Q_THRESHOLD 152 + +#define INV_MULMOD_BNM1_THRESHOLD 66 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 189 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1442 +#define MUPI_DIV_QR_THRESHOLD 91 +#define MU_BDIV_QR_THRESHOLD 1308 +#define MU_BDIV_Q_THRESHOLD 1442 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 126 +#define HGCD_APPR_THRESHOLD 139 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 573 +#define GCDEXT_DC_THRESHOLD 448 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 9 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 834 +#define SET_STR_PRECOMPUTE_THRESHOLD 1888 -- cgit v1.2.1 From 161de004453f214c6030bb9e9babd2d0048a7337 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 12:19:09 +0100 Subject: Retune. --- mpn/powerpc32/p5/gmp-mparam.h | 137 +++++++++++------------ mpn/powerpc32/p6/gmp-mparam.h | 206 ++++++++++++++++++----------------- mpn/powerpc64/mode64/p3/gmp-mparam.h | 73 +++++++------ mpn/powerpc64/mode64/p6/gmp-mparam.h | 42 +++---- 4 files changed, 240 insertions(+), 218 deletions(-) diff --git a/mpn/powerpc32/p5/gmp-mparam.h b/mpn/powerpc32/p5/gmp-mparam.h index a8400ce65..ba210ecc4 100644 --- a/mpn/powerpc32/p5/gmp-mparam.h +++ b/mpn/powerpc32/p5/gmp-mparam.h @@ -30,114 +30,117 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 8 #define MOD_1U_TO_MOD_1_1_THRESHOLD 6 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ -#define BMOD_1_TO_MOD_1_THRESHOLD 62 +#define BMOD_1_TO_MOD_1_THRESHOLD 61 #define MUL_TOOM22_THRESHOLD 22 -#define MUL_TOOM33_THRESHOLD 78 +#define MUL_TOOM33_THRESHOLD 57 #define MUL_TOOM44_THRESHOLD 130 -#define MUL_TOOM6H_THRESHOLD 206 -#define MUL_TOOM8H_THRESHOLD 260 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 309 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 -#define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 42 +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 40 #define SQR_TOOM3_THRESHOLD 77 -#define SQR_TOOM4_THRESHOLD 169 -#define SQR_TOOM6_THRESHOLD 246 -#define SQR_TOOM8_THRESHOLD 381 +#define SQR_TOOM4_THRESHOLD 124 +#define SQR_TOOM6_THRESHOLD 140 +#define SQR_TOOM8_THRESHOLD 238 + +#define MULMID_TOOM42_THRESHOLD 40 #define MULMOD_BNM1_THRESHOLD 15 -#define SQRMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define POWM_SEC_TABLE 4,29,252,840,2080 -#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ - { 13, 5}, { 27, 6}, { 21, 7}, { 11, 6}, \ - { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ - { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ - { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ - { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ - { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ - { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ - { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ - { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ - { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ - { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ - { 575,10}, { 159,11}, { 95, 9}, { 383,12}, \ - { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ - { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ - { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ - { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ - { 767,10}, { 415, 9}, { 831,11}, { 223,12}, \ - { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 76 + { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 71 #define MUL_FFT_THRESHOLD 4736 -#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ - { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ - { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ - { 19, 6}, { 77, 7}, { 39, 8}, { 23, 7}, \ - { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \ - { 23, 8}, { 47,10}, { 15, 7}, { 121, 9}, \ - { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ - { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ - { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ - { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \ - { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \ + { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ - { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \ - { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 88 +#define SQR_FFT_TABLE3_SIZE 76 #define SQR_FFT_THRESHOLD 3712 #define MULLO_BASECASE_THRESHOLD 2 #define MULLO_DC_THRESHOLD 68 #define MULLO_MUL_N_THRESHOLD 9236 -#define DC_DIV_QR_THRESHOLD 70 -#define DC_DIVAPPR_Q_THRESHOLD 238 +#define DC_DIV_QR_THRESHOLD 69 +#define DC_DIVAPPR_Q_THRESHOLD 220 #define DC_BDIV_QR_THRESHOLD 75 #define DC_BDIV_Q_THRESHOLD 188 #define INV_MULMOD_BNM1_THRESHOLD 54 -#define INV_NEWTON_THRESHOLD 250 -#define INV_APPR_THRESHOLD 246 +#define INV_NEWTON_THRESHOLD 230 +#define INV_APPR_THRESHOLD 230 -#define BINV_NEWTON_THRESHOLD 375 +#define BINV_NEWTON_THRESHOLD 278 #define REDC_1_TO_REDC_N_THRESHOLD 87 -#define MU_DIV_QR_THRESHOLD 1334 -#define MU_DIVAPPR_Q_THRESHOLD 1387 -#define MUPI_DIV_QR_THRESHOLD 114 -#define MU_BDIV_QR_THRESHOLD 1078 -#define MU_BDIV_Q_THRESHOLD 1334 +#define MU_DIV_QR_THRESHOLD 1210 +#define MU_DIVAPPR_Q_THRESHOLD 1308 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1210 #define MATRIX22_STRASSEN_THRESHOLD 14 -#define HGCD_THRESHOLD 104 -#define GCD_DC_THRESHOLD 424 -#define GCDEXT_DC_THRESHOLD 321 +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 138 +#define HGCD_REDUCE_THRESHOLD 2578 +#define GCD_DC_THRESHOLD 408 +#define GCDEXT_DC_THRESHOLD 298 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 12 -#define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 454 -#define SET_STR_PRECOMPUTE_THRESHOLD 1074 +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 527 +#define SET_STR_PRECOMPUTE_THRESHOLD 1090 diff --git a/mpn/powerpc32/p6/gmp-mparam.h b/mpn/powerpc32/p6/gmp-mparam.h index 73951d0ae..529a66d19 100644 --- a/mpn/powerpc32/p6/gmp-mparam.h +++ b/mpn/powerpc32/p6/gmp-mparam.h @@ -29,115 +29,127 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 3 #define MOD_1_UNNORM_THRESHOLD 0 /* always */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 15 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ -#define MUL_TOOM22_THRESHOLD 34 -#define MUL_TOOM33_THRESHOLD 70 -#define MUL_TOOM44_THRESHOLD 187 -#define MUL_TOOM6H_THRESHOLD 286 -#define MUL_TOOM8H_THRESHOLD 321 +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 55 +#define MUL_TOOM44_THRESHOLD 88 +#define MUL_TOOM6H_THRESHOLD 137 +#define MUL_TOOM8H_THRESHOLD 181 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 110 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 118 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 107 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 #define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 68 -#define SQR_TOOM3_THRESHOLD 113 -#define SQR_TOOM4_THRESHOLD 312 -#define SQR_TOOM6_THRESHOLD 330 -#define SQR_TOOM8_THRESHOLD 357 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 56 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 296 -#define MULMOD_BNM1_THRESHOLD 19 -#define SQRMOD_BNM1_THRESHOLD 20 +#define MULMID_TOOM42_THRESHOLD 26 -#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */ +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define POWM_SEC_TABLE 2,26,127,453,1068 + +#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ - { 10, 5}, { 21, 6}, { 17, 7}, { 9, 6}, \ - { 20, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \ - { 7, 7}, { 21, 8}, { 11, 7}, { 27, 9}, \ - { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ - { 41, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \ + { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \ + { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \ { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ - { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ - { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ - { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ - { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ - { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ - { 191, 8}, { 383,11}, { 63,10}, { 127, 9}, \ - { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \ - { 287,10}, { 159, 9}, { 319,11}, { 95,10}, \ - { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ - { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ - { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \ - { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ - { 415,11}, { 223,10}, { 447,12}, { 4096,13}, \ - { 8192,14}, { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 83 -#define MUL_FFT_THRESHOLD 4736 - -#define SQR_FFT_MODF_THRESHOLD 312 /* k = 5 */ -#define SQR_FFT_TABLE3 \ - { { 312, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ - { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ - { 27, 7}, { 17, 6}, { 35, 7}, { 21, 8}, \ - { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ - { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ - { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ - { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ - { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \ { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ - { 127, 8}, { 255,10}, { 79, 9}, { 159, 8}, \ - { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \ - { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 1728 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \ + { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \ + { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \ + { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \ + { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \ + { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \ { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ - { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \ - { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ - { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ - { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \ - { 191,10}, { 383, 9}, { 767,10}, { 415,11}, \ - { 223,10}, { 447,12}, { 4096,13}, { 8192,14}, \ - { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 78 -#define SQR_FFT_THRESHOLD 2752 - -#define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 151 -#define MULLO_MUL_N_THRESHOLD 1175 - -#define DC_DIV_QR_THRESHOLD 133 -#define DC_DIVAPPR_Q_THRESHOLD 442 -#define DC_BDIV_QR_THRESHOLD 130 -#define DC_BDIV_Q_THRESHOLD 324 - -#define INV_MULMOD_BNM1_THRESHOLD 116 -#define INV_NEWTON_THRESHOLD 507 -#define INV_APPR_THRESHOLD 454 - -#define BINV_NEWTON_THRESHOLD 507 -#define REDC_1_TO_REDC_N_THRESHOLD 118 - -#define MU_DIV_QR_THRESHOLD 1652 -#define MU_DIVAPPR_Q_THRESHOLD 1752 -#define MUPI_DIV_QR_THRESHOLD 225 -#define MU_BDIV_QR_THRESHOLD 762 -#define MU_BDIV_Q_THRESHOLD 1017 - -#define MATRIX22_STRASSEN_THRESHOLD 28 -#define HGCD_THRESHOLD 76 -#define GCD_DC_THRESHOLD 333 -#define GCDEXT_DC_THRESHOLD 245 + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \ + { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 92 +#define SQR_FFT_THRESHOLD 1600 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 187 +#define DC_BDIV_QR_THRESHOLD 64 +#define DC_BDIV_Q_THRESHOLD 146 + +#define INV_MULMOD_BNM1_THRESHOLD 68 +#define INV_NEWTON_THRESHOLD 182 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 186 +#define REDC_1_TO_REDC_N_THRESHOLD 60 + +#define MU_DIV_QR_THRESHOLD 924 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 667 +#define MU_BDIV_Q_THRESHOLD 823 + +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 974 +#define GCD_DC_THRESHOLD 195 +#define GCDEXT_DC_THRESHOLD 134 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 10 -#define GET_STR_PRECOMPUTE_THRESHOLD 20 -#define SET_STR_DC_THRESHOLD 199 -#define SET_STR_PRECOMPUTE_THRESHOLD 478 +#define GET_STR_DC_THRESHOLD 9 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 190 +#define SET_STR_PRECOMPUTE_THRESHOLD 411 diff --git a/mpn/powerpc64/mode64/p3/gmp-mparam.h b/mpn/powerpc64/mode64/p3/gmp-mparam.h index 221b0e1d8..cf1d8ca47 100644 --- a/mpn/powerpc64/mode64/p3/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p3/gmp-mparam.h @@ -23,12 +23,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ -#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -36,22 +37,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 33 #define MUL_TOOM44_THRESHOLD 46 #define MUL_TOOM6H_THRESHOLD 77 -#define MUL_TOOM8H_THRESHOLD 115 +#define MUL_TOOM8H_THRESHOLD 139 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 38 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 33 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 32 - -#define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 16 -#define SQR_TOOM3_THRESHOLD 49 -#define SQR_TOOM4_THRESHOLD 70 -#define SQR_TOOM6_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 48 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 45 +#define SQR_TOOM4_THRESHOLD 64 +#define SQR_TOOM6_THRESHOLD 85 #define SQR_TOOM8_THRESHOLD 139 +#define MULMID_TOOM42_THRESHOLD 22 + #define MULMOD_BNM1_THRESHOLD 8 -#define SQRMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define POWM_SEC_TABLE 2,23,127,502,1421 #define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -123,35 +128,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 118 #define SQR_FFT_THRESHOLD 1728 -#define MULLO_BASECASE_THRESHOLD 3 -#define MULLO_DC_THRESHOLD 28 -#define MULLO_MUL_N_THRESHOLD 4940 +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 2367 -#define DC_DIV_QR_THRESHOLD 27 -#define DC_DIVAPPR_Q_THRESHOLD 95 -#define DC_BDIV_QR_THRESHOLD 28 +#define DC_DIV_QR_THRESHOLD 26 +#define DC_DIVAPPR_Q_THRESHOLD 87 +#define DC_BDIV_QR_THRESHOLD 27 #define DC_BDIV_Q_THRESHOLD 62 -#define INV_MULMOD_BNM1_THRESHOLD 29 -#define INV_NEWTON_THRESHOLD 92 -#define INV_APPR_THRESHOLD 94 +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 91 +#define INV_APPR_THRESHOLD 91 #define BINV_NEWTON_THRESHOLD 115 -#define REDC_1_TO_REDC_N_THRESHOLD 30 +#define REDC_1_TO_REDC_N_THRESHOLD 31 #define MU_DIV_QR_THRESHOLD 551 #define MU_DIVAPPR_Q_THRESHOLD 551 -#define MUPI_DIV_QR_THRESHOLD 49 -#define MU_BDIV_QR_THRESHOLD 492 +#define MUPI_DIV_QR_THRESHOLD 50 +#define MU_BDIV_QR_THRESHOLD 474 #define MU_BDIV_Q_THRESHOLD 492 -#define MATRIX22_STRASSEN_THRESHOLD 9 -#define HGCD_THRESHOLD 55 -#define GCD_DC_THRESHOLD 150 -#define GCDEXT_DC_THRESHOLD 124 +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 53 +#define HGCD_APPR_THRESHOLD 55 +#define HGCD_REDUCE_THRESHOLD 688 +#define GCD_DC_THRESHOLD 148 +#define GCDEXT_DC_THRESHOLD 118 #define JACOBI_BASE_METHOD 1 -#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_DC_THRESHOLD 16 #define GET_STR_PRECOMPUTE_THRESHOLD 27 -#define SET_STR_DC_THRESHOLD 354 +#define SET_STR_DC_THRESHOLD 375 #define SET_STR_PRECOMPUTE_THRESHOLD 812 diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h index bf7f0fd0c..5392138f1 100644 --- a/mpn/powerpc64/mode64/p6/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -39,26 +39,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 50 #define MUL_TOOM44_THRESHOLD 112 #define MUL_TOOM6H_THRESHOLD 274 -#define MUL_TOOM8H_THRESHOLD 430 +#define MUL_TOOM8H_THRESHOLD 339 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 62 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76 #define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 24 #define SQR_TOOM3_THRESHOLD 49 #define SQR_TOOM4_THRESHOLD 136 -#define SQR_TOOM6_THRESHOLD 274 -#define SQR_TOOM8_THRESHOLD 410 +#define SQR_TOOM6_THRESHOLD 226 +#define SQR_TOOM8_THRESHOLD 393 #define MULMID_TOOM42_THRESHOLD 36 #define MULMOD_BNM1_THRESHOLD 14 #define SQRMOD_BNM1_THRESHOLD 14 -#define POWM_SEC_TABLE 4,19,228,713,919 +#define POWM_SEC_TABLE 4,23,213,840,2618 #define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -112,35 +112,35 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULLO_BASECASE_THRESHOLD 5 #define MULLO_DC_THRESHOLD 61 -#define MULLO_MUL_N_THRESHOLD 5558 +#define MULLO_MUL_N_THRESHOLD 3271 -#define DC_DIV_QR_THRESHOLD 29 -#define DC_DIVAPPR_Q_THRESHOLD 112 +#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIVAPPR_Q_THRESHOLD 200 #define DC_BDIV_QR_THRESHOLD 70 #define DC_BDIV_Q_THRESHOLD 168 #define INV_MULMOD_BNM1_THRESHOLD 61 -#define INV_NEWTON_THRESHOLD 93 -#define INV_APPR_THRESHOLD 91 +#define INV_NEWTON_THRESHOLD 166 +#define INV_APPR_THRESHOLD 166 #define BINV_NEWTON_THRESHOLD 222 #define REDC_1_TO_REDC_N_THRESHOLD 63 -#define MU_DIV_QR_THRESHOLD 807 -#define MU_DIVAPPR_Q_THRESHOLD 807 -#define MUPI_DIV_QR_THRESHOLD 27 -#define MU_BDIV_QR_THRESHOLD 872 +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 59 +#define MU_BDIV_QR_THRESHOLD 889 #define MU_BDIV_Q_THRESHOLD 1078 #define MATRIX22_STRASSEN_THRESHOLD 13 -#define HGCD_THRESHOLD 94 -#define HGCD_APPR_THRESHOLD 55 -#define HGCD_REDUCE_THRESHOLD 2121 -#define GCD_DC_THRESHOLD 253 -#define GCDEXT_DC_THRESHOLD 217 +#define HGCD_THRESHOLD 109 +#define HGCD_APPR_THRESHOLD 108 +#define HGCD_REDUCE_THRESHOLD 1052 +#define GCD_DC_THRESHOLD 501 +#define GCDEXT_DC_THRESHOLD 249 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 16 #define GET_STR_PRECOMPUTE_THRESHOLD 29 #define SET_STR_DC_THRESHOLD 532 -#define SET_STR_PRECOMPUTE_THRESHOLD 1561 +#define SET_STR_PRECOMPUTE_THRESHOLD 1639 -- cgit v1.2.1 From 52b003cc5830e7fde5e3dca4338c170f3cdd9fe5 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 12:39:37 +0100 Subject: *** empty log message *** --- mpn/s390_64/README | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 mpn/s390_64/README diff --git a/mpn/s390_64/README b/mpn/s390_64/README new file mode 100644 index 000000000..82b68a080 --- /dev/null +++ b/mpn/s390_64/README @@ -0,0 +1,77 @@ +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + + + +There are 5 generations of 64-but s390 processors, z900, z990, z9, +z10, and z196. The current GMP code was optimised for the two oldest, +z900 and z990. + + +mpn_copyi + +This code makes use of a loop around MVC. It almost surely runs very +close to optimally. A small improvement could be done by using one +MVC for size 256 bytes, now we use two (we use an extra MVC when +copying any multiple of 256 bytes). + + +mpn_copyd + +We have tried several feed-in variants here, branch tree, jump table +and computed goto. The fastest (on z990) turned out to be computed +goto. + +An approach not tried is EX of LMG and STMG, modifying the register set +on-the-fly. Using that trick, we could completely avoid using +separate feed-in paths. + + +mpn_lshift, mpn_rshift + +The current code runs at pipeline decode bandwith on z990. + + +mpn_add_n, mpn_sub_n + +The current code is 4-way unrolled. It should be unrolled more, at +least 8x, in order to reach 2.5 c/l. + + +mpn_mul_1, mpn_addmul_1, mpn_submul_1 + +The current code is very naive, but due to the non-pipelined nature of +MLGR on z900 and z990, more sophisticated code would not gain much. + +On z10 one would need to cluster at least 4 MLGR together, in order to +reduce stalling. + +On z196, one surely want to use unrolling and pipelining, to perhaps +reach around 12 c/l. A major issue here and on z10 is ALCGR's 3 cycle +stalling. + + +mpn_mul_2, mpn_addmul_2 + +At least for older machines (z900, z990) with very slow MLGR, we +should use Karatsuba's algorithm on 2-limb units, making mul_2 and +addmul_2 the main multiplicaton primitives. The newer machines might +benefit less from this approach, perhaps in particular z10, where MLGR +clustering is more important. + +With Karatsuba, one could hope for around 16 cycles per accumulated +128 cross product, on z990. -- cgit v1.2.1 From 5c345ce60c939a92a920e984d36b4d4d52d4bae9 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 12:41:46 +0100 Subject: *** empty log message *** --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 80e0f7a32..ba4f47ede 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2011-11-24 Torbjorn Granlund + + * mpn/powerpc32/p7/gmp-mparam.h: New file. + + * tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's + threshold. + 2011-11-22 Torbjorn Granlund * mpn/powerpc64/mode64/p6/aorsmul_1.asm: New file. -- cgit v1.2.1 From cbc96e61b041e6ff713adf3885c610fdefa2023f Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 22:05:28 +0100 Subject: (Formatted Output Strings): Clarify rules for mpf_t precision. --- doc/gmp.texi | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/gmp.texi b/doc/gmp.texi index 1d6538165..9e77abe7f 100644 --- a/doc/gmp.texi +++ b/doc/gmp.texi @@ -5909,7 +5909,7 @@ instance extensions registered with GLIBC @code{register_printf_function}. Also currently there's no support for POSIX @samp{$} style numbered arguments (perhaps this will be added in the future). -The precision field has it's usual meaning for integer @samp{Z} and float +The precision field has its usual meaning for integer @samp{Z} and float @samp{F} types, but is currently undefined for @samp{Q} and should not be used with that. @@ -5920,7 +5920,10 @@ happens even for an @samp{f} conversion of an @code{mpf_t} which is an integer, for instance @math{2^@W{1024}} in an @code{mpf_t} of 128 bits precision will only produce about 40 digits, then pad with zeros to the decimal point. An empty precision field like @samp{%.Fe} or @samp{%.Ff} can -be used to specifically request just the significant digits. +be used to specifically request just the significant digits. Without any dot +and thus no precision field, a precision value of 6 will be used. Note that +these rules mean that @samp{%Ff}, @samp{%.Ff}, and @samp{%.0Ff} will all be +different. The decimal point character (or string) is taken from the current locale settings on systems which provide @code{localeconv} (@pxref{Locales,, Locales -- cgit v1.2.1 From 853e7d21ab5471b137ac4f80258dd779d54061ba Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Thu, 24 Nov 2011 22:11:22 +0100 Subject: *** empty log message *** --- ChangeLog | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ChangeLog b/ChangeLog index ba4f47ede..761f9161b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2011-11-24 Torbjorn Granlund + * doc/gmp.texi (Formatted Output Strings): Clarify rules for mpf_t + precision. + * mpn/powerpc32/p7/gmp-mparam.h: New file. * tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's -- cgit v1.2.1 From df16fd175d4cfcbd2d60cab0ca927c992e3185a4 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Fri, 25 Nov 2011 23:55:30 +0100 Subject: Overhaul x86/x86_64 support, merging three case statements into one. --- configure.in | 110 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/configure.in b/configure.in index 1d1ebd10f..28df31214 100644 --- a/configure.in +++ b/configure.in @@ -1468,46 +1468,62 @@ case $host in i386*) gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386" gcc_cflags_arch="-march=i386" + path="x86" ;; i486*) gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486" gcc_cflags_arch="-march=i486" + path="x86/i486 x86" ;; i586 | pentium) gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486" gcc_cflags_arch="-march=pentium" + path="x86/pentium x86" ;; pentiummmx) gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486" gcc_cflags_arch="-march=pentium-mmx -march=pentium" + path="x86/pentium/mmx x86/pentium x86" ;; i686 | pentiumpro) gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=pentiumpro -march=pentium" + path="x86/p6 x86" ;; pentium2) gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium" + path="x86/p6/mmx x86/p6 x86" ;; - pentium3 | pentiumm) + pentium3) gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium" + path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86" + ;; + pentiumm) + gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium" + path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" ;; k6) gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486" gcc_cflags_arch="-march=k6" + path="x86/k6/mmx x86/k6 x86" ;; k62) gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486" gcc_cflags_arch="-march=k6-2 -march=k6" + path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;; k63) gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486" gcc_cflags_arch="-march=k6-3 -march=k6" + path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;; geode) gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486" gcc_cflags_arch="-march=k6-3 -march=k6" + path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;; athlon) # Athlon instruction costs are close to P6 (3 cycle load latency, @@ -1515,6 +1531,7 @@ case $host in # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro. gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium" + path="x86/k7/mmx x86/k7 x86" ;; i786 | pentium4) # pentiumpro is the primary fallback when gcc doesn't know pentium4. @@ -1524,77 +1541,84 @@ case $host in gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium" gcc_64_cflags_cpu="-mtune=nocona" + path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86" + path_64="x86_64/pentium4 x86_64" ;; viac32) # Not sure of the best fallbacks here for -mcpu. # c3-2 has sse and mmx, so pentium3 is good for -march. gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486" gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium" + path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86" ;; viac3*) # Not sure of the best fallbacks here. gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486" gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium" + path="x86/pentium/mmx x86/pentium x86" ;; athlon64 | k8 | x86_64) gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium" + path="x86/k8 x86" + path_64="x86_64/k8 x86_64" ;; k10) gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8" - gcc_cflags_arch="-march=amdfam10 -mtune=k8 -march=k8~-mno-sse2" + gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2" + path="x86/k10 x86/k8 x86" + path_64="x86_64/k10 x86_64/k8 x86_64" ;; bobcat) gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8" - gcc_cflags_arch="-march=btver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2" + gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" + path="x86/bobcat x86" + path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64" ;; - bulldozer) + bulldozer | bd1) gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8" - gcc_cflags_arch="-march=bdver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2" + gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" + path="x86/bd1 x86" + path_64="x86_64/bd1 x86_64" ;; core2) gcc_cflags_cpu="-mtune=core2 -mtune=k8" gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" + path_64="x86_64/core2 x86_64" + ;; + corei | coreinhm | coreiwsm) + gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" + gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" + path_64="x86_64/coreinhm x86_64/core2 x86_64" ;; - corei | coreinhm | coreiwsm | coreisbr) + coreisbr) gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" + path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64" ;; atom) gcc_cflags_cpu="-mtune=atom -mtune=pentium3" gcc_cflags_arch="-march=atom -march=pentium3" + path="x86/atom/sse2 x86/atom/mmx x86/atom x86" + path_64="x86_64/atom x86_64" + ;; + nano) + gcc_cflags_cpu="-mtune=nano" + gcc_cflags_arch="-march=nano" + path="x86/nano x86" + path_64="x86_64/nano x86_64" ;; *) gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486" gcc_cflags_arch="-march=i486" + path="x86" + path_64="x86_64" ;; esac - case $host_cpu in - i386*) path="x86" ;; - i486*) path="x86/i486 x86" ;; - i586 | pentium) path="x86/pentium x86" ;; - pentiummmx) path="x86/pentium/mmx x86/pentium x86" ;; - i686 | pentiumpro) path="x86/p6 x86" ;; - pentium2) path="x86/p6/mmx x86/p6 x86" ;; - pentium3) path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86";; - pentiumm | core2 | corei | coreinhm | coreiwsm | coreisbr) - path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86";; - [k6[23]]) path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;; - k6) path="x86/k6/mmx x86/k6 x86" ;; - geode) path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;; - # we don't have any specific 32-bit code for athlon64/opteron, the - # athlon code should be reasonable - athlon | athlon64 | k8 | k10 | bobcat | bulldozer) - path="x86/k7/mmx x86/k7 x86" ;; - i786 | pentium4) path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86" ;; - # VIA/Centaur processors, sold as CyrixIII and C3. - viac32) path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86";; - viac3*) path="x86/pentium/mmx x86/pentium x86";; - atom) path="x86/atom/sse2 x86/atom/mmx x86/atom x86" ;; - *) path="x86" ;; - esac - case $host in X86_64_PATTERN) cclist_64="gcc" @@ -1604,34 +1628,10 @@ case $host in SPEED_CYCLECOUNTER_OBJ_64=x86_64.lo cyclecounter_size_64=2 abilist="64 32" - path_64="x86_64" if test "$enable_assembly" = "yes" ; then extra_functions_64="invert_limb_table" fi - case $host_cpu in - x86_64) - ;; - k10 | bulldozer) - path_64="x86_64/k10 x86_64/k8 $path_64" ;; - athlon64 | k8) - path_64="x86_64/k8 $path_64" ;; - bobcat) - path_64="x86_64/bobcat x86_64/k10 x86_64/k8 $path_64" ;; - pentium4) - path_64="x86_64/pentium4 $path_64" ;; - core2) - path_64="x86_64/core2 $path_64" ;; - corei | coreinhm | coreiwsm) - path_64="x86_64/coreinhm x86_64/core2 $path_64" ;; - coreisbr) - path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 $path_64" ;; - atom) - path_64="x86_64/atom $path_64" ;; - nano) - path_64="x86_64/nano $path_64" ;; - esac - case $host in *-*-solaris*) # Sun cc. -- cgit v1.2.1 From d0600ffcd15e1baaadee4838ef966ae28eb6e695 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Fri, 25 Nov 2011 23:57:06 +0100 Subject: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode. --- mpn/x86/bobcat/gmp-mparam.h | 141 +++++++++++++++++++++++++++++++++++++++ mpn/x86/core2/gmp-mparam.h | 141 +++++++++++++++++++++++++++++++++++++++ mpn/x86/coreinhm/gmp-mparam.h | 141 +++++++++++++++++++++++++++++++++++++++ mpn/x86/coreisbr/gmp-mparam.h | 140 ++++++++++++++++++++++++++++++++++++++ mpn/x86/k10/gmp-mparam.h | 142 +++++++++++++++++++++++++++++++++++++++ mpn/x86/k8/gmp-mparam.h | 144 +++++++++++++++++++++++++++++++++++++++ mpn/x86/nano/gmp-mparam.h | 152 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1001 insertions(+) create mode 100644 mpn/x86/bobcat/gmp-mparam.h create mode 100644 mpn/x86/core2/gmp-mparam.h create mode 100644 mpn/x86/coreinhm/gmp-mparam.h create mode 100644 mpn/x86/coreisbr/gmp-mparam.h create mode 100644 mpn/x86/k10/gmp-mparam.h create mode 100644 mpn/x86/k8/gmp-mparam.h create mode 100644 mpn/x86/nano/gmp-mparam.h diff --git a/mpn/x86/bobcat/gmp-mparam.h b/mpn/x86/bobcat/gmp-mparam.h new file mode 100644 index 000000000..58dfee1cf --- /dev/null +++ b/mpn/x86/bobcat/gmp-mparam.h @@ -0,0 +1,141 @@ +/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 40 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 85 +#define MUL_TOOM44_THRESHOLD 147 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 111 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 38 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 303 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMID_TOOM42_THRESHOLD 76 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define POWM_SEC_TABLE 2,17,225,357,2212 + +#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */ +#define MUL_FFT_TABLE3 \ + { { 888, 6}, { 27, 7}, { 15, 6}, { 33, 7}, \ + { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \ + { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \ + { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \ + { 41, 8}, { 23, 7}, { 49, 8}, { 31, 7}, \ + { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 271,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 223,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 66 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 730 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 730, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 31, 7}, { 63, 8}, { 39, 7}, { 79, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271,11}, { 159,10}, { 319, 9}, { 671,11}, \ + { 191, 9}, { 767,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 70 +#define SQR_FFT_THRESHOLD 7296 + +#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_DC_THRESHOLD 45 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 72 +#define DC_DIVAPPR_Q_THRESHOLD 214 +#define DC_BDIV_QR_THRESHOLD 67 +#define DC_BDIV_Q_THRESHOLD 142 + +#define INV_MULMOD_BNM1_THRESHOLD 71 +#define INV_NEWTON_THRESHOLD 250 +#define INV_APPR_THRESHOLD 228 + +#define BINV_NEWTON_THRESHOLD 270 +#define REDC_1_TO_REDC_N_THRESHOLD 71 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 1822 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1787 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD_THRESHOLD 81 +#define HGCD_APPR_THRESHOLD 128 +#define HGCD_REDUCE_THRESHOLD 4455 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 32 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 812 diff --git a/mpn/x86/core2/gmp-mparam.h b/mpn/x86/core2/gmp-mparam.h new file mode 100644 index 000000000..feb0f281f --- /dev/null +++ b/mpn/x86/core2/gmp-mparam.h @@ -0,0 +1,141 @@ +/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 228 +#define MUL_TOOM6H_THRESHOLD 294 +#define MUL_TOOM8H_THRESHOLD 458 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 116 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 262 +#define SQR_TOOM8_THRESHOLD 597 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define POWM_SEC_TABLE 6,26,262,991,2212 + +#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 690, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 191,10}, { 383, 9}, \ + { 799,11}, { 223,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 70 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 630 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 630, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 15, 5}, { 31, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 271, 9}, { 543,11}, \ + { 159,10}, { 319, 9}, { 671, 8}, { 1343,11}, \ + { 191,10}, { 383, 9}, { 799,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 67 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 30 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 15 +#define DC_DIVAPPR_Q_THRESHOLD 49 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 190 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 35 +#define INV_APPR_THRESHOLD 35 + +#define BINV_NEWTON_THRESHOLD 324 +#define REDC_1_TO_REDC_N_THRESHOLD 83 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1099 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1718 + +#define MATRIX22_STRASSEN_THRESHOLD 31 +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 149 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 351 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 517 +#define SET_STR_PRECOMPUTE_THRESHOLD 1402 diff --git a/mpn/x86/coreinhm/gmp-mparam.h b/mpn/x86/coreinhm/gmp-mparam.h new file mode 100644 index 000000000..21afeb619 --- /dev/null +++ b/mpn/x86/coreinhm/gmp-mparam.h @@ -0,0 +1,141 @@ +/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.5 */ + +#define MOD_1_NORM_THRESHOLD 24 +#define MOD_1_UNNORM_THRESHOLD 15 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 214 +#define MUL_TOOM6H_THRESHOLD 306 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 148 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 42 +#define SQR_TOOM3_THRESHOLD 149 +#define SQR_TOOM4_THRESHOLD 226 +#define SQR_TOOM6_THRESHOLD 333 +#define SQR_TOOM8_THRESHOLD 494 + +#define MULMID_TOOM42_THRESHOLD 78 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 21 + +#define POWM_SEC_TABLE 2,33,294,1298,2870 + +#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 606, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 63 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 505 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 505, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 74 +#define SQR_FFT_THRESHOLD 4800 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 21 +#define DC_DIVAPPR_Q_THRESHOLD 42 +#define DC_BDIV_QR_THRESHOLD 84 +#define DC_BDIV_Q_THRESHOLD 156 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 17 +#define INV_APPR_THRESHOLD 17 + +#define BINV_NEWTON_THRESHOLD 348 +#define REDC_1_TO_REDC_N_THRESHOLD 83 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 501 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD_THRESHOLD 57 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 253 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 127 +#define SET_STR_PRECOMPUTE_THRESHOLD 646 diff --git a/mpn/x86/coreisbr/gmp-mparam.h b/mpn/x86/coreisbr/gmp-mparam.h new file mode 100644 index 000000000..16ef958ad --- /dev/null +++ b/mpn/x86/coreisbr/gmp-mparam.h @@ -0,0 +1,140 @@ +/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-24, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 24 +#define MOD_1_UNNORM_THRESHOLD 25 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 18 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 101 +#define MUL_TOOM44_THRESHOLD 244 +#define MUL_TOOM6H_THRESHOLD 351 +#define MUL_TOOM8H_THRESHOLD 547 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 109 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 109 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 48 +#define SQR_TOOM3_THRESHOLD 165 +#define SQR_TOOM4_THRESHOLD 276 +#define SQR_TOOM6_THRESHOLD 366 +#define SQR_TOOM8_THRESHOLD 572 + +#define MULMID_TOOM42_THRESHOLD 98 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define POWM_SEC_TABLE 2,27,258,1052 + +#define MUL_FFT_MODF_THRESHOLD 716 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 716, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \ + { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 31, 6}, \ + { 63, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 69 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 595 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 595, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 159,10}, { 319, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 63 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 100 +#define MULLO_MUL_N_THRESHOLD 14379 + +#define DC_DIV_QR_THRESHOLD 22 +#define DC_DIVAPPR_Q_THRESHOLD 30 +#define DC_BDIV_QR_THRESHOLD 120 +#define DC_BDIV_Q_THRESHOLD 268 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 12 +#define INV_APPR_THRESHOLD 13 + +#define BINV_NEWTON_THRESHOLD 410 +#define REDC_1_TO_REDC_N_THRESHOLD 100 + +#define MU_DIV_QR_THRESHOLD 1037 +#define MU_DIVAPPR_Q_THRESHOLD 889 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1858 +#define MU_BDIV_Q_THRESHOLD 2172 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD_THRESHOLD 59 +#define HGCD_APPR_THRESHOLD 56 +#define HGCD_REDUCE_THRESHOLD 4818 +#define GCD_DC_THRESHOLD 278 +#define GCDEXT_DC_THRESHOLD 298 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_DC_THRESHOLD 438 +#define SET_STR_PRECOMPUTE_THRESHOLD 1206 diff --git a/mpn/x86/k10/gmp-mparam.h b/mpn/x86/k10/gmp-mparam.h new file mode 100644 index 000000000..5c036223c --- /dev/null +++ b/mpn/x86/k10/gmp-mparam.h @@ -0,0 +1,142 @@ +/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 32 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 77 +#define MUL_TOOM44_THRESHOLD 127 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 77 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 336 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 54 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define POWM_SEC_TABLE 4,32,164,879,2178 + +#define MUL_FFT_MODF_THRESHOLD 786 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 786, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 63, 9}, { 39, 8}, { 83, 9}, { 47,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255, 7}, { 1023, 8}, { 543, 9}, { 279,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543, 8}, { 1087,10}, { 287,11}, { 159, 9}, \ + { 671,11}, { 191,10}, { 399, 9}, { 799,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 76 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 660 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 660, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 31, 7}, { 63, 8}, \ + { 35, 7}, { 71, 8}, { 39, 9}, { 23, 8}, \ + { 55,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 167,10}, { 95,11}, { 63,10}, \ + { 159,11}, { 95, 8}, { 799,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 543,11}, { 159, 9}, \ + { 639,10}, { 367,11}, { 191,10}, { 383, 9}, \ + { 799,10}, { 415,11}, { 223,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 67 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 6 +#define MULLO_DC_THRESHOLD 42 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 248 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 250 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 276 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1718 +#define MU_DIVAPPR_Q_THRESHOLD 1652 +#define MUPI_DIV_QR_THRESHOLD 114 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 131 +#define HGCD_APPR_THRESHOLD 163 +#define HGCD_REDUCE_THRESHOLD 3810 +#define GCD_DC_THRESHOLD 555 +#define GCDEXT_DC_THRESHOLD 389 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 140 +#define SET_STR_PRECOMPUTE_THRESHOLD 1334 diff --git a/mpn/x86/k8/gmp-mparam.h b/mpn/x86/k8/gmp-mparam.h new file mode 100644 index 000000000..727a381f1 --- /dev/null +++ b/mpn/x86/k8/gmp-mparam.h @@ -0,0 +1,144 @@ +/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 42 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 136 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 77 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 294 +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 74 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 2,14,216,991,2658 + +#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */ +#define MUL_FFT_TABLE3 \ + { { 888, 6}, { 15, 5}, { 31, 6}, { 25, 7}, \ + { 13, 6}, { 27, 7}, { 15, 6}, { 33, 7}, \ + { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \ + { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \ + { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \ + { 41, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \ + { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \ + { 287,11}, { 159,10}, { 335,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,11}, \ + { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 73 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 758 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 758, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 74 +#define SQR_FFT_THRESHOLD 7296 + +#define MULLO_BASECASE_THRESHOLD 8 +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 91 +#define DC_DIVAPPR_Q_THRESHOLD 278 +#define DC_BDIV_QR_THRESHOLD 87 +#define DC_BDIV_Q_THRESHOLD 216 + +#define INV_MULMOD_BNM1_THRESHOLD 62 +#define INV_NEWTON_THRESHOLD 262 +#define INV_APPR_THRESHOLD 262 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD_THRESHOLD 139 +#define HGCD_APPR_THRESHOLD 176 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 610 +#define GCDEXT_DC_THRESHOLD 419 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 450 +#define SET_STR_PRECOMPUTE_THRESHOLD 1366 diff --git a/mpn/x86/nano/gmp-mparam.h b/mpn/x86/nano/gmp-mparam.h new file mode 100644 index 000000000..5fa509372 --- /dev/null +++ b/mpn/x86/nano/gmp-mparam.h @@ -0,0 +1,152 @@ +/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */ + +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 53 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 32 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 132 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 130 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 135 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 194 +#define SQR_TOOM4_THRESHOLD 502 +#define SQR_TOOM6_THRESHOLD 746 +#define SQR_TOOM8_THRESHOLD 1005 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define POWM_SEC_TABLE 4,23,258,828,2246 + +#define MUL_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 308, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 24, 7}, { 15, 6}, \ + { 31, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \ + { 31, 9}, { 71,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 543, 9}, \ + { 287, 8}, { 575, 7}, { 1215,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \ + { 607, 8}, { 1215,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703, 8}, { 1407, 9}, \ + { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,10}, { 479, 9}, { 959, 8}, \ + { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 1856 + +#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 396, 5}, { 13, 6}, { 7, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \ + { 39, 7}, { 21, 8}, { 11, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 543,10}, { 143, 9}, \ + { 287, 8}, { 607, 7}, { 1215, 6}, { 2431,10}, \ + { 159, 8}, { 639,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 607, 8}, { 1215,11}, \ + { 159,10}, { 319, 9}, { 671,10}, { 351, 9}, \ + { 703, 8}, { 1407, 9}, { 735, 8}, { 1471, 7}, \ + { 2943,11}, { 191,10}, { 383, 9}, { 799,10}, \ + { 415, 9}, { 895,10}, { 479,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 87 +#define SQR_FFT_THRESHOLD 2368 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 3369 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 118 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 266 +#define INV_APPR_THRESHOLD 218 + +#define BINV_NEWTON_THRESHOLD 268 +#define REDC_1_TO_REDC_N_THRESHOLD 56 + +#define MU_DIV_QR_THRESHOLD 1308 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 124 +#define MU_BDIV_QR_THRESHOLD 855 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 104 +#define HGCD_APPR_THRESHOLD 139 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 456 +#define GCDEXT_DC_THRESHOLD 321 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 542 +#define SET_STR_PRECOMPUTE_THRESHOLD 840 -- cgit v1.2.1 From 60d13bded47cd6c09d6ce761347a675b91ecead5 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Fri, 25 Nov 2011 23:57:13 +0100 Subject: *** empty log message *** --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 761f9161b..5f69c758f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2011-11-25 Torbjorn Granlund + + * x86/*: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode. + + * configure.in: Overhaul x86/x86_64 support, merging three case + statements into one. + 2011-11-24 Torbjorn Granlund * doc/gmp.texi (Formatted Output Strings): Clarify rules for mpf_t -- cgit v1.2.1 From 3d1e2a383827ce817e2dca7c2eabf61f8ebf2245 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sat, 26 Nov 2011 10:56:14 +0100 Subject: Reinsert x86 path components accidentally lost in major edit. --- configure.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configure.in b/configure.in index 28df31214..580b3e427 100644 --- a/configure.in +++ b/configure.in @@ -1560,25 +1560,25 @@ case $host in athlon64 | k8 | x86_64) gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium" - path="x86/k8 x86" + path="x86/k8 x86/k7/mmx x86/k7 x86" path_64="x86_64/k8 x86_64" ;; k10) gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8" gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2" - path="x86/k10 x86/k8 x86" + path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86" path_64="x86_64/k10 x86_64/k8 x86_64" ;; bobcat) gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8" gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" - path="x86/bobcat x86" + path="x86/bobcat x86/k7/mmx x86/k7 x86" path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64" ;; bulldozer | bd1) gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8" gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" - path="x86/bd1 x86" + path="x86/bd1 x86/k7/mmx x86/k7 x86" path_64="x86_64/bd1 x86_64" ;; core2) -- cgit v1.2.1 From ff7620568f0f51316d73501a764ed229bd0d7923 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sat, 26 Nov 2011 10:57:47 +0100 Subject: Partially tabify. --- configure.in | 176 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/configure.in b/configure.in index 580b3e427..728529c13 100644 --- a/configure.in +++ b/configure.in @@ -1466,157 +1466,157 @@ case $host in gcc_cflags_optlist="cpu arch" case $host_cpu in i386*) - gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386" - gcc_cflags_arch="-march=i386" + gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386" + gcc_cflags_arch="-march=i386" path="x86" - ;; + ;; i486*) - gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486" - gcc_cflags_arch="-march=i486" + gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486" + gcc_cflags_arch="-march=i486" path="x86/i486 x86" - ;; + ;; i586 | pentium) - gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486" - gcc_cflags_arch="-march=pentium" + gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486" + gcc_cflags_arch="-march=pentium" path="x86/pentium x86" - ;; + ;; pentiummmx) - gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486" - gcc_cflags_arch="-march=pentium-mmx -march=pentium" + gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486" + gcc_cflags_arch="-march=pentium-mmx -march=pentium" path="x86/pentium/mmx x86/pentium x86" - ;; + ;; i686 | pentiumpro) - gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=pentiumpro -march=pentium" + gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=pentiumpro -march=pentium" path="x86/p6 x86" - ;; + ;; pentium2) - gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium" + gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium" path="x86/p6/mmx x86/p6 x86" - ;; + ;; pentium3) - gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium" + gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium" path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86" - ;; + ;; pentiumm) - gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium" + gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium" path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" - ;; + ;; k6) - gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486" - gcc_cflags_arch="-march=k6" + gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486" + gcc_cflags_arch="-march=k6" path="x86/k6/mmx x86/k6 x86" - ;; + ;; k62) - gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486" - gcc_cflags_arch="-march=k6-2 -march=k6" + gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486" + gcc_cflags_arch="-march=k6-2 -march=k6" path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" - ;; + ;; k63) - gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486" - gcc_cflags_arch="-march=k6-3 -march=k6" + gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486" + gcc_cflags_arch="-march=k6-3 -march=k6" path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" - ;; + ;; geode) - gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486" - gcc_cflags_arch="-march=k6-3 -march=k6" + gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486" + gcc_cflags_arch="-march=k6-3 -march=k6" path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86" - ;; + ;; athlon) - # Athlon instruction costs are close to P6 (3 cycle load latency, - # 4-6 cycle mul, 40 cycle div, pairable adc, etc) so if gcc doesn't - # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro. - gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium" + # Athlon instruction costs are close to P6 (3 cycle load latency, + # 4-6 cycle mul, 40 cycle div, pairable adc, etc) so if gcc doesn't + # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro. + gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium" path="x86/k7/mmx x86/k7 x86" - ;; + ;; i786 | pentium4) - # pentiumpro is the primary fallback when gcc doesn't know pentium4. - # This gets us cmov to eliminate branches. Maybe "athlon" would be - # a possibility on gcc 3.0. - # - gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium" - gcc_64_cflags_cpu="-mtune=nocona" + # pentiumpro is the primary fallback when gcc doesn't know pentium4. + # This gets us cmov to eliminate branches. Maybe "athlon" would be + # a possibility on gcc 3.0. + # + gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium" + gcc_64_cflags_cpu="-mtune=nocona" path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86" path_64="x86_64/pentium4 x86_64" - ;; + ;; viac32) - # Not sure of the best fallbacks here for -mcpu. - # c3-2 has sse and mmx, so pentium3 is good for -march. - gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486" - gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium" + # Not sure of the best fallbacks here for -mcpu. + # c3-2 has sse and mmx, so pentium3 is good for -march. + gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486" + gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium" path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86" - ;; + ;; viac3*) - # Not sure of the best fallbacks here. - gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486" - gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium" + # Not sure of the best fallbacks here. + gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486" + gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium" path="x86/pentium/mmx x86/pentium x86" - ;; + ;; athlon64 | k8 | x86_64) - gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" - gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium" + gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486" + gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium" path="x86/k8 x86/k7/mmx x86/k7 x86" path_64="x86_64/k8 x86_64" - ;; + ;; k10) - gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8" - gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2" + gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8" + gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2" path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86" path_64="x86_64/k10 x86_64/k8 x86_64" - ;; + ;; bobcat) - gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8" - gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" + gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8" + gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" path="x86/bobcat x86/k7/mmx x86/k7 x86" path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64" - ;; + ;; bulldozer | bd1) - gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8" - gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" + gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8" + gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2" path="x86/bd1 x86/k7/mmx x86/k7 x86" path_64="x86_64/bd1 x86_64" - ;; + ;; core2) - gcc_cflags_cpu="-mtune=core2 -mtune=k8" - gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + gcc_cflags_cpu="-mtune=core2 -mtune=k8" + gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" path_64="x86_64/core2 x86_64" ;; corei | coreinhm | coreiwsm) - gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" - gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" + gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" path_64="x86_64/coreinhm x86_64/core2 x86_64" - ;; + ;; coreisbr) - gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" - gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" + gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8" + gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2" path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86" path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64" - ;; + ;; atom) - gcc_cflags_cpu="-mtune=atom -mtune=pentium3" - gcc_cflags_arch="-march=atom -march=pentium3" + gcc_cflags_cpu="-mtune=atom -mtune=pentium3" + gcc_cflags_arch="-march=atom -march=pentium3" path="x86/atom/sse2 x86/atom/mmx x86/atom x86" path_64="x86_64/atom x86_64" - ;; + ;; nano) - gcc_cflags_cpu="-mtune=nano" - gcc_cflags_arch="-march=nano" + gcc_cflags_cpu="-mtune=nano" + gcc_cflags_arch="-march=nano" path="x86/nano x86" path_64="x86_64/nano x86_64" - ;; + ;; *) - gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486" - gcc_cflags_arch="-march=i486" + gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486" + gcc_cflags_arch="-march=i486" path="x86" path_64="x86_64" - ;; + ;; esac case $host in -- cgit v1.2.1 From 84e1db5df779cff97b211a773ae2577abb99fa32 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 16:35:07 +0100 Subject: Use CNST_LIMB for some constants. --- mpn/generic/udiv_w_sdiv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mpn/generic/udiv_w_sdiv.c b/mpn/generic/udiv_w_sdiv.c index c01f95847..ceefd1b5f 100644 --- a/mpn/generic/udiv_w_sdiv.c +++ b/mpn/generic/udiv_w_sdiv.c @@ -9,7 +9,7 @@ GNU MP RELEASE. -Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc. +Copyright 1992, 1994, 1996, 2000, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -116,12 +116,12 @@ mpn_udiv_w_sdiv (rp, a1, a0, d) { /* Hence a1 = d - 1 = 2*b1 - 1 */ if (a0 >= -d) { - q = -1; + q = -CNST_LIMB(1); r = a0 + d; } else { - q = -2; + q = -CNST_LIMB(2); r = a0 + 2*d; } } -- cgit v1.2.1 From a18078c77e2cc3fc4919acb9bf18a330b3268366 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 18:53:54 +0100 Subject: Changes to support non-standard ABIs in a coherent way. --- configure.in | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/configure.in b/configure.in index 728529c13..c9f023bbf 100644 --- a/configure.in +++ b/configure.in @@ -1640,10 +1640,9 @@ case $host in ;; *-*-mingw* | *-*-cygwin) limb_64=longlong - path_64="" # Windows amd64 calling conventions are *different* - extra_functions_64="" - # Silence many pedantic warnings for w64. FIXME. - gcc_64_cflags="$gcc_cflags -std=gnu99" + extra_functions_64=""# FIXME: remove when invert_limb is ported + AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64]) + AC_DEFINE(GMP_NONSTD_ABI,1,[Define to 1 if ABI is non-standard]) ;; esac ;; @@ -3061,6 +3060,17 @@ for tmp_fn in $gmp_mpn_functions; do esac fi + # If the host uses a non-standard ABI, check if tmp_file supports it + # + if test -n "$GMP_NONSTD_ABI" && test $tmp_dir != generic; then + abi=[`sed -n 's/^[ ]*ABI_SUPPORT(\(.*\))/\1/p' $tmp_file `] + if echo "$abi" | grep -q "\\b${GMP_NONSTD_ABI}\\b"; then + true + else + continue + fi + fi + found=yes eval found_$tmp_ext=yes @@ -3344,6 +3354,8 @@ if test "$gmp_asm_syntax_testing" != no; then case $host in *-*-darwin*) GMP_INCLUDE_MPN(x86_64/darwin.m4) ;; + *-*-mingw* | *-*-cygwin) + GMP_INCLUDE_MPN(x86_64/dos64.m4) ;; esac ;; esac -- cgit v1.2.1 From 4fc5b919efe2d88b2ba5f8066ebac6ae6b326019 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 18:54:52 +0100 Subject: (ABI_SUPPORT): New dummy macro. --- mpn/asm-defs.m4 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4 index 7a5639fbe..b95cad7c0 100644 --- a/mpn/asm-defs.m4 +++ b/mpn/asm-defs.m4 @@ -1713,6 +1713,22 @@ m4_assert_numargs(1) ) +dnl Usage: ABI_SUPPORT(abi) +dnl +dnl A dummy macro which is grepped for by ./configure to know what ABIs +dnl are supported in an asm file. +dnl +dnl If multiple non-standard ABIs are supported, several ABI_SUPPORT +dnl declarations should be used: +dnl +dnl ABI_SUPPORT(FOOABI) +dnl ABI_SUPPORT(BARABI) + +define(ABI_SUPPORT, +m4_assert_numargs(1) +) + + dnl Usage: GMP_NUMB_MASK dnl dnl A bit mask for the number part of a limb. Eg. with 6 bit nails in a -- cgit v1.2.1 From 5db131d3587a358bb093d29a90ff922828a4e1bd Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:06:35 +0100 Subject: More DOS64 configure changes. --- configure.in | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configure.in b/configure.in index c9f023bbf..5b7cf188a 100644 --- a/configure.in +++ b/configure.in @@ -1640,9 +1640,10 @@ case $host in ;; *-*-mingw* | *-*-cygwin) limb_64=longlong - extra_functions_64=""# FIXME: remove when invert_limb is ported + extra_functions_64="" # FIXME: remove when invert_limb is ported + CALLING_CONVENTIONS_OBJS_64="" AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64]) - AC_DEFINE(GMP_NONSTD_ABI,1,[Define to 1 if ABI is non-standard]) + AC_SUBST(GMP_NONSTD_ABI,DOS64) ;; esac ;; -- cgit v1.2.1 From 3ef9c90709d4a4e1a6a5052cea8e17c9feb76728 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:06:51 +0100 Subject: New file. --- mpn/x86_64/dos64.m4 | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 mpn/x86_64/dos64.m4 diff --git a/mpn/x86_64/dos64.m4 b/mpn/x86_64/dos64.m4 new file mode 100644 index 000000000..ef60834ec --- /dev/null +++ b/mpn/x86_64/dos64.m4 @@ -0,0 +1,39 @@ +divert(-1) +dnl Copyright 2011 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +define(`HOST_DOS64') + +define(`JUMPTABSECT', `.section .rdata,"dr"') + +define(`DOS64_ENTRY', + `push %rdi + push %rsi + mov %rcx, %rdi +ifelse(eval($1>=2),1,`dnl + mov %rdx, %rsi +ifelse(eval($1>=3),1,`dnl + mov %r8, %rdx +ifelse(eval($1>=4),1,`dnl + mov %r9, %rcx +')')')') + +define(`DOS64_EXIT', + `pop %rsi + pop %rdi') + +divert`'dnl -- cgit v1.2.1 From d220c40fa205a350318faf2257a87480fce910df Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:08:16 +0100 Subject: (DOS64_ENTRY, DOS64_EXIT): New, empty defs. --- mpn/x86_64/x86_64-defs.m4 | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4 index 6942a7882..79d7b3cf2 100644 --- a/mpn/x86_64/x86_64-defs.m4 +++ b/mpn/x86_64/x86_64-defs.m4 @@ -2,8 +2,8 @@ divert(-1) dnl m4 macros for amd64 assembler. -dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free -dnl Software Foundation, Inc. +dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009, 2011 +dnl Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -169,4 +169,7 @@ ifdef(`PIC', define(`JUMPTABSECT', `.section .data.rel.ro.local,"aw",@progbits') +define(`DOS64_ENTRY',`') +define(`DOS64_EXIT',`') + divert`'dnl -- cgit v1.2.1 From dc3473c8e0d2c4e84a698901cc82327b7df24286 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:09:54 +0100 Subject: Retune. --- mpn/x86_64/gmp-mparam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h index b16ff5a6b..aca6853f0 100644 --- a/mpn/x86_64/gmp-mparam.h +++ b/mpn/x86_64/gmp-mparam.h @@ -192,7 +192,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MATRIX22_STRASSEN_THRESHOLD 16 #define HGCD_THRESHOLD 125 -#define HGCD_APPR_THRESHOLD 50 +#define HGCD_APPR_THRESHOLD 173 #define HGCD_REDUCE_THRESHOLD 3524 #define GCD_DC_THRESHOLD 555 #define GCDEXT_DC_THRESHOLD 478 -- cgit v1.2.1 From d4f5eddea43f682b380e85a9db69b4e8fd8ea54a Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:11:55 +0100 Subject: Retune. --- mpn/ia64/gmp-mparam.h | 16 +++--- mpn/s390_32/esame/gmp-mparam.h | 86 ++++++++++++++++--------------- mpn/s390_64/gmp-mparam.h | 8 +-- mpn/x86/bobcat/gmp-mparam.h | 113 +++++++++++++++++++++-------------------- 4 files changed, 114 insertions(+), 109 deletions(-) diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h index f080b876e..77e02f518 100644 --- a/mpn/ia64/gmp-mparam.h +++ b/mpn/ia64/gmp-mparam.h @@ -26,10 +26,10 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 26 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 #define USE_PREINV_DIVREM_1 1 /* native */ #define DIV_QR_2_PI2_THRESHOLD 12 @@ -54,6 +54,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 0 /* always */ #define SQR_TOOM8_THRESHOLD 0 /* always */ +#define MULMID_TOOM42_THRESHOLD 98 + #define MULMOD_BNM1_THRESHOLD 23 #define SQRMOD_BNM1_THRESHOLD 28 @@ -171,9 +173,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MATRIX22_STRASSEN_THRESHOLD 23 #define HGCD_THRESHOLD 117 -#define HGCD_APPR_THRESHOLD 50 -#define HGCD_REDUCE_THRESHOLD 3389 -#define GCD_DC_THRESHOLD 496 +#define HGCD_APPR_THRESHOLD 111 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 555 #define GCDEXT_DC_THRESHOLD 368 #define JACOBI_BASE_METHOD 4 diff --git a/mpn/s390_32/esame/gmp-mparam.h b/mpn/s390_32/esame/gmp-mparam.h index 5dedeeb81..a6508be1a 100644 --- a/mpn/s390_32/esame/gmp-mparam.h +++ b/mpn/s390_32/esame/gmp-mparam.h @@ -24,43 +24,45 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ /* 1200 MHz IBM z990 running in 32-bit mode */ #define DIVREM_1_NORM_THRESHOLD 0 /* always */ -#define DIVREM_1_UNNORM_THRESHOLD 3 -#define MOD_1_1P_METHOD 1 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 3 -#define MOD_1N_TO_MOD_1_1_THRESHOLD 12 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 17 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 42 #define USE_PREINV_DIVREM_1 1 #define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ -#define BMOD_1_TO_MOD_1_THRESHOLD 50 +#define BMOD_1_TO_MOD_1_THRESHOLD 30 #define MUL_TOOM22_THRESHOLD 16 -#define MUL_TOOM33_THRESHOLD 66 -#define MUL_TOOM44_THRESHOLD 169 -#define MUL_TOOM6H_THRESHOLD 369 -#define MUL_TOOM8H_THRESHOLD 517 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 147 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 333 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 106 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 187 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 100 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 28 -#define SQR_TOOM3_THRESHOLD 93 -#define SQR_TOOM4_THRESHOLD 387 -#define SQR_TOOM6_THRESHOLD 552 -#define SQR_TOOM8_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 26 +#define SQR_TOOM3_THRESHOLD 81 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 #define MULMID_TOOM42_THRESHOLD 38 #define MULMOD_BNM1_THRESHOLD 13 #define SQRMOD_BNM1_THRESHOLD 15 +#define POWM_SEC_TABLE 4,23,262,892,2500 + #define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 336, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \ @@ -91,37 +93,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 35 #define SQR_FFT_THRESHOLD 2368 -#define MULLO_BASECASE_THRESHOLD 6 -#define MULLO_DC_THRESHOLD 45 +#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_DC_THRESHOLD 49 #define MULLO_MUL_N_THRESHOLD 5397 -#define DC_DIV_QR_THRESHOLD 40 -#define DC_DIVAPPR_Q_THRESHOLD 152 +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 146 #define DC_BDIV_QR_THRESHOLD 51 -#define DC_BDIV_Q_THRESHOLD 136 +#define DC_BDIV_Q_THRESHOLD 124 #define INV_MULMOD_BNM1_THRESHOLD 46 -#define INV_NEWTON_THRESHOLD 197 -#define INV_APPR_THRESHOLD 157 +#define INV_NEWTON_THRESHOLD 179 +#define INV_APPR_THRESHOLD 153 -#define BINV_NEWTON_THRESHOLD 114 +#define BINV_NEWTON_THRESHOLD 214 #define REDC_1_TO_REDC_N_THRESHOLD 55 -#define MU_DIV_QR_THRESHOLD 1210 -#define MU_DIVAPPR_Q_THRESHOLD 1334 -#define MUPI_DIV_QR_THRESHOLD 81 -#define MU_BDIV_QR_THRESHOLD 942 -#define MU_BDIV_Q_THRESHOLD 1258 - -#define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 104 -#define GCD_DC_THRESHOLD 278 +#define MU_DIV_QR_THRESHOLD 1078 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 74 +#define MU_BDIV_QR_THRESHOLD 872 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 90 +#define HGCD_APPR_THRESHOLD 111 +#define HGCD_REDUCE_THRESHOLD 1962 +#define GCD_DC_THRESHOLD 225 #define GCDEXT_DC_THRESHOLD 217 #define JACOBI_BASE_METHOD 2 -#define GET_STR_DC_THRESHOLD 16 -#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 #define SET_STR_DC_THRESHOLD 274 #define SET_STR_PRECOMPUTE_THRESHOLD 824 - -/* Tuneup completed successfully, took 108 seconds */ diff --git a/mpn/s390_64/gmp-mparam.h b/mpn/s390_64/gmp-mparam.h index 46ca86726..c0ade71c2 100644 --- a/mpn/s390_64/gmp-mparam.h +++ b/mpn/s390_64/gmp-mparam.h @@ -28,19 +28,19 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 #define MOD_1U_TO_MOD_1_1_THRESHOLD 5 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 58 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 38 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19 #define USE_PREINV_DIVREM_1 1 #define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ -#define BMOD_1_TO_MOD_1_THRESHOLD 47 +#define BMOD_1_TO_MOD_1_THRESHOLD 88 #define MUL_TOOM22_THRESHOLD 10 #define MUL_TOOM33_THRESHOLD 41 -#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM44_THRESHOLD 104 #define MUL_TOOM6H_THRESHOLD 149 #define MUL_TOOM8H_THRESHOLD 212 diff --git a/mpn/x86/bobcat/gmp-mparam.h b/mpn/x86/bobcat/gmp-mparam.h index 58dfee1cf..e14ba39f5 100644 --- a/mpn/x86/bobcat/gmp-mparam.h +++ b/mpn/x86/bobcat/gmp-mparam.h @@ -25,30 +25,30 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 12 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 #define USE_PREINV_DIVREM_1 1 /* native */ #define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 40 +#define BMOD_1_TO_MOD_1_THRESHOLD 42 #define MUL_TOOM22_THRESHOLD 28 -#define MUL_TOOM33_THRESHOLD 85 +#define MUL_TOOM33_THRESHOLD 90 #define MUL_TOOM44_THRESHOLD 147 -#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM6H_THRESHOLD 274 #define MUL_TOOM8H_THRESHOLD 454 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 111 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ #define SQR_TOOM2_THRESHOLD 38 -#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM3_THRESHOLD 89 #define SQR_TOOM4_THRESHOLD 220 #define SQR_TOOM6_THRESHOLD 303 #define SQR_TOOM8_THRESHOLD 454 @@ -58,84 +58,85 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MULMOD_BNM1_THRESHOLD 19 #define SQRMOD_BNM1_THRESHOLD 23 -#define POWM_SEC_TABLE 2,17,225,357,2212 +#define POWM_SEC_TABLE 4,14,290,357,2178 #define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */ #define MUL_FFT_TABLE3 \ - { { 888, 6}, { 27, 7}, { 15, 6}, { 33, 7}, \ - { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \ - { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \ - { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \ - { 41, 8}, { 23, 7}, { 49, 8}, { 31, 7}, \ - { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \ - { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ - { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ - { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ - { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ - { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ - { 191,11}, { 63,10}, { 127, 9}, { 255,10}, \ - { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ - { 127,10}, { 271,11}, { 159,10}, { 319, 9}, \ - { 639,10}, { 335,11}, { 191,10}, { 383, 9}, \ - { 767,11}, { 223,12}, { 4096,13}, { 8192,14}, \ + { { 888, 6}, { 25, 7}, { 13, 6}, { 27, 7}, \ + { 15, 6}, { 33, 7}, { 17, 6}, { 35, 7}, \ + { 19, 6}, { 39, 7}, { 23, 6}, { 47, 7}, \ + { 27, 8}, { 15, 7}, { 31, 6}, { 63, 7}, \ + { 35, 8}, { 19, 7}, { 41, 8}, { 23, 7}, \ + { 49, 8}, { 31, 7}, { 63, 8}, { 39, 7}, \ + { 79, 8}, { 43, 9}, { 23, 8}, { 51, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 159,10}, { 319, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,12}, { 4096,13}, { 8192,14}, \ { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 66 +#define MUL_FFT_TABLE3_SIZE 70 #define MUL_FFT_THRESHOLD 7552 -#define SQR_FFT_MODF_THRESHOLD 730 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 723 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 730, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { { 723, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \ - { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ - { 31, 7}, { 63, 8}, { 39, 7}, { 79, 8}, \ - { 43, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ { 95, 9}, { 191,11}, { 63,10}, { 127, 9}, \ { 255,10}, { 159,11}, { 95,10}, { 191,12}, \ - { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ - { 271,11}, { 159,10}, { 319, 9}, { 671,11}, \ - { 191, 9}, { 767,12}, { 4096,13}, { 8192,14}, \ - { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 70 -#define SQR_FFT_THRESHOLD 7296 + { 63,11}, { 127,10}, { 255, 9}, { 543,11}, \ + { 159, 9}, { 671,11}, { 191,10}, { 383, 9}, \ + { 799,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 69 +#define SQR_FFT_THRESHOLD 5760 #define MULLO_BASECASE_THRESHOLD 5 #define MULLO_DC_THRESHOLD 45 #define MULLO_MUL_N_THRESHOLD 13463 -#define DC_DIV_QR_THRESHOLD 72 -#define DC_DIVAPPR_Q_THRESHOLD 214 +#define DC_DIV_QR_THRESHOLD 75 +#define DC_DIVAPPR_Q_THRESHOLD 216 #define DC_BDIV_QR_THRESHOLD 67 -#define DC_BDIV_Q_THRESHOLD 142 +#define DC_BDIV_Q_THRESHOLD 143 -#define INV_MULMOD_BNM1_THRESHOLD 71 -#define INV_NEWTON_THRESHOLD 250 +#define INV_MULMOD_BNM1_THRESHOLD 75 +#define INV_NEWTON_THRESHOLD 244 #define INV_APPR_THRESHOLD 228 -#define BINV_NEWTON_THRESHOLD 270 +#define BINV_NEWTON_THRESHOLD 276 #define REDC_1_TO_REDC_N_THRESHOLD 71 -#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIV_QR_THRESHOLD 1858 #define MU_DIVAPPR_Q_THRESHOLD 1822 #define MUPI_DIV_QR_THRESHOLD 122 #define MU_BDIV_QR_THRESHOLD 1787 #define MU_BDIV_Q_THRESHOLD 1787 -#define MATRIX22_STRASSEN_THRESHOLD 21 -#define HGCD_THRESHOLD 81 -#define HGCD_APPR_THRESHOLD 128 -#define HGCD_REDUCE_THRESHOLD 4455 -#define GCD_DC_THRESHOLD 465 +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD_THRESHOLD 78 +#define HGCD_APPR_THRESHOLD 55 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 474 #define GCDEXT_DC_THRESHOLD 345 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 11 -#define GET_STR_PRECOMPUTE_THRESHOLD 32 +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 31 #define SET_STR_DC_THRESHOLD 270 #define SET_STR_PRECOMPUTE_THRESHOLD 812 -- cgit v1.2.1 From 68afbfbde8fb3e1bc9bb31d53ce5d81f438262a1 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:13:37 +0100 Subject: Support ABI DOS64. --- mpn/x86_64/aorsmul_1.asm | 51 ++++++++++++++++++++++++++++++----------- mpn/x86_64/mul_1.asm | 55 ++++++++++++++++++++++++++++++++++----------- mpn/x86_64/mul_basecase.asm | 14 ++++++++++++ mpn/x86_64/sqr_basecase.asm | 17 +++++++++++++- 4 files changed, 110 insertions(+), 27 deletions(-) diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm index 9c64d56fc..a406bc9e8 100644 --- a/mpn/x86_64/aorsmul_1.asm +++ b/mpn/x86_64/aorsmul_1.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_addmul_1 and mpn_submul_1. -dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. +dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -28,20 +28,27 @@ C Intel corei ? C Intel atom 21.3 C VIA nano 5.5 -C The inner loop of this code is the result of running a code generation and +C The loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. -C TODO: -C * The inner loop is great, but the prologue and epilogue code was -C quickly written. Tune it! +C TODO +C * The loop is great, but the prologue and epilogue code was quickly written. +C Tune it! -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n_param',`%rdx') -define(`vl', `%rcx') +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vl', `%rcx') C r9 -define(`n', `%r11') +define(`n', `%r11') + +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') ifdef(`OPERATION_addmul_1',` define(`ADDSUB', `add') @@ -52,17 +59,33 @@ ifdef(`OPERATION_submul_1',` define(`func', `mpn_submul_1') ') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`vl', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + mov (up), %rax C read first u limb early push %rbx - mov n_param, %rbx C move away n from rdx, mul uses it +IFELF(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') mul vl - mov %rbx, n +IFELF(` mov %rbx, n ') and $3, R32(%rbx) jz L(b0) @@ -145,5 +168,7 @@ L(ret): adc $0, %rdx mov %rdx, %rax pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') ret EPILOGUE() diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm index 5f8dc4c9c..3b87bbf01 100644 --- a/mpn/x86_64/mul_1.asm +++ b/mpn/x86_64/mul_1.asm @@ -28,38 +28,65 @@ C Intel corei 3.8 C Intel atom 19.8 C VIA nano ? -C The inner loop of this code is the result of running a code generation and +C The loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. -C TODO: -C * The inner loop is great, but the prologue and epilogue code was -C quickly written. Tune it! +C TODO +C * The loop is great, but the prologue and epilogue code was quickly written. +C Tune it! -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n_param',`%rdx') -define(`vl', `%rcx') +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vl', `%rcx') C r9 -define(`n', `%r11') +define(`n', `%r11') + +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`vl', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') push %rbx - mov %r8, %r10 +IFELF(` mov %r8, %r10') +IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns) jmp L(common) EPILOGUE() PROLOGUE(mpn_mul_1) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + push %rbx xor %r10, %r10 L(common): mov (up), %rax C read first u limb early - mov n_param, %rbx C move away n from rdx, mul uses it +IFELF(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') mul vl - mov %rbx, %r11 +IFELF(` mov %rbx, n ') add %r10, %rax adc $0, %rdx @@ -145,5 +172,7 @@ L(L2): mul vl L(ret): mov %rdx, %rax pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') ret EPILOGUE() diff --git a/mpn/x86_64/mul_basecase.asm b/mpn/x86_64/mul_basecase.asm index fdba9a6e3..5fede9234 100644 --- a/mpn/x86_64/mul_basecase.asm +++ b/mpn/x86_64/mul_basecase.asm @@ -59,10 +59,23 @@ define(`n', `%r11') define(`outer_addr', `%r14') define(`un', `%r13') +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mul_basecase) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') push %rbx push %rbp push %r12 @@ -448,6 +461,7 @@ L(ret): pop %r15 pop %r12 pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm index 311daab8a..f71627ab9 100644 --- a/mpn/x86_64/sqr_basecase.asm +++ b/mpn/x86_64/sqr_basecase.asm @@ -75,12 +75,22 @@ define(`w1', `%rcx') define(`w2', `%rbp') define(`w3', `%r10') +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(16) - PROLOGUE(mpn_sqr_basecase) + DOS64_ENTRY(3) add $-40, %rsp mov %rbx, 32(%rsp) mov %rbp, 24(%rsp) @@ -115,6 +125,7 @@ L(1): mov (up), %rax mov %rdx, 8(rp) add $32, %rsp pop %rbx + DOS64_EXIT() ret L(2): mov (up), %rax @@ -139,6 +150,7 @@ L(2): mov (up), %rax mov %r11, 24(rp) add $32, %rsp pop %rbx + DOS64_EXIT() ret L(3): mov (up), %rax @@ -184,6 +196,7 @@ L(3): mov (up), %rax adc %rbx, 40(rp) add $32, %rsp pop %rbx + DOS64_EXIT() ret L(4): mov (up), %rax @@ -256,6 +269,7 @@ L(4): mov (up), %rax pop %r12 pop %rbp pop %rbx + DOS64_EXIT() ret @@ -780,5 +794,6 @@ L(d1): mov %r11, 24(rp,j,8) pop %r12 pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() -- cgit v1.2.1 From b69304467882ffb2ef4e4ffe6c6876f877dc1d40 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 28 Nov 2011 23:13:59 +0100 Subject: *** empty log message *** --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 5f69c758f..6967ce66a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +2011-11-28 Torbjorn Granlund + + * mpn/x86_64/mul_basecase.asm: Support ABI DOS64. + * mpn/x86_64/sqr_basecase.asm: Support ABI DOS64. + * mpn/x86_64/aorsmul_1.asm: Support ABI DOS64. + * mpn/x86_64/mul_1.asm: Support ABI DOS64. + + * mpn/x86_64/x86_64-defs.m4 (DOS64_ENTRY, DOS64_EXIT): New, empty defs. + + * mpn/x86_64/dos64.m4: New file. + + * mpn/asm-defs.m4 (ABI_SUPPORT): New dummy macro. + + * configure.in (64-bit mingw/cygwin): Define HOST_DOS64,GMP_NONSTD_ABI. + No longer clear out path_64. + (mpn code selection loop): Handle GMP_NONSTD_ABI. + + * mpn/generic/udiv_w_sdiv.c: Use CNST_LIMB for some constants. + 2011-11-25 Torbjorn Granlund * x86/*: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode. -- cgit v1.2.1 From cdaf5d1a1e737e7db82e6509571468fa660c043b Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 29 Nov 2011 21:46:37 +0100 Subject: Retune. --- mpn/x86_64/coreinhm/gmp-mparam.h | 92 +++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/mpn/x86_64/coreinhm/gmp-mparam.h b/mpn/x86_64/coreinhm/gmp-mparam.h index 90cfa2be4..0a0ada3c5 100644 --- a/mpn/x86_64/coreinhm/gmp-mparam.h +++ b/mpn/x86_64/coreinhm/gmp-mparam.h @@ -53,58 +53,92 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_TOOM6_THRESHOLD 318 #define SQR_TOOM8_THRESHOLD 502 +#define MULMID_TOOM42_THRESHOLD 22 + #define MULMOD_BNM1_THRESHOLD 13 #define SQRMOD_BNM1_THRESHOLD 13 -#define POWM_SEC_TABLE 2,65,322,1084 +#define POWM_SEC_TABLE 3,42,83,643,2080 #define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ - { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ - { 23, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ - { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ - { 159, 9}, { 319, 8}, { 639,10}, { 167,11}, \ - { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ - { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ - { 575,11}, { 159,10}, { 319,12}, { 95,11}, \ - { 191,10}, { 383,11}, { 207,13}, { 8192,14}, \ - { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ - { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ - {4194304,23}, {8388608,24} } -#define MUL_FFT_TABLE3_SIZE 74 + { 159,11}, { 95,10}, { 191, 9}, { 383,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,10}, { 1407,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,13}, { 319,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 959,11}, { 1919,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 137 #define MUL_FFT_THRESHOLD 3712 -#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ - { 15, 9}, { 43,10}, { 23, 9}, { 47,11}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ - { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ - { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ - { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ - { 511,10}, { 135,11}, { 79,10}, { 159, 9}, \ - { 319,11}, { 95,10}, { 191, 9}, { 383, 8}, \ - { 767,12}, { 63,10}, { 255,11}, { 143, 9}, \ - { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \ - { 639,11}, { 175,12}, { 95,11}, { 191,10}, \ - { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ - { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ - {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } -#define SQR_FFT_TABLE3_SIZE 76 -#define SQR_FFT_THRESHOLD 3200 + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255,11}, { 79,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,11}, { 175,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,11}, { 959,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \ + { 447,12}, { 959,11}, { 1919,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 137 +#define SQR_FFT_THRESHOLD 2752 #define MULLO_BASECASE_THRESHOLD 4 #define MULLO_DC_THRESHOLD 21 -- cgit v1.2.1 From 23df1f61b5f28b4bf4953acd2b069d1f09d6450f Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 29 Nov 2011 21:59:39 +0100 Subject: Add DOS64 ABI support to most files. --- ChangeLog | 4 ++++ configure.in | 1 - mpn/x86_64/addmul_2.asm | 7 ++++++- mpn/x86_64/aorrlsh1_n.asm | 8 +++++++- mpn/x86_64/aorrlsh2_n.asm | 5 ++++- mpn/x86_64/aorrlshC_n.asm | 7 ++++++- mpn/x86_64/aorrlsh_n.asm | 14 ++++++++++++++ mpn/x86_64/aors_n.asm | 34 ++++++++++++++++++++++++++-------- mpn/x86_64/aorscnd_n.asm | 14 ++++++++++++++ mpn/x86_64/bdiv_dbm1c.asm | 16 +++++++++++++++- mpn/x86_64/bdiv_q_1.asm | 21 +++++++++++++++++++-- mpn/x86_64/com.asm | 8 ++++++-- mpn/x86_64/copyd.asm | 9 +++++++-- mpn/x86_64/copyi.asm | 9 +++++++-- mpn/x86_64/core2/aorrlsh1_n.asm | 5 ++++- mpn/x86_64/core2/aorrlsh2_n.asm | 5 ++++- mpn/x86_64/core2/aorrlsh_n.asm | 4 ++++ mpn/x86_64/core2/aors_n.asm | 19 ++++++++++++++++--- mpn/x86_64/core2/aorsmul_1.asm | 8 +++++++- mpn/x86_64/core2/lshift.asm | 39 +++++++++++++++++++++++---------------- mpn/x86_64/core2/lshiftc.asm | 39 +++++++++++++++++++++++---------------- mpn/x86_64/core2/rsh1aors_n.asm | 17 ++++++++++++++++- mpn/x86_64/core2/rshift.asm | 39 +++++++++++++++++++++++---------------- mpn/x86_64/core2/sublsh1_n.asm | 5 ++++- mpn/x86_64/core2/sublsh2_n.asm | 5 ++++- mpn/x86_64/core2/sublshC_n.asm | 4 +++- mpn/x86_64/coreinhm/aorrlsh_n.asm | 17 +++++++++++++++++ mpn/x86_64/coreisbr/aors_n.asm | 14 ++++++++++++++ mpn/x86_64/invert_limb.asm | 6 +++++- mpn/x86_64/invert_limb_table.asm | 3 +++ mpn/x86_64/logops_n.asm | 16 ++++++++++++---- mpn/x86_64/lshift.asm | 11 ++++++++++- mpn/x86_64/lshiftc.asm | 7 ++++++- mpn/x86_64/lshsub_n.asm | 16 +++++++++++++++- mpn/x86_64/mod_1_1.asm | 7 +++++++ mpn/x86_64/mod_1_2.asm | 9 ++++++++- mpn/x86_64/mod_1_4.asm | 14 +++++++++++--- mpn/x86_64/mod_34lsub1.asm | 12 +++++++++--- mpn/x86_64/mul_2.asm | 7 ++++++- mpn/x86_64/mulmid_basecase.asm | 14 +++++++++++++- mpn/x86_64/popham.asm | 12 +++++++++--- mpn/x86_64/redc_1.asm | 5 +++++ mpn/x86_64/rsh1aors_n.asm | 17 ++++++++++++++++- mpn/x86_64/rshift.asm | 7 ++++++- mpn/x86_64/sqr_basecase.asm | 8 -------- mpn/x86_64/sublsh1_n.asm | 7 ++++++- mpn/x86_64/tabselect.asm | 14 ++++++++++++++ 47 files changed, 458 insertions(+), 111 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6967ce66a..01c275bd2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-29 Torbjorn Granlund + + * mpn/x86_64: Add DOS64 ABI support to most files. + 2011-11-28 Torbjorn Granlund * mpn/x86_64/mul_basecase.asm: Support ABI DOS64. diff --git a/configure.in b/configure.in index 5b7cf188a..601d6348f 100644 --- a/configure.in +++ b/configure.in @@ -1640,7 +1640,6 @@ case $host in ;; *-*-mingw* | *-*-cygwin) limb_64=longlong - extra_functions_64="" # FIXME: remove when invert_limb is ported CALLING_CONVENTIONS_OBJS_64="" AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64]) AC_SUBST(GMP_NONSTD_ABI,DOS64) diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm index 107c3dafe..5c6647888 100644 --- a/mpn/x86_64/addmul_2.asm +++ b/mpn/x86_64/addmul_2.asm @@ -50,10 +50,14 @@ define(`w2', `%rbp') define(`w3', `%r10') define(`n', `%r11') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + +ASM_START() TEXT ALIGN(16) -ASM_START() PROLOGUE(mpn_addmul_2) + DOS64_ENTRY(4) mov n_param, n push %rbx push %rbp @@ -164,6 +168,7 @@ L(end): xor R32(w1), R32(w1) pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm index 2ea556b73..dda7d590e 100644 --- a/mpn/x86_64/aorrlsh1_n.asm +++ b/mpn/x86_64/aorrlsh1_n.asm @@ -1,7 +1,8 @@ dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] -dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2011 Free Software +dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -54,10 +55,14 @@ ifdef(`OPERATION_rsblsh1_n', ` MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) push %rbp mov (vp), %r8 @@ -147,5 +152,6 @@ ifdef(`OPERATION_rsblsh1_n',` movslq R32(%rbp), %rax') pop %rbp + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/aorrlsh2_n.asm b/mpn/x86_64/aorrlsh2_n.asm index 6d55cfd10..8c427a674 100644 --- a/mpn/x86_64/aorrlsh2_n.asm +++ b/mpn/x86_64/aorrlsh2_n.asm @@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2009, 2010 Free Software Foundation, Inc. +dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n',` MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm index cab0b07f4..ae9a9d952 100644 --- a/mpn/x86_64/aorrlshC_n.asm +++ b/mpn/x86_64/aorrlshC_n.asm @@ -1,7 +1,7 @@ dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] -dnl Copyright 2009, 2010 Free Software Foundation, Inc. +dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -37,10 +37,14 @@ define(`n', `%rcx') define(M, eval(m4_lshift(1,LSH))) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) push %r12 push %r13 push %r14 @@ -140,5 +144,6 @@ ifelse(ADDSUB,add,` pop %r14 pop %r13 pop %r12 + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm index d19dea535..8ab3688d2 100644 --- a/mpn/x86_64/aorrlsh_n.asm +++ b/mpn/x86_64/aorrlsh_n.asm @@ -56,10 +56,23 @@ ifdef(`OPERATION_rsblsh_n',` MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') push %r12 push %r13 push %r14 @@ -155,5 +168,6 @@ L(end): add R32(%rbx), R32(%rbx) pop %r14 pop %r13 pop %r12 + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm index 916e9b664..eadde641b 100644 --- a/mpn/x86_64/aors_n.asm +++ b/mpn/x86_64/aors_n.asm @@ -1,7 +1,7 @@ dnl AMD64 mpn_add_n, mpn_sub_n -dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation, -dnl Inc. +dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011 Free Software +dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -30,15 +30,15 @@ C Intel SBR 1.59 C Intel atom 4 C VIA nano 3.25 -C The inner loop of this code is the result of running a code generation and +C The loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`vp', `%rdx') -define(`n', `%rcx') -define(`cy', `%r8') C (only for mpn_add_nc) +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (only for mpn_add_nc) ifdef(`OPERATION_add_n', ` define(ADCSBB, adc) @@ -51,10 +51,23 @@ ifdef(`OPERATION_sub_n', ` MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func_nc) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') mov R32(n), R32(%rax) shr $2, n and $3, R32(%rax) @@ -69,6 +82,7 @@ PROLOGUE(func_nc) EPILOGUE() ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) mov R32(n), R32(%rax) shr $2, n and $3, R32(%rax) @@ -85,6 +99,7 @@ L(lt4): dec R32(%rax) ADCSBB (vp), %r8 mov %r8, (rp) adc R32(%rax), R32(%rax) + DOS64_EXIT() ret L(2): dec R32(%rax) @@ -95,6 +110,7 @@ L(2): dec R32(%rax) mov %r8, (rp) mov %r9, 8(rp) adc R32(%rax), R32(%rax) + DOS64_EXIT() ret L(3): mov 16(up), %r10 @@ -105,6 +121,7 @@ L(3): mov 16(up), %r10 mov %r9, 8(rp) mov %r10, 16(rp) setc R8(%rax) + DOS64_EXIT() ret ALIGN(16) @@ -142,5 +159,6 @@ L(end): lea 32(up), up dec R32(%rax) jnz L(lt4) adc R32(%rax), R32(%rax) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/aorscnd_n.asm b/mpn/x86_64/aorscnd_n.asm index 19ea42f2a..d22a2a218 100644 --- a/mpn/x86_64/aorscnd_n.asm +++ b/mpn/x86_64/aorscnd_n.asm @@ -59,10 +59,23 @@ ifdef(`OPERATION_subcnd_n', ` MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') push %rbx push %rbp push %r12 @@ -160,5 +173,6 @@ L(end): neg R32(%rax) pop %r12 pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm index f6a77507d..0fef478d9 100644 --- a/mpn/x86_64/bdiv_dbm1c.asm +++ b/mpn/x86_64/bdiv_dbm1c.asm @@ -41,10 +41,23 @@ define(`cy', `%r8') define(`n', `%r9') +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_bdiv_dbm1c) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') mov (up), %rax mov n_param, n mov R32(n_param), R32(%r11) @@ -84,6 +97,7 @@ L(lo1): sub %rax, %r8 add $4, n jnz L(top) -L(end): mov %r8, %rax + mov %r8, %rax + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm index 01624a52a..e1e1db5a5 100644 --- a/mpn/x86_64/bdiv_q_1.asm +++ b/mpn/x86_64/bdiv_q_1.asm @@ -1,8 +1,8 @@ dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by dnl 1-limb divisor, returning quotient only. -dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation, -dnl Inc. +dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009, 2011 Free Software +dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -41,10 +41,22 @@ C di r8 just mpn_pi1_bdiv_q_1 C shift r9 just mpn_pi1_bdiv_q_1 +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_bdiv_q_1) + DOS64_ENTRY(4) push %rbx mov %rcx, %rax @@ -91,6 +103,9 @@ L(evn): bsf %rax, %rcx EPILOGUE() PROLOGUE(mpn_pi1_bdiv_q_1) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') push %rbx mov %rcx, %r11 C d @@ -144,11 +159,13 @@ L(ent): imul %r8, %rax imul %r8, %rax mov %rax, (%rdi) pop %rbx + DOS64_EXIT() ret L(one): shr R8(%rcx), %rax imul %r8, %rax mov %rax, (%rdi) pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm index 6ff62eeac..3a232fc20 100644 --- a/mpn/x86_64/com.asm +++ b/mpn/x86_64/com.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_com. -dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc. +dnl Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -34,11 +34,14 @@ define(`rp',`%rdi') define(`up',`%rsi') define(`n',`%rdx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_com) + DOS64_ENTRY(3) movq (up), %r8 movl R32(%rdx), R32(%rax) leaq (up,n,8), up @@ -76,5 +79,6 @@ L(e10): movq 24(up,n,8), %r9 movq %r9, 24(rp,n,8) addq $4, n jnc L(oop) -L(ret): ret +L(ret): DOS64_EXIT() + ret EPILOGUE() diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm index 13210217b..15e929f4e 100644 --- a/mpn/x86_64/copyd.asm +++ b/mpn/x86_64/copyd.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_copyd -- copy limb vector, decrementing. -dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -39,10 +39,14 @@ define(`rp',`%rdi') define(`up',`%rsi') define(`n',`%rdx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_copyd) + DOS64_ENTRY(3) leaq -8(up,n,8), up leaq (rp,n,8), rp subq $4, n @@ -73,5 +77,6 @@ L(end): shrl R32(%rdx) C edx = lowpart(n) movq -8(up), %r9 movq %r8, -8(rp) movq %r9, -16(rp) -1: ret +1: DOS64_EXIT() + ret EPILOGUE() diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm index d5cbdd644..1dd6c3168 100644 --- a/mpn/x86_64/copyi.asm +++ b/mpn/x86_64/copyi.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_copyi -- copy limb vector, incrementing. -dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -39,10 +39,14 @@ define(`rp',`%rdi') define(`up',`%rsi') define(`n',`%rdx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_copyi) + DOS64_ENTRY(3) leaq -8(rp), rp subq $4, n jc L(end) @@ -72,5 +76,6 @@ L(end): shrl R32(%rdx) C edx = lowpart(n) movq 8(up), %r9 movq %r8, 8(rp) movq %r9, 16(rp) -1: ret +1: DOS64_EXIT() + ret EPILOGUE() diff --git a/mpn/x86_64/core2/aorrlsh1_n.asm b/mpn/x86_64/core2/aorrlsh1_n.asm index 346c21f33..e44e718a6 100644 --- a/mpn/x86_64/core2/aorrlsh1_n.asm +++ b/mpn/x86_64/core2/aorrlsh1_n.asm @@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2008, 2010 Free Software Foundation, Inc. +dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh1_n', ` MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/mpn/x86_64/core2/aorrlsh2_n.asm b/mpn/x86_64/core2/aorrlsh2_n.asm index 1da0c527f..2d9c89553 100644 --- a/mpn/x86_64/core2/aorrlsh2_n.asm +++ b/mpn/x86_64/core2/aorrlsh2_n.asm @@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2008, 2010 Free Software Foundation, Inc. +dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n', ` MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/mpn/x86_64/core2/aorrlsh_n.asm b/mpn/x86_64/core2/aorrlsh_n.asm index 8d03970ca..a8f5c051a 100644 --- a/mpn/x86_64/core2/aorrlsh_n.asm +++ b/mpn/x86_64/core2/aorrlsh_n.asm @@ -20,4 +20,8 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + include_mpn(`x86_64/coreinhm/aorrlsh_n.asm') diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm index 75807c79a..bc109cc22 100644 --- a/mpn/x86_64/core2/aors_n.asm +++ b/mpn/x86_64/core2/aors_n.asm @@ -1,6 +1,6 @@ dnl Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract. -dnl Copyright 2006, 2007 Free Software Foundation, Inc. +dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -48,16 +48,28 @@ ifdef(`OPERATION_sub_n', ` MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) -ASM_START() +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) +ASM_START() TEXT ALIGN(16) - PROLOGUE(func_nc) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') jmp L(start) EPILOGUE() PROLOGUE(func) + DOS64_ENTRY(4) xor %r8, %r8 L(start): mov (up), %r10 @@ -96,6 +108,7 @@ L(end): ADCSBB %r11, %r10 mov %r10, 8(rp) mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0 adc R32(%rax), R32(%rax) + DOS64_EXIT() ret ALIGN(16) diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm index bb4f663c4..aeda30159 100644 --- a/mpn/x86_64/core2/aorsmul_1.asm +++ b/mpn/x86_64/core2/aorsmul_1.asm @@ -1,6 +1,7 @@ dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". -dnl Copyright 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. +dnl Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011 Free Software +dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -45,10 +46,14 @@ ifdef(`OPERATION_submul_1',` MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) push %rbx push %rbp lea (%rdx), %rbx @@ -127,5 +132,6 @@ L(n1): mov 8(rp), %r10 adc %rdx, %rax pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm index 3b17e8315..2e175de76 100644 --- a/mpn/x86_64/core2/lshift.asm +++ b/mpn/x86_64/core2/lshift.asm @@ -1,6 +1,6 @@ dnl x86-64 mpn_lshift optimized for "Core 2". -dnl Copyright 2007, 2009 Free Software Foundation, Inc. +dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -35,12 +35,16 @@ C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n', `%rdx') -define(`cnt', `%cl') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_lshift) + DOS64_ENTRY(4) lea -8(rp,n,8), rp lea -8(up,n,8), up @@ -51,7 +55,7 @@ L(b00): C n = 4, 8, 12, ... mov (up), %r10 mov -8(up), %r11 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r10, %rax + shld R8(cnt), %r10, %rax mov -16(up), %r8 lea 24(rp), rp sub $4, n @@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ... jae L(nb01) L(b01): mov (up), %r9 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r9, %rax + shld R8(cnt), %r9, %rax sub $2, n jb L(le1) mov -8(up), %r10 @@ -70,8 +74,9 @@ L(b01): mov (up), %r9 lea -8(up), up lea 16(rp), rp jmp L(01) -L(le1): shl R8(%rcx), %r9 +L(le1): shl R8(cnt), %r9 mov %r9, (rp) + DOS64_EXIT() ret L(nb01):C n = 2, 6, 10, ... @@ -79,17 +84,18 @@ L(nb01):C n = 2, 6, 10, ... L(b10): mov (up), %r8 mov -8(up), %r9 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r8, %rax + shld R8(cnt), %r8, %rax sub $3, n jb L(le2) mov -16(up), %r10 lea -16(up), up lea 8(rp), rp jmp L(10) -L(le2): shld R8(%rcx), %r9, %r8 +L(le2): shld R8(cnt), %r9, %r8 mov %r8, (rp) - shl R8(%rcx), %r9 + shl R8(cnt), %r9 mov %r9, -8(rp) + DOS64_EXIT() ret ALIGN(16) C performance critical! @@ -97,23 +103,23 @@ L(b11): C n = 3, 7, 11, ... mov (up), %r11 mov -8(up), %r8 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r11, %rax + shld R8(cnt), %r11, %rax mov -16(up), %r9 lea -24(up), up sub $4, n jb L(end) ALIGN(16) -L(top): shld R8(%rcx), %r8, %r11 +L(top): shld R8(cnt), %r8, %r11 mov (up), %r10 mov %r11, (rp) -L(10): shld R8(%rcx), %r9, %r8 +L(10): shld R8(cnt), %r9, %r8 mov -8(up), %r11 mov %r8, -8(rp) -L(01): shld R8(%rcx), %r10, %r9 +L(01): shld R8(cnt), %r10, %r9 mov -16(up), %r8 mov %r9, -16(rp) -L(00): shld R8(%rcx), %r11, %r10 +L(00): shld R8(cnt), %r11, %r10 mov -24(up), %r9 mov %r10, -24(rp) add $-32, up @@ -121,11 +127,12 @@ L(00): shld R8(%rcx), %r11, %r10 sub $4, n jnc L(top) -L(end): shld R8(%rcx), %r8, %r11 +L(end): shld R8(cnt), %r8, %r11 mov %r11, (rp) - shld R8(%rcx), %r9, %r8 + shld R8(cnt), %r9, %r8 mov %r8, -8(rp) - shl R8(%rcx), %r9 + shl R8(cnt), %r9 mov %r9, -16(rp) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm index a19f72297..31a08f7ae 100644 --- a/mpn/x86_64/core2/lshiftc.asm +++ b/mpn/x86_64/core2/lshiftc.asm @@ -1,6 +1,6 @@ dnl x86-64 mpn_lshiftc optimized for "Core 2". -dnl Copyright 2007, 2009 Free Software Foundation, Inc. +dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -35,12 +35,16 @@ C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n', `%rdx') -define(`cnt', `%cl') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_lshiftc) + DOS64_ENTRY(4) lea -8(rp,n,8), rp lea -8(up,n,8), up @@ -51,7 +55,7 @@ L(b00): C n = 4, 8, 12, ... mov (up), %r10 mov -8(up), %r11 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r10, %rax + shld R8(cnt), %r10, %rax mov -16(up), %r8 lea 24(rp), rp sub $4, n @@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ... jae L(nb01) L(b01): mov (up), %r9 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r9, %rax + shld R8(cnt), %r9, %rax sub $2, n jb L(le1) mov -8(up), %r10 @@ -70,9 +74,10 @@ L(b01): mov (up), %r9 lea -8(up), up lea 16(rp), rp jmp L(01) -L(le1): shl R8(%rcx), %r9 +L(le1): shl R8(cnt), %r9 not %r9 mov %r9, (rp) + DOS64_EXIT() ret L(nb01):C n = 2, 6, 10, ... @@ -80,19 +85,20 @@ L(nb01):C n = 2, 6, 10, ... L(b10): mov (up), %r8 mov -8(up), %r9 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r8, %rax + shld R8(cnt), %r8, %rax sub $3, n jb L(le2) mov -16(up), %r10 lea -16(up), up lea 8(rp), rp jmp L(10) -L(le2): shld R8(%rcx), %r9, %r8 +L(le2): shld R8(cnt), %r9, %r8 not %r8 mov %r8, (rp) - shl R8(%rcx), %r9 + shl R8(cnt), %r9 not %r9 mov %r9, -8(rp) + DOS64_EXIT() ret ALIGN(16) C performance critical! @@ -100,26 +106,26 @@ L(b11): C n = 3, 7, 11, ... mov (up), %r11 mov -8(up), %r8 xor R32(%rax), R32(%rax) - shld R8(%rcx), %r11, %rax + shld R8(cnt), %r11, %rax mov -16(up), %r9 lea -24(up), up sub $4, n jb L(end) ALIGN(16) -L(top): shld R8(%rcx), %r8, %r11 +L(top): shld R8(cnt), %r8, %r11 mov (up), %r10 not %r11 mov %r11, (rp) -L(10): shld R8(%rcx), %r9, %r8 +L(10): shld R8(cnt), %r9, %r8 mov -8(up), %r11 not %r8 mov %r8, -8(rp) -L(01): shld R8(%rcx), %r10, %r9 +L(01): shld R8(cnt), %r10, %r9 mov -16(up), %r8 not %r9 mov %r9, -16(rp) -L(00): shld R8(%rcx), %r11, %r10 +L(00): shld R8(cnt), %r11, %r10 mov -24(up), %r9 not %r10 mov %r10, -24(rp) @@ -128,14 +134,15 @@ L(00): shld R8(%rcx), %r11, %r10 sub $4, n jnc L(top) -L(end): shld R8(%rcx), %r8, %r11 +L(end): shld R8(cnt), %r8, %r11 not %r11 mov %r11, (rp) - shld R8(%rcx), %r9, %r8 + shld R8(cnt), %r9, %r8 not %r8 mov %r8, -8(rp) - shl R8(%rcx), %r9 + shl R8(cnt), %r9 not %r9 mov %r9, -16(rp) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm index eb52efc08..b350e4a43 100644 --- a/mpn/x86_64/core2/rsh1aors_n.asm +++ b/mpn/x86_64/core2/rsh1aors_n.asm @@ -1,6 +1,6 @@ dnl Intel P6/64 mpn_rsh1add_n and mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1 -dnl Copyright 2003, 2005, 2009, 2010 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2009, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -49,11 +49,24 @@ ifdef(`OPERATION_rsh1sub_n', ` MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func_nc) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') push %rbx push %rbp @@ -66,6 +79,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(func_n) + DOS64_ENTRY(4) push %rbx push %rbp @@ -171,5 +185,6 @@ L(end): shrd $1, %rbx, %rbp mov %rbp, (rp) pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm index 38a77364f..68306881c 100644 --- a/mpn/x86_64/core2/rshift.asm +++ b/mpn/x86_64/core2/rshift.asm @@ -1,6 +1,6 @@ dnl x86-64 mpn_rshift optimized for "Core 2". -dnl Copyright 2007, 2009 Free Software Foundation, Inc. +dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -35,12 +35,16 @@ C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n', `%rdx') -define(`cnt', `%cl') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_rshift) + DOS64_ENTRY(4) mov R32(%rdx), R32(%rax) and $3, R32(%rax) jne L(nb00) @@ -48,7 +52,7 @@ L(b00): C n = 4, 8, 12, ... mov (up), %r10 mov 8(up), %r11 xor R32(%rax), R32(%rax) - shrd R8(%rcx), %r10, %rax + shrd R8(cnt), %r10, %rax mov 16(up), %r8 lea 8(up), up lea -24(rp), rp @@ -60,7 +64,7 @@ L(nb00):C n = 1, 5, 9, ... jae L(nb01) L(b01): mov (up), %r9 xor R32(%rax), R32(%rax) - shrd R8(%rcx), %r9, %rax + shrd R8(cnt), %r9, %rax sub $2, n jb L(le1) mov 8(up), %r10 @@ -68,8 +72,9 @@ L(b01): mov (up), %r9 lea 16(up), up lea -16(rp), rp jmp L(01) -L(le1): shr R8(%rcx), %r9 +L(le1): shr R8(cnt), %r9 mov %r9, (rp) + DOS64_EXIT() ret L(nb01):C n = 2, 6, 10, ... @@ -77,17 +82,18 @@ L(nb01):C n = 2, 6, 10, ... L(b10): mov (up), %r8 mov 8(up), %r9 xor R32(%rax), R32(%rax) - shrd R8(%rcx), %r8, %rax + shrd R8(cnt), %r8, %rax sub $3, n jb L(le2) mov 16(up), %r10 lea 24(up), up lea -8(rp), rp jmp L(10) -L(le2): shrd R8(%rcx), %r9, %r8 +L(le2): shrd R8(cnt), %r9, %r8 mov %r8, (rp) - shr R8(%rcx), %r9 + shr R8(cnt), %r9 mov %r9, 8(rp) + DOS64_EXIT() ret ALIGN(16) @@ -95,23 +101,23 @@ L(b11): C n = 3, 7, 11, ... mov (up), %r11 mov 8(up), %r8 xor R32(%rax), R32(%rax) - shrd R8(%rcx), %r11, %rax + shrd R8(cnt), %r11, %rax mov 16(up), %r9 lea 32(up), up sub $4, n jb L(end) ALIGN(16) -L(top): shrd R8(%rcx), %r8, %r11 +L(top): shrd R8(cnt), %r8, %r11 mov -8(up), %r10 mov %r11, (rp) -L(10): shrd R8(%rcx), %r9, %r8 +L(10): shrd R8(cnt), %r9, %r8 mov (up), %r11 mov %r8, 8(rp) -L(01): shrd R8(%rcx), %r10, %r9 +L(01): shrd R8(cnt), %r10, %r9 mov 8(up), %r8 mov %r9, 16(rp) -L(00): shrd R8(%rcx), %r11, %r10 +L(00): shrd R8(cnt), %r11, %r10 mov 16(up), %r9 mov %r10, 24(rp) add $32, up @@ -119,11 +125,12 @@ L(00): shrd R8(%rcx), %r11, %r10 sub $4, n jnc L(top) -L(end): shrd R8(%rcx), %r8, %r11 +L(end): shrd R8(cnt), %r8, %r11 mov %r11, (rp) - shrd R8(%rcx), %r9, %r8 + shrd R8(cnt), %r9, %r8 mov %r8, 8(rp) - shr R8(%rcx), %r9 + shr R8(cnt), %r9 mov %r9, 16(rp) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/core2/sublsh1_n.asm b/mpn/x86_64/core2/sublsh1_n.asm index 7522b429f..50411d7d0 100644 --- a/mpn/x86_64/core2/sublsh1_n.asm +++ b/mpn/x86_64/core2/sublsh1_n.asm @@ -2,7 +2,7 @@ dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN. dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2008, 2010 Free Software Foundation, Inc. +dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -30,4 +30,7 @@ define(func, mpn_sublsh1_n) MULFUNC_PROLOGUE(mpn_sublsh1_n) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/mpn/x86_64/core2/sublsh2_n.asm b/mpn/x86_64/core2/sublsh2_n.asm index 036d2c859..affc87177 100644 --- a/mpn/x86_64/core2/sublsh2_n.asm +++ b/mpn/x86_64/core2/sublsh2_n.asm @@ -2,7 +2,7 @@ dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN. dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2008, 2010 Free Software Foundation, Inc. +dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -30,4 +30,7 @@ define(func, mpn_sublsh2_n) MULFUNC_PROLOGUE(mpn_sublsh2_n) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm index 2f89c35e3..7c4545f5a 100644 --- a/mpn/x86_64/core2/sublshC_n.asm +++ b/mpn/x86_64/core2/sublshC_n.asm @@ -3,7 +3,7 @@ dnl Core iN. dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2008, 2010 Free Software Foundation, Inc. +dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -40,6 +40,7 @@ ASM_START() TEXT ALIGN(8) PROLOGUE(func) + DOS64_ENTRY(4) push %rbx push %r12 @@ -141,5 +142,6 @@ L(end): shr $RSH, %r11 pop %rbx sub R32(%r11), R32(%rax) neg R32(%rax) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm index a4afae69d..e22cc065d 100644 --- a/mpn/x86_64/coreinhm/aorrlsh_n.asm +++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm @@ -62,10 +62,23 @@ C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with C refmpn_rsblsh_nc MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(func_n) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt push %rbx xor R32(%rbx), R32(%rbx) C clear CF save register L(ent): push %rbp @@ -170,9 +183,13 @@ L(wd1): shrd %cl, %r8, %r11 IFRSB( neg %rax) pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() PROLOGUE(func_nc) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt +IFDOS(` mov 64(%rsp), %r9 ') C cy push %rbx neg cy sbb R32(%rbx), R32(%rbx) C initialise CF save register diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm index 66a5e3b60..4d8d1cccf 100644 --- a/mpn/x86_64/coreisbr/aors_n.asm +++ b/mpn/x86_64/coreisbr/aors_n.asm @@ -49,10 +49,22 @@ ifdef(`OPERATION_sub_n', ` MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func) + DOS64_ENTRY(4) xor %r8, %r8 L(ent): mov R32(n), R32(%rax) shr $2, n @@ -144,5 +156,7 @@ L(e1): ADCSBB 16(vp), %r10 ret EPILOGUE() PROLOGUE(func_nc) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') jmp L(ent) EPILOGUE() diff --git a/mpn/x86_64/invert_limb.asm b/mpn/x86_64/invert_limb.asm index 8c6aa68b6..06cf1414a 100644 --- a/mpn/x86_64/invert_limb.asm +++ b/mpn/x86_64/invert_limb.asm @@ -2,7 +2,7 @@ dnl AMD64 mpn_invert_limb -- Invert a normalized limb. dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. -dnl Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc. +dnl Copyright 2004, 2007, 2008, 2009, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -33,11 +33,14 @@ C VIA nano 79 157 C rax rcx rdx rdi rsi r8 +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_invert_limb) C Kn C2 Ci + DOS64_ENTRY(1) mov %rdi, %rax C 0 0 0 shr $55, %rax C 1 1 1 ifdef(`PIC',` @@ -94,6 +97,7 @@ ifdef(`DARWIN',` adc %rdi, %rdx sub %rdx, %rax + DOS64_EXIT() ret EPILOGUE() ASM_END() diff --git a/mpn/x86_64/invert_limb_table.asm b/mpn/x86_64/invert_limb_table.asm index 98a331372..86d75b8ce 100644 --- a/mpn/x86_64/invert_limb_table.asm +++ b/mpn/x86_64/invert_limb_table.asm @@ -21,6 +21,9 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() C Table entry X contains floor (0x7fd00 / (0x100 + X)) diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm index 1df564a8f..02b9da549 100644 --- a/mpn/x86_64/logops_n.asm +++ b/mpn/x86_64/logops_n.asm @@ -1,6 +1,6 @@ dnl AMD64 logops. -dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc. +dnl Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -72,6 +72,8 @@ define(`up',`%rsi') define(`vp',`%rdx') define(`n',`%rcx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() @@ -79,6 +81,7 @@ ifdef(`VARIANT_1',` TEXT ALIGN(32) PROLOGUE(func) + DOS64_ENTRY(4) movq (vp), %r8 movl R32(%rcx), R32(%rax) leaq (vp,n,8), vp @@ -117,7 +120,8 @@ L(e10): movq 24(vp,n,8), %r9 movq %r9, 24(rp,n,8) addq $4, n jnc L(oop) -L(ret): ret +L(ret): DOS64_EXIT() + ret EPILOGUE() ') @@ -125,6 +129,7 @@ ifdef(`VARIANT_2',` TEXT ALIGN(32) PROLOGUE(func) + DOS64_ENTRY(4) movq (vp), %r8 notq %r8 movl R32(%rcx), R32(%rax) @@ -168,7 +173,8 @@ L(e10): movq 24(vp,n,8), %r9 movq %r9, 24(rp,n,8) addq $4, n jnc L(oop) -L(ret): ret +L(ret): DOS64_EXIT() + ret EPILOGUE() ') @@ -176,6 +182,7 @@ ifdef(`VARIANT_3',` TEXT ALIGN(32) PROLOGUE(func) + DOS64_ENTRY(4) movq (vp), %r8 movl R32(%rcx), R32(%rax) leaq (vp,n,8), vp @@ -220,6 +227,7 @@ L(e10): movq 24(vp,n,8), %r9 movq %r9, 24(rp,n,8) addq $4, n jnc L(oop) -L(ret): ret +L(ret): DOS64_EXIT() + ret EPILOGUE() ') diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm index 2f3d5c94d..5852ba9f9 100644 --- a/mpn/x86_64/lshift.asm +++ b/mpn/x86_64/lshift.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_lshift -- mpn left shift. -dnl Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2007, 2009, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -36,10 +36,14 @@ define(`up', `%rsi') define(`n', `%rdx') define(`cnt', `%rcx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_lshift) + DOS64_ENTRY(4) cmp $1, R8(%rcx) jne L(gen) @@ -83,6 +87,7 @@ L(t1): mov (up), %r8 dec R32(%rax) jne L(n00) adc R32(%rax), R32(%rax) + DOS64_EXIT() ret L(e1): test R32(%rax), R32(%rax) C clear cy L(n00): mov (up), %r8 @@ -91,6 +96,7 @@ L(n00): mov (up), %r8 adc %r8, %r8 mov %r8, (rp) L(ret): adc R32(%rax), R32(%rax) + DOS64_EXIT() ret L(n01): dec R32(%rax) mov 8(up), %r9 @@ -100,6 +106,7 @@ L(n01): dec R32(%rax) mov %r8, (rp) mov %r9, 8(rp) adc R32(%rax), R32(%rax) + DOS64_EXIT() ret L(n10): mov 16(up), %r10 adc %r8, %r8 @@ -109,6 +116,7 @@ L(n10): mov 16(up), %r10 mov %r9, 8(rp) mov %r10, 16(rp) adc $-1, R32(%rax) + DOS64_EXIT() ret L(gen): neg R32(%rcx) C put rsh count in cl @@ -222,5 +230,6 @@ L(end): L(ast): mov (up), %r10 shl R8(%rcx), %r10 mov %r10, (rp) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm index 93bb614d3..b4124b037 100644 --- a/mpn/x86_64/lshiftc.asm +++ b/mpn/x86_64/lshiftc.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_lshiftc -- mpn left shift with complement. -dnl Copyright 2003, 2005, 2006, 2009 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2006, 2009, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -36,10 +36,14 @@ define(`up', `%rsi') define(`n', `%rdx') define(`cnt', `%rcx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_lshiftc) + DOS64_ENTRY(4) neg R32(%rcx) C put rsh count in cl mov -8(up,n,8), %rax shr R8(%rcx), %rax C function return value @@ -162,5 +166,6 @@ L(ast): mov (up), %r10 shl R8(%rcx), %r10 not %r10 mov %r10, (rp) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm index 3a42863ad..6e5816b1c 100644 --- a/mpn/x86_64/lshsub_n.asm +++ b/mpn/x86_64/lshsub_n.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). -dnl Copyright 2006 Free Software Foundation, Inc. +dnl Copyright 2006, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -44,10 +44,23 @@ define(`vp', `%rdx') define(`n', `%rcx') define(`cnt', `%r8') +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_lshsub_n) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') push %r12 push %r13 @@ -151,5 +164,6 @@ L(end): pop %r13 pop %r12 + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm index 56f708a75..8afa96e05 100644 --- a/mpn/x86_64/mod_1_1.asm +++ b/mpn/x86_64/mod_1_1.asm @@ -67,10 +67,14 @@ C the source of the cmov in the loop. C C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mod_1_1p) + DOS64_ENTRY(4) push %rbp push %rbx mov %rdx, b @@ -163,6 +167,7 @@ L(ok): shr R8(%rcx), %rax pop %rbx pop %rbp + DOS64_EXIT() ret L(fix): sub b, %rax jmp L(ok) @@ -170,6 +175,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(mpn_mod_1_1p_cps) + DOS64_ENTRY(2) push %rbp bsr %rsi, %rcx push %rbx @@ -211,6 +217,7 @@ L(z): pop %r12 pop %rbx pop %rbp + DOS64_EXIT() ret EPILOGUE() ASM_END() diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm index a0ecb6855..b09f24bc0 100644 --- a/mpn/x86_64/mod_1_2.asm +++ b/mpn/x86_64/mod_1_2.asm @@ -2,7 +2,7 @@ dnl AMD64 mpn_mod_1s_2p dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2009, 2010 Free Software Foundation, Inc. +dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -31,10 +31,14 @@ C Intel SBR 4.5 C Intel atom 28 C VIA nano 8 +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mod_1s_2p) + DOS64_ENTRY(4) push %r14 test $1, R8(%rsi) mov %rdx, %r14 @@ -145,6 +149,7 @@ L(1): xor R32(%rcx), R32(%rcx) pop %r12 pop %r13 pop %r14 + DOS64_EXIT() ret L(one): mov (%rdi), %r8 @@ -154,6 +159,7 @@ L(one): EPILOGUE() PROLOGUE(mpn_mod_1s_2p_cps) + DOS64_ENTRY(2) push %rbp bsr %rsi, %rcx push %rbx @@ -214,5 +220,6 @@ ifdef(`SHLD_SLOW',` pop %r12 pop %rbx pop %rbp + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm index d99080d7f..3068e3def 100644 --- a/mpn/x86_64/mod_1_4.asm +++ b/mpn/x86_64/mod_1_4.asm @@ -2,7 +2,7 @@ dnl AMD64 mpn_mod_1s_4p dnl Contributed to the GNU project by Torbjorn Granlund. -dnl Copyright 2009, 2010 Free Software Foundation, Inc. +dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -30,17 +30,22 @@ C Intel corei 4 C Intel atom 23 C VIA nano 4.75 +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mod_1s_4p) + DOS64_ENTRY(4) + push %r15 push %r14 push %r13 push %r12 push %rbp push %rbx - mov %rdx, -16(%rsp) + mov %rdx, %r15 mov %rcx, %r14 mov 16(%rcx), %r11 C B1modb mov 24(%rcx), %rbx C B2modb @@ -135,7 +140,7 @@ L(end): mov 8(%r14), R32(%rsi) or %rdx, %rdi mov %rdi, %rax mulq (%r14) - mov -16(%rsp), %rbx + mov %r15, %rbx mov %rax, %r9 sal R8(%rcx), %r8 inc %rdi @@ -155,11 +160,13 @@ L(end): mov 8(%r14), R32(%rsi) pop %r12 pop %r13 pop %r14 + DOS64_EXIT() ret EPILOGUE() ALIGN(16) PROLOGUE(mpn_mod_1s_4p_cps) + DOS64_ENTRY(2) push %rbp bsr %rsi, %rcx push %rbx @@ -244,5 +251,6 @@ ifdef(`SHLD_SLOW',` pop %r12 pop %rbx pop %rbp + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm index 08cd7d939..ee4d0d347 100644 --- a/mpn/x86_64/mod_34lsub1.asm +++ b/mpn/x86_64/mod_34lsub1.asm @@ -1,7 +1,7 @@ dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. -dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010 Free Software -dnl Foundation, Inc. +dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011 Free +dnl Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -39,10 +39,14 @@ C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) C TODO C * Review feed-in and wind-down code. +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_mod_34lsub1) + DOS64_ENTRY(2) mov $0x0000FFFFFFFFFFFF, %r11 @@ -66,7 +70,8 @@ PROLOGUE(mpn_mod_34lsub1) shl $16, %rdx C src[1] low add %rdx, %rax -L(one): ret +L(one): DOS64_EXIT() + ret C Don't change this, the wind-down code is not able to handle greater values @@ -176,5 +181,6 @@ L(0): add %r9, %rax add %rdx, %rax C apply 2mod3 high add %rdi, %rax C apply 2mod3 low + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm index 206a4ea2c..35deefa8b 100644 --- a/mpn/x86_64/mul_2.asm +++ b/mpn/x86_64/mul_2.asm @@ -1,7 +1,7 @@ dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and dnl store the result in a third limb vector. -dnl Copyright 2008 Free Software Foundation, Inc. +dnl Copyright 2008, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -53,10 +53,14 @@ define(`w2', `%rbp') define(`w3', `%r10') define(`n', `%r11') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mul_2) + DOS64_ENTRY(4) push %rbx push %rbp @@ -172,5 +176,6 @@ L(m22): mul v1 pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/mulmid_basecase.asm b/mpn/x86_64/mulmid_basecase.asm index 375e7f70e..d2d56d4a4 100644 --- a/mpn/x86_64/mulmid_basecase.asm +++ b/mpn/x86_64/mulmid_basecase.asm @@ -50,11 +50,23 @@ define(`vp', `%r15') define(`vp_inner', `%r10') +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mulmid_basecase) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') push %rbx push %rbp push %r12 @@ -539,6 +551,6 @@ L(ret): pop %r15 pop %r12 pop %rbp pop %rbx + DOS64_EXIT() ret - EPILOGUE() diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm index 9db368106..999452328 100644 --- a/mpn/x86_64/popham.asm +++ b/mpn/x86_64/popham.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. -dnl Copyright 2004, 2005, 2007, 2010 Free Software Foundation, Inc. +dnl Copyright 2004, 2005, 2007, 2010, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -44,6 +44,7 @@ ifdef(`OPERATION_popcount',` define(`h33333333', `%r11') define(`h0f0f0f0f', `%rcx') define(`h01010101', `%rdx') + define(`POP', `$1') define(`HAM', `dnl') ') ifdef(`OPERATION_hamdist',` @@ -55,17 +56,22 @@ ifdef(`OPERATION_hamdist',` define(`h33333333', `%r11') define(`h0f0f0f0f', `%rcx') define(`h01010101', `%r14') + define(`POP', `dnl') define(`HAM', `$1') ') MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(func) - + POP(` DOS64_ENTRY(2) ') + HAM(` DOS64_ENTRY(3) ') push %r12 push %r13 HAM(` push %r14 ') @@ -155,6 +161,6 @@ L(end): HAM(` pop %r14 ') pop %r13 pop %r12 + DOS64_EXIT() ret - EPILOGUE() diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm index 8d731c68c..53b5641a0 100644 --- a/mpn/x86_64/redc_1.asm +++ b/mpn/x86_64/redc_1.asm @@ -49,10 +49,14 @@ define(`n', `%r13') define(`i', `%r11') define(`nneg', `%r12') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_redc_1) + DOS64_ENTRY(4) push %rbp push %rbx push %r12 @@ -293,5 +297,6 @@ L(ret): pop %r14 pop %r12 pop %rbx pop %rbp + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm index c4a336446..1b6a103f1 100644 --- a/mpn/x86_64/rsh1aors_n.asm +++ b/mpn/x86_64/rsh1aors_n.asm @@ -1,7 +1,7 @@ dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1 dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1 -dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -53,11 +53,24 @@ ifdef(`OPERATION_rsh1sub_n', ` MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(func_nc) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') push %rbx xor R32(%rax), R32(%rax) @@ -69,6 +82,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(func_n) + DOS64_ENTRY(4) push %rbx xor R32(%rax), R32(%rax) @@ -169,5 +183,6 @@ L(top): add %rbx, %rbx C rotate carry limb, restore acy L(end): mov %rbx, (rp) pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm index 0f822a4a0..57a4ab093 100644 --- a/mpn/x86_64/rshift.asm +++ b/mpn/x86_64/rshift.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_rshift -- mpn right shift. -dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -36,10 +36,14 @@ define(`up', `%rsi') define(`n', `%rdx') define(`cnt', `%rcx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_rshift) + DOS64_ENTRY(4) neg R32(%rcx) C put rsh count in cl mov (up), %rax shl R8(%rcx), %rax C function return value @@ -156,5 +160,6 @@ L(end): L(ast): mov (up), %r10 shr R8(%rcx), %r10 mov %r10, (rp) + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm index f71627ab9..71195d7ae 100644 --- a/mpn/x86_64/sqr_basecase.asm +++ b/mpn/x86_64/sqr_basecase.asm @@ -75,14 +75,6 @@ define(`w1', `%rcx') define(`w2', `%rbp') define(`w3', `%r10') -ifdef(`HOST_DOS64',` - define(`IFDOS', `$1') - define(`IFELF', `') -',` - define(`IFDOS', `') - define(`IFELF', `$1') -') - ABI_SUPPORT(DOS64) ABI_SUPPORT(ELF64) diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm index a2f48c007..a0515cf18 100644 --- a/mpn/x86_64/sublsh1_n.asm +++ b/mpn/x86_64/sublsh1_n.asm @@ -1,6 +1,6 @@ dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) -dnl Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc. +dnl Copyright 2003, 2005, 2006, 2007, 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -41,10 +41,14 @@ define(`up',`%rsi') define(`vp',`%rdx') define(`n', `%rcx') +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_sublsh1_n) + DOS64_ENTRY(4) push %rbx push %rbp @@ -140,5 +144,6 @@ L(end): add R32(%rbp), R32(%rax) pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm index 2611b3212..a6699a9a4 100644 --- a/mpn/x86_64/tabselect.asm +++ b/mpn/x86_64/tabselect.asm @@ -50,10 +50,23 @@ define(`maskn', `%r12') C rax rbx rcx rdx rdi rsi rbp (rsp) r8 r9 r10 r11 r12 r13 r14 r15 C nents n rp tab which +ifdef(`HOST_DOS64',` + define(`IFDOS', `$1') + define(`IFELF', `') +',` + define(`IFDOS', `') + define(`IFELF', `$1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(ELF64) + ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_tabselect) + DOS64_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') push %rbx push %rbp push %r12 @@ -105,5 +118,6 @@ L(outer_end): pop %r12 pop %rbp pop %rbx + DOS64_EXIT() ret EPILOGUE() -- cgit v1.2.1 From 2c033efc02631f22e6e180ce737a2faf81b09ccc Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 29 Nov 2011 23:28:07 +0100 Subject: Fix typo in last change (thanks Marco!). --- mpn/x86_64/mod_1_4.asm | 1 + 1 file changed, 1 insertion(+) diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm index 3068e3def..629520877 100644 --- a/mpn/x86_64/mod_1_4.asm +++ b/mpn/x86_64/mod_1_4.asm @@ -160,6 +160,7 @@ L(end): mov 8(%r14), R32(%rsi) pop %r12 pop %r13 pop %r14 + pop %r15 DOS64_EXIT() ret EPILOGUE() -- cgit v1.2.1