From f2112c9a69c750288e0a4b032bcd0ebb004b92eb Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 01:02:27 +0100
Subject: Provide gmp-mparam.h for POWER7.

---
 mpn/powerpc64/mode64/p7/gmp-mparam.h | 155 +++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 mpn/powerpc64/mode64/p7/gmp-mparam.h

diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 000000000..57b888637
--- /dev/null
+++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -0,0 +1,155 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 3550 MHz POWER7 (gcc110.fsffrance.org) */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           28
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               202
+#define MUL_TOOM6H_THRESHOLD               298
+#define MUL_TOOM8H_THRESHOLD               406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     143
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     135
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     141
+
+#define SQR_BASECASE_THRESHOLD              10
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                 84
+#define SQR_TOOM4_THRESHOLD                160
+#define SQR_TOOM6_THRESHOLD                246
+#define SQR_TOOM8_THRESHOLD                296
+
+#define MULMID_TOOM42_THRESHOLD             62
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             436  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    436, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     12, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     43,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    159,11}, {     95,10}, {    191,11}, {    111,12}, \
+    {     63,11}, {    127,10}, {    255,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,12}, \
+    {     95,11}, {    191,10}, {    383,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    575,11}, \
+    {    303,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    335,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    308, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,11}, {    175,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    303,12}, \
+    {    159,11}, {    319,10}, {    639, 9}, {   1279,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,12}, \
+    {    223,11}, {    447,10}, {    895,11}, {    479,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 103
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             4
+#define MULLO_DC_THRESHOLD                  34
+#define MULLO_MUL_N_THRESHOLD             9174
+
+#define DC_DIV_QR_THRESHOLD                 30
+#define DC_DIVAPPR_Q_THRESHOLD             124
+#define DC_BDIV_QR_THRESHOLD                66
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           81
+#define INV_NEWTON_THRESHOLD               165
+#define INV_APPR_THRESHOLD                 133
+
+#define BINV_NEWTON_THRESHOLD              300
+#define REDC_1_TO_REDC_N_THRESHOLD          76
+
+#define MU_DIV_QR_THRESHOLD               1470
+#define MU_DIVAPPR_Q_THRESHOLD            1442
+#define MUPI_DIV_QR_THRESHOLD               58
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1499
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     121
+#define GCD_DC_THRESHOLD                   443
+#define GCDEXT_DC_THRESHOLD                396
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        22
+#define SET_STR_DC_THRESHOLD              1517
+#define SET_STR_PRECOMPUTE_THRESHOLD      4040
-- 
cgit v1.2.1


From 7efbd396826cff03514bdc27356fa34fcd323f58 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 01:19:16 +0100
Subject: Add POWER7 cycle counts.

---
 mpn/powerpc64/com.asm                     |  9 ++++++---
 mpn/powerpc64/copyd.asm                   |  9 ++++++---
 mpn/powerpc64/copyi.asm                   |  9 ++++++---
 mpn/powerpc64/logops_n.asm                |  9 ++++++---
 mpn/powerpc64/lshift.asm                  | 11 ++++++-----
 mpn/powerpc64/mode64/aors_n.asm           | 11 ++++++-----
 mpn/powerpc64/mode64/aorslshC_n.asm       | 11 ++++++-----
 mpn/powerpc64/mode64/aorsmul_1.asm        | 13 +++++++------
 mpn/powerpc64/mode64/bdiv_dbm1c.asm       |  4 +++-
 mpn/powerpc64/mode64/dive_1.asm           | 11 ++++++-----
 mpn/powerpc64/mode64/divrem_1.asm         | 13 +++++++------
 mpn/powerpc64/mode64/divrem_2.asm         | 11 ++++++-----
 mpn/powerpc64/mode64/invert_limb.asm      | 11 ++++++-----
 mpn/powerpc64/mode64/lshiftc.asm          | 11 ++++++-----
 mpn/powerpc64/mode64/mod_1_1.asm          | 11 ++++++-----
 mpn/powerpc64/mode64/mod_1_4.asm          | 11 ++++++-----
 mpn/powerpc64/mode64/mod_34lsub1.asm      | 11 ++++++-----
 mpn/powerpc64/mode64/mode1o.asm           | 10 ++++++----
 mpn/powerpc64/mode64/mul_1.asm            | 11 ++++++-----
 mpn/powerpc64/mode64/mul_basecase.asm     | 10 +++++-----
 mpn/powerpc64/mode64/p5/gmp-mparam.h      |  2 +-
 mpn/powerpc64/mode64/p6/gmp-mparam.h      |  2 +-
 mpn/powerpc64/mode64/rsh1add_n.asm        | 11 ++++++-----
 mpn/powerpc64/mode64/rsh1sub_n.asm        | 11 ++++++-----
 mpn/powerpc64/mode64/sqr_diag_addlsh1.asm | 11 ++++++-----
 mpn/powerpc64/rshift.asm                  | 11 ++++++-----
 26 files changed, 144 insertions(+), 111 deletions(-)

diff --git a/mpn/powerpc64/com.asm b/mpn/powerpc64/com.asm
index 4fb2e65d7..cb89bade2 100644
--- a/mpn/powerpc64/com.asm
+++ b/mpn/powerpc64/com.asm
@@ -19,9 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630:     1?
-C POWER4/PPC970:     1.6
+C                  cycles/limb
+C POWER3/PPC630          1?
+C POWER4/PPC970          1.6
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.45
 
 C TODO
 C  * 8-way unrolling brings timing down to about 1.3 cycles/limb.
diff --git a/mpn/powerpc64/copyd.asm b/mpn/powerpc64/copyd.asm
index 6a46a433c..256e7dc12 100644
--- a/mpn/powerpc64/copyd.asm
+++ b/mpn/powerpc64/copyd.asm
@@ -19,9 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630:     1
-C POWER4/PPC970:     1
+C                  cycles/limb
+C POWER3/PPC630          1
+C POWER4/PPC970          1
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.4
 
 C INPUT PARAMETERS
 C rp	r3
diff --git a/mpn/powerpc64/copyi.asm b/mpn/powerpc64/copyi.asm
index 5cb7e4856..31d1fc2e7 100644
--- a/mpn/powerpc64/copyi.asm
+++ b/mpn/powerpc64/copyi.asm
@@ -19,9 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630:     1
-C POWER4/PPC970:     1
+C                  cycles/limb
+C POWER3/PPC630          1
+C POWER4/PPC970          1
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.4
 
 C INPUT PARAMETERS
 C rp	r3
diff --git a/mpn/powerpc64/logops_n.asm b/mpn/powerpc64/logops_n.asm
index 917b59f45..2caa2c7c6 100644
--- a/mpn/powerpc64/logops_n.asm
+++ b/mpn/powerpc64/logops_n.asm
@@ -20,9 +20,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630:     1.75
-C POWER4/PPC970:     2.10
+C                  cycles/limb
+C POWER3/PPC630          1.75
+C POWER4/PPC970          2.10
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.75
 
 C   n	   POWER3/PPC630   POWER4/PPC970
 C     1	       15.00	       15.33
diff --git a/mpn/powerpc64/lshift.asm b/mpn/powerpc64/lshift.asm
index f97661ae7..eb70c4983 100644
--- a/mpn/powerpc64/lshift.asm
+++ b/mpn/powerpc64/lshift.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 ?
-C POWER4/PPC970		 ?
-C POWER5		 2.25
-C POWER6		 9.75
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.75
+C POWER7                 2.15
 
 C TODO
 C  * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm
index 980525f67..c6ea35089 100644
--- a/mpn/powerpc64/mode64/aors_n.asm
+++ b/mpn/powerpc64/mode64/aors_n.asm
@@ -20,11 +20,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 1.5
-C POWER4/PPC970		 2
-C POWER5		 2.25
-C POWER6		 2.63
+C                   cycles/limb
+C POWER3/PPC630          1.5
+C POWER4/PPC970          2
+C POWER5                 2.25
+C POWER6                 2.63
+C POWER7               2.25-2.87
 
 C This code is a little bit slower for POWER3/PPC630 than the simple code used
 C previously, but it is much faster for POWER4/PPC970.  The reason for the
diff --git a/mpn/powerpc64/mode64/aorslshC_n.asm b/mpn/powerpc64/mode64/aorslshC_n.asm
index 4622cd946..3776d3e59 100644
--- a/mpn/powerpc64/mode64/aorslshC_n.asm
+++ b/mpn/powerpc64/mode64/aorslshC_n.asm
@@ -17,11 +17,12 @@ dnl  License for more details.
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
-C		   cycles/limb
-C POWER3/PPC630		 1.83	(1.5 c/l should be possible)
-C POWER4/PPC970		 3	(2.0 c/l should be possible)
-C POWER5		 3
-C POWER6	      3.5-47
+C                  cycles/limb
+C POWER3/PPC630          1.83   (1.5 c/l should be possible)
+C POWER4/PPC970          3      (2.0 c/l should be possible)
+C POWER5                 3
+C POWER6              3.5-47
+C POWER7                 3
 
 C STATUS
 C  * Try combining upx+up, and vpx+vp.
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index b1a3315b6..658a2d941 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -20,12 +20,13 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		mpn_addmul_1	mpn_submul_1
-C		cycles/limb	cycles/limb
-C POWER3/PPC630   6-18		   6-18
-C POWER4/PPC970	   8		    8.3
-C POWER5	   8		    8.25
-C POWER6	  16.25		   16.75
+C               mpn_addmul_1    mpn_submul_1
+C               cycles/limb     cycles/limb
+C POWER3/PPC630   6-18             6-18
+C POWER4/PPC970    8                8.3
+C POWER5           8                8.25
+C POWER6          16.25            16.75
+C POWER7           3.77             4.9
 
 C TODO
 C  * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
index 40f3d4ec7..e88fc4440 100644
--- a/mpn/powerpc64/mode64/bdiv_dbm1c.asm
+++ b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -19,11 +19,13 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		  cycles/limb
+C                 cycles/limb
 C POWER3/PPC630       6-18
 C POWER4/PPC970       8.5?
 C POWER5              8.5  fluctuating as function of n % 3
 C POWER6             15
+C POWER6             15
+C POWER7              4.75
 
 C TODO
 C  * Nothing to do...
diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm
index d457d65e9..0f94154bf 100644
--- a/mpn/powerpc64/mode64/dive_1.asm
+++ b/mpn/powerpc64/mode64/dive_1.asm
@@ -19,12 +19,13 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C			cycles/limb
-C			norm	unorm
+C                       cycles/limb
+C                       norm    unorm
 C POWER3/PPC630        13-19
-C POWER4/PPC970		16
-C POWER5		16	16
-C POWER6		37	46
+C POWER4/PPC970         16
+C POWER5                16      16
+C POWER6                37      46
+C POWER7                12      12
 
 C TODO
 C  * Check if n=1 code is really an improvement.  It probably isn't.
diff --git a/mpn/powerpc64/mode64/divrem_1.asm b/mpn/powerpc64/mode64/divrem_1.asm
index 9d065b728..c0e7b2a9f 100644
--- a/mpn/powerpc64/mode64/divrem_1.asm
+++ b/mpn/powerpc64/mode64/divrem_1.asm
@@ -20,12 +20,13 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C			    cycles/limb
-C			norm	unorm	frac
-C POWER3/PPC630		16-34	16-34	~11
-C POWER4/PPC970		 29		 19
-C POWER5		 29	 29	~20
-C POWER6		 50	 59	~42
+C                           cycles/limb
+C                       norm    unorm   frac
+C POWER3/PPC630         16-34   16-34   ~11
+C POWER4/PPC970          29              19
+C POWER5                 29      29     ~20
+C POWER6                 50      59     ~42
+C POWER7                 25      25     ~14
 
 C INPUT PARAMETERS
 C qp  = r3
diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm
index 53ef1c708..18f549357 100644
--- a/mpn/powerpc64/mode64/divrem_2.asm
+++ b/mpn/powerpc64/mode64/divrem_2.asm
@@ -19,12 +19,13 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C			cycles/limb
-C			norm	frac
+C                       cycles/limb
+C                       norm    frac
 C POWER3/PPC630
-C POWER4/PPC970		?	?
-C POWER5		37	?
-C POWER6		62	?
+C POWER4/PPC970         ?       ?
+C POWER5                37      ?
+C POWER6                62      ?
+C POWER6                30.5    ?
 
 C INPUT PARAMETERS
 C qp  = r3
diff --git a/mpn/powerpc64/mode64/invert_limb.asm b/mpn/powerpc64/mode64/invert_limb.asm
index aed0a32ab..31b243001 100644
--- a/mpn/powerpc64/mode64/invert_limb.asm
+++ b/mpn/powerpc64/mode64/invert_limb.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		   cycles/limb (approximate)
-C POWER3/PPC630		80
-C POWER4/PPC970		86
-C POWER5		86
-C POWER6	       170
+C                  cycles/limb (approximate)
+C POWER3/PPC630         80
+C POWER4/PPC970         86
+C POWER5                86
+C POWER6               170
+C POWER7                66
 
 ASM_START()
 PROLOGUE(mpn_invert_limb)
diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/mode64/lshiftc.asm
index 647244d1f..bca55638f 100644
--- a/mpn/powerpc64/mode64/lshiftc.asm
+++ b/mpn/powerpc64/mode64/lshiftc.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 ?
-C POWER4/PPC970		 ?
-C POWER5		 2.25
-C POWER6		 9.5
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.5
+C POWER7                 2.15
 
 C TODO
 C  * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/mode64/mod_1_1.asm b/mpn/powerpc64/mode64/mod_1_1.asm
index 61e39310a..f24ceb2c8 100644
--- a/mpn/powerpc64/mode64/mod_1_1.asm
+++ b/mpn/powerpc64/mode64/mod_1_1.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 ?
-C POWER4/PPC970		17
-C POWER5		16
-C POWER6		30
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970         17
+C POWER5                16
+C POWER6                30
+C POWER7                10.2
 
 C TODO
 C  * Optimise, in particular the cps function.  This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_1_4.asm b/mpn/powerpc64/mode64/mod_1_4.asm
index e0f26da96..b6163c5e7 100644
--- a/mpn/powerpc64/mode64/mod_1_4.asm
+++ b/mpn/powerpc64/mode64/mod_1_4.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 ?
-C POWER4/PPC970		 9
-C POWER5		 9
-C POWER6		13
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          9
+C POWER5                 9
+C POWER6                13
+C POWER7                3.5
 
 C TODO
 C  * Optimise, in particular the cps function.  This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_34lsub1.asm b/mpn/powerpc64/mode64/mod_34lsub1.asm
index 62ba17a3c..30b9f98be 100644
--- a/mpn/powerpc64/mode64/mod_34lsub1.asm
+++ b/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 1.33
-C POWER4/PPC970		 1.5
-C POWER5		 1.32
-C POWER6		 2.35
+C                   cycles/limb
+C POWER3/PPC630          1.33
+C POWER4/PPC970          1.5
+C POWER5                 1.32
+C POWER6                 2.35
+C POWER7                 1
 
 C INPUT PARAMETERS
 define(`up',`r3')
diff --git a/mpn/powerpc64/mode64/mode1o.asm b/mpn/powerpc64/mode64/mode1o.asm
index 489ca8551..37e4028d8 100644
--- a/mpn/powerpc64/mode64/mode1o.asm
+++ b/mpn/powerpc64/mode64/mode1o.asm
@@ -19,10 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C               cycles/limb
-C POWER3/PPC630:    13-19
-C POWER4/PPC970:     16
-C POWER5:            16
+C                  cycles/limb
+C POWER3/PPC630        13-19
+C POWER4/PPC970         16
+C POWER5                16
+C POWER6                 ?
+C POWER7                12
 
 C TODO
 C  * Check if n=1 code is really an improvement.  It probably isn't.
diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm
index 12bff2fb6..e911cf551 100644
--- a/mpn/powerpc64/mode64/mul_1.asm
+++ b/mpn/powerpc64/mode64/mul_1.asm
@@ -21,11 +21,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630	    6-18
-C POWER4/PPC970	    7.25?  not updated for last file revision
-C POWER5	    7.25
-C POWER6	   14
+C               cycles/limb
+C POWER3/PPC630     6-18
+C POWER4/PPC970     7.25?  not updated for last file revision
+C POWER5            7.25
+C POWER6           14
+C POWER7            2.9
 
 C TODO
 C  * Try to reduce the number of needed live registers (at least r5 and r10
diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm
index fd7ff9aa1..a34f75962 100644
--- a/mpn/powerpc64/mode64/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/mul_basecase.asm
@@ -20,11 +20,11 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		   cycles/limb
-C POWER3/PPC630		6-18
-C POWER4/PPC970		 8
-C POWER5		 8
-C POWER6		24
+C                  cycles/limb
+C POWER3/PPC630         6-18
+C POWER4/PPC970          8
+C POWER5                 8
+C POWER6                24
 
 C INPUT PARAMETERS
 define(`rp', `r3')
diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h
index 827b555c8..d177da94e 100644
--- a/mpn/powerpc64/mode64/p5/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -1,4 +1,4 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
 
 Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
 Software Foundation, Inc.
diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index d447b56d9..88cac3e72 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -1,4 +1,4 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
 
 Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
 Software Foundation, Inc.
diff --git a/mpn/powerpc64/mode64/rsh1add_n.asm b/mpn/powerpc64/mode64/rsh1add_n.asm
index 8af3ca774..2a5ef3060 100644
--- a/mpn/powerpc64/mode64/rsh1add_n.asm
+++ b/mpn/powerpc64/mode64/rsh1add_n.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		   cycles/limb
-C POWER3/PPC630		 2		(1.5 c/l should be possible)
-C POWER4/PPC970		 4		(2.0 c/l should be possible)
-C POWER5		 3.5		(2.0 c/l should be possible)
-C POWER6		 4.5
+C                  cycles/limb
+C POWER3/PPC630          2              (1.5 c/l should be possible)
+C POWER4/PPC970          4              (2.0 c/l should be possible)
+C POWER5                 3.5            (2.0 c/l should be possible)
+C POWER6                 4.5
+C POWER7                 3.5
 
 define(`rp',`r3')
 define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/rsh1sub_n.asm b/mpn/powerpc64/mode64/rsh1sub_n.asm
index 1faa03379..b10eb8ab7 100644
--- a/mpn/powerpc64/mode64/rsh1sub_n.asm
+++ b/mpn/powerpc64/mode64/rsh1sub_n.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		   cycles/limb
-C POWER3/PPC630		 2		(1.5 c/l should be possible)
-C POWER4/PPC970		 4		(2.0 c/l should be possible)
-C POWER5		 3.5		(2.0 c/l should be possible)
-C POWER6		 4.5
+C                  cycles/limb
+C POWER3/PPC630          2              (1.5 c/l should be possible)
+C POWER4/PPC970          4              (2.0 c/l should be possible)
+C POWER5                 3.5            (2.0 c/l should be possible)
+C POWER6                 4.5
+C POWER7                 3.5
 
 define(`rp',`r3')
 define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
index 663f04c14..a1903cb6e 100644
--- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
+++ b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630   10
-C POWER4/PPC970	   6
-C POWER5	   5.375
-C POWER6	   8.5
+C               cycles/limb
+C POWER3/PPC630      10
+C POWER4/PPC970       6
+C POWER5              5.375
+C POWER6              8.5
+C POWER7              3.4
 
 C NOTES
 C  * This was written for POWER6 and its preferences for adjacent integer
diff --git a/mpn/powerpc64/rshift.asm b/mpn/powerpc64/rshift.asm
index 6545af769..18406c57e 100644
--- a/mpn/powerpc64/rshift.asm
+++ b/mpn/powerpc64/rshift.asm
@@ -19,11 +19,12 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		    cycles/limb
-C POWER3/PPC630		 ?
-C POWER4/PPC970		 ?
-C POWER5		 2.25
-C POWER6		 9.75
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.75
+C POWER7                 2.15
 
 C TODO
 C  * Try to reduce the number of needed live registers
-- 
cgit v1.2.1


From bd877338537856ee48e44fc80c92e58bfc68809f Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 01:20:56 +0100
Subject: *** empty log message ***

---
 ChangeLog | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 939030555..9bff5fdde 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-03  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/powerpc64/mode64/p7/gmp-mparam.h: New file.
+
 2011-11-02  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/s390_64/invert_limb.asm: Slight optimisation.
-- 
cgit v1.2.1


From 0254462dd44a2a730978ca1ca4d5c749ac51902a Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 19:39:30 +0100
Subject: Pass -m32 in more cases, using via _maybe mechanism. Inherit default
 gcc_cflags in more places.

---
 configure.in | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/configure.in b/configure.in
index 21defe968..9c0092427 100644
--- a/configure.in
+++ b/configure.in
@@ -648,7 +648,7 @@ case $host in
         # -mpa-risc-2-0 is only an optional flag, in case an old gcc is
         # used.  Assembler support for 2.0 is essential though, for our asm
         # files.
-	gcc_20n_cflags="-O2"
+	gcc_20n_cflags="$gcc_cflags"
 	gcc_20n_cflags_optlist="arch"
         gcc_20n_cflags_arch="-mpa-risc-2-0 -mpa-risc-1-1"
         gcc_20n_testlist="sizeof-long-4 hppa-level-2.0"
@@ -671,7 +671,7 @@ case $host in
           esac
 
           cclist_20w="gcc cc"
-	  gcc_20w_cflags="-O2 -mpa-risc-2-0"
+	  gcc_20w_cflags="$gcc_cflags -mpa-risc-2-0"
           cc_20w_cflags="+DD64 +O2"
           cc_20w_testlist="hpc-hppa-2-0"
           path_20w="pa64"
@@ -735,7 +735,7 @@ case $host in
         cc_32_cflags=""
         cc_32_cflags_optlist="opt"
         cc_32_cflags_opt="+O3 +O2 +O1"
-        gcc_32_cflags="-milp32 -O2"
+        gcc_32_cflags="$gcc_cflags -milp32"
         limb_32=longlong
         SPEED_CYCLECOUNTER_OBJ_32=ia64.lo
         cyclecounter_size_32=2
@@ -750,7 +750,7 @@ case $host in
         cc_64_cppflags="+DD64"
         cc_64_cflags_optlist="opt"
         cc_64_cflags_opt="+O3 +O2 +O1"
-        gcc_64_cflags="$gcc_64_cflags -mlp64"
+        gcc_64_cflags="$gcc_cflags -mlp64"
         ;;
     esac
     ;;
@@ -831,13 +831,13 @@ case $host in
         abilist="n32 64 o32"
 
         cclist_n32="gcc cc"
-        gcc_n32_cflags="-O2 -mabi=n32"
+        gcc_n32_cflags="$gcc_cflags -mabi=n32"
         cc_n32_cflags="-O2 -n32"	# no -g, it disables all optimizations
         limb_n32=longlong
         path_n32="mips64"
 
         cclist_64="gcc cc"
-        gcc_64_cflags="$gcc_64_cflags -mabi=64"
+        gcc_64_cflags="$gcc_cflags -mabi=64"
         gcc_64_ldflags="-Wc,-mabi=64"
         cc_64_cflags="-O2 -64"		# no -g, it disables all optimizations
         cc_64_ldflags="-Wc,-64"
@@ -969,7 +969,7 @@ case $host in
 	    # Need -Wc to pass object type flags through to the linker.
 	    abilist="mode64 $abilist"
 	    cclist_mode64="gcc xlc"
-	    gcc_mode64_cflags="-O2 -maix64 -mpowerpc64"
+	    gcc_mode64_cflags="$gcc_cflags -maix64 -mpowerpc64"
 	    gcc_mode64_cflags_optlist="cpu"
 	    gcc_mode64_ldflags="-Wc,-maix64"
 	    xlc_mode64_cflags="-O2 -q64 -qmaxmem=20000"
@@ -1014,6 +1014,7 @@ case $host in
 	    abilist="mode64 mode32 $abilist"
 	    gcc_cflags_opt="-O3 -O2 -O1"	# will this become used?
 	    cclist_mode32="gcc"
+	    gcc_mode32_cflags_maybe="-m32"
 	    gcc_mode32_cflags="-mpowerpc64"
 	    gcc_mode32_cflags_optlist="subtype cpu opt"
 	    gcc_mode32_cflags_subtype="-force_cpusubtype_ALL"
@@ -1057,6 +1058,7 @@ case $host in
 	    #
 	    abilist="mode64 mode32 $abilist"
 	    cclist_mode32="gcc"
+	    gcc_mode32_cflags_maybe="-m32"
 	    gcc_mode32_cflags="-mpowerpc64"
 	    gcc_mode32_cflags_optlist="cpu opt"
 	    gcc_mode32_cflags_opt="-O3 -O2 -O1"
@@ -1358,7 +1360,7 @@ case $host in
         # it until we're sure.  (Might want -xarch=v9a or -xarch=v9b for the
         # higher cpu types instead.)
         #
-        gcc_64_cflags="$gcc_64_cflags -m64 -mptr64"
+        gcc_64_cflags="$gcc_cflags -m64 -mptr64"
         gcc_64_ldflags="-Wc,-m64"
         gcc_64_cflags_optlist="cpu"
 
@@ -1580,7 +1582,7 @@ case $host in
     case $host in
       X86_64_PATTERN)
 	cclist_64="gcc"
-	gcc_64_cflags="$gcc_64_cflags -m64"
+	gcc_64_cflags="$gcc_cflags -m64"
 	gcc_64_cflags_optlist="cpu arch"
 	CALLING_CONVENTIONS_OBJS_64='amd64call.lo amd64check$U.lo'
 	SPEED_CYCLECOUNTER_OBJ_64=x86_64.lo
@@ -1625,7 +1627,7 @@ case $host in
 	    path_64=""	# Windows amd64 calling conventions are *different*
 	    extra_functions_64=""
 	    # Silence many pedantic warnings for w64.  FIXME.
-	    gcc_64_cflags="$gcc_64_cflags -std=gnu99"
+	    gcc_64_cflags="$gcc_cflags -std=gnu99"
 	    ;;
 	esac
 	;;
-- 
cgit v1.2.1


From ac8a2270a9a1c3596bc8abc2c3785ef324b85d5d Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 19:39:45 +0100
Subject: *** empty log message ***

---
 ChangeLog | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 9bff5fdde..0b26e4664 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2011-11-03  Torbjorn Granlund  <tege@gmplib.org>
 
+	* configure.in: Pass -m32 in more cases, using via _maybe mechanism.
+	Inherit default gcc_cflags in more places.
+
 	* mpn/powerpc64/mode64/p7/gmp-mparam.h: New file.
 
 2011-11-02  Torbjorn Granlund  <tege@gmplib.org>
-- 
cgit v1.2.1


From f64a1e744b5e0c511dd012a46bc5a845e901836e Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 22:42:23 +0100
Subject: Move file up from mode64.

---
 mpn/powerpc64/lshiftc.asm        | 198 +++++++++++++++++++++++++++++++++++++++
 mpn/powerpc64/mode64/lshiftc.asm | 195 --------------------------------------
 2 files changed, 198 insertions(+), 195 deletions(-)
 create mode 100644 mpn/powerpc64/lshiftc.asm
 delete mode 100644 mpn/powerpc64/mode64/lshiftc.asm

diff --git a/mpn/powerpc64/lshiftc.asm b/mpn/powerpc64/lshiftc.asm
new file mode 100644
index 000000000..8f470a5f4
--- /dev/null
+++ b/mpn/powerpc64/lshiftc.asm
@@ -0,0 +1,198 @@
+dnl  PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
+
+dnl  Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.5
+C POWER7                 2.15
+
+C TODO
+C  * Try to reduce the number of needed live registers
+C  * Micro-optimise header code
+C  * Keep in synch with lshift.asm and rshift.asm
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	subfic	tnc, cnt, 64
+	sldi	r7, n, 3	C byte count corresponding to n
+	add	up, up, r7	C up = up + n
+	add	rp, rp, r7	C rp = rp + n
+	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
+	cmpdi	cr6, r30, 2
+	addi	r31, n, 3	C compute count...
+	ld	r10, -8(up)	C load 1st limb for b00...b11
+	srd	retval, r10, tnc
+	srdi	r31, r31, 2	C ...for ctr
+	mtctr	r31		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	ld	r11, -16(up)	C load 2nd limb for b10 and b11
+	beq	cr6, L(b10)
+
+	ALIGN(16)
+L(b11):	sld	r8, r10, cnt
+	srd	r9, r11, tnc
+	ld	u1, -24(up)
+	addi	up, up, -24
+	sld	r12, r11, cnt
+	srd	r7, u1, tnc
+	addi	rp, rp, 16
+	bdnz	L(gt3)
+
+	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	nor	r8, r8, r8
+	b	L(cj3)
+
+	ALIGN(16)
+L(gt3):	ld	u0, -8(up)
+	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -16(up)
+	nor	r10, r12, r7
+	b	L(L11)
+
+	ALIGN(32)
+L(b10):	sld	r12, r10, cnt
+	addi	rp, rp, 24
+	srd	r7, r11, tnc
+	bdnz	L(gt2)
+
+	sld	r8, r11, cnt
+	nor	r10, r12, r7
+	nor	r8, r8, r8
+	b	L(cj2)
+
+L(gt2):	ld	u0, -24(up)
+	sld	r8, r11, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	nor	r10, r12, r7
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -40(up)
+	nor	r11, r8, r9
+	addi	up, up, -16
+	b	L(L10)
+
+	ALIGN(16)
+L(b00):	ld	u1, -16(up)
+	sld	r12, r10, cnt
+	srd	r7, u1, tnc
+	ld	u0, -24(up)
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	nor	r10, r12, r7
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	addi	rp, rp, 8
+	bdz	L(cj4)
+
+L(gt4):	addi	up, up, -32
+	ld	u0, -8(up)
+	nor	r11, r8, r9
+	b	L(L00)
+
+	ALIGN(16)
+L(b01):	bdnz	L(gt1)
+	sld	r8, r10, cnt
+	nor	r8, r8, r8
+	std	r8, -8(rp)
+	b	L(ret)
+
+L(gt1):	ld	u0, -16(up)
+	sld	r8, r10, cnt
+	srd	r9, u0, tnc
+	ld	u1, -24(up)
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -32(up)
+	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -40(up)
+	addi	up, up, -40
+	nor	r10, r12, r7
+	bdz	L(end)
+
+	ALIGN(32)
+L(top):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -8(up)
+	std	r11, -8(rp)
+	nor	r11, r8, r9
+L(L00):	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -16(up)
+	std	r10, -16(rp)
+	nor	r10, r12, r7
+L(L11):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -24(up)
+	std	r11, -24(rp)
+	nor	r11, r8, r9
+L(L10):	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	addi	up, up, -32
+	std	r10, -32(rp)
+	addi	rp, rp, -32
+	nor	r10, r12, r7
+	bdnz	L(top)
+
+	ALIGN(32)
+L(end):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	std	r11, -8(rp)
+L(cj4):	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	std	r10, -16(rp)
+	nor	r8, r8, r8
+L(cj3):	nor	r10, r12, r7
+	std	r11, -24(rp)
+L(cj2):	std	r10, -32(rp)
+	std	r8, -40(rp)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+`	srdi	r3, retval, 32
+	mr	r4, retval
+',`	mr	r3, retval')
+	blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/mode64/lshiftc.asm
deleted file mode 100644
index bca55638f..000000000
--- a/mpn/powerpc64/mode64/lshiftc.asm
+++ /dev/null
@@ -1,195 +0,0 @@
-dnl  PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
-
-dnl  Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C                   cycles/limb
-C POWER3/PPC630          ?
-C POWER4/PPC970          ?
-C POWER5                 2.25
-C POWER6                 9.5
-C POWER7                 2.15
-
-C TODO
-C  * Try to reduce the number of needed live registers
-C  * Micro-optimise header code
-C  * Keep in synch with lshift.asm and rshift.asm
-
-C INPUT PARAMETERS
-define(`rp',  `r3')
-define(`up',  `r4')
-define(`n',   `r5')
-define(`cnt', `r6')
-
-define(`tnc',`r0')
-define(`u0',`r30')
-define(`u1',`r31')
-define(`retval',`r5')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
-	std	r31, -8(r1)
-	std	r30, -16(r1)
-	subfic	tnc, cnt, 64
-	sldi	r7, n, 3	C byte count corresponding to n
-	add	up, up, r7	C up = up + n
-	add	rp, rp, r7	C rp = rp + n
-	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
-	cmpdi	cr6, r30, 2
-	addi	r31, n, 3	C compute count...
-	ld	r10, -8(up)	C load 1st limb for b00...b11
-	srd	retval, r10, tnc
-	srdi	r31, r31, 2	C ...for ctr
-	mtctr	r31		C copy count into ctr
-	beq	cr0, L(b00)
-	blt	cr6, L(b01)
-	ld	r11, -16(up)	C load 2nd limb for b10 and b11
-	beq	cr6, L(b10)
-
-	ALIGN(16)
-L(b11):	sld	r8, r10, cnt
-	srd	r9, r11, tnc
-	ld	u1, -24(up)
-	addi	up, up, -24
-	sld	r12, r11, cnt
-	srd	r7, u1, tnc
-	addi	rp, rp, 16
-	bdnz	L(gt3)
-
-	nor	r11, r8, r9
-	sld	r8, u1, cnt
-	nor	r8, r8, r8
-	b	L(cj3)
-
-	ALIGN(16)
-L(gt3):	ld	u0, -8(up)
-	nor	r11, r8, r9
-	sld	r8, u1, cnt
-	srd	r9, u0, tnc
-	ld	u1, -16(up)
-	nor	r10, r12, r7
-	b	L(L11)
-
-	ALIGN(32)
-L(b10):	sld	r12, r10, cnt
-	addi	rp, rp, 24
-	srd	r7, r11, tnc
-	bdnz	L(gt2)
-
-	sld	r8, r11, cnt
-	nor	r10, r12, r7
-	nor	r8, r8, r8
-	b	L(cj2)
-
-L(gt2):	ld	u0, -24(up)
-	sld	r8, r11, cnt
-	srd	r9, u0, tnc
-	ld	u1, -32(up)
-	nor	r10, r12, r7
-	sld	r12, u0, cnt
-	srd	r7, u1, tnc
-	ld	u0, -40(up)
-	nor	r11, r8, r9
-	addi	up, up, -16
-	b	L(L10)
-
-	ALIGN(16)
-L(b00):	ld	u1, -16(up)
-	sld	r12, r10, cnt
-	srd	r7, u1, tnc
-	ld	u0, -24(up)
-	sld	r8, u1, cnt
-	srd	r9, u0, tnc
-	ld	u1, -32(up)
-	nor	r10, r12, r7
-	sld	r12, u0, cnt
-	srd	r7, u1, tnc
-	addi	rp, rp, 8
-	bdz	L(cj4)
-
-L(gt4):	addi	up, up, -32
-	ld	u0, -8(up)
-	nor	r11, r8, r9
-	b	L(L00)
-
-	ALIGN(16)
-L(b01):	bdnz	L(gt1)
-	sld	r8, r10, cnt
-	nor	r8, r8, r8
-	std	r8, -8(rp)
-	b	L(ret)
-
-L(gt1):	ld	u0, -16(up)
-	sld	r8, r10, cnt
-	srd	r9, u0, tnc
-	ld	u1, -24(up)
-	sld	r12, u0, cnt
-	srd	r7, u1, tnc
-	ld	u0, -32(up)
-	nor	r11, r8, r9
-	sld	r8, u1, cnt
-	srd	r9, u0, tnc
-	ld	u1, -40(up)
-	addi	up, up, -40
-	nor	r10, r12, r7
-	bdz	L(end)
-
-	ALIGN(32)
-L(top):	sld	r12, u0, cnt
-	srd	r7, u1, tnc
-	ld	u0, -8(up)
-	std	r11, -8(rp)
-	nor	r11, r8, r9
-L(L00):	sld	r8, u1, cnt
-	srd	r9, u0, tnc
-	ld	u1, -16(up)
-	std	r10, -16(rp)
-	nor	r10, r12, r7
-L(L11):	sld	r12, u0, cnt
-	srd	r7, u1, tnc
-	ld	u0, -24(up)
-	std	r11, -24(rp)
-	nor	r11, r8, r9
-L(L10):	sld	r8, u1, cnt
-	srd	r9, u0, tnc
-	ld	u1, -32(up)
-	addi	up, up, -32
-	std	r10, -32(rp)
-	addi	rp, rp, -32
-	nor	r10, r12, r7
-	bdnz	L(top)
-
-	ALIGN(32)
-L(end):	sld	r12, u0, cnt
-	srd	r7, u1, tnc
-	std	r11, -8(rp)
-L(cj4):	nor	r11, r8, r9
-	sld	r8, u1, cnt
-	std	r10, -16(rp)
-	nor	r8, r8, r8
-L(cj3):	nor	r10, r12, r7
-	std	r11, -24(rp)
-L(cj2):	std	r10, -32(rp)
-	std	r8, -40(rp)
-
-L(ret):	ld	r31, -8(r1)
-	ld	r30, -16(r1)
-	mr	r3, retval
-	blr
-EPILOGUE()
-- 
cgit v1.2.1


From c7aa2d66f1403def399929e97347f16b4386550a Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 3 Nov 2011 22:44:45 +0100
Subject: (mpz_sub): Abort for non-handled case.

---
 dumbmp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dumbmp.c b/dumbmp.c
index 293580228..3292d6eec 100644
--- a/dumbmp.c
+++ b/dumbmp.c
@@ -421,6 +421,8 @@ mpz_sub (mpz_t r, mpz_t a, mpz_t b)
       mp_limb_t *tp;  int tn;
       tn = an; an = bn; bn = tn;
       tp = ap; ap = bp; bp = tp;
+      /* This needs sign change, not done so abort.  */
+      abort ();
     }
 
   cy = 0;
-- 
cgit v1.2.1


From 9e346a7777b9c5576e5a4758a1701ed114a9e977 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Fri, 4 Nov 2011 00:15:09 +0100
Subject: *** empty log message ***

---
 ChangeLog | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 0b26e4664..803bcb543 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2011-11-03  Torbjorn Granlund  <tege@gmplib.org>
 
+	* dumbmp.c (mpz_sub): Abort for non-handled case.
+
+	* mpn/powerpc64/mode64/lshiftc.asm: Move file from here...
+	* mpn/powerpc64/lshiftc.asm: ...to here, with trivial modifications.
+
 	* configure.in: Pass -m32 in more cases, using via _maybe mechanism.
 	Inherit default gcc_cflags in more places.
 
-- 
cgit v1.2.1


From 98ec919fbc24e85c20818b472131687ba42ae6ab Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 7 Nov 2011 18:42:27 +0100
Subject: Change how mpn_redc_1 works, use more broadly.

---
 configure.in             |  2 +-
 gmp-impl.h               |  5 +---
 mpn/generic/powm.c       | 44 +++++++++++++++++++------------
 mpn/generic/powm_sec.c   | 18 +++++++++----
 mpn/generic/redc_1.c     |  5 +---
 mpn/generic/redc_1_sec.c | 45 --------------------------------
 mpn/x86_64/redc_1.asm    | 68 ++++++++++--------------------------------------
 tests/refmpn.c           |  7 ++---
 tune/speed.h             |  6 ++---
 9 files changed, 61 insertions(+), 139 deletions(-)
 delete mode 100644 mpn/generic/redc_1_sec.c

diff --git a/configure.in b/configure.in
index 9c0092427..79367c210 100644
--- a/configure.in
+++ b/configure.in
@@ -2638,7 +2638,7 @@ gmp_mpn_functions="$extra_functions					   \
   mu_bdiv_q mu_bdiv_qr							   \
   bdiv_q bdiv_qr							   \
   divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec		   \
-  redc_1_sec trialdiv remove						   \
+  trialdiv remove							   \
   and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n			   \
   copyi copyd zero							   \
   $gmp_mpn_functions_optional"
diff --git a/gmp-impl.h b/gmp-impl.h
index e918c31ed..c0ed63791 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -1063,7 +1063,7 @@ __GMP_DECLSPEC void mpn_mulmid __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_sr
 __GMP_DECLSPEC mp_limb_t mpn_submul_1c __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
 
 #define mpn_redc_1 __MPN(redc_1)
-__GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+__GMP_DECLSPEC void mpn_redc_1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
 
 #define mpn_redc_2 __MPN(redc_2)
 __GMP_DECLSPEC void mpn_redc_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
@@ -1471,9 +1471,6 @@ __GMP_DECLSPEC void      mpn_powm_sec __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t
 __GMP_DECLSPEC mp_size_t mpn_powm_sec_itch __GMP_PROTO ((mp_size_t, mp_size_t, mp_size_t));
 #define   mpn_tabselect __MPN(tabselect)
 __GMP_DECLSPEC void      mpn_tabselect __GMP_PROTO ((volatile mp_limb_t *, volatile mp_limb_t *, mp_size_t, mp_size_t, mp_size_t));
-#define mpn_redc_1_sec __MPN(redc_1_sec)
-__GMP_DECLSPEC void mpn_redc_1_sec __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
-
 #define   mpn_addcnd_n __MPN(addcnd_n)
 __GMP_DECLSPEC mp_limb_t mpn_addcnd_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
 #define   mpn_subcnd_n __MPN(subcnd_n)
diff --git a/mpn/generic/powm.c b/mpn/generic/powm.c
index 57edfd4f6..fa92362ad 100644
--- a/mpn/generic/powm.c
+++ b/mpn/generic/powm.c
@@ -6,7 +6,7 @@
    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
 
-Copyright 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -74,6 +74,16 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
+#undef MPN_REDC_1
+#define MPN_REDC_1(rp, up, mp, n, invm)					\
+  do {									\
+    mp_limb_t cy;							\
+    mpn_redc_1 (up, mp, n, invm);					\
+    cy = mpn_add_n (rp, up + n, up, n);					\
+    if (cy != 0)							\
+      mpn_sub_n (rp, rp, mp, n);					\
+  } while (0)
+
 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
 #define WANT_REDC_2 1
 #endif
@@ -212,12 +222,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
   mpn_sqr (tp, this_pp, n);
 #if WANT_REDC_2
   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
-    mpn_redc_1 (rp, tp, mp, n, mip[0]);
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
   else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
     mpn_redc_2 (rp, tp, mp, n, mip);
 #else
   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
-    mpn_redc_1 (rp, tp, mp, n, mip[0]);
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
 #endif
   else
     mpn_redc_n (rp, tp, mp, n, mip);
@@ -229,12 +239,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
       this_pp += n;
 #if WANT_REDC_2
       if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
-	mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
+	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
       else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
 	mpn_redc_2 (this_pp, tp, mp, n, mip);
 #else
       if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
-	mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
+	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
 #endif
       else
 	mpn_redc_n (this_pp, tp, mp, n, mip);
@@ -309,7 +319,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	  else
@@ -319,7 +329,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	}
@@ -380,7 +390,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	  else
@@ -390,7 +400,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	}
@@ -401,7 +411,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
 #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	  INNERLOOP;
 	}
       else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
@@ -440,7 +450,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	  else
@@ -450,7 +460,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	}
@@ -501,7 +511,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	  else
@@ -511,7 +521,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
 #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	      INNERLOOP;
 	    }
 	}
@@ -522,7 +532,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #undef MPN_REDUCE
 #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
 #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
 	  INNERLOOP;
 	}
       else
@@ -545,12 +555,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 
 #if WANT_REDC_2
   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
-    mpn_redc_1 (rp, tp, mp, n, mip[0]);
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
   else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
     mpn_redc_2 (rp, tp, mp, n, mip);
 #else
   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
-    mpn_redc_1 (rp, tp, mp, n, mip[0]);
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
 #endif
   else
     mpn_redc_n (rp, tp, mp, n, mip);
diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c
index 315ae6e5e..3a6f55403 100644
--- a/mpn/generic/powm_sec.c
+++ b/mpn/generic/powm_sec.c
@@ -7,7 +7,7 @@
    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
 
-Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -56,6 +56,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define WANT_CACHE_SECURITY 1
 
+#undef MPN_REDC_1_SEC
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm)				\
+  do {									\
+    mp_limb_t cy;							\
+    mpn_redc_1 (up, mp, n, invm);					\
+    cy = mpn_add_n (rp, up + n, up, n);					\
+    mpn_subcnd_n (rp, rp, mp, n, cy);					\
+  } while (0)
 
 /* Define our own mpn squaring function.  We do this since we cannot use a
    native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over
@@ -252,7 +260,7 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
     {
       mpn_mul_basecase (tp, this_pp, n, pp + n, n);
       this_pp += n;
-      mpn_redc_1_sec (this_pp, tp, mp, n, minv);
+      MPN_REDC_1_SEC (this_pp, tp, mp, n, minv);
     }
 
   expbits = getbits (ep, ebi, windowsize);
@@ -278,7 +286,7 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
       do
 	{
 	  mpn_local_sqr (tp, rp, n, tp + 2 * n);
-	  mpn_redc_1_sec (rp, tp, mp, n, minv);
+	  MPN_REDC_1_SEC (rp, tp, mp, n, minv);
 	  this_windowsize--;
 	}
       while (this_windowsize != 0);
@@ -289,12 +297,12 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 #else
       mpn_mul_basecase (tp, rp, n, pp + n * expbits, n);
 #endif
-      mpn_redc_1_sec (rp, tp, mp, n, minv);
+      MPN_REDC_1_SEC (rp, tp, mp, n, minv);
     }
 
   MPN_COPY (tp, rp, n);
   MPN_ZERO (tp + n, n);
-  mpn_redc_1_sec (rp, tp, mp, n, minv);
+  MPN_REDC_1_SEC (rp, tp, mp, n, minv);
   cnd = mpn_sub_n (tp, rp, mp, n);	/* we need just retval */
   mpn_subcnd_n (rp, rp, mp, n, !cnd);
   TMP_FREE;
diff --git a/mpn/generic/redc_1.c b/mpn/generic/redc_1.c
index 177f3932f..3567414eb 100644
--- a/mpn/generic/redc_1.c
+++ b/mpn/generic/redc_1.c
@@ -25,7 +25,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 
 void
-mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+mpn_redc_1 (mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
 {
   mp_size_t j;
   mp_limb_t cy;
@@ -40,7 +40,4 @@ mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
       up[0] = cy;
       up++;
     }
-  cy = mpn_add_n (rp, up, up - n, n);
-  if (cy != 0)
-    mpn_sub_n (rp, rp, mp, n);
 }
diff --git a/mpn/generic/redc_1_sec.c b/mpn/generic/redc_1_sec.c
deleted file mode 100644
index 3d914381c..000000000
--- a/mpn/generic/redc_1_sec.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/* mpn_redc_1_sec.  Set cp[] <- up[]/R^n mod mp[].  Clobber up[].
-   mp[] is n limbs; up[] is 2n limbs.
-
-   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
-   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-
-Copyright (C) 2000, 2001, 2002, 2004, 2008, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 3 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-void
-mpn_redc_1_sec (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
-{
-  mp_size_t j;
-  mp_limb_t cy;
-
-  ASSERT (n > 0);
-  ASSERT_MPN (up, 2*n);
-
-  for (j = n - 1; j >= 0; j--)
-    {
-      cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
-      ASSERT (up[0] == 0);
-      up[0] = cy;
-      up++;
-    }
-  cy = mpn_add_n (rp, up, up - n, n);
-  mpn_subcnd_n (rp, rp, mp, n, cy);
-}
diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm
index 976cab2bc..8d731c68c 100644
--- a/mpn/x86_64/redc_1.asm
+++ b/mpn/x86_64/redc_1.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
 
-dnl  Copyright 2004, 2008 Free Software Foundation, Inc.
+dnl  Copyright 2004, 2008, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -34,22 +34,18 @@ C TODO
 C  * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
 C    The code for 1, 2, 3, 4 should perhaps be completely register based.
 C  * Perhaps align outer loops.
-C  * The sub_n at the end leaks side-channel data.  How do we fix that?
-C  * Write mpn_add_n_sub_n computing R = A + B - C.  It should run at 2 c/l.
 C  * We could software pipeline the IMUL stuff, by putting it before the
 C    outer loops and before the end of the outer loops.  The last outer
 C    loop iteration would then compute an unneeded product, but it is at
 C    least not a stray read from up[], since it is at up[n].
-C  * Can we combine both the add_n and sub_n into the loops, somehow?
 
 C INPUT PARAMETERS
-define(`rp',	  `%rdi')
-define(`up',	  `%rsi')
-define(`param_mp',`%rdx')
-define(`n',	  `%rcx')
-define(`invm',	  `%r8')
+define(`up',	  `%rdi')
+define(`mp',	  `%rsi')
+define(`n_param', `%rdx')
+define(`invm',	  `%rcx')
 
-define(`mp',	  `%r13')
+define(`n',	  `%r13')
 define(`i',	  `%r11')
 define(`nneg',	  `%r12')
 
@@ -62,13 +58,12 @@ PROLOGUE(mpn_redc_1)
 	push	%r12
 	push	%r13
 	push	%r14
-	push	n
-	sub	$8, %rsp		C maintain ABI required rsp alignment
 
-	lea	(param_mp,n,8), mp	C mp += n
-	lea	(up,n,8), up		C up += n
+	lea	(mp,n_param,8), mp	C mp += n
+	lea	(up,n_param,8), up	C up += n
 
-	mov	n, nneg
+	mov	n_param, nneg
+	mov	n_param, n
 	neg	nneg
 
 	mov	R32(n), R32(%rax)
@@ -136,9 +131,7 @@ L(n1):	mov	%r14, 16(up,nneg,8)	C up[0]
 	add	$8, up
 	dec	n
 	jnz	L(o1)
-C	lea	(mp), mp
-	lea	16(up), up
-	jmp	L(common)
+	jmp	L(ret)
 
 L(b0):	C lea	(mp), mp
 	lea	-16(up), up
@@ -190,10 +183,7 @@ L(ed0):	add	%r10, (up)
 	add	$8, up
 	dec	n
 	jnz	L(o0)
-C	lea	(mp), mp
-	lea	16(up), up
-	jmp	L(common)
-
+	jmp	L(ret)
 
 L(b3):	lea	-8(mp), mp
 	lea	-24(up), up
@@ -244,9 +234,7 @@ L(ed3):	add	%r10, 8(up)
 	add	$8, up
 	dec	n
 	jnz	L(o3)
-	lea	8(mp), mp
-	lea	24(up), up
-	jmp	L(common)
+	jmp	L(ret)
 
 L(b2):	lea	-16(mp), mp
 	lea	-32(up), up
@@ -299,36 +287,8 @@ L(ed2):	add	%r10, 16(up)
 	add	$8, up
 	dec	n
 	jnz	L(o2)
-	lea	16(mp), mp
-	lea	32(up), up
-
-
-L(common):
-	lea	(mp,nneg,8), mp		C restore entry mp
-
-C   cy = mpn_add_n (rp, up, up - n, n);
-C		    rdi rsi  rdx    rcx
-	lea	(up,nneg,8), up		C up -= n
-	lea	(up,nneg,8), %rdx	C rdx = up - n [up entry value]
-	mov	rp, nneg		C preserve rp over first call
-	mov	8(%rsp), %rcx		C pass entry n
-C	mov	rp, %rdi
-	CALL(	mpn_add_n)
-	test	R32(%rax), R32(%rax)
-	jz	L(ret)
-
-C     mpn_sub_n (rp, rp, mp, n);
-C		 rdi rsi rdx rcx
-	mov	nneg, %rdi
-	mov	nneg, %rsi
-	mov	mp, %rdx
-	mov	8(%rsp), %rcx		C pass entry n
-	CALL(	mpn_sub_n)
 
-L(ret):
-	add	$8, %rsp
-	pop	n			C just increment rsp
-	pop	%r14
+L(ret):	pop	%r14
 	pop	%r13
 	pop	%r12
 	pop	%rbx
diff --git a/tests/refmpn.c b/tests/refmpn.c
index fbcc602d6..7ace7ebce 100644
--- a/tests/refmpn.c
+++ b/tests/refmpn.c
@@ -2,7 +2,7 @@
    of the normal gmp code.  Speed isn't a consideration.
 
 Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
-2007, 2008, 2009 Free Software Foundation, Inc.
+2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -2303,12 +2303,9 @@ refmpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
 
   for (j = n - 1; j >= 0; j--)
     {
-      up[0] = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
+      up[0] = refmpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
       up++;
     }
-  cy = mpn_add_n (rp, up, up - n, n);
-  if (cy != 0)
-    mpn_sub_n (rp, rp, mp, n);
 }
 
 size_t
diff --git a/tune/speed.h b/tune/speed.h
index c017a8ec2..08c01a5dc 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -2193,7 +2193,7 @@ int speed_routine_count_zeros_setup
 #define SPEED_ROUTINE_REDC_1(function)					\
   {									\
     unsigned   i;							\
-    mp_ptr     cp, mp, tp, ap;						\
+    mp_ptr     mp, tp, ap;						\
     mp_limb_t  inv;							\
     double     t;							\
     TMP_DECL;								\
@@ -2203,7 +2203,6 @@ int speed_routine_count_zeros_setup
     TMP_MARK;								\
     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
-    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
 									\
     MPN_COPY (ap,         s->xp, s->size);				\
@@ -2218,14 +2217,13 @@ int speed_routine_count_zeros_setup
     speed_operand_src (s, ap, 2*s->size+1);				\
     speed_operand_dst (s, tp, 2*s->size+1);				\
     speed_operand_src (s, mp, s->size);					\
-    speed_operand_dst (s, cp, s->size);					\
     speed_cache_fill (s);						\
 									\
     speed_starttime ();							\
     i = s->reps;							\
     do {								\
       MPN_COPY (tp, ap, 2*s->size);					\
-      function (cp, tp, mp, s->size, inv);				\
+      function (tp, mp, s->size, inv);					\
     } while (--i != 0);							\
     t = speed_endtime ();						\
 									\
-- 
cgit v1.2.1


From ac2c5637c823e21c9fd3aa8cf3d52fedb70519e5 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 7 Nov 2011 18:42:54 +0100
Subject: *** empty log message ***

---
 ChangeLog | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 803bcb543..6d92c7d2a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2011-11-07  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/generic/redc_1.c: Just reduce U uperand using Hensel norm, but
+	not fully canonically; leave add_n and conditional sub_n to caller.
+	Therefore omit R argument.
+
+	* mpn/generic/redc_1_sec.c: Remove.
+
+	* gmp-impl.h (mpn_redc_1): Update declaration.
+	(mpn_redc_1_sec): Remove declaration.
+
+	* configure.in (gmp_mpn_functions): Remove redc_1.
+
+	* mpn/x86_64/redc_1.asm: Adopt to new defined functionality/interface.
+	* tune/speed.h (SPEED_ROUTINE_REDC_1): Likewise.
+
+	* tests/refmpn.c (refmpn_redc_1): Likewise; also call refmpn_addmul_1
+	instead of mpn_addmul_1.
+
+	* mpn/generic/powm.c (MPN_REDC_1): New macro, use for mpn_redc_1.
+	* mpn/generic/powm_sec.c (MPN_REDC_1_SEC): New macro, use for
+	mpn_redc_1_sec.
+
 2011-11-03  Torbjorn Granlund  <tege@gmplib.org>
 
 	* dumbmp.c (mpz_sub): Abort for non-handled case.
-- 
cgit v1.2.1


From eb4ffad7f3a72c693a161ecf544e3d7cb9a1c0ec Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 16:34:21 +0100
Subject: Fix comment typo.

---
 mpn/powerpc64/mode64/mul_basecase.asm    | 2 +-
 mpn/powerpc64/mode64/p6/mul_basecase.asm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm
index a34f75962..9a3957f94 100644
--- a/mpn/powerpc64/mode64/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl  PowerPC-64 mpn_basecase.
+dnl  PowerPC-64 mpn_mul_basecase.
 
 dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
 dnl  Foundation, Inc.
diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm
index 427d6081a..52c5af8ff 100644
--- a/mpn/powerpc64/mode64/p6/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl  PowerPC-64 mpn_basecase.
+dnl  PowerPC-64 mpn_mul_basecase.
 
 dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free
 dnl  Software Foundation, Inc.
-- 
cgit v1.2.1


From 91ea899257061155b12c5bfab949561117a70a4b Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 22:26:07 +0100
Subject: (gmp_mpn_functions): Add addcnd_n and subcnd_n.

---
 configure.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.in b/configure.in
index 79367c210..6c9a313c3 100644
--- a/configure.in
+++ b/configure.in
@@ -2654,6 +2654,7 @@ case $tmp_fn in
 		     tmp_mulfunc="aors_err2_n" ;;
   add_err3_n|sub_err3_n)
 		     tmp_mulfunc="aors_err3_n" ;;
+  addcnd_n|subcnd_n) tmp_mulfunc="aorscnd_n"   ;;
   addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
   popcount|hamdist)  tmp_mulfunc="popham"    ;;
   and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
-- 
cgit v1.2.1


From 0247111bce9444a966b57323f42fdd3e5a754b22 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 22:29:36 +0100
Subject: New file.

---
 mpn/x86_64/aorscnd_n.asm | 164 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 mpn/x86_64/aorscnd_n.asm

diff --git a/mpn/x86_64/aorscnd_n.asm b/mpn/x86_64/aorscnd_n.asm
new file mode 100644
index 000000000..19ea42f2a
--- /dev/null
+++ b/mpn/x86_64/aorscnd_n.asm
@@ -0,0 +1,164 @@
+dnl  AMD64 mpn_addcnd_n, mpn_subcnd_n
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.25
+C AMD K10	 2
+C Intel P4	13
+C Intel core2	 2.9
+C Intel NHM	 2.9
+C Intel SBR	 2.4
+C Intel atom	 6.5
+C VIA nano	 3
+
+C NOTES
+C  * It might seem natural to use the cmov insn here, but since this function
+C    is supposed to have the exact same execution pattern for cnd true and
+C    false, and since cmov's documentation is not clear about wheather it
+C    actually reads both source operands and writes the register for a false
+C    condition, we cannot use it.
+C  * Two cases could be optimised: (1) addcnd_n could use ADCSBB-from-memory
+C    to save one insn/limb, and (2) when up=rp addcnd_n and subcnd_n could use
+C    ADCSBB-to-memory, again saving 1 insn/limb.
+C  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
+C    for any other processor.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnd',	`%r8')
+
+ifdef(`OPERATION_addcnd_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_addcnd_n)')
+ifdef(`OPERATION_subcnd_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_subcnd_n)')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	neg	cnd
+	sbb	cnd, cnd		C make cnd mask
+
+	lea	(vp,n,8), vp
+	lea	(up,n,8), up
+	lea	(rp,n,8), rp
+
+	mov	R32(n), R32(%rax)
+	neg	n
+	and	$3, R32(%rax)
+	jz	L(top)			C carry-save reg rax = 0 in this arc
+	cmp	$2, R32(%rax)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	(vp,n,8), %r12
+	mov	8(vp,n,8), %r13
+	mov	16(vp,n,8), %r14
+	mov	(up,n,8), %r10
+	mov	8(up,n,8), %rbx
+	mov	16(up,n,8), %rbp
+	and	cnd, %r12
+	and	cnd, %r13
+	and	cnd, %r14
+	ADDSUB	%r12, %r10
+	ADCSBB	%r13, %rbx
+	ADCSBB	%r14, %rbp
+	sbb	R32(%rax), R32(%rax)	C save carry
+	mov	%r10, (rp,n,8)
+	mov	%rbx, 8(rp,n,8)
+	mov	%rbp, 16(rp,n,8)
+	add	$3, n
+	js	L(top)
+	jmp	L(end)
+
+L(b2):	mov	(vp,n,8), %r12
+	mov	8(vp,n,8), %r13
+	mov	(up,n,8), %r10
+	mov	8(up,n,8), %rbx
+	and	cnd, %r12
+	and	cnd, %r13
+	ADDSUB	%r12, %r10
+	ADCSBB	%r13, %rbx
+	sbb	R32(%rax), R32(%rax)	C save carry
+	mov	%r10, (rp,n,8)
+	mov	%rbx, 8(rp,n,8)
+	add	$2, n
+	js	L(top)
+	jmp	L(end)
+
+L(b1):	mov	(vp,n,8), %r12
+	mov	(up,n,8), %r10
+	and	cnd, %r12
+	ADDSUB	%r12, %r10
+	sbb	R32(%rax), R32(%rax)	C save carry
+	mov	%r10, (rp,n,8)
+	add	$1, n
+	jns	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp,n,8), %r12
+	mov	8(vp,n,8), %r13
+	mov	16(vp,n,8), %r14
+	mov	24(vp,n,8), %r11
+	mov	(up,n,8), %r10
+	mov	8(up,n,8), %rbx
+	mov	16(up,n,8), %rbp
+	mov	24(up,n,8), %r9
+	and	cnd, %r12
+	and	cnd, %r13
+	and	cnd, %r14
+	and	cnd, %r11
+	add	R32(%rax), R32(%rax)	C restore carry
+	ADCSBB	%r12, %r10
+	ADCSBB	%r13, %rbx
+	ADCSBB	%r14, %rbp
+	ADCSBB	%r11, %r9
+	sbb	R32(%rax), R32(%rax)	C save carry
+	mov	%r10, (rp,n,8)
+	mov	%rbx, 8(rp,n,8)
+	mov	%rbp, 16(rp,n,8)
+	mov	%r9, 24(rp,n,8)
+	add	$4, n
+	js	L(top)
+
+L(end):	neg	R32(%rax)
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()
-- 
cgit v1.2.1


From 76dbb3ab764f748395af063c5b58f188ccbdb163 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 22:31:09 +0100
Subject: Add measuring of mpn_addcnd_n, mpn_subcnd_n.

---
 tune/common.c | 11 +++++++++++
 tune/speed.c  |  3 +++
 tune/speed.h  |  2 ++
 3 files changed, 16 insertions(+)

diff --git a/tune/common.c b/tune/common.c
index dbcc5ce90..eb2d4ba1a 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -1107,6 +1107,17 @@ speed_mpn_rsh1sub_n (struct speed_params *s)
 }
 #endif
 
+double
+speed_mpn_addcnd_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addcnd_n (wp, xp, yp, s->size, 1));
+}
+double
+speed_mpn_subcnd_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_subcnd_n (wp, xp, yp, s->size, 1));
+}
+
 /* mpn_and_n etc can be macros and so have to be handled with
    SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
 double
diff --git a/tune/speed.c b/tune/speed.c
index 0604edded..061517e28 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -468,6 +468,9 @@ const struct routine_t {
   { "mpn_rsh1sub_n",     speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
 #endif
 
+  { "mpn_addcnd_n",     speed_mpn_addcnd_n, FLAG_R_OPTIONAL },
+  { "mpn_subcnd_n",     speed_mpn_subcnd_n, FLAG_R_OPTIONAL },
+
   { "MPN_ZERO",          speed_MPN_ZERO             },
 
   { "binvert_limb",       speed_binvert_limb,       FLAG_NODATA },
diff --git a/tune/speed.h b/tune/speed.h
index 08c01a5dc..70484d391 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -148,6 +148,7 @@ double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_add_err1_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_add_err2_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_add_err3_n __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_addcnd_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_addlsh_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s));
@@ -305,6 +306,7 @@ double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_sub_err1_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_sub_err2_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_sub_err3_n __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_subcnd_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_sublsh_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s));
-- 
cgit v1.2.1


From 2a071bbcca683a848366e451963e451e8d4c0d23 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 22:31:54 +0100
Subject: Add testing of mpn_addcnd_n, mpn_subcnd_n.

---
 tests/devel/try.c | 16 ++++++++++++++++
 tests/refmpn.c    | 23 +++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/tests/devel/try.c b/tests/devel/try.c
index 5619ec26d..bf09dd829 100644
--- a/tests/devel/try.c
+++ b/tests/devel/try.c
@@ -622,6 +622,8 @@ enum {
   TYPE_SUBLSH1_NC, TYPE_SUBLSH2_NC, TYPE_SUBLSH_NC,
   TYPE_RSBLSH1_NC, TYPE_RSBLSH2_NC, TYPE_RSBLSH_NC,
 
+  TYPE_ADDCND_N, TYPE_SUBCND_N,
+
   TYPE_MOD_1, TYPE_MOD_1C, TYPE_DIVMOD_1, TYPE_DIVMOD_1C, TYPE_DIVREM_1,
   TYPE_DIVREM_1C, TYPE_PREINV_DIVREM_1, TYPE_DIVREM_2, TYPE_PREINV_MOD_1,
   TYPE_MOD_34LSUB1, TYPE_UDIV_QRNND, TYPE_UDIV_QRNND_R,
@@ -742,6 +744,16 @@ param_init (void)
   COPY (TYPE_ADD_ERR3_N);
   REFERENCE (refmpn_sub_err3_n);
 
+  p = &param[TYPE_ADDCND_N];
+  COPY (TYPE_ADD_N);
+  p->carry = CARRY_BIT;
+  REFERENCE (refmpn_addcnd_n);
+
+  p = &param[TYPE_SUBCND_N];
+  COPY (TYPE_ADD_N);
+  p->carry = CARRY_BIT;
+  REFERENCE (refmpn_subcnd_n);
+
 
   p = &param[TYPE_MUL_1];
   p->retval = 1;
@@ -1704,6 +1716,8 @@ const struct choice_t choice_array[] = {
   { TRY(mpn_copyd), TYPE_COPYD },
 #endif
 
+  { TRY(mpn_addcnd_n), TYPE_ADDCND_N },
+  { TRY(mpn_subcnd_n), TYPE_SUBCND_N },
 #if HAVE_NATIVE_mpn_addlsh1_n
   { TRY(mpn_addlsh1_n), TYPE_ADDLSH1_N },
 #endif
@@ -2395,6 +2409,8 @@ call (struct each_t *e, tryfun_t function)
   case TYPE_RSBLSH2_NC:
   case TYPE_ADD_NC:
   case TYPE_SUB_NC:
+  case TYPE_ADDCND_N:
+  case TYPE_SUBCND_N:
     e->retval = CALLING_CONVENTIONS (function)
       (e->d[0].p, e->s[0].p, e->s[1].p, size, carry);
     break;
diff --git a/tests/refmpn.c b/tests/refmpn.c
index 7ace7ebce..b31804ef9 100644
--- a/tests/refmpn.c
+++ b/tests/refmpn.c
@@ -596,6 +596,29 @@ refmpn_sub_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
   return refmpn_sub_nc (rp, s1p, s2p, size, CNST_LIMB(0));
 }
 
+mp_limb_t
+refmpn_addcnd_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size, mp_limb_t cnd)
+{
+  if (cnd != 0)
+    return refmpn_add_n (rp, s1p, s2p, size);
+  else
+    {
+      refmpn_copyi (rp, s1p, size);
+      return 0;
+    }
+}
+mp_limb_t
+refmpn_subcnd_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size, mp_limb_t cnd)
+{
+  if (cnd != 0)
+    return refmpn_sub_n (rp, s1p, s2p, size);
+  else
+    {
+      refmpn_copyi (rp, s1p, size);
+      return 0;
+    }
+}
+
 
 #define AORS_ERR1_N(operation)						\
   {                                                                     \
-- 
cgit v1.2.1


From 61f85141acc5225219d9be7f38738af7258aac9a Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 22:32:44 +0100
Subject: Declare just added refmpn_addcnd_n, refmpn_subcnd_n.

---
 tests/tests.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/tests.h b/tests/tests.h
index 4086e5c5d..75b546319 100644
--- a/tests/tests.h
+++ b/tests/tests.h
@@ -172,6 +172,11 @@ int refmpf_validate_division __GMP_PROTO ((const char *name, mpf_srcptr got,
                                            mpf_srcptr n, mpf_srcptr d));
 
 
+mp_limb_t refmpn_addcnd_n __GMP_PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+					mp_size_t size, mp_limb_t cnd));
+mp_limb_t refmpn_subcnd_n __GMP_PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+					mp_size_t size, mp_limb_t cnd));
+
 mp_limb_t refmpn_add __GMP_PROTO ((mp_ptr rp,
                               mp_srcptr s1p, mp_size_t s1size,
                               mp_srcptr s2p, mp_size_t s2size));
-- 
cgit v1.2.1


From ecb644d44d69f619e469bd22383f9c8558ef2afb Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 23:41:38 +0100
Subject: New file.

---
 mpn/powerpc64/mode64/sqr_basecase.asm | 852 ++++++++++++++++++++++++++++++++++
 1 file changed, 852 insertions(+)
 create mode 100644 mpn/powerpc64/mode64/sqr_basecase.asm

diff --git a/mpn/powerpc64/mode64/sqr_basecase.asm b/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 000000000..72ac2d318
--- /dev/null
+++ b/mpn/powerpc64/mode64/sqr_basecase.asm
@@ -0,0 +1,852 @@
+dnl  PowerPC-64 mpn_sqr_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 Free
+dnl  Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630         6-18
+C POWER4/PPC970          8
+C POWER5                 8
+C POWER6                16.25
+C POWER7                 3.77
+
+C NOTES
+C  * This is very crude, cleanup!
+C  * Try to reduce the number of needed live registers.
+C  * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4.  The
+C    cost will be more live registers.
+C  * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C    size a lot and speed things up perhaps 25%.
+C  * Use computed goto in order to compress the code.
+C  * Implement a larger final corner.
+C  * Schedule callee-saves register saves into other insns.  This could save
+C    about 5 cycles/call.  (We cannot analogously optimise the restores, since
+C    the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C  * Should the alternating std/adde sequences be split?  Some pipelines handle
+C    adde poorly, and might sequentialise all these instructions.
+C  * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C    adjacent integer multiply insns.  Except for the multiply insns, the code
+C    was not carefully optimised for POWER6 or any other CPU.
+C  * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved',  `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	cmpdi	cr0, n, 2
+	bge	cr0, L(ge2)
+	ld	r5, 0(up)	C n = 1
+	nop
+	mulld	r8, r5, r5	C weight 0
+	mulhdu	r9, r5, r5	C weight 1
+	std	r8, 0(rp)
+	std	r9, 8(rp)
+	blr
+	ALIGN(16)
+L(ge2):	bgt	cr0, L(gt2)
+	ld	r0, 0(up)	C n = 2
+	nop
+	mulld	r8, r0, r0	C u0 * u0
+	mulhdu	r9, r0, r0	C u0 * u0
+	ld	r6, 8(up)
+	mulld	r10, r6, r6	C u1 * u1
+	mulhdu	r11, r6, r6	C u1 * u1
+	mulld	r4, r6, r0	C u1 * u0
+	mulhdu	r5, r6, r0	C u1 * u0
+	addc	r4, r4, r4
+	adde	r5, r5, r5
+	addze	r11, r11
+	addc	r9, r9, r4
+	adde	r10, r10, r5
+	addze	r11, r11
+	std	r8, 0(rp)
+	std	r9, 8(rp)
+	std	r10, 16(rp)
+	std	r11, 24(rp)
+	blr
+
+	ALIGN(16)
+L(gt2):	std	r31,  -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	std	r25, -56(r1)
+	std	r24, -64(r1)
+	std	r23, -72(r1)
+	std	r22, -80(r1)
+	std	r21, -88(r1)
+
+	mr	rp_saved, rp
+	mr	up_saved, up
+	mr	n_saved, n
+	mr	rp_outer, rp
+	mr	up_outer, up
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addic	r7, n, 2	C compute count...
+	srdi	r7, r7, 2	C ...for ctr
+	mtctr	r7		C copy count into ctr
+	beq-	cr0, L(b0)
+	blt-	cr6, L(b1)
+	beq-	cr6, L(b2)
+
+L(b3):	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	addi	up, up, 24
+	li	r12, 0		C carry limb
+	bdz	L(em3)
+
+	ALIGN(16)
+L(tm3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm3)
+
+L(em3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop)
+
+L(b0):	ld	r6, 0(up)
+	ld	r27, 8(up)
+	mulld	r7, r27, r6
+	mulhdu	r12, r27, r6
+	std	r7, 8(rp)
+	addi	rp, rp, 8
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	addi	up, up, 32
+	bdz	L(em0)
+
+	ALIGN(16)
+L(tm0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm0)
+
+L(em0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop_ent_2)
+
+L(b1):	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r12, r27, r6
+	addc	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addi	rp, rp, 16
+	ld	r9, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 40
+	bdz	L(em1)
+
+	ALIGN(16)
+L(tm1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm1)
+
+L(em1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop_ent_3)
+
+L(b2):	addi	r7, r7, -1	C FIXME
+	mtctr	r7		C FIXME
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 24(up)
+	mulld	r11, r9, r6
+	mulhdu	r10, r9, r6
+	addc	r7, r7, r26
+	adde	r11, r11, r8
+	addze	r12, r10
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	std	r11, 24(rp)
+	addi	rp, rp, 24
+	ld	r9, 32(up)
+	ld	r27, 40(up)
+	addi	up, up, 48
+	bdz	L(em2)
+
+	ALIGN(16)
+L(tm2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm2)
+
+L(em2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop_ent_0)
+
+
+L(outer_loop):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	bdz	L(outer_end)
+
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 24(up)
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	ld	r30, 16(rp)
+	mulld	r11, r9, r6
+	mulhdu	r10, r9, r6
+	addc	r7, r7, r26
+	adde	r11, r11, r8
+	addze	r12, r10
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	adde	r11, r11, r30
+	std	r11, 16(rp)
+	addi	rp, rp, 24
+	ld	r9, 32(up)
+	ld	r27, 40(up)
+	addi	up, up, 48
+	bdz	L(ea1)
+
+	ALIGN(16)
+L(ta1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta1)
+
+L(ea1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+L(outer_loop_ent_0):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	addc	r0, r0, r28
+	adde	r7, r7, r26
+	addze	r12, r8
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addi	rp, rp, 16
+	ld	r9, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 40
+	bdz	L(ea0)
+
+	ALIGN(16)
+L(ta0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta0)
+
+L(ea0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+L(outer_loop_ent_3):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r28, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r12, r9, r6
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	addi	rp, rp, 8
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	addi	up, up, 32
+	bdz	L(ea3)
+
+	ALIGN(16)
+L(ta3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta3)
+
+L(ea3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	addic	r0, r0, 0
+	li	r12, 0		C cy_limb = 0
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	bdz	L(ea2)
+	addi	up, up, 24
+
+	ALIGN(16)
+L(ta2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta2)
+
+L(ea2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+	b	L(outer_loop)
+
+L(outer_end):
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r11, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r8, r9, r6
+	addc	r0, r0, r11
+	std	r0, 0(rp)
+	addze	r8, r8
+	std	r8, 8(rp)
+
+define(`rp',  `rp_saved')
+define(`up',  `r5')
+define(`n',   `r6')
+define(`climb',	`r0')
+
+	addi	r4, rp_saved, 8
+	mr	r5, up_saved
+	mr	r6, n_saved
+
+	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 2	 		C compute count...
+	srdi	n, n, 2			C ...for ctr
+	mtctr	n			C put loop count into ctr
+	beq	cr0, L(xb0)
+	blt	cr6, L(xb1)
+	beq	cr6, L(xb2)
+
+L(xb3):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	addi	up, up, 24
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	ld	r10,  8(rp)
+	ld	r11, 16(rp)
+	ld	r6,  24(rp)
+	ld	r7,  32(rp)
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	addze	climb, r29
+	addc	r10, r10, r25
+	adde	r11, r11, r26
+	adde	r6, r6, r27
+	adde	r7, r7, r28
+	std	r24,  0(rp)
+	std	r10,  8(rp)
+	std	r11, 16(rp)
+	std	r6,  24(rp)
+	std	r7,  32(rp)
+	addi	rp, rp, 40
+	bdnz	L(top)
+	b	L(end)
+
+L(xb2):	ld	r6,  0(up)
+	ld	r7,  8(up)
+	addi	up, up, 16
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	ld	r10,  8(rp)
+	ld	r11, 16(rp)
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	addze	climb, r27
+	addc	r10, r10, r25
+	adde	r11, r11, r26
+	std	r24,  0(rp)
+	std	r10,  8(rp)
+	std	r11, 16(rp)
+	addi	rp, rp, 24
+	bdnz	L(top)
+	b	L(end)
+
+L(xb0):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	ld	r23, 24(up)
+	addi	up, up, 32
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	mulld	r30, r23, r23
+	mulhdu	r31, r23, r23
+	ld	r10,  8(rp)
+	ld	r11, 16(rp)
+	ld	r6,  24(rp)
+	ld	r7,  32(rp)
+	ld	r12, 40(rp)
+	ld	r23, 48(rp)
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	adde	r12, r12, r12
+	adde	r23, r23, r23
+	addze	climb, r31
+	std	r24,  0(rp)
+	addc	r10, r10, r25
+	std	r10,  8(rp)
+	adde	r11, r11, r26
+	std	r11, 16(rp)
+	adde	r6, r6, r27
+	std	r6,  24(rp)
+	adde	r7, r7, r28
+	std	r7,  32(rp)
+	adde	r12, r12, r29
+	std	r12, 40(rp)
+	adde	r23, r23, r30
+	std	r23, 48(rp)
+	addi	rp, rp, 56
+	bdnz	L(top)
+	b	L(end)
+
+L(xb1):	ld	r6,  0(up)
+	addi	up, up, 8
+	mulld	r24, r6, r6
+	mulhdu	climb, r6, r6
+	std	r24, 0(rp)
+	addic	rp, rp, 8		C clear carry as side-effect
+
+	ALIGN(32)
+L(top):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	ld	r23, 24(up)
+	addi	up, up, 32
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	mulld	r30, r23, r23
+	mulhdu	r31, r23, r23
+	ld	r8,   0(rp)
+	ld	r9,   8(rp)
+	adde	r8, r8, r8
+	adde	r9, r9, r9
+	ld	r10, 16(rp)
+	ld	r11, 24(rp)
+	adde	r10, r10, r10
+	adde	r11, r11, r11
+	ld	r6,  32(rp)
+	ld	r7,  40(rp)
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	ld	r12, 48(rp)
+	ld	r23, 56(rp)
+	adde	r12, r12, r12
+	adde	r23, r23, r23
+	addze	r31, r31
+	addc	r8, r8, climb
+	std	r8,   0(rp)
+	adde	r9, r9, r24
+	std	r9,   8(rp)
+	adde	r10, r10, r25
+	std	r10, 16(rp)
+	adde	r11, r11, r26
+	std	r11, 24(rp)
+	adde	r6, r6, r27
+	std	r6,  32(rp)
+	adde	r7, r7, r28
+	std	r7,  40(rp)
+	adde	r12, r12, r29
+	std	r12, 48(rp)
+	adde	r23, r23, r30
+	std	r23, 56(rp)
+	mr	climb, r31
+	addi	rp, rp, 64
+	bdnz	L(top)
+
+L(end):	addze	climb, climb
+	std	climb,  0(rp)
+
+	ld	r31,  -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	ld	r26, -48(r1)
+	ld	r25, -56(r1)
+	ld	r24, -64(r1)
+	ld	r23, -72(r1)
+	ld	r22, -80(r1)
+	ld	r21, -88(r1)
+	blr
+EPILOGUE()
-- 
cgit v1.2.1


From e958b3fb3edcec522f3c374a122f80a2d3d9207a Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 9 Nov 2011 23:41:46 +0100
Subject: *** empty log message ***

---
 ChangeLog | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 6d92c7d2a..4ec18c5ac 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2011-11-09  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/powerpc64/mode64/sqr_basecase.asm: New file.
+
+	* mpn/x86_64/aorscnd_n.asm: New file.
+
+	* tune/speed.c (routine): Add measuring of mpn_addcnd_n, mpn_subcnd_n.
+	* tune/common.c (speed_mpn_addcnd_n,speed_mpn_subcnd_n): New functions.
+	* tune/speed.h: Declare them.
+
+	* tests/devel/try.c: Add tests for mpn_addcnd_n and mpn_subcnd_n.
+	* tests/refmpn.c (refmpn_addcnd_n, refmpn_subcnd_n): New functions.
+	* tests/tests.h: Declare them.
+
+	* configure.in (gmp_mpn_functions): Add addcnd_n and subcnd_n.
+
 2011-11-07  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/generic/redc_1.c: Just reduce U uperand using Hensel norm, but
-- 
cgit v1.2.1


From eb453fbaa0a498d2b1bfd05c9a51310da203fd33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Fri, 11 Nov 2011 14:13:49 +0100
Subject: Make mpn_hgcd_appr use mpn_hgcd_reduce.

---
 ChangeLog               |   6 ++
 mpn/generic/hgcd_appr.c | 175 ++----------------------------------------------
 2 files changed, 10 insertions(+), 171 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4ec18c5ac..7bd87d97e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-11-11  Niels M�ller  <nisse@lysator.liu.se>
+
+	* mpn/generic/hgcd_appr.c (submul, hgcd_matrix_apply): Deleted
+	functions, earlier copied to hgcd_reduce.c.
+	(mpn_hgcd_appr): Use hgcd_reduce.
+
 2011-11-09  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/powerpc64/mode64/sqr_basecase.asm: New file.
diff --git a/mpn/generic/hgcd_appr.c b/mpn/generic/hgcd_appr.c
index 963eaea47..8454f9da5 100644
--- a/mpn/generic/hgcd_appr.c
+++ b/mpn/generic/hgcd_appr.c
@@ -25,172 +25,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-/* Computes R -= A * B. Result must be non-negative. Normalized down
-   to size an, and resulting size is returned. */
-static mp_size_t
-submul (mp_ptr rp, mp_size_t rn,
-	mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
-{
-  mp_ptr tp;
-  TMP_DECL;
-
-  ASSERT (bn > 0);
-  ASSERT (an >= bn);
-  ASSERT (rn >= an);
-  ASSERT (an + bn <= rn + 1);
-  
-  TMP_MARK;
-  tp = TMP_ALLOC_LIMBS (an + bn);
-
-  mpn_mul (tp, ap, an, bp, bn);
-  if (an + bn > rn)
-    {
-      ASSERT (tp[rn] == 0);
-      bn--;
-    }
-  ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn));
-  TMP_FREE;
-
-  while (rn > an && (rp[rn-1] == 0))
-    rn--;
-
-  return rn;
-}
-
-/* Computes (a, b)  <--  M^{-1} (a; b) */
-/* FIXME:
-    x Take scratch parameter, and figure out scratch need.
-
-    x Use some fallback for small M->n?    
-*/
-static mp_size_t
-hgcd_matrix_apply (const struct hgcd_matrix *M,
-		   mp_ptr ap, mp_ptr bp,
-		   mp_size_t n)
-{
-  mp_size_t an, bn, un, vn, nn;
-  mp_size_t mn[2][2];
-  mp_size_t modn;
-  mp_ptr tp, sp, scratch;
-  mp_limb_t cy;
-  unsigned i, j;
-
-  TMP_DECL;
-
-  ASSERT ( (ap[n-1] | bp[n-1]) > 0);
-
-  an = n;
-  MPN_NORMALIZE (ap, an);
-  bn = n;
-  MPN_NORMALIZE (bp, bn);
-  
-  for (i = 0; i < 2; i++)
-    for (j = 0; j < 2; j++)
-      {
-	mp_size_t k;
-	k = M->n;
-	MPN_NORMALIZE (M->p[i][j], k);
-	mn[i][j] = k;
-      }
-
-  ASSERT (mn[0][0] > 0);
-  ASSERT (mn[1][1] > 0);
-  ASSERT ( (mn[0][1] | mn[1][0]) > 0);
-
-  TMP_MARK;
-
-  if (mn[0][1] == 0)
-    {
-      mp_size_t qn;
-      
-      /* A unchanged, M = (1, 0; q, 1) */
-      ASSERT (mn[0][0] == 1);
-      ASSERT (M->p[0][0][0] == 1);
-      ASSERT (mn[1][1] == 1);
-      ASSERT (M->p[1][1][0] == 1);
-
-      /* Put B <-- B - q A */
-      nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
-    }
-  else if (mn[1][0] == 0)
-    {
-      /* B unchanged, M = (1, q; 0, 1) */
-      ASSERT (mn[0][0] == 1);
-      ASSERT (M->p[0][0][0] == 1);
-      ASSERT (mn[1][1] == 1);
-      ASSERT (M->p[1][1][0] == 1);
-
-      /* Put A  <-- A - q * B */
-      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);      
-    }
-  else
-    {
-      /* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
-	 B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
-      un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
-      vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
-
-      nn = MAX (un, vn);
-      /* In the range of interest, mulmod_bnm1 should always beat mullo. */
-      modn = mpn_mulmod_bnm1_next_size (nn + 1);
-
-      scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
-      tp = TMP_ALLOC_LIMBS (modn);
-      sp = TMP_ALLOC_LIMBS (modn);
-
-      ASSERT (n <= 2*modn);
-
-      if (n > modn)
-	{
-	  cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
-	  MPN_INCR_U (ap, modn, cy);
-
-	  cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
-	  MPN_INCR_U (bp, modn, cy);
-
-	  n = modn;
-	}
-
-      mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
-      mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
-
-      /* FIXME: Handle the small n case in some better way. */
-      if (n + mn[1][1] < modn)
-	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
-      if (n + mn[0][1] < modn)
-	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-  
-      cy = mpn_sub_n (tp, tp, sp, modn);
-      MPN_DECR_U (tp, modn, cy);
-
-      ASSERT (mpn_zero_p (tp + nn, modn - nn));
-
-      mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
-      MPN_COPY (ap, tp, nn);
-      mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
-
-      if (n + mn[1][0] < modn)
-	MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
-      if (n + mn[0][0] < modn)
-	MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
-
-      cy = mpn_sub_n (tp, tp, sp, modn);
-      MPN_DECR_U (tp, modn, cy);
-
-      ASSERT (mpn_zero_p (tp + nn, modn - nn));
-      MPN_COPY (bp, tp, nn);
-
-      while ( (ap[nn-1] | bp[nn-1]) == 0)
-	{
-	  nn--;
-	  ASSERT (nn > 0);
-	}
-    }
-  TMP_FREE;
-
-  return nn;
-}
-
 /* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
    HGCD_THRESHOLD at the end? */
 mp_size_t
@@ -347,13 +181,12 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
     {
       mp_size_t n2 = (3*n)/4 + 1;
       mp_size_t p = n/2;
-      mp_size_t input_n = n;
+      mp_size_t nn;
 
-      MPN_COPY (tp, ap + p, n - p);
-      MPN_COPY (tp + n - p, bp + p, n - p);
-      if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
+      nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
+      if (nn)
 	{
-	  n = hgcd_matrix_apply (M, ap, bp, n);
+	  n = nn;
 	  /* FIXME: Discard some of the low limbs immediately? */
 	  success = 1;
 	}
-- 
cgit v1.2.1


From 5b0e8651a493b9128594851eff5387bde8081526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Fri, 11 Nov 2011 14:59:14 +0100
Subject: Tuning of mpn_hgcd_appr and mpn_hgcd_reduce.

---
 ChangeLog            | 24 ++++++++++++++++++++
 tune/Makefile.am     |  7 ++++--
 tune/common.c        | 16 +++++++++++++
 tune/hgcd_reduce_1.c | 30 ++++++++++++++++++++++++
 tune/hgcd_reduce_2.c | 29 ++++++++++++++++++++++++
 tune/speed.c         |  4 ++++
 tune/speed.h         | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tune/tuneup.c        | 25 ++++++++++++++++++++
 8 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 tune/hgcd_reduce_1.c
 create mode 100644 tune/hgcd_reduce_2.c

diff --git a/ChangeLog b/ChangeLog
index 7bd87d97e..3187619a5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,29 @@
 2011-11-11  Niels M�ller  <nisse@lysator.liu.se>
 
+	* tune/hgcd_reduce_2.c: New file.
+	* tune/hgcd_reduce_1.c: New file.
+
+	* tune/tuneup.c (hgcd_appr_threshold): New threshold variable.
+	(hgcd_reduce_threshold): Likewise.
+	(tune_hgcd_appr): New function.
+	(tune_hgcd_reduce): New function.
+	(all): Call tune_hgcd_appr and tune_hgcd_reduce.
+
+	* tune/speed.h (speed_mpn_hgcd_reduce): Declaration.
+	(speed_mpn_hgcd_reduce_[12]): Likewise.
+	(mpn_hgcd_reduce_[12]): Likewise.
+	(SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL): New macro.
+
+	* tune/speed.c (routine): Added mpn_hgcd_reduce,
+	mpn_hgcd_reduce_1, and mpn_hgcd_reduce_2.
+
+	* tune/common.c (speed_mpn_hgcd_reduce): New function.
+	(speed_mpn_hgcd_reduce_[12]): Likewise.
+
+	* tune/Makefile.am (libspeed_la_SOURCES): Added hgcd_reduce_1.c
+	hgcd_reduce_2.c.
+	(TUNE_MPN_SRCS_BASIC): Added hgcd_appr.c and hgcd_reduce.c.
+
 	* mpn/generic/hgcd_appr.c (submul, hgcd_matrix_apply): Deleted
 	functions, earlier copied to hgcd_reduce.c.
 	(mpn_hgcd_appr): Use hgcd_reduce.
diff --git a/tune/Makefile.am b/tune/Makefile.am
index e54c020d4..117e5ca2c 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -43,7 +43,8 @@ libspeed_la_SOURCES =							\
   common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
   freq.c								\
   gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
-  hgcd_lehmer.c jacbase1.c jacbase2.c jacbase3.c jacbase4.c		\
+  hgcd_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c				\
+  jacbase1.c jacbase2.c jacbase3.c jacbase4.c				\
   mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c		\
   noop.c powm_mod.c powm_redc.c pre_divrem_1.c				\
   set_strb.c set_strs.c set_strp.c time.c
@@ -129,7 +130,9 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
 TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c			\
   dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c	\
   invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c		\
-  get_str.c set_str.c matrix22_mul.c hgcd.c mul_n.c sqr.c		\
+  get_str.c set_str.c matrix22_mul.c					\
+  hgcd.c hgcd_appr.c hgcd_reduce.c					\
+  mul_n.c sqr.c								\
   mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c	\
   mulmid.c mulmid_n.c toom42_mulmid.c					\
   nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c	\
diff --git a/tune/common.c b/tune/common.c
index eb2d4ba1a..cc333a470 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -1538,6 +1538,22 @@ speed_mpn_hgcd_appr (struct speed_params *s)
   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch);
 }
 
+double
+speed_mpn_hgcd_reduce (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch);
+}
+double
+speed_mpn_hgcd_reduce_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch);
+}
+double
+speed_mpn_hgcd_reduce_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch);
+}
+
 double
 speed_mpn_gcd (struct speed_params *s)
 {
diff --git a/tune/hgcd_reduce_1.c b/tune/hgcd_reduce_1.c
new file mode 100644
index 000000000..996362414
--- /dev/null
+++ b/tune/hgcd_reduce_1.c
@@ -0,0 +1,30 @@
+/* mpn/generic/hgcd_reduce.c forced to use hgcd. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#undef  HGCD_REDUCE_THRESHOLD
+#define HGCD_REDUCE_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd_reduce  mpn_hgcd_reduce_1
+#define __gmpn_hgcd_reduce_itch  mpn_hgcd_reduce_1_itch
+
+
+#include "../mpn/generic/hgcd_reduce.c"
diff --git a/tune/hgcd_reduce_2.c b/tune/hgcd_reduce_2.c
new file mode 100644
index 000000000..1eed4ba11
--- /dev/null
+++ b/tune/hgcd_reduce_2.c
@@ -0,0 +1,29 @@
+/* mpn/generic/hgcd_reduce.c forced to use hgcd_appr. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#undef  HGCD_REDUCE_THRESHOLD
+#define HGCD_REDUCE_THRESHOLD 0
+#define __gmpn_hgcd_reduce mpn_hgcd_reduce_2
+#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_2_itch
+
+#include "../mpn/generic/hgcd_reduce.c"
diff --git a/tune/speed.c b/tune/speed.c
index 061517e28..08c13e776 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -279,6 +279,10 @@ const struct routine_t {
   { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
   { "mpn_hgcd_appr",     speed_mpn_hgcd_appr        },
 
+  { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
+  { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
+  { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2    },
+  
   { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
   { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
 
diff --git a/tune/speed.h b/tune/speed.h
index 70484d391..5add58720 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -198,6 +198,9 @@ double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_appr __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_reduce __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_reduce_1 __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_reduce_2 __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s));
@@ -488,6 +491,16 @@ mp_size_t mpn_hgcd_lehmer
   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr));
 #define MPN_HGCD_LEHMER_ITCH(n) (n)
 
+mp_size_t mpn_hgcd_reduce_1
+  __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr));
+mp_size_t mpn_hgcd_reduce_1_itch
+  __GMP_PROTO ((mp_size_t, mp_size_t));
+
+mp_size_t mpn_hgcd_reduce_2
+  __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr));
+mp_size_t mpn_hgcd_reduce_2_itch
+  __GMP_PROTO ((mp_size_t, mp_size_t));
+
 mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
 mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
 
@@ -2706,6 +2719,57 @@ int speed_routine_count_zeros_setup
     return t;								\
   }
 
+#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
+  {									\
+    mp_size_t hgcd_init_itch, hgcd_step_itch;				\
+    mp_ptr ap, bp, wp, tmp1;						\
+    struct hgcd_matrix hgcd;						\
+    mp_size_t p = s->size/2;						\
+    int res;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+    									\
+    if (s->size < 2)							\
+      return -1;							\
+    									\
+    TMP_MARK;								\
+    									\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
+    									\
+    s->xp[s->size - 1] |= 1;						\
+    s->yp[s->size - 1] |= 1;						\
+    									\
+    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
+    hgcd_step_itch = itchfunc (s->size, p);				\
+    									\
+    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
+    									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, ap, s->size + 1);				\
+    speed_operand_dst (s, bp, s->size + 1);				\
+    speed_operand_dst (s, wp, hgcd_step_itch);				\
+    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
+    speed_cache_fill (s);						\
+    									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (ap, s->xp, s->size);					\
+	MPN_COPY (bp, s->yp, s->size);					\
+	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
+	res = func (&hgcd, ap, bp, s->size, p, wp);			\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+    TMP_FREE;								\
+    return t;								\
+  }
+
 /* Run some GCDs of s->size limbs each.  The number of different data values
    is decreased as s->size**2, since GCD is a quadratic algorithm.
    SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
diff --git a/tune/tuneup.c b/tune/tuneup.c
index 4f53c979c..c62a25362 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -195,6 +195,8 @@ mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
 mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
+mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
 mp_size_t  gcd_accel_threshold          = MP_SIZE_T_MAX;
 mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
 mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
@@ -1754,6 +1756,27 @@ tune_hgcd (void)
   one (&hgcd_threshold, &param);
 }
 
+void
+tune_hgcd_appr (void)
+{
+  static struct param_t  param;
+  param.name = "HGCD_APPR_THRESHOLD";
+  param.function = speed_mpn_hgcd_appr;
+  /* We seem to get strange results for small sizes */
+  param.min_size = 30;
+  one (&hgcd_appr_threshold, &param);
+}
+
+void
+tune_hgcd_reduce (void)
+{
+  static struct param_t  param;
+  param.name = "HGCD_REDUCE_THRESHOLD";
+  param.function = speed_mpn_hgcd_reduce;
+  param.min_size = 30;
+  one (&hgcd_reduce_threshold, &param);
+}
+
 void
 tune_gcd_dc (void)
 {
@@ -2579,6 +2602,8 @@ all (void)
 
   tune_matrix22_mul ();
   tune_hgcd ();
+  tune_hgcd_appr ();
+  tune_hgcd_reduce();
   tune_gcd_dc ();
   tune_gcdext_dc ();
   tune_jacobi_base ();
-- 
cgit v1.2.1


From 11d8c9b34ef96cb653f6af6124af9f54767805a8 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Fri, 11 Nov 2011 23:33:52 +0100
Subject: Remove file, now part of sqr_basecase.asm.

---
 mpn/powerpc64/mode64/sqr_diag_addlsh1.asm | 239 ------------------------------
 1 file changed, 239 deletions(-)
 delete mode 100644 mpn/powerpc64/mode64/sqr_diag_addlsh1.asm

diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
deleted file mode 100644
index a1903cb6e..000000000
--- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,239 +0,0 @@
-dnl  PowerPC-64 mpn_sqr_diag_addlsh1
-
-dnl  Copyright 2011 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C               cycles/limb
-C POWER3/PPC630      10
-C POWER4/PPC970       6
-C POWER5              5.375
-C POWER6              8.5
-C POWER7              3.4
-
-C NOTES
-C  * This was written for POWER6 and its preferences for adjacent integer
-C    multiply insns.  The cost is that we get a large set of live registers,
-C    and therefore need to save 9 callee-saves registers.  Except for the
-C    multiply insns, the code was not carefully optimised for POWER6 or any
-C    other CPU.
-C  * Perform some cross-jumping in the feed-in code, into the loop's tail.
-
-C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n)
-
-C INPUT PARAMETERS
-define(`rp',  `r3')
-define(`tp',  `r4')
-define(`up',  `r5')
-define(`n',   `r6')
-
-define(`climb',	`r0')
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
-	std	r31,  -8(r1)
-	std	r30, -16(r1)
-	std	r29, -24(r1)
-	std	r28, -32(r1)
-	std	r27, -40(r1)
-	std	r26, -48(r1)
-	std	r25, -56(r1)
-	std	r24, -64(r1)
-	std	r23, -72(r1)
-
-	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
-	cmpdi	cr6, r0, 2
-	addi	n, n, 2	 		C compute count...
-	srdi	n, n, 2			C ...for ctr
-	mtctr	n			C put loop count into ctr
-	beq	cr0, L(b0)
-	blt	cr6, L(b1)
-	beq	cr6, L(b2)
-
-L(b3):	ld	r6,   0(up)
-	ld	r7,   8(up)
-	ld	r12, 16(up)
-	addi	up, up, 24
-	mulld	r24, r6, r6
-	mulhdu	r25, r6, r6
-	mulld	r26, r7, r7
-	mulhdu	r27, r7, r7
-	mulld	r28, r12, r12
-	mulhdu	r29, r12, r12
-	ld	r10, 0(tp)
-	ld	r11, 8(tp)
-	ld	r6, 16(tp)
-	ld	r7, 24(tp)
-	addi	tp, tp, 32
-	addc	r10, r10, r10
-	adde	r11, r11, r11
-	adde	r6, r6, r6
-	adde	r7, r7, r7
-	addze	climb, r29
-	addc	r10, r10, r25
-	adde	r11, r11, r26
-	adde	r6, r6, r27
-	adde	r7, r7, r28
-	std	r24,  0(rp)
-	std	r10,  8(rp)
-	std	r11, 16(rp)
-	std	r6,  24(rp)
-	std	r7,  32(rp)
-	addi	rp, rp, 40
-	bdnz	L(top)
-	b	L(end)
-
-L(b2):	ld	r6,  0(up)
-	ld	r7,  8(up)
-	addi	up, up, 16
-	mulld	r24, r6, r6
-	mulhdu	r25, r6, r6
-	mulld	r26, r7, r7
-	mulhdu	r27, r7, r7
-	ld	r10,  0(tp)
-	ld	r11,  8(tp)
-	addi	tp, tp, 16
-	addc	r10, r10, r10
-	adde	r11, r11, r11
-	addze	climb, r27
-	addc	r10, r10, r25
-	adde	r11, r11, r26
-	std	r24,  0(rp)
-	std	r10,  8(rp)
-	std	r11, 16(rp)
-	addi	rp, rp, 24
-	bdnz	L(top)
-	b	L(end)
-
-L(b0):	ld	r6,   0(up)
-	ld	r7,   8(up)
-	ld	r12, 16(up)
-	ld	r23, 24(up)
-	addi	up, up, 32
-	mulld	r24, r6, r6
-	mulhdu	r25, r6, r6
-	mulld	r26, r7, r7
-	mulhdu	r27, r7, r7
-	mulld	r28, r12, r12
-	mulhdu	r29, r12, r12
-	mulld	r30, r23, r23
-	mulhdu	r31, r23, r23
-	ld	r10,  0(tp)
-	ld	r11,  8(tp)
-	ld	r6,  16(tp)
-	ld	r7,  24(tp)
-	ld	r12, 32(tp)
-	ld	r23, 40(tp)
-	addi	tp, tp, 48
-	addc	r10, r10, r10
-	adde	r11, r11, r11
-	adde	r6, r6, r6
-	adde	r7, r7, r7
-	adde	r12, r12, r12
-	adde	r23, r23, r23
-	addze	climb, r31
-	std	r24,  0(rp)
-	addc	r10, r10, r25
-	std	r10,  8(rp)
-	adde	r11, r11, r26
-	std	r11, 16(rp)
-	adde	r6, r6, r27
-	std	r6,  24(rp)
-	adde	r7, r7, r28
-	std	r7,  32(rp)
-	adde	r12, r12, r29
-	std	r12, 40(rp)
-	adde	r23, r23, r30
-	std	r23, 48(rp)
-	addi	rp, rp, 56
-	bdnz	L(top)
-	b	L(end)
-
-L(b1):	ld	r6, 0(up)
-	addi	up, up, 8
-	mulld	r24, r6, r6
-	mulhdu	climb, r6, r6
-	std	r24, 0(rp)
-	addic	rp, rp, 8		C clear carry as side-effect
-
-	ALIGN(32)
-L(top):	ld	r6,   0(up)
-	ld	r7,   8(up)
-	ld	r12, 16(up)
-	ld	r23, 24(up)
-	addi	up, up, 32
-	mulld	r24, r6, r6
-	mulhdu	r25, r6, r6
-	mulld	r26, r7, r7
-	mulhdu	r27, r7, r7
-	mulld	r28, r12, r12
-	mulhdu	r29, r12, r12
-	mulld	r30, r23, r23
-	mulhdu	r31, r23, r23
-	ld	r8,   0(tp)
-	ld	r9,   8(tp)
-	adde	r8, r8, r8
-	adde	r9, r9, r9
-	ld	r10, 16(tp)
-	ld	r11, 24(tp)
-	adde	r10, r10, r10
-	adde	r11, r11, r11
-	ld	r6,  32(tp)
-	ld	r7,  40(tp)
-	adde	r6, r6, r6
-	adde	r7, r7, r7
-	ld	r12, 48(tp)
-	ld	r23, 56(tp)
-	adde	r12, r12, r12
-	adde	r23, r23, r23
-	addi	tp, tp, 64
-	addze	r31, r31
-	addc	r8, r8, climb
-	std	r8,   0(rp)
-	adde	r9, r9, r24
-	std	r9,   8(rp)
-	adde	r10, r10, r25
-	std	r10, 16(rp)
-	adde	r11, r11, r26
-	std	r11, 24(rp)
-	adde	r6, r6, r27
-	std	r6,  32(rp)
-	adde	r7, r7, r28
-	std	r7,  40(rp)
-	adde	r12, r12, r29
-	std	r12, 48(rp)
-	adde	r23, r23, r30
-	std	r23, 56(rp)
-	mr	climb, r31
-	addi	rp, rp, 64
-	bdnz	L(top)
-
-L(end):	addze	climb, climb
-	std	climb,  0(rp)
-
-L(ret):	ld	r31, -8(r1)
-	ld	r30, -16(r1)
-	ld	r29, -24(r1)
-	ld	r28, -32(r1)
-	ld	r27, -40(r1)
-	ld	r26, -48(r1)
-	ld	r25, -56(r1)
-	ld	r24, -64(r1)
-	ld	r23, -72(r1)
-	blr
-EPILOGUE()
-- 
cgit v1.2.1


From 4b6d13c7c4cac584b8f8391eeaa87f335417ceec Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Fri, 11 Nov 2011 23:37:19 +0100
Subject: *** empty log message ***

---
 ChangeLog | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 3187619a5..e13e336e0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-11  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/powerpc64/mode64/sqr_diag_addlsh1.asm: Remove.
+
 2011-11-11  Niels M�ller  <nisse@lysator.liu.se>
 
 	* tune/hgcd_reduce_2.c: New file.
-- 
cgit v1.2.1


From 890e8c8008d6518223533612dfe95b07db2c696d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Sun, 13 Nov 2011 20:24:47 +0100
Subject: Tweaked tuning setup for hgcd_appr.

---
 ChangeLog     | 6 ++++++
 tune/tuneup.c | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e13e336e0..71c2427e1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-11-13  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tune/tuneup.c (tune_hgcd_appr): Use default min_size.
+	(tune_hgcd_reduce): Increase max_size and step_factor, to 7000
+	and 0.04, respectively.
+
 2011-11-11  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/powerpc64/mode64/sqr_diag_addlsh1.asm: Remove.
diff --git a/tune/tuneup.c b/tune/tuneup.c
index c62a25362..ce1db103d 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -1762,8 +1762,6 @@ tune_hgcd_appr (void)
   static struct param_t  param;
   param.name = "HGCD_APPR_THRESHOLD";
   param.function = speed_mpn_hgcd_appr;
-  /* We seem to get strange results for small sizes */
-  param.min_size = 30;
   one (&hgcd_appr_threshold, &param);
 }
 
@@ -1774,6 +1772,8 @@ tune_hgcd_reduce (void)
   param.name = "HGCD_REDUCE_THRESHOLD";
   param.function = speed_mpn_hgcd_reduce;
   param.min_size = 30;
+  param.max_size = 7000;
+  param.step_factor = 0.04;
   one (&hgcd_reduce_threshold, &param);
 }
 
-- 
cgit v1.2.1


From e037315eefee1b249bbe052bfd84c1a1c01c6f72 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sun, 13 Nov 2011 21:31:57 +0100
Subject: Add support for POWM_SEC_TABLE table.

---
 mpn/generic/powm_sec.c |  14 ++++-
 tune/Makefile.am       |   2 +-
 tune/tuneup.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c
index 3a6f55403..c6358947b 100644
--- a/mpn/generic/powm_sec.c
+++ b/mpn/generic/powm_sec.c
@@ -189,15 +189,27 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
     }
 }
 
+#ifndef POWM_SEC_TABLE
+#if GMP_NUMB_BITS < 50
+#define POWM_SEC_TABLE  2,33,96,780,2741
+#else
+#define POWM_SEC_TABLE  2,130,524,2578
+#endif
+#endif
+
+#if TUNE_PROGRAM_BUILD
+extern int win_size (mp_bitcnt_t);
+#else
 static inline int
 win_size (mp_bitcnt_t eb)
 {
   int k;
-  static mp_bitcnt_t x[] = {0,4,27,100,325,1026,2905,7848,20457,51670,~(mp_bitcnt_t)0};
+  static mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0};
   for (k = 1; eb > x[k]; k++)
     ;
   return k;
 }
+#endif
 
 /* Convert U to REDC form, U_r = B^n * U mod M */
 static void
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 117e5ca2c..38b1fe9d2 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -132,7 +132,7 @@ TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c			\
   invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c		\
   get_str.c set_str.c matrix22_mul.c					\
   hgcd.c hgcd_appr.c hgcd_reduce.c					\
-  mul_n.c sqr.c								\
+  mul_n.c sqr.c powm_sec.c						\
   mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c	\
   mulmid.c mulmid_n.c toom42_mulmid.c					\
   nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c	\
diff --git a/tune/tuneup.c b/tune/tuneup.c
index ce1db103d..c30d19d6b 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -192,7 +192,6 @@ mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
-mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
 mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
@@ -1801,6 +1800,134 @@ tune_gcdext_dc (void)
   one (&gcdext_dc_threshold, &param);
 }
 
+/* In tune_powm_sec we compute the table used by the win_size function.  The
+   cutoff points are in exponent bits, disregarding other operand sizes.  It is
+   not possible to use the one framework since it currently uses a granilarity
+   of full limbs.
+*/
+
+/* This win_size replaces the variant in the powm code, allowing us to
+   control k in the k-ary algorithms.  */
+int winsize;
+int
+win_size (mp_bitcnt_t eb)
+{
+  return winsize;
+}
+
+void
+tune_powm_sec (void)
+{
+  mp_size_t n;
+  int k, i;
+  mp_size_t itch;
+  mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
+  const int n_max = 3000 / GMP_NUMB_BITS;
+  const int n_measurements = 5;
+  mp_ptr rp, bp, ep, mp, tp;
+  double ttab[n_measurements], tk, tkp1;
+  TMP_DECL;
+  TMP_MARK;
+
+  possible_nbits_cutoff = 0;
+
+  k = 1;
+
+  winsize = 10;			/* the itch function needs this */
+  itch = mpn_powm_sec_itch (n_max, n_max, n_max);
+
+  rp = TMP_ALLOC_LIMBS (n_max);
+  bp = TMP_ALLOC_LIMBS (n_max);
+  ep = TMP_ALLOC_LIMBS (n_max);
+  mp = TMP_ALLOC_LIMBS (n_max);
+  tp = TMP_ALLOC_LIMBS (itch);
+
+  mpn_random (bp, n_max);
+  mpn_random (mp, n_max);
+  mp[0] |= 1;
+
+/* How about taking the M operand size into account?
+
+   An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
+   B = O(M)).
+
+   Using k-ary and no sliding window, the precomputation will need time
+   O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
+   O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
+
+   An operation R=powm_sec(B,E,N) will take time like powm.
+
+   Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
+   main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
+   O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
+   table reads, respectively.  */
+
+  printf ("#define POWM_SEC_TABLE  ");
+
+  for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; )
+    {
+      n = (nbits - 1) / GMP_NUMB_BITS + 1;
+
+      /* Generate E such that sliding-window for k and k+1 works equally
+	 well/poorly (but sliding is not used in powm_sec, of course). */
+      for (i = 0; i < n; i++)
+	ep[i] = ~CNST_LIMB(0);
+
+      /* Truncate E to be exactly nbits large.  */
+      if (nbits % GMP_NUMB_BITS != 0)
+	mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS);
+      ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS;
+
+      winsize = k;
+      for (i = 0; i < n_measurements; i++)
+	{
+	  speed_starttime ();
+	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
+	  ttab[i] = speed_endtime ();
+	}
+      tk = median (ttab, n_measurements);
+
+      winsize = k + 1;
+      speed_starttime ();
+      for (i = 0; i < n_measurements; i++)
+	{
+	  speed_starttime ();
+	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
+	  ttab[i] = speed_endtime ();
+	}
+      tkp1 = median (ttab, n_measurements);
+/*
+      printf ("testing: %ld, %d", nbits, k, ep[n-1]);
+      printf ("   %10.5f  %10.5f\n", tk, tkp1);
+*/
+      if (tkp1 < tk)
+	{
+	  if (possible_nbits_cutoff)
+	    {
+	      /* Two consecutive sizes indicate k increase, obey.  */
+	      if (k > 1)
+		printf (",");
+	      printf ("%ld", (long) possible_nbits_cutoff);
+	      k++;
+	      possible_nbits_cutoff = 0;
+	    }
+	  else
+	    {
+	      /* One measurement indicate k increase, save nbits for further
+		 consideration.  */
+	      possible_nbits_cutoff = nbits;
+	    }
+	}
+      else
+	possible_nbits_cutoff = 0;
+
+      nbits_next = nbits * 65 / 64;
+      nbits = nbits_next + (nbits_next == nbits);
+    }
+  printf ("\n");
+  TMP_FREE;
+}
+
 
 /* size_extra==1 reflects the fact that with high<divisor one division is
    always skipped.  Forcing high<divisor while testing ensures consistency
@@ -1896,7 +2023,6 @@ tune_mod_1 (void)
     {
       static struct param_t  param;
       double   t1, t2;
-      int      method;
 
       s.size = 10;
       s.r = randlimb_half ();
@@ -2575,6 +2701,11 @@ all (void)
   tune_sqrmod_bnm1 ();
   printf("\n");
 
+#if 1
+  tune_powm_sec ();
+  printf("\n");
+#endif
+
   tune_fft_mul ();
   printf("\n");
 
-- 
cgit v1.2.1


From 305da91c761535cf2b6fcdf4239aae04fa72e5da Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sun, 13 Nov 2011 21:33:48 +0100
Subject: *** empty log message ***

---
 ChangeLog | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 71c2427e1..262ff7215 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2011-11-13  Torbjorn Granlund  <tege@gmplib.org>
+
+	* tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Add powm_sec.c.
+
+	* mpn/generic/powm_sec.c (win_size): Use POWM_SEC_TABLE
+	(POWM_SEC_TABLE): Define default.
+
+	* tune/tuneup.c (tune_powm_sec): New function computing POWM_SEC_TABLE.
+	(all): Call new function.
+
+	* mpn/generic/powm_sec.c (win_size): Define only when
+	TUNE_PROGRAM_BUILD is not set.
+
 2011-11-13  Niels M�ller  <nisse@lysator.liu.se>
 
 	* tune/tuneup.c (tune_hgcd_appr): Use default min_size.
-- 
cgit v1.2.1


From e1d8e2b8173bbd8e9b034722206979eef782df2c Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 15 Nov 2011 00:49:29 +0100
Subject: Separate mpn_tabselect from mpn_powm_sec and prepare for asm support.

---
 ChangeLog               | 10 ++++++++++
 configure.in            |  5 ++++-
 mpn/asm-defs.m4         |  1 +
 mpn/generic/powm_sec.c  | 25 -------------------------
 mpn/generic/tabselect.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+), 26 deletions(-)
 create mode 100644 mpn/generic/tabselect.c

diff --git a/ChangeLog b/ChangeLog
index 262ff7215..0491b1574 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2011-11-15  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/asm-defs.m4 (define_mpn): Add tabselect.
+
+	* configure.in (gmp_mpn_functions): Add tabselect.
+	(HAVE_NATIVE): Add entries for addncd_n, subcnd_n, tabselect.
+	
+	* mpn/generic/powm_sec.c: Remove mpn_tabselect implementation.
+	* mpn/generic/tabselect.c: New file with removed code.
+
 2011-11-13  Torbjorn Granlund  <tege@gmplib.org>
 
 	* tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Add powm_sec.c.
diff --git a/configure.in b/configure.in
index 6c9a313c3..eedab0eca 100644
--- a/configure.in
+++ b/configure.in
@@ -2640,7 +2640,7 @@ gmp_mpn_functions="$extra_functions					   \
   divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec		   \
   trialdiv remove							   \
   and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n			   \
-  copyi copyd zero							   \
+  copyi copyd zero tabselect						   \
   $gmp_mpn_functions_optional"
 
 define(GMP_MULFUNC_CHOICES,
@@ -3103,6 +3103,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_mpn_add_n_sub_n
 #undef HAVE_NATIVE_mpn_add_nc
 #undef HAVE_NATIVE_mpn_addaddmul_1msb0
+#undef HAVE_NATIVE_mpn_addcnd_n
 #undef HAVE_NATIVE_mpn_addlsh1_n
 #undef HAVE_NATIVE_mpn_addlsh2_n
 #undef HAVE_NATIVE_mpn_addlsh_n
@@ -3191,6 +3192,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_mpn_sqr_diag_addlsh1
 #undef HAVE_NATIVE_mpn_sub_n
 #undef HAVE_NATIVE_mpn_sub_nc
+#undef HAVE_NATIVE_mpn_subcnd_n
 #undef HAVE_NATIVE_mpn_sublsh1_n
 #undef HAVE_NATIVE_mpn_sublsh2_n
 #undef HAVE_NATIVE_mpn_sublsh_n
@@ -3204,6 +3206,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1
 #undef HAVE_NATIVE_mpn_sublsh_nc_ip1
 #undef HAVE_NATIVE_mpn_submul_1c
+#undef HAVE_NATIVE_mpn_tabselect
 #undef HAVE_NATIVE_mpn_udiv_qrnnd
 #undef HAVE_NATIVE_mpn_udiv_qrnnd_r
 #undef HAVE_NATIVE_mpn_umul_ppmm
diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4
index 4f049b21b..7a5639fbe 100644
--- a/mpn/asm-defs.m4
+++ b/mpn/asm-defs.m4
@@ -1471,6 +1471,7 @@ define_mpn(sub_n)
 define_mpn(sub_nc)
 define_mpn(submul_1)
 define_mpn(submul_1c)
+define_mpn(tabselect)
 define_mpn(umul_ppmm)
 define_mpn(umul_ppmm_r)
 define_mpn(udiv_qrnnd)
diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c
index c6358947b..d7ed2b486 100644
--- a/mpn/generic/powm_sec.c
+++ b/mpn/generic/powm_sec.c
@@ -320,31 +320,6 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
   TMP_FREE;
 }
 
-#if ! HAVE_NATIVE_mpn_tabselect
-/* Select entry `which' from table `tab', which has nents entries, each `n'
-   limbs.  Store the selected entry at rp.  Reads entire table to avoid
-   side-channel information leaks.  O(n*nents).
-   FIXME: Move to its own file.  */
-void
-mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
-	       mp_size_t nents, mp_size_t which)
-{
-  mp_size_t k, i;
-  mp_limb_t mask;
-  volatile mp_limb_t *tp;
-
-  for (k = 0; k < nents; k++)
-    {
-      mask = -(mp_limb_t) (which == k);
-      tp = tab + n * k;
-      for (i = 0; i < n; i++)
-	{
-	  rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
-	}
-    }
-}
-#endif
-
 mp_size_t
 mpn_powm_sec_itch (mp_size_t bn, mp_size_t en, mp_size_t n)
 {
diff --git a/mpn/generic/tabselect.c b/mpn/generic/tabselect.c
new file mode 100644
index 000000000..02e52fdc0
--- /dev/null
+++ b/mpn/generic/tabselect.c
@@ -0,0 +1,48 @@
+/* mpn_tabselect.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Select entry `which' from table `tab', which has nents entries, each `n'
+   limbs.  Store the selected entry at rp.  Reads entire table to avoid
+   side-channel information leaks.  O(n*nents).
+   FIXME: Move to its own file.  */
+void
+mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
+	       mp_size_t nents, mp_size_t which)
+{
+  mp_size_t k, i;
+  mp_limb_t mask;
+  volatile mp_limb_t *tp;
+
+  for (k = 0; k < nents; k++)
+    {
+      mask = -(mp_limb_t) (which == k);
+      tp = tab + n * k;
+      for (i = 0; i < n; i++)
+	{
+	  rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
+	}
+    }
+}
-- 
cgit v1.2.1


From aebd2151218bded6e4278834b9f082808eef6590 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 15 Nov 2011 00:53:06 +0100
Subject: Add mpn_tabselect assembly support for powerpc64, x86, x86_64, ia64.

---
 ChangeLog                   |   7 ++-
 mpn/ia64/tabselect.asm      | 139 ++++++++++++++++++++++++++++++++++++++++++++
 mpn/powerpc64/tabselect.asm |  95 ++++++++++++++++++++++++++++++
 mpn/x86/tabselect.asm       | 104 +++++++++++++++++++++++++++++++++
 mpn/x86_64/tabselect.asm    | 108 ++++++++++++++++++++++++++++++++++
 5 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 mpn/ia64/tabselect.asm
 create mode 100644 mpn/powerpc64/tabselect.asm
 create mode 100644 mpn/x86/tabselect.asm
 create mode 100644 mpn/x86_64/tabselect.asm

diff --git a/ChangeLog b/ChangeLog
index 0491b1574..b14d2a8da 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,10 +1,15 @@
 2011-11-15  Torbjorn Granlund  <tege@gmplib.org>
 
+	* mpn/powerpc64/tabselect.asm: New file.
+	* mpn/x86_64/tabselect.asm: New file.
+	* mpn/x86/tabselect.asm: New file.
+	* mpn/ia64/tabselect.asm: New file.
+
 	* mpn/asm-defs.m4 (define_mpn): Add tabselect.
 
 	* configure.in (gmp_mpn_functions): Add tabselect.
 	(HAVE_NATIVE): Add entries for addncd_n, subcnd_n, tabselect.
-	
+
 	* mpn/generic/powm_sec.c: Remove mpn_tabselect implementation.
 	* mpn/generic/tabselect.c: New file with removed code.
 
diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm
new file mode 100644
index 000000000..0ae3fdcfe
--- /dev/null
+++ b/mpn/ia64/tabselect.asm
@@ -0,0 +1,139 @@
+dnl  IA-64 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       ?
+C Itanium 2:     5  (estimated)
+
+C NOTES
+C  * Using software pipelining could trivially yield 3 c/l even without
+C    unrolling.  (This code was modelled after the powerpc64 code, for
+C    simplicity.)
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r32')
+define(`tp',     `r33')
+define(`n',      `r34')
+define(`nents',  `r35')
+define(`which',  `r36')
+
+define(`mask',   `r8')
+
+define(`rp1',     `r32')
+define(`tp1',     `r33')
+define(`rp2',     `r14')
+define(`tp2',     `r15')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+.mmi;	addp4	rp = 0, rp		C			M I
+	addp4	tp = 0, tp		C			M I
+	zxt4	n = n			C			I
+.mii;	nop	0
+	zxt4	nents = nents		C			I
+	zxt4	which = which		C			I
+	;;
+')
+.mmi;	add	rp2 = 8, rp1
+	add	tp2 = 8, tp1
+	add	r6 = -2, n
+	;;
+.mmi;	cmp.eq	p10, p0 = 1, n
+	and	r9 = 1, n		C set cr0 for use in inner loop
+	shr.u	r6 = r6, 1		C inner loop count
+	;;
+.mmi;	cmp.eq	p8, p0 = 0, r9
+	sub	which = nents, which
+	shl	n = n, 3
+	;;
+
+L(outer):
+.mmi	cmp.eq	p6, p7 = which, nents	C are we at the selected table entry?
+	nop	0
+	mov	ar.lc = r6		C			I0
+	;;
+.mmb;
+  (p6)	mov	mask = -1
+  (p7)	mov	mask = 0
+  (p8)	br.dptk	L(top)			C branch to loop entry if n even
+	;;
+
+.mmi;	ld8	r16 = [tp1], 8
+	add	tp2 = 8, tp2
+	nop	0
+	;;
+.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+	;;
+.mmi;	andcm	r18 = r18, mask
+	;;
+	or	r16 = r16, r18
+	nop	0
+	;;
+.mmb;	st8	[rp1] = r16, 8
+	add	rp2 = 8, rp2
+  (p10)	br.dpnt	L(end)
+
+	ALIGN(32)
+L(top):
+.mmi;	ld8	r16 = [tp1], 16
+	ld8	r17 = [tp2], 16
+	nop	0
+	;;
+.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+.mmi;	ld8	r19 = [rp2]
+	and	r17 = r17, mask
+	nop	0
+	;;
+.mmi;	andcm	r18 = r18, mask
+	andcm	r19 = r19, mask
+	nop	0
+	;;
+.mmi;	or	r16 = r16, r18
+	or	r17 = r17, r19
+	nop	0
+	;;
+.mmb;	st8	[rp1] = r16, 16
+	st8	[rp2] = r17, 16
+	br.cloop.dptk	L(top)
+	;;
+L(end):
+.mmi;	sub	rp1 = rp1, n		C move rp back to beginning
+	sub	rp2 = rp2, n		C move rp back to beginning
+	cmp.ne	p9, p0 = 1, nents
+.mmb;	add	nents = -1, nents
+	nop	0
+  (p9)	br.dptk	L(outer)
+	;;
+
+.mib;	nop	0
+	nop	0
+	br.ret.sptk.many b0
+EPILOGUE()
diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm
new file mode 100644
index 000000000..0ac2e9ba0
--- /dev/null
+++ b/mpn/powerpc64/tabselect.asm
@@ -0,0 +1,95 @@
+dnl  PowerPC-64 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r3')
+define(`tp',     `r4')
+define(`n',      `r5')
+define(`nents',  `r6')
+define(`which',  `r7')
+
+define(`mask',   `r8')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	addi	r0, n, 1
+	srdi	r0, r0, 1		C inner loop count
+	andi.	r9, n, 1		C set cr0 for use in inner loop
+	subf	which, nents, which
+	sldi	n, n, 3
+
+L(outer):
+	mtctr	r0			C put inner loop count in ctr
+
+	add	r9, which, nents	C are we at the selected table entry?
+	addic	r9, r9, -1		C set CF iff not selected entry
+	subfe	mask, r0, r0
+
+	beq	cr0, L(top)		C branch to loop entry if n even
+
+	ld	r9, 0(tp)
+	and	r9, r9, mask
+	ld	r11, 0(rp)
+	andc	r11, r11, mask
+	or	r9, r9, r11
+	std	r9, 0(rp)
+	addi	tp, tp, 8
+	addi	rp, rp, 8
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r9, 0(tp)
+	ld	r10, 8(tp)
+	and	r9, r9, mask
+	and	r10, r10, mask
+	ld	r11, 0(rp)
+	ld	r12, 8(rp)
+	andc	r11, r11, mask
+	andc	r12, r12, mask
+	or	r9, r9, r11
+	or	r10, r10, r12
+	std	r9, 0(rp)
+	std	r10, 8(rp)
+	addi	tp, tp, 16
+	addi	rp, rp, 16
+	bdnz	L(top)
+
+L(end):	subf	rp, n, rp		C move rp back to beginning
+	addi	nents, nents, -1
+	cmpdi	cr6, nents, 0
+	bne	cr6, L(outer)
+
+	blr
+EPILOGUE()
diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm
new file mode 100644
index 000000000..ab646dac3
--- /dev/null
+++ b/mpn/x86/tabselect.asm
@@ -0,0 +1,104 @@
+dnl  x86 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 ?
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C Intel Atom			 ?
+C AMD K6			 ?
+C AMD K7			 ?
+C AMD K8			 ?
+C AMD K10			 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%edi')
+define(`tp',     `%esi')
+define(`n',      `%ebx')
+define(`nents',  `%ecx')
+define(`which',  `36(%esp)')
+
+define(`i',      `%ebp')
+define(`maskp',  `20(%esp)')
+define(`maskn',  `32(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+	mov	20(%esp), rp
+	mov	24(%esp), tp
+	mov	28(%esp), n
+	mov	32(%esp), nents
+
+	lea	(rp,n,4), rp
+	lea	(tp,n,4), tp
+	sub	nents, which
+L(outer):
+	mov	which, %eax
+	add	nents, %eax
+	neg	%eax			C set CF iff 'which' != k
+	sbb	%eax, %eax
+	mov	%eax, maskn
+	not	%eax
+	mov	%eax, maskp
+
+	mov	n, i
+	neg	i
+
+	ALIGN(16)
+L(top):	mov	(tp,i,4), %eax
+	and	maskp, %eax
+	mov	(rp,i,4), %edx
+	and	maskn, %edx
+	or	%edx, %eax
+	mov	%eax, (rp,i,4)
+	inc	i
+	js	L(top)
+
+L(end):	mov	n, %eax
+	lea	(tp,%eax,4), tp
+	dec	nents
+	jne	L(outer)
+
+L(outer_end):
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
new file mode 100644
index 000000000..f7de6a85b
--- /dev/null
+++ b/mpn/x86_64/tabselect.asm
@@ -0,0 +1,108 @@
+dnl  AMD64 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2/AVX2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%rdi')
+define(`tp',     `%rsi')
+define(`n',      `%rdx')
+define(`nents',  `%rcx')
+define(`which',  `%r8')
+
+define(`i',      `%rbp')
+define(`maskp',  `%r11')
+define(`maskn',  `%r12')
+
+C rax rbx  rcx  rdx rdi rsi rbp (rsp)  r8   r9 r10 r11 r12 r13 r14 r15
+C         nents  n  rp  tab           which
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	lea	(rp,n,8), rp
+	lea	(tp,n,8), tp
+	sub	nents, which
+L(outer):
+	lea	(which,nents), %rax
+	neg	%rax			C set CF iff 'which' != k
+	sbb	maskn, maskn
+	mov	maskn, maskp
+	not	maskp
+
+	mov	n, i
+	neg	i
+	test	$1, R32(n)
+	je	L(top)
+	mov	(tp,i,8), %rax
+	and	maskp, %rax
+	mov	(rp,i,8), %r9
+	and	maskn, %r9
+	or	%r9, %rax
+	mov	%rax, (rp,i,8)
+	add	$1, i
+	jns	L(end)
+
+	ALIGN(16)
+L(top):	mov	(tp,i,8), %rax
+	mov	8(tp,i,8), %rbx
+	and	maskp, %rax
+	and	maskp, %rbx
+	mov	(rp,i,8), %r9
+	mov	8(rp,i,8), %r10
+	and	maskn, %r9
+	and	maskn, %r10
+	or	%r9, %rax
+	or	%r10, %rbx
+	mov	%rax, (rp,i,8)
+	mov	%rbx, 8(rp,i,8)
+	add	$2, i
+	js	L(top)
+
+L(end):	lea	(tp,n,8), tp
+	dec	nents
+	jne	L(outer)
+
+L(outer_end):
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()
-- 
cgit v1.2.1


From 4fc9dd5d4647c86ba4d5b08b0a2589f6f6079796 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 15 Nov 2011 01:33:25 +0100
Subject: Amend 2011-11-03 gcc_cflags change.

---
 ChangeLog    | 2 ++
 configure.in | 6 ++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index b14d2a8da..64b394b03 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
 2011-11-15  Torbjorn Granlund  <tege@gmplib.org>
 
+	* configure.in: Amend 2011-11-03 gcc_cflags change.
+
 	* mpn/powerpc64/tabselect.asm: New file.
 	* mpn/x86_64/tabselect.asm: New file.
 	* mpn/x86/tabselect.asm: New file.
diff --git a/configure.in b/configure.in
index eedab0eca..887975c40 100644
--- a/configure.in
+++ b/configure.in
@@ -875,7 +875,7 @@ case $host in
     abilist="32"
     cclist="gcc cc"
     cc_cflags="-O2"
-    gcc_cflags="$gcc_cflags -mpowerpc"
+    gcc_32_cflags="$gcc_cflags -mpowerpc"
     gcc_cflags_optlist="precomp subtype asm cpu"
     gcc_cflags_precomp="-no-cpp-precomp"
     gcc_cflags_subtype="-force_cpusubtype_ALL"	# for vmx on darwin
@@ -1243,9 +1243,7 @@ case $host in
     #
     case $host_cpu in
       sparc64 | sparcv9* | ultrasparc*)
-        gcc_cflags="$gcc_cflags -Wa,-xarch=v8plus" ;;
-      *)
-        gcc_cflags="$gcc_cflags" ;;
+        gcc_32_cflags="$gcc_cflags -Wa,-xarch=v8plus" ;;
     esac
     gcc_32_cflags_maybe="-m32"
     gcc_cflags_optlist="cpu"
-- 
cgit v1.2.1


From 8467dfae35b0349c306be952466a6382818d4188 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Tue, 15 Nov 2011 14:01:48 +0100
Subject: Further tweak for HGCD_APPR_THRESHOLD tuning.

---
 ChangeLog     | 5 +++++
 tune/tuneup.c | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 64b394b03..9af90086e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2011-11-15  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tune/tuneup.c (tune_hgcd_appr): Increased min_size to 50; some
+	machines got small thresholds which appear to be bogus.
+
 2011-11-15  Torbjorn Granlund  <tege@gmplib.org>
 
 	* configure.in: Amend 2011-11-03 gcc_cflags change.
diff --git a/tune/tuneup.c b/tune/tuneup.c
index c30d19d6b..b61729119 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -1761,6 +1761,8 @@ tune_hgcd_appr (void)
   static struct param_t  param;
   param.name = "HGCD_APPR_THRESHOLD";
   param.function = speed_mpn_hgcd_appr;
+  /* We seem to get strange results for small sizes */
+  param.min_size = 50;
   one (&hgcd_appr_threshold, &param);
 }
 
-- 
cgit v1.2.1


From 5153cf91d11990dbb6dc0291eb9eee2a6796b089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Tue, 15 Nov 2011 14:07:40 +0100
Subject: speed support for mpn_hgcd_appr_lehmer.

---
 ChangeLog               | 17 +++++++++++++++++
 tune/Makefile.am        |  2 +-
 tune/common.c           |  8 +++++++-
 tune/hgcd_appr_lehmer.c | 29 +++++++++++++++++++++++++++++
 tune/speed.c            |  1 +
 tune/speed.h            |  7 ++++++-
 6 files changed, 61 insertions(+), 3 deletions(-)
 create mode 100644 tune/hgcd_appr_lehmer.c

diff --git a/ChangeLog b/ChangeLog
index 9af90086e..7efdb424d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,22 @@
 2011-11-15  Niels M�ller  <nisse@lysator.liu.se>
 
+	* tune/speed.h (speed_mpn_hgcd_appr_lehmer): New prototype.
+	(mpn_hgcd_lehmer_itch): Likewise.
+	(mpn_hgcd_appr_lehmer): Likewise.
+	(mpn_hgcd_appr_lehmer_itch): Likewise.
+	(MPN_HGCD_LEHMER_ITCH): Deleted macro.
+
+	* tune/speed.c (routine): Added mpn_hgcd_appr_lehmer.
+
+	* tune/common.c (speed_mpn_hgcd_lehmer): Use mpn_hgcd_lehmer_itch
+	rather than similarly named macro.
+	(speed_mpn_hgcd_appr_lehmer): New function.
+
+	* tune/Makefile.am (libspeed_la_SOURCES): Added
+	hgcd_appr_lehmer.c.
+
+	* tune/hgcd_appr_lehmer.c: New file.
+
 	* tune/tuneup.c (tune_hgcd_appr): Increased min_size to 50; some
 	machines got small thresholds which appear to be bogus.
 
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 38b1fe9d2..646a1f4af 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -43,7 +43,7 @@ libspeed_la_SOURCES =							\
   common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
   freq.c								\
   gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
-  hgcd_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c				\
+  hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c	\
   jacbase1.c jacbase2.c jacbase3.c jacbase4.c				\
   mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c		\
   noop.c powm_mod.c powm_redc.c pre_divrem_1.c				\
diff --git a/tune/common.c b/tune/common.c
index cc333a470..9855e8845 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -1529,7 +1529,7 @@ speed_mpn_hgcd (struct speed_params *s)
 double
 speed_mpn_hgcd_lehmer (struct speed_params *s)
 {
-  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, MPN_HGCD_LEHMER_ITCH);
+  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch);
 }
 
 double
@@ -1538,6 +1538,12 @@ speed_mpn_hgcd_appr (struct speed_params *s)
   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch);
 }
 
+double
+speed_mpn_hgcd_appr_lehmer (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch);
+}
+
 double
 speed_mpn_hgcd_reduce (struct speed_params *s)
 {
diff --git a/tune/hgcd_appr_lehmer.c b/tune/hgcd_appr_lehmer.c
new file mode 100644
index 000000000..18123e951
--- /dev/null
+++ b/tune/hgcd_appr_lehmer.c
@@ -0,0 +1,29 @@
+/* mpn/generic/hgcd_appr.c forced to use Lehmer's quadratic algorithm. */
+
+/*
+Copyright 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#undef  HGCD_APPR_THRESHOLD
+#define HGCD_APPR_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd_appr  mpn_hgcd_appr_lehmer
+#define __gmpn_hgcd_appr_itch mpn_hgcd_appr_lehmer_itch
+
+#include "../mpn/generic/hgcd_appr.c"
diff --git a/tune/speed.c b/tune/speed.c
index 08c13e776..cffed35b6 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -278,6 +278,7 @@ const struct routine_t {
   { "mpn_hgcd",          speed_mpn_hgcd             },
   { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
   { "mpn_hgcd_appr",     speed_mpn_hgcd_appr        },
+  { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
 
   { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
   { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
diff --git a/tune/speed.h b/tune/speed.h
index 5add58720..329c09783 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -198,6 +198,7 @@ double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_appr __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_hgcd_appr_lehmer __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_reduce __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_reduce_1 __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_reduce_2 __GMP_PROTO ((struct speed_params *s));
@@ -489,7 +490,11 @@ mp_size_t mpn_gcdext_double
   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
 mp_size_t mpn_hgcd_lehmer
   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr));
-#define MPN_HGCD_LEHMER_ITCH(n) (n)
+mp_size_t mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t));
+
+mp_size_t mpn_hgcd_appr_lehmer
+  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr));
+mp_size_t mpn_hgcd_appr_lehmer_itch __GMP_PROTO ((mp_size_t));
 
 mp_size_t mpn_hgcd_reduce_1
   __GMP_PROTO ((struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr));
-- 
cgit v1.2.1


From 1d309e7a2dbaa08258bd681c59cd19b73d67e03a Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 15 Nov 2011 20:36:09 +0100
Subject: Rewrite mpn/generic/powm_sec.c.

---
 ChangeLog              | 10 ++++++++
 mpn/generic/powm_sec.c | 69 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 64b394b03..d329c7233 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
 2011-11-15  Torbjorn Granlund  <tege@gmplib.org>
 
+	* mpn/generic/powm_sec.c (mpn_local_sqr): Remove forgotten TMP_* calls.
+	(redcify): Likewise.
+	(mpn_powm_sec): Likewise.
+
+	* mpn/generic/powm_sec.c (mpn_powm_sec): Rework scratch usage
+	(mpn_powm_sec_itch): Rewrite.
+
+	* mpn/generic/powm_sec.c (mpn_powm_sec): Use mpn_tabselect also in
+	initialisation.
+
 	* configure.in: Amend 2011-11-03 gcc_cflags change.
 
 	* mpn/powerpc64/tabselect.asm: New file.
diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c
index d7ed2b486..24bb83de3 100644
--- a/mpn/generic/powm_sec.c
+++ b/mpn/generic/powm_sec.c
@@ -133,8 +133,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
   if (n > 1)
     {
       mp_limb_t cy;
-      TMP_DECL;
-      TMP_MARK;
 
       cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
       tp[n - 1] = cy;
@@ -156,8 +154,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
 #endif
 	rp[2 * n - 1] += cy;
       }
-
-      TMP_FREE;
     }
 }
 #endif
@@ -211,26 +207,24 @@ win_size (mp_bitcnt_t eb)
 }
 #endif
 
-/* Convert U to REDC form, U_r = B^n * U mod M */
+/* Convert U to REDC form, U_r = B^n * U mod M.
+   Uses scratch space at tp of size 2un + n + 1.  */
 static void
 redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp)
 {
   mp_ptr qp;
-  TMP_DECL;
-  TMP_MARK;
 
-  qp = tp + un + n;
+  qp = tp + un + n;		/* un + n - n + 1 = un + 1 limbs */
 
   MPN_ZERO (tp, n);
   MPN_COPY (tp + n, up, un);
   mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
-  TMP_FREE;
 }
 
 /* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
-   Requires that mp[n-1..0] is odd.  FIXME: is this true?
-   Requires that ep[en-1..0] is > 1.
-   Uses scratch space at tp of 3n+1 limbs.  */
+   Requires that mp[n-1..0] is odd.
+   Requires that ep[en-1..0] > 1.
+   Uses scratch space at tp as defined by mpn_powm_sec_itch.  */
 void
 mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
 	      mp_srcptr ep, mp_size_t en,
@@ -244,13 +238,10 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
   mp_ptr pp, this_pp;
   long i;
   int cnd;
-  TMP_DECL;
 
   ASSERT (en > 1 || (en == 1 && ep[0] > 0));
   ASSERT (n >= 1 && ((mp[0] & 1) != 0));
 
-  TMP_MARK;
-
   count_leading_zeros (cnt, ep[en - 1]);
   ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt;
 
@@ -259,15 +250,27 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
   binvert_limb (minv, mp[0]);
   minv = -minv;
 
-  pp = tp + 4 * n;
+  pp = tp;
+  tp += (n << windowsize);	/* put tp after power table */
 
+  /* Compute pp[0] table entry */
+  /* scratch: |   n   | 1 |   n+2    |  */
+  /*          | pp[0] | 1 | redcify  |  */
   this_pp = pp;
   this_pp[n] = 1;
-  redcify (this_pp, this_pp + n, 1, mp, n, tp + 6 * n);
+  redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1);
   this_pp += n;
-  redcify (this_pp, bp, bn, mp, n, tp + 6 * n);
+
+  /* Compute pp[1] table entry.  To avoid excessive scratch usage in the
+     degenerate situation where B >> M, we let redcify use scratch space which
+     will later be used by the pp table (element 2 and up).  */
+  /* scratch: |   n   |   n   |  bn + n + 1  |  */
+  /*          | pp[0] | pp[1] |   redcify    |  */
+  redcify (this_pp, bp, bn, mp, n, this_pp + n);
 
   /* Precompute powers of b and put them in the temporary area at pp.  */
+  /* scratch: |   n   |   n   | ...  |                    |   2n      |  */
+  /*          | pp[0] | pp[1] | ...  | pp[2^windowsize-1] |  product  |  */
   for (i = (1 << windowsize) - 2; i > 0; i--)
     {
       mpn_mul_basecase (tp, this_pp, n, pp + n, n);
@@ -281,8 +284,15 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
   else
     ebi -= windowsize;
 
+#if WANT_CACHE_SECURITY
+  mpn_tabselect (rp, pp, n, 1 << windowsize, expbits);
+#else
   MPN_COPY (rp, pp + n * expbits, n);
+#endif
 
+  /* Main exponentiation loop.  */
+  /* scratch: |   n   |   n   | ...  |                    |     3n-4n     |  */
+  /*          | pp[0] | pp[1] | ...  | pp[2^windowsize-1] |  loop scratch |  */
   while (ebi != 0)
     {
       expbits = getbits (ep, ebi, windowsize);
@@ -317,7 +327,6 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
   MPN_REDC_1_SEC (rp, tp, mp, n, minv);
   cnd = mpn_sub_n (tp, rp, mp, n);	/* we need just retval */
   mpn_subcnd_n (rp, rp, mp, n, !cnd);
-  TMP_FREE;
 }
 
 mp_size_t
@@ -326,10 +335,20 @@ mpn_powm_sec_itch (mp_size_t bn, mp_size_t en, mp_size_t n)
   int windowsize;
   mp_size_t redcify_itch, itch;
 
-  windowsize = win_size (en * GMP_NUMB_BITS); /* slight over-estimate of exp */
-  itch = 4 * n + (n << windowsize);
-  redcify_itch = 2 * bn + n + 1;
-  /* The 6n is due to the placement of reduce scratch 6n into the start of the
-     scratch area.  */
-  return MAX (itch, redcify_itch + 6 * n);
+  /* The top scratch usage will either be when reducing B in the 2nd redcify
+     call, or more typically n*2^windowsize + 3n or 4n, in the main loop.  (It
+     is 3n or 4n depending on if we use mpn_local_sqr or a native
+     mpn_sqr_basecase.  We assume 4n always for now.) */
+
+  windowsize = win_size (en * GMP_LIMB_BITS); /* slight over-estimate of exp */
+
+  /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call,
+     the 2bn + n + 1 term is due to redcify's own usage.  */
+  redcify_itch = (2 * n) + (2 * bn + n + 1);
+
+  /* The n * 2^windowsize term is due to the power table, the 4n term is due to
+     scratch needs of squaring/multiplication in the exponentiation loop.  */
+  itch = (n << windowsize) + (4 * n);
+
+  return MAX (itch, redcify_itch);
 }
-- 
cgit v1.2.1


From 4187da90eab6dd83437babffd6c845501de64a1d Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 16 Nov 2011 21:46:58 +0100
Subject: New file.

---
 ChangeLog                          |   4 +
 mpn/powerpc64/mode64/aorscnd_n.asm | 185 +++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 mpn/powerpc64/mode64/aorscnd_n.asm

diff --git a/ChangeLog b/ChangeLog
index 89d9ebd45..6cfc78d1c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-16  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/powerpc64/mode64/aorscnd_n.asm: New file.
+
 2011-11-15  Niels M�ller  <nisse@lysator.liu.se>
 
 	* tune/speed.h (speed_mpn_hgcd_appr_lehmer): New prototype.
diff --git a/mpn/powerpc64/mode64/aorscnd_n.asm b/mpn/powerpc64/mode64/aorscnd_n.asm
new file mode 100644
index 000000000..47aa6fb39
--- /dev/null
+++ b/mpn/powerpc64/mode64/aorscnd_n.asm
@@ -0,0 +1,185 @@
+dnl  PowerPC-64 mpn_addcnd_n/mpn_subcnd_n.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          2.25
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 ?
+
+C INPUT PARAMETERS
+define(`rp',   `r3')
+define(`up',   `r4')
+define(`vp',   `r5')
+define(`n',    `r6')
+define(`cnd',  `r7')
+
+ifdef(`OPERATION_addcnd_n',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_addcnd_n)
+  define(GENRVAL,	`addi	r3, r3, 1')
+  define(SETCBR,	`addic	r0, $1, -1')
+  define(CLRCB,		`addic	r0, r0, 0')
+')
+ifdef(`OPERATION_subcnd_n',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_subcnd_n)
+  define(GENRVAL,	`neg	r3, r3')
+  define(SETCBR,	`subfic	r0, $1, 0')
+  define(CLRCB,		`addic	r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ASM_START()
+PROLOGUE(func)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+
+	subfic	cnd, cnd, 0
+	subfe	cnd, cnd, cnd
+
+	rldicl.	r0, r6, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	r6, r6, 3	C compute count...
+	srdi	r6, r6, 2	C ...for ctr
+	mtctr	r6		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r8, 0(up)	C load s1 limb
+	ld	r9, 0(vp)	C load s2 limb
+	ld	r10, 8(up)	C load s1 limb
+	ld	r11, 8(vp)	C load s2 limb
+	ld	r12, 16(up)	C load s1 limb
+	addi	up, up, 24
+	ld	r0, 16(vp)	C load s2 limb
+	addi	vp, vp, 24
+	and	r9, r9, cnd
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	ADDSUB	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	std	r29, 0(rp)
+	std	r30, 8(rp)
+	std	r31, 16(rp)
+	addi	rp, rp, 24
+	bdnz	L(go)
+	b	L(ret)
+
+L(b01):	ld	r12, 0(up)	C load s1 limb
+	addi	up, up, 8
+	ld	r0, 0(vp)	C load s2 limb
+	addi	vp, vp, 8
+	and	r0, r0, cnd
+	ADDSUB	r31, r0, r12	C add
+	std	r31, 0(rp)
+	addi	rp, rp, 8
+	bdnz	L(go)
+	b	L(ret)
+
+L(b10):	ld	r10, 0(up)	C load s1 limb
+	ld	r11, 0(vp)	C load s2 limb
+	ld	r12, 8(up)	C load s1 limb
+	addi	up, up, 16
+	ld	r0, 8(vp)	C load s2 limb
+	addi	vp, vp, 16
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	ADDSUB	r30, r11, r10	C add
+	ADDSUBC	r31, r0, r12	C add
+	std	r30, 0(rp)
+	std	r31, 8(rp)
+	addi	rp, rp, 16
+	bdnz	L(go)
+	b	L(ret)
+
+L(b00):	CLRCB			C clear/set cy
+L(go):	ld	r6, 0(up)	C load s1 limb
+	ld	r27, 0(vp)	C load s2 limb
+	ld	r8, 8(up)	C load s1 limb
+	ld	r9, 8(vp)	C load s2 limb
+	ld	r10, 16(up)	C load s1 limb
+	ld	r11, 16(vp)	C load s2 limb
+	ld	r12, 24(up)	C load s1 limb
+	ld	r0, 24(vp)	C load s2 limb
+	and	r27, r27, cnd
+	and	r9, r9, cnd
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	bdz	L(end)
+
+	addi	up, up, 32
+	addi	vp, vp, 32
+
+L(top):	ADDSUBC	r28, r27, r6
+	ld	r6, 0(up)	C load s1 limb
+	ld	r27, 0(vp)	C load s2 limb
+	ADDSUBC	r29, r9, r8
+	ld	r8, 8(up)	C load s1 limb
+	ld	r9, 8(vp)	C load s2 limb
+	ADDSUBC	r30, r11, r10
+	ld	r10, 16(up)	C load s1 limb
+	ld	r11, 16(vp)	C load s2 limb
+	ADDSUBC	r31, r0, r12
+	ld	r12, 24(up)	C load s1 limb
+	ld	r0, 24(vp)	C load s2 limb
+	std	r28, 0(rp)
+	addi	up, up, 32
+	std	r29, 8(rp)
+	addi	vp, vp, 32
+	std	r30, 16(rp)
+	std	r31, 24(rp)
+	addi	rp, rp, 32
+	and	r27, r27, cnd
+	and	r9, r9, cnd
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	bdnz	L(top)		C decrement ctr and loop back
+
+L(end):	ADDSUBC	r28, r27, r6
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	std	r28, 0(rp)
+	std	r29, 8(rp)
+	std	r30, 16(rp)
+	std	r31, 24(rp)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+
+	subfe	r3, r0, r0	C -cy
+	GENRVAL
+	blr
+EPILOGUE()
-- 
cgit v1.2.1


From e143b1a779b0a2f13627758436f3ee6d3103f39d Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 16 Nov 2011 21:49:38 +0100
Subject: Slight tweak of new code.

---
 mpn/powerpc64/tabselect.asm | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm
index 0ac2e9ba0..7d189388b 100644
--- a/mpn/powerpc64/tabselect.asm
+++ b/mpn/powerpc64/tabselect.asm
@@ -21,10 +21,10 @@ include(`../config.m4')
 
 C                  cycles/limb
 C POWER3/PPC630          ?
-C POWER4/PPC970          ?
+C POWER4/PPC970          3.3
 C POWER5                 ?
 C POWER6                 ?
-C POWER7                 ?
+C POWER7                 2.5
 
 C NOTES
 C  * This has not been tuned for any specific processor.  Its speed should not
@@ -60,18 +60,20 @@ L(outer):
 	beq	cr0, L(top)		C branch to loop entry if n even
 
 	ld	r9, 0(tp)
+	addi	tp, tp, 8
 	and	r9, r9, mask
 	ld	r11, 0(rp)
 	andc	r11, r11, mask
 	or	r9, r9, r11
 	std	r9, 0(rp)
-	addi	tp, tp, 8
 	addi	rp, rp, 8
 	bdz	L(end)
 
 	ALIGN(16)
 L(top):	ld	r9, 0(tp)
 	ld	r10, 8(tp)
+	addi	tp, tp, 16
+	nop
 	and	r9, r9, mask
 	and	r10, r10, mask
 	ld	r11, 0(rp)
@@ -82,13 +84,12 @@ L(top):	ld	r9, 0(tp)
 	or	r10, r10, r12
 	std	r9, 0(rp)
 	std	r10, 8(rp)
-	addi	tp, tp, 16
 	addi	rp, rp, 16
 	bdnz	L(top)
 
 L(end):	subf	rp, n, rp		C move rp back to beginning
+	cmpdi	cr6, nents, 1
 	addi	nents, nents, -1
-	cmpdi	cr6, nents, 0
 	bne	cr6, L(outer)
 
 	blr
-- 
cgit v1.2.1


From 380a7c946b6837264f7a34c2cb20b1ed0cc7f967 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 16 Nov 2011 21:50:51 +0100
Subject: Add cycle counts.

---
 mpn/ia64/tabselect.asm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm
index 0ae3fdcfe..cc5b49b04 100644
--- a/mpn/ia64/tabselect.asm
+++ b/mpn/ia64/tabselect.asm
@@ -21,12 +21,12 @@ include(`../config.m4')
 
 C           cycles/limb
 C Itanium:       ?
-C Itanium 2:     5  (estimated)
+C Itanium 2:     2.5
 
 C NOTES
-C  * Using software pipelining could trivially yield 3 c/l even without
-C    unrolling.  (This code was modelled after the powerpc64 code, for
-C    simplicity.)
+C  * Using software pipelining could trivially yield 2 c/l without unrolling,
+C    or 1+epsilon with unrolling.  (This code was modelled after the powerpc64
+C    code, for simplicity.)
 
 C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
 define(`rp',     `r32')
-- 
cgit v1.2.1


From 3b86e6c687af5ab4aeb153bc5ca1bb4a0ccd7759 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 16 Nov 2011 21:51:17 +0100
Subject: Add cycle counts.

---
 mpn/x86_64/tabselect.asm | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
index f7de6a85b..ca475942b 100644
--- a/mpn/x86_64/tabselect.asm
+++ b/mpn/x86_64/tabselect.asm
@@ -21,14 +21,14 @@ include(`../config.m4')
 
 
 C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 ?
-C Intel NHM	 ?
-C Intel SBR	 ?
+C AMD K8,K9	 2.5
+C AMD K10	 2.5
+C Intel P4	 4
+C Intel core2	 2.3
+C Intel NHM	 2.5
+C Intel SBR	 2.2
 C Intel atom	 ?
-C VIA nano	 ?
+C VIA nano	 3.5
 
 C NOTES
 C  * This has not been tuned for any specific processor.  Its speed should not
-- 
cgit v1.2.1


From 27057444042708cc07e7f2959af63076c042065b Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Wed, 16 Nov 2011 21:55:23 +0100
Subject: New file.

---
 ChangeLog                   |  2 +
 mpn/powerpc32/tabselect.asm | 98 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 mpn/powerpc32/tabselect.asm

diff --git a/ChangeLog b/ChangeLog
index 6cfc78d1c..9d2e0c041 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
 2011-11-16  Torbjorn Granlund  <tege@gmplib.org>
 
+	* mpn/powerpc32/tabselect.asm: New file.
+
 	* mpn/powerpc64/mode64/aorscnd_n.asm: New file.
 
 2011-11-15  Niels M�ller  <nisse@lysator.liu.se>
diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm
new file mode 100644
index 000000000..b12fecd12
--- /dev/null
+++ b/mpn/powerpc32/tabselect.asm
@@ -0,0 +1,98 @@
+dnl  PowerPC-32 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C 603e:              ?
+C 604e:              ?
+C 75x (G3):          ?
+C 7400,7410 (G4):    ?
+C 744x,745x (G4+):   ?
+C power4/ppc970:     3.3
+C power5:            ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r3')
+define(`tp',     `r4')
+define(`n',      `r5')
+define(`nents',  `r6')
+define(`which',  `r7')
+
+define(`mask',   `r8')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	addi	r0, n, 1
+	srdi	r0, r0, 1		C inner loop count
+	andi.	r9, n, 1		C set cr0 for use in inner loop
+	subf	which, nents, which
+	sldi	n, n, 2
+
+L(outer):
+	mtctr	r0			C put inner loop count in ctr
+
+	add	r9, which, nents	C are we at the selected table entry?
+	addic	r9, r9, -1		C set CF iff not selected entry
+	subfe	mask, r0, r0
+
+	beq	cr0, L(top)		C branch to loop entry if n even
+
+	lwz	r9, 0(tp)
+	addi	tp, tp, 4
+	and	r9, r9, mask
+	lwz	r11, 0(rp)
+	andc	r11, r11, mask
+	or	r9, r9, r11
+	stw	r9, 0(rp)
+	addi	rp, rp, 4
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	lwz	r9, 0(tp)
+	lwz	r10, 4(tp)
+	addi	tp, tp, 8
+	nop
+	and	r9, r9, mask
+	and	r10, r10, mask
+	lwz	r11, 0(rp)
+	lwz	r12, 4(rp)
+	andc	r11, r11, mask
+	andc	r12, r12, mask
+	or	r9, r9, r11
+	or	r10, r10, r12
+	stw	r9, 0(rp)
+	stw	r10, 4(rp)
+	addi	rp, rp, 8
+	bdnz	L(top)
+
+L(end):	subf	rp, n, rp		C move rp back to beginning
+	cmpdi	cr6, nents, 1
+	addi	nents, nents, -1
+	bne	cr6, L(outer)
+
+	blr
+EPILOGUE()
-- 
cgit v1.2.1


From 4aa30987572b144b7606af6394aadf1efdc7a65c Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 17 Nov 2011 09:02:17 +0100
Subject: Use 32-bit insn forms.

---
 mpn/powerpc32/tabselect.asm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm
index b12fecd12..155a7b495 100644
--- a/mpn/powerpc32/tabselect.asm
+++ b/mpn/powerpc32/tabselect.asm
@@ -47,10 +47,10 @@ ASM_START()
 	ALIGN(16)
 PROLOGUE(mpn_tabselect)
 	addi	r0, n, 1
-	srdi	r0, r0, 1		C inner loop count
+	srwi	r0, r0, 1		C inner loop count
 	andi.	r9, n, 1		C set cr0 for use in inner loop
 	subf	which, nents, which
-	sldi	n, n, 2
+	slwi	n, n, 2
 
 L(outer):
 	mtctr	r0			C put inner loop count in ctr
@@ -90,7 +90,7 @@ L(top):	lwz	r9, 0(tp)
 	bdnz	L(top)
 
 L(end):	subf	rp, n, rp		C move rp back to beginning
-	cmpdi	cr6, nents, 1
+	cmpwi	cr6, nents, 1
 	addi	nents, nents, -1
 	bne	cr6, L(outer)
 
-- 
cgit v1.2.1


From 63f97805b47041bbd20aca91e30542de25276c39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 17 Nov 2011 15:18:45 +0100
Subject: Another tweak to mpn_hgcd_appr tuning

---
 ChangeLog     | 4 ++++
 tune/tuneup.c | 1 +
 2 files changed, 5 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 9d2e0c041..758ec78ae 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-17  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tune/tuneup.c (tune_hgcd_appr): Increase stop_since_change.
+
 2011-11-16  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/powerpc32/tabselect.asm: New file.
diff --git a/tune/tuneup.c b/tune/tuneup.c
index b61729119..444e5e429 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -1763,6 +1763,7 @@ tune_hgcd_appr (void)
   param.function = speed_mpn_hgcd_appr;
   /* We seem to get strange results for small sizes */
   param.min_size = 50;
+  param.stop_since_change = 150;
   one (&hgcd_appr_threshold, &param);
 }
 
-- 
cgit v1.2.1


From ca20b2f018c660e83322ff0bd1a3a3a2f9874bb1 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 17 Nov 2011 21:24:47 +0100
Subject: Add speed measurement for mpn_tabselect.

---
 ChangeLog     |  9 +++++++++
 tune/common.c |  5 +++++
 tune/speed.c  |  3 ++-
 tune/speed.h  | 10 ++++++++--
 4 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 758ec78ae..658930906 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2011-11-17  Torbjorn Granlund  <tege@gmplib.org>
+
+	* tune/speed.c (routine): Add mpn_tabselect.
+	* tune/common.c (speed_mpn_tabselect): New function.
+	* tune/speed.h (SPEED_ROUTINE_MPN_COPY_CALL): New macro, made from
+	old SPEED_ROUTINE_MPN_COPY.
+	(SPEED_ROUTINE_MPN_COPY): Just invoke SPEED_ROUTINE_MPN_COPY_CALL.
+	(SPEED_ROUTINE_MPN_TABSELECT): New macro.
+
 2011-11-17  Niels M�ller  <nisse@lysator.liu.se>
 
 	* tune/tuneup.c (tune_hgcd_appr): Increase stop_since_change.
diff --git a/tune/common.c b/tune/common.c
index 9855e8845..88f0099e8 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -461,6 +461,11 @@ speed_mpn_com (struct speed_params *s)
 {
   SPEED_ROUTINE_MPN_COPY (mpn_com);
 }
+double
+speed_mpn_tabselect (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TABSELECT (mpn_tabselect);
+}
 
 
 double
diff --git a/tune/speed.c b/tune/speed.c
index cffed35b6..704d82772 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -283,7 +283,7 @@ const struct routine_t {
   { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
   { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
   { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2    },
-  
+
   { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
   { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
 
@@ -412,6 +412,7 @@ const struct routine_t {
 #if HAVE_NATIVE_mpn_copyd
   { "mpn_copyd",         speed_mpn_copyd            },
 #endif
+  { "mpn_tabselect",     speed_mpn_tabselect, FLAG_R_OPTIONAL },
 #if HAVE_NATIVE_mpn_addlsh1_n
   { "mpn_addlsh1_n",     speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
 #endif
diff --git a/tune/speed.h b/tune/speed.h
index 329c09783..20daad2dd 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -175,6 +175,7 @@ double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s));
 double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s));
 double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s));
 double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s));
+double speed_mpn_tabselect __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s));
 double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
@@ -613,7 +614,7 @@ int speed_routine_count_zeros_setup
 #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
 
 /* For mpn_copy or similar. */
-#define SPEED_ROUTINE_MPN_COPY(function)				\
+#define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
   {									\
     mp_ptr    wp;							\
     unsigned  i;							\
@@ -632,13 +633,18 @@ int speed_routine_count_zeros_setup
     speed_starttime ();							\
     i = s->reps;							\
     do									\
-      function (wp, s->xp, s->size);					\
+      call;								\
     while (--i != 0);							\
     t = speed_endtime ();						\
 									\
     TMP_FREE;								\
     return t;								\
   }
+#define SPEED_ROUTINE_MPN_COPY(function)				\
+  SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_TABSELECT(function)				\
+  SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size, 1, s->r))
 
 #define SPEED_ROUTINE_MPN_COPYC(function)				\
   {									\
-- 
cgit v1.2.1


From 342bc281b1d2e59520f99be86120b177606f43f3 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 17 Nov 2011 22:19:28 +0100
Subject: Retune, adding several new THRESHOLDs.

---
 mpn/alpha/ev5/gmp-mparam.h            |  40 ++++++----
 mpn/alpha/ev6/gmp-mparam.h            |  76 ++++++++++--------
 mpn/ia64/gmp-mparam.h                 | 142 ++++++++++++++++++++++------------
 mpn/pa64/gmp-mparam.h                 |  62 ++++++++-------
 mpn/powerpc64/mode64/p4/gmp-mparam.h  |  31 +++++---
 mpn/powerpc64/mode64/p5/gmp-mparam.h  |  39 ++++++----
 mpn/powerpc64/mode64/p6/gmp-mparam.h  |  37 +++++----
 mpn/powerpc64/mode64/p7/gmp-mparam.h  |  32 ++++----
 mpn/s390_64/gmp-mparam.h              |  16 ++--
 mpn/sparc64/ultrasparc34/gmp-mparam.h |  29 ++++---
 mpn/sparc64/ultrasparct1/gmp-mparam.h |  36 +++++----
 mpn/x86/atom/gmp-mparam.h             |  41 ++++++----
 mpn/x86/k7/gmp-mparam.h               |  45 ++++++-----
 mpn/x86/p6/sse2/gmp-mparam.h          |  61 ++++++++-------
 mpn/x86/pentium4/sse2/gmp-mparam.h    |  85 ++++++++++----------
 mpn/x86_64/atom/gmp-mparam.h          |  17 ++--
 mpn/x86_64/bobcat/gmp-mparam.h        |  10 ++-
 mpn/x86_64/core2/gmp-mparam.h         |  23 +++---
 mpn/x86_64/coreinhm/gmp-mparam.h      |  23 +++---
 mpn/x86_64/coreisbr/gmp-mparam.h      | 132 ++++++++++++++++++++++---------
 mpn/x86_64/gmp-mparam.h               |  13 +++-
 mpn/x86_64/nano/gmp-mparam.h          |  33 ++++----
 mpn/x86_64/pentium4/gmp-mparam.h      |  51 ++++++------
 23 files changed, 659 insertions(+), 415 deletions(-)

diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h
index a4c794838..395353a46 100644
--- a/mpn/alpha/ev5/gmp-mparam.h
+++ b/mpn/alpha/ev5/gmp-mparam.h
@@ -26,38 +26,44 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
 #define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD         29
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8  /* never mpn_mod_1_1p */
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         4
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     75
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     15
 #define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD              21
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD           80
+#define BMOD_1_TO_MOD_1_THRESHOLD           78
 
-#define MUL_TOOM22_THRESHOLD                18
-#define MUL_TOOM33_THRESHOLD                61
-#define MUL_TOOM44_THRESHOLD                88
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD               118
 #define MUL_TOOM6H_THRESHOLD               173
-#define MUL_TOOM8H_THRESHOLD                 0
+#define MUL_TOOM8H_THRESHOLD               240
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      57
 #define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD      60
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      56
 
 #define SQR_BASECASE_THRESHOLD               4
 #define SQR_TOOM2_THRESHOLD                 28
-#define SQR_TOOM3_THRESHOLD                 65
+#define SQR_TOOM3_THRESHOLD                 77
 #define SQR_TOOM4_THRESHOLD                136
-#define SQR_TOOM6_THRESHOLD                180
-#define SQR_TOOM8_THRESHOLD                248
+#define SQR_TOOM6_THRESHOLD                173
+#define SQR_TOOM8_THRESHOLD                260
+
+#define MULMID_TOOM42_THRESHOLD             20
 
 #define MULMOD_BNM1_THRESHOLD               11
 #define SQRMOD_BNM1_THRESHOLD               13
 
+#define POWM_SEC_TABLE  2,17,322,387
+
 #define MUL_FFT_MODF_THRESHOLD             244  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    244, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
@@ -161,9 +167,11 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD                942
 
 #define MATRIX22_STRASSEN_THRESHOLD         13
-#define HGCD_THRESHOLD                     101
-#define GCD_DC_THRESHOLD                   330
-#define GCDEXT_DC_THRESHOLD                222
+#define HGCD_THRESHOLD                     105
+#define HGCD_APPR_THRESHOLD                111
+#define HGCD_REDUCE_THRESHOLD             1437
+#define GCD_DC_THRESHOLD                   318
+#define GCDEXT_DC_THRESHOLD                214
 #define JACOBI_BASE_METHOD                   2
 
 #define GET_STR_DC_THRESHOLD                16
diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h
index 12c3891d7..ce865f4cc 100644
--- a/mpn/alpha/ev6/gmp-mparam.h
+++ b/mpn/alpha/ev6/gmp-mparam.h
@@ -29,38 +29,44 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
 #define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD         6
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        30
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         4
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
 #define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD               8
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD           16
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
 
 #define MUL_TOOM22_THRESHOLD                35
-#define MUL_TOOM33_THRESHOLD                74
-#define MUL_TOOM44_THRESHOLD               178
-#define MUL_TOOM6H_THRESHOLD               288
-#define MUL_TOOM8H_THRESHOLD               333
+#define MUL_TOOM33_THRESHOLD                77
+#define MUL_TOOM44_THRESHOLD               184
+#define MUL_TOOM6H_THRESHOLD               228
+#define MUL_TOOM8H_THRESHOLD               288
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD      75
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     101
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      73
 
-#define SQR_BASECASE_THRESHOLD               5
-#define SQR_TOOM2_THRESHOLD                 61
-#define SQR_TOOM3_THRESHOLD                107
-#define SQR_TOOM4_THRESHOLD                170
-#define SQR_TOOM6_THRESHOLD                309
-#define SQR_TOOM8_THRESHOLD                360
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 58
+#define SQR_TOOM3_THRESHOLD                103
+#define SQR_TOOM4_THRESHOLD                172
+#define SQR_TOOM6_THRESHOLD                264
+#define SQR_TOOM8_THRESHOLD                333
+
+#define MULMID_TOOM42_THRESHOLD             52
 
 #define MULMOD_BNM1_THRESHOLD               20
 #define SQRMOD_BNM1_THRESHOLD               23
 
+#define POWM_SEC_TABLE  4,17,246,2388
+
 #define MUL_FFT_MODF_THRESHOLD             480  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    480, 5}, {     18, 6}, {     10, 5}, {     21, 6}, \
@@ -148,19 +154,19 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_THRESHOLD                 3136
 
 #define MULLO_BASECASE_THRESHOLD             0  /* always */
-#define MULLO_DC_THRESHOLD                 130
-#define MULLO_MUL_N_THRESHOLD            15604
+#define MULLO_DC_THRESHOLD                 173
+#define MULLO_MUL_N_THRESHOLD            11355
 
-#define DC_DIV_QR_THRESHOLD                119
-#define DC_DIVAPPR_Q_THRESHOLD             390
+#define DC_DIV_QR_THRESHOLD                112
+#define DC_DIVAPPR_Q_THRESHOLD             422
 #define DC_BDIV_QR_THRESHOLD               110
-#define DC_BDIV_Q_THRESHOLD                318
+#define DC_BDIV_Q_THRESHOLD                348
 
-#define INV_MULMOD_BNM1_THRESHOLD           75
-#define INV_NEWTON_THRESHOLD               390
-#define INV_APPR_THRESHOLD                 372
+#define INV_MULMOD_BNM1_THRESHOLD           68
+#define INV_NEWTON_THRESHOLD               402
+#define INV_APPR_THRESHOLD                 396
 
-#define BINV_NEWTON_THRESHOLD              393
+#define BINV_NEWTON_THRESHOLD              399
 #define REDC_1_TO_REDC_N_THRESHOLD         110
 
 #define MU_DIV_QR_THRESHOLD               1718
@@ -170,12 +176,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD               1652
 
 #define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                     282
-#define GCD_DC_THRESHOLD                  1138
-#define GCDEXT_DC_THRESHOLD                773
+#define HGCD_THRESHOLD                     278
+#define HGCD_APPR_THRESHOLD                366
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                  1258
+#define GCDEXT_DC_THRESHOLD                777
 #define JACOBI_BASE_METHOD                   3
 
-#define GET_STR_DC_THRESHOLD                14
-#define GET_STR_PRECOMPUTE_THRESHOLD        19
-#define SET_STR_DC_THRESHOLD              3754
-#define SET_STR_PRECOMPUTE_THRESHOLD      8097
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD              3539
+#define SET_STR_PRECOMPUTE_THRESHOLD      7784
diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h
index 0841c82aa..f080b876e 100644
--- a/mpn/ia64/gmp-mparam.h
+++ b/mpn/ia64/gmp-mparam.h
@@ -1,6 +1,6 @@
 /* gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010 Free Software
+Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010, 2011 Free Software
 Foundation, Inc.
 
 This file is part of the GNU MP Library.
@@ -21,70 +21,92 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define GMP_LIMB_BITS 64
 #define BYTES_PER_MP_LIMB 8
 
-/* 1300MHz Itanium2 (babe.fsffrance.org) */
-
+/* 900MHz Itanium2 (titanic.gmplib.org) */
 
+#define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          8
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        21
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD              12
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
 
 #define MUL_TOOM22_THRESHOLD                40
-#define MUL_TOOM33_THRESHOLD               122
-#define MUL_TOOM44_THRESHOLD               212
+#define MUL_TOOM33_THRESHOLD               129
+#define MUL_TOOM44_THRESHOLD               214
 #define MUL_TOOM6H_THRESHOLD               318
 #define MUL_TOOM8H_THRESHOLD               430
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     146
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD     129
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     145
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     126
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD     151
 
 #define SQR_BASECASE_THRESHOLD              11
 #define SQR_TOOM2_THRESHOLD                 84
-#define SQR_TOOM3_THRESHOLD                125
+#define SQR_TOOM3_THRESHOLD                135
 #define SQR_TOOM4_THRESHOLD                494
-#define SQR_TOOM6_THRESHOLD                  0  /* never toom4 */
-#define SQR_TOOM8_THRESHOLD                  0  /* never toom6 */
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
 
 #define MULMOD_BNM1_THRESHOLD               23
-#define SQRMOD_BNM1_THRESHOLD               25
+#define SQRMOD_BNM1_THRESHOLD               28
+
+#define POWM_SEC_TABLE  2,29,130,905
 
-#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
-  { {    444, 5}, {     27, 6}, {     14, 5}, {     29, 6}, \
-    {     35, 7}, {     18, 6}, {     37, 7}, {     19, 6}, \
+  { {    476, 5}, {     27, 6}, {     14, 5}, {     29, 6}, \
+    {     33, 7}, {     17, 6}, {     37, 7}, {     19, 6}, \
     {     39, 7}, {     21, 6}, {     43, 7}, {     33, 8}, \
     {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \
-    {     21, 7}, {     43, 8}, {     29, 9}, {     15, 8}, \
-    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
-    {     49, 9}, {     27, 8}, {     57, 9}, {     31, 8}, \
-    {     63, 9}, {     35, 8}, {     71, 9}, {     43,10}, \
+    {     21, 7}, {     43, 8}, {     37, 9}, {     19, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     27, 8}, \
+    {     57, 9}, {     31, 8}, {     63, 9}, {     43,10}, \
     {     23, 9}, {     59,10}, {     31, 9}, {     71,10}, \
-    {     39, 9}, {     87,10}, {     47, 9}, {     99,10}, \
+    {     39, 9}, {     83,10}, {     47, 9}, {     99,10}, \
     {     55,11}, {     31,10}, {     87,11}, {     47,10}, \
     {    111,12}, {     31,11}, {     63,10}, {    143,11}, \
     {     79,10}, {    167,11}, {     95,10}, {    191,11}, \
     {    111,12}, {     63,11}, {    143,10}, {    287, 9}, \
     {    575,10}, {    303,11}, {    159,10}, {    319,12}, \
     {     95,11}, {    191,10}, {    399,11}, {    207,10}, \
-    {    431,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
-    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
-    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 76
+    {    431,13}, {     63,12}, {    127,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    303,12}, \
+    {    159,11}, {    335,10}, {    671,11}, {    367,12}, \
+    {    191,11}, {    399,10}, {    799,11}, {    431,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    607,12}, {    319,11}, \
+    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    415,11}, {    863,12}, {    447,14}, {    127,13}, \
+    {    255,12}, {    607,13}, {    319,12}, {    735,13}, \
+    {    383,12}, {    799,11}, {   1599,12}, {    863,13}, \
+    {    447,12}, {    927,11}, {   1855,14}, {    255,13}, \
+    {    511,12}, {   1055,13}, {    575,12}, {   1215,13}, \
+    {    639,12}, {   1279,13}, {    703,14}, {    383,13}, \
+    {    767,12}, {   1535,13}, {    831,12}, {   1663,13}, \
+    {    895,12}, {   1791,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,14}, {    767,13}, \
+    {   1599,12}, {   3199,13}, {   1663,14}, {    895,13}, \
+    {   1855,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,14}, {   1279,13}, {   2687,14}, \
+    {   1407,15}, {    767,14}, {   1535,13}, {   3199,14}, \
+    {   1663,13}, {   3455,14}, {   1791,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 155
 #define MUL_FFT_THRESHOLD                 5760
 
-#define SQR_FFT_MODF_THRESHOLD             440  /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD             436  /* k = 5 */
 #define SQR_FFT_TABLE3                                      \
-  { {    440, 5}, {     14, 4}, {     29, 5}, {     29, 6}, \
-    {     15, 5}, {     31, 6}, {     35, 7}, {     18, 6}, \
-    {     37, 7}, {     33, 8}, {     17, 7}, {     37, 8}, \
+  { {    436, 5}, {     14, 4}, {     29, 5}, {     31, 6}, \
+    {     35, 7}, {     18, 6}, {     37, 7}, {     37, 8}, \
     {     19, 7}, {     40, 8}, {     37, 9}, {     19, 8}, \
     {     43, 9}, {     23, 8}, {     49, 9}, {     27, 8}, \
     {     57, 9}, {     43,10}, {     23, 9}, {     55,10}, \
@@ -93,45 +115,69 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     {     87,11}, {     47,10}, {    111,12}, {     31,11}, \
     {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
     {     95,10}, {    191,11}, {    111,12}, {     63,11}, \
-    {    127,10}, {    255,11}, {    143,10}, {    303,11}, \
-    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
-    {    399,11}, {    207,10}, {    431,13}, {   8192,14}, \
-    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
-    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
-    {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 66
+    {    127,10}, {    255,11}, {    143,10}, {    287, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319,12}, \
+    {     95,11}, {    191,10}, {    399,11}, {    207,10}, \
+    {    431,13}, {     63,12}, {    127,11}, {    271,10}, \
+    {    543,11}, {    303,12}, {    159,11}, {    335,10}, \
+    {    671,11}, {    367,10}, {    735,12}, {    191,11}, \
+    {    399,10}, {    799,11}, {    431,12}, {    223,11}, \
+    {    463,13}, {    127,12}, {    255,11}, {    543,12}, \
+    {    287,11}, {    607,12}, {    319,11}, {    671,12}, \
+    {    351,11}, {    735,13}, {    191,12}, {    383,11}, \
+    {    799,12}, {    415,11}, {    863,12}, {    447,11}, \
+    {    895,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1087,12}, {    607,13}, {    319,12}, {    735,13}, \
+    {    383,12}, {    863,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1183,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    895,12}, {   1791,13}, \
+    {    959,15}, {    255,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1215,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1471,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,14}, {   1279,13}, \
+    {   2687,14}, {   1407,15}, {    767,14}, {   1535,13}, \
+    {   3199,14}, {   1663,13}, {   3455,14}, {   1791,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 151
 #define SQR_FFT_THRESHOLD                 4032
 
 #define MULLO_BASECASE_THRESHOLD            29
 #define MULLO_DC_THRESHOLD                  57
 #define MULLO_MUL_N_THRESHOLD            11278
 
-#define DC_DIV_QR_THRESHOLD                 59
+#define DC_DIV_QR_THRESHOLD                 64
 #define DC_DIVAPPR_Q_THRESHOLD             222
 #define DC_BDIV_QR_THRESHOLD                95
 #define DC_BDIV_Q_THRESHOLD                264
 
-#define INV_MULMOD_BNM1_THRESHOLD           82
-#define INV_NEWTON_THRESHOLD                11
-#define INV_APPR_THRESHOLD                  18
+#define INV_MULMOD_BNM1_THRESHOLD           86
+#define INV_NEWTON_THRESHOLD               139
+#define INV_APPR_THRESHOLD                 147
 
 #define BINV_NEWTON_THRESHOLD              252
-#define REDC_1_TO_REDC_2_THRESHOLD           0
+#define REDC_1_TO_REDC_2_THRESHOLD           0  /* always */
 #define REDC_2_TO_REDC_N_THRESHOLD         147
 
 #define MU_DIV_QR_THRESHOLD               1142
-#define MU_DIVAPPR_Q_THRESHOLD             998
+#define MU_DIVAPPR_Q_THRESHOLD            1142
 #define MUPI_DIV_QR_THRESHOLD                0  /* always */
-#define MU_BDIV_QR_THRESHOLD              1187
+#define MU_BDIV_QR_THRESHOLD              1210
 #define MU_BDIV_Q_THRESHOLD               1470
 
 #define MATRIX22_STRASSEN_THRESHOLD         23
 #define HGCD_THRESHOLD                     117
-#define GCD_DC_THRESHOLD                   469
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   496
 #define GCDEXT_DC_THRESHOLD                368
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                13
-#define GET_STR_PRECOMPUTE_THRESHOLD        21
-#define SET_STR_DC_THRESHOLD              1204
-#define SET_STR_PRECOMPUTE_THRESHOLD      3266
+#define GET_STR_PRECOMPUTE_THRESHOLD        22
+#define SET_STR_DC_THRESHOLD              1474
+#define SET_STR_PRECOMPUTE_THRESHOLD      3168
diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h
index d0e86d856..081757aca 100644
--- a/mpn/pa64/gmp-mparam.h
+++ b/mpn/pa64/gmp-mparam.h
@@ -25,14 +25,16 @@ with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define DIVREM_1_NORM_THRESHOLD              0  /* always */
 #define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
 #define MOD_1U_TO_MOD_1_1_THRESHOLD         10
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
 #define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD              21
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
 #define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
 
@@ -47,16 +49,20 @@ with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD     129
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD      54
 
-#define SQR_BASECASE_THRESHOLD               0  /* always */
-#define SQR_TOOM2_THRESHOLD                 56
-#define SQR_TOOM3_THRESHOLD                169
-#define SQR_TOOM4_THRESHOLD                280
-#define SQR_TOOM6_THRESHOLD                  0
-#define SQR_TOOM8_THRESHOLD                309
+#define SQR_BASECASE_THRESHOLD               5
+#define SQR_TOOM2_THRESHOLD                 58
+#define SQR_TOOM3_THRESHOLD                153
+#define SQR_TOOM4_THRESHOLD                278
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
 
-#define MULMOD_BNM1_THRESHOLD               16
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               15
 #define SQRMOD_BNM1_THRESHOLD               19
 
+#define POWM_SEC_TABLE  2,23,228,1084
+
 #define MUL_FFT_MODF_THRESHOLD             336  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    336, 5}, {     11, 4}, {     23, 5}, {     21, 6}, \
@@ -196,34 +202,36 @@ with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_THRESHOLD                 1856
 
 #define MULLO_BASECASE_THRESHOLD             0  /* always */
-#define MULLO_DC_THRESHOLD                 133
-#define MULLO_MUL_N_THRESHOLD             4292
+#define MULLO_DC_THRESHOLD                 113
+#define MULLO_MUL_N_THRESHOLD             4658
 
-#define DC_DIV_QR_THRESHOLD                140
-#define DC_DIVAPPR_Q_THRESHOLD             422
-#define DC_BDIV_QR_THRESHOLD               150
-#define DC_BDIV_Q_THRESHOLD                351
+#define DC_DIV_QR_THRESHOLD                123
+#define DC_DIVAPPR_Q_THRESHOLD             372
+#define DC_BDIV_QR_THRESHOLD               142
+#define DC_BDIV_Q_THRESHOLD                312
 
-#define INV_MULMOD_BNM1_THRESHOLD           60
-#define INV_NEWTON_THRESHOLD               348
-#define INV_APPR_THRESHOLD                 324
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD               315
+#define INV_APPR_THRESHOLD                 315
 
-#define BINV_NEWTON_THRESHOLD              363
+#define BINV_NEWTON_THRESHOLD              360
 #define REDC_1_TO_REDC_N_THRESHOLD         101
 
-#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIV_QR_THRESHOLD                979
 #define MU_DIVAPPR_Q_THRESHOLD            1142
-#define MUPI_DIV_QR_THRESHOLD              110
+#define MUPI_DIV_QR_THRESHOLD               93
 #define MU_BDIV_QR_THRESHOLD               889
-#define MU_BDIV_Q_THRESHOLD               1334
+#define MU_BDIV_Q_THRESHOLD               1187
 
 #define MATRIX22_STRASSEN_THRESHOLD          9
-#define HGCD_THRESHOLD                     242
-#define GCD_DC_THRESHOLD                   752
-#define GCDEXT_DC_THRESHOLD                545
+#define HGCD_THRESHOLD                     234
+#define HGCD_APPR_THRESHOLD                300
+#define HGCD_REDUCE_THRESHOLD             1553
+#define GCD_DC_THRESHOLD                   684
+#define GCDEXT_DC_THRESHOLD                525
 #define JACOBI_BASE_METHOD                   2
 
 #define GET_STR_DC_THRESHOLD                21
 #define GET_STR_PRECOMPUTE_THRESHOLD        24
-#define SET_STR_DC_THRESHOLD              2008
-#define SET_STR_PRECOMPUTE_THRESHOLD      4066
+#define SET_STR_DC_THRESHOLD              1951
+#define SET_STR_PRECOMPUTE_THRESHOLD      4034
diff --git a/mpn/powerpc64/mode64/p4/gmp-mparam.h b/mpn/powerpc64/mode64/p4/gmp-mparam.h
index 9a0932654..317bc94d6 100644
--- a/mpn/powerpc64/mode64/p4/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p4/gmp-mparam.h
@@ -29,6 +29,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        20
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     16
 #define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           37
 
@@ -43,16 +44,20 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD      62
 
-#define SQR_BASECASE_THRESHOLD               5
-#define SQR_TOOM2_THRESHOLD                 28
-#define SQR_TOOM3_THRESHOLD                 57
-#define SQR_TOOM4_THRESHOLD                136
-#define SQR_TOOM6_THRESHOLD                181
-#define SQR_TOOM8_THRESHOLD                272
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                214
+#define SQR_TOOM6_THRESHOLD                254
+#define SQR_TOOM8_THRESHOLD                430
 
-#define MULMOD_BNM1_THRESHOLD               13
+#define MULMID_TOOM42_THRESHOLD             32
+
+#define MULMOD_BNM1_THRESHOLD               12
 #define SQRMOD_BNM1_THRESHOLD               16
 
+#define POWM_SEC_TABLE  6,47,347,1036,2826
+
 #define MUL_FFT_MODF_THRESHOLD             372  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    372, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
@@ -116,9 +121,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 103
 #define SQR_FFT_THRESHOLD                 2752
 
-#define MULLO_BASECASE_THRESHOLD             5
+#define MULLO_BASECASE_THRESHOLD             3
 #define MULLO_DC_THRESHOLD                  36
-#define MULLO_MUL_N_THRESHOLD            12691
+#define MULLO_MUL_N_THRESHOLD            13463
 
 #define DC_DIV_QR_THRESHOLD                 43
 #define DC_DIVAPPR_Q_THRESHOLD             158
@@ -139,12 +144,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD                998
 
 #define MATRIX22_STRASSEN_THRESHOLD         11
-#define HGCD_THRESHOLD                     105
+#define HGCD_THRESHOLD                     103
+#define HGCD_APPR_THRESHOLD                110
+#define HGCD_REDUCE_THRESHOLD             1962
 #define GCD_DC_THRESHOLD                   318
 #define GCDEXT_DC_THRESHOLD                242
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                12
 #define GET_STR_PRECOMPUTE_THRESHOLD        23
-#define SET_STR_DC_THRESHOLD               858
-#define SET_STR_PRECOMPUTE_THRESHOLD      1864
+#define SET_STR_DC_THRESHOLD               650
+#define SET_STR_PRECOMPUTE_THRESHOLD      1781
diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h
index d177da94e..9220f99d5 100644
--- a/mpn/powerpc64/mode64/p5/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -31,6 +31,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
 #define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           40
 
@@ -38,22 +39,26 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM33_THRESHOLD                50
 #define MUL_TOOM44_THRESHOLD               121
 #define MUL_TOOM6H_THRESHOLD               202
-#define MUL_TOOM8H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               260
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      82
 #define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
 
-#define SQR_BASECASE_THRESHOLD               9
-#define SQR_TOOM2_THRESHOLD                 36
-#define SQR_TOOM3_THRESHOLD                 59
-#define SQR_TOOM4_THRESHOLD                147
-#define SQR_TOOM6_THRESHOLD                204
-#define SQR_TOOM8_THRESHOLD                288
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                142
+#define SQR_TOOM6_THRESHOLD                191
+#define SQR_TOOM8_THRESHOLD                284
 
-#define MULMOD_BNM1_THRESHOLD               14
-#define SQRMOD_BNM1_THRESHOLD               16
+#define MULMID_TOOM42_THRESHOLD             32
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  4,35,387,1068,2699
 
 #define MUL_FFT_MODF_THRESHOLD             348  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -166,15 +171,15 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_THRESHOLD                 2752
 
 #define MULLO_BASECASE_THRESHOLD             0
-#define MULLO_DC_THRESHOLD                  31
+#define MULLO_DC_THRESHOLD                  42
 #define MULLO_MUL_N_THRESHOLD             6633
 
-#define DC_DIV_QR_THRESHOLD                 37
+#define DC_DIV_QR_THRESHOLD                 43
 #define DC_DIVAPPR_Q_THRESHOLD             155
 #define DC_BDIV_QR_THRESHOLD                46
-#define DC_BDIV_Q_THRESHOLD                112
+#define DC_BDIV_Q_THRESHOLD                120
 
-#define INV_MULMOD_BNM1_THRESHOLD           26
+#define INV_MULMOD_BNM1_THRESHOLD           52
 #define INV_NEWTON_THRESHOLD               177
 #define INV_APPR_THRESHOLD                 165
 
@@ -189,11 +194,13 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MATRIX22_STRASSEN_THRESHOLD         15
 #define HGCD_THRESHOLD                     108
-#define GCD_DC_THRESHOLD                   303
+#define HGCD_APPR_THRESHOLD                113
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   315
 #define GCDEXT_DC_THRESHOLD                237
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                13
 #define GET_STR_PRECOMPUTE_THRESHOLD        23
-#define SET_STR_DC_THRESHOLD               532
-#define SET_STR_PRECOMPUTE_THRESHOLD      1639
+#define SET_STR_DC_THRESHOLD               650
+#define SET_STR_PRECOMPUTE_THRESHOLD      1585
diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index 88cac3e72..5ec334089 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -31,6 +31,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD      5
 #define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           21
 
@@ -45,16 +46,20 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD      66
 
-#define SQR_BASECASE_THRESHOLD               9
-#define SQR_TOOM2_THRESHOLD                 30
-#define SQR_TOOM3_THRESHOLD                 53
-#define SQR_TOOM4_THRESHOLD                148
-#define SQR_TOOM6_THRESHOLD                226
-#define SQR_TOOM8_THRESHOLD                430
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 49
+#define SQR_TOOM4_THRESHOLD                136
+#define SQR_TOOM6_THRESHOLD                274
+#define SQR_TOOM8_THRESHOLD                410
+
+#define MULMID_TOOM42_THRESHOLD             24
 
 #define MULMOD_BNM1_THRESHOLD               14
 #define SQRMOD_BNM1_THRESHOLD               14
 
+#define POWM_SEC_TABLE  4,19,228,713,919
+
 #define MUL_FFT_MODF_THRESHOLD             340  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    340, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
@@ -107,14 +112,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MULLO_BASECASE_THRESHOLD             5
 #define MULLO_DC_THRESHOLD                  28
-#define MULLO_MUL_N_THRESHOLD             6633
+#define MULLO_MUL_N_THRESHOLD             3084
 
-#define DC_DIV_QR_THRESHOLD                 27
+#define DC_DIV_QR_THRESHOLD                 23
 #define DC_DIVAPPR_Q_THRESHOLD             112
 #define DC_BDIV_QR_THRESHOLD                29
-#define DC_BDIV_Q_THRESHOLD                 86
+#define DC_BDIV_Q_THRESHOLD                 79
 
-#define INV_MULMOD_BNM1_THRESHOLD           47
+#define INV_MULMOD_BNM1_THRESHOLD           51
 #define INV_NEWTON_THRESHOLD                93
 #define INV_APPR_THRESHOLD                  91
 
@@ -123,14 +128,16 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MU_DIV_QR_THRESHOLD                855
 #define MU_DIVAPPR_Q_THRESHOLD             807
-#define MUPI_DIV_QR_THRESHOLD               33
+#define MUPI_DIV_QR_THRESHOLD               23
 #define MU_BDIV_QR_THRESHOLD               807
 #define MU_BDIV_Q_THRESHOLD                872
 
-#define MATRIX22_STRASSEN_THRESHOLD         11
-#define HGCD_THRESHOLD                      64
-#define GCD_DC_THRESHOLD                   237
-#define GCDEXT_DC_THRESHOLD                183
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                      69
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   268
+#define GCDEXT_DC_THRESHOLD                209
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                17
diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h
index 57b888637..02603c525 100644
--- a/mpn/powerpc64/mode64/p7/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -28,7 +28,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1N_TO_MOD_1_1_THRESHOLD          6
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          5
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD         7
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
 #define USE_PREINV_DIVREM_1                  0
 #define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
@@ -46,18 +46,20 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD     135
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD     141
 
-#define SQR_BASECASE_THRESHOLD              10
-#define SQR_TOOM2_THRESHOLD                 50
-#define SQR_TOOM3_THRESHOLD                 84
-#define SQR_TOOM4_THRESHOLD                160
-#define SQR_TOOM6_THRESHOLD                246
-#define SQR_TOOM8_THRESHOLD                296
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 36
+#define SQR_TOOM3_THRESHOLD                109
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                303
+#define SQR_TOOM8_THRESHOLD                399
 
 #define MULMID_TOOM42_THRESHOLD             62
 
 #define MULMOD_BNM1_THRESHOLD               15
 #define SQRMOD_BNM1_THRESHOLD               16
 
+#define POWM_SEC_TABLE  6,65,342,1465
+
 #define MUL_FFT_MODF_THRESHOLD             436  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    436, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
@@ -121,8 +123,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 103
 #define SQR_FFT_THRESHOLD                 3264
 
-#define MULLO_BASECASE_THRESHOLD             4
-#define MULLO_DC_THRESHOLD                  34
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  23
 #define MULLO_MUL_N_THRESHOLD             9174
 
 #define DC_DIV_QR_THRESHOLD                 30
@@ -144,12 +146,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD               1499
 
 #define MATRIX22_STRASSEN_THRESHOLD         15
-#define HGCD_THRESHOLD                     121
-#define GCD_DC_THRESHOLD                   443
-#define GCDEXT_DC_THRESHOLD                396
+#define HGCD_THRESHOLD                     124
+#define HGCD_APPR_THRESHOLD                155
+#define HGCD_REDUCE_THRESHOLD             3134
+#define GCD_DC_THRESHOLD                   492
+#define GCDEXT_DC_THRESHOLD                333
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                11
-#define GET_STR_PRECOMPUTE_THRESHOLD        22
+#define GET_STR_PRECOMPUTE_THRESHOLD        17
 #define SET_STR_DC_THRESHOLD              1517
-#define SET_STR_PRECOMPUTE_THRESHOLD      4040
+#define SET_STR_PRECOMPUTE_THRESHOLD      3421
diff --git a/mpn/s390_64/gmp-mparam.h b/mpn/s390_64/gmp-mparam.h
index c4960254e..46ca86726 100644
--- a/mpn/s390_64/gmp-mparam.h
+++ b/mpn/s390_64/gmp-mparam.h
@@ -61,6 +61,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULMOD_BNM1_THRESHOLD                9
 #define SQRMOD_BNM1_THRESHOLD               11
 
+#define POWM_SEC_TABLE  4,23,128,598
+
 #define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    220, 5}, {      7, 4}, {     15, 5}, {      8, 4}, \
@@ -131,7 +133,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MULLO_BASECASE_THRESHOLD             3
 #define MULLO_DC_THRESHOLD                  33
-#define MULLO_MUL_N_THRESHOLD             4392
+#define MULLO_MUL_N_THRESHOLD             5240
 
 #define DC_DIV_QR_THRESHOLD                 28
 #define DC_DIVAPPR_Q_THRESHOLD             106
@@ -152,12 +154,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD                680
 
 #define MATRIX22_STRASSEN_THRESHOLD         11
-#define HGCD_THRESHOLD                      71
-#define GCD_DC_THRESHOLD                   177
-#define GCDEXT_DC_THRESHOLD                142
-#define JACOBI_BASE_METHOD                   2
+#define HGCD_THRESHOLD                      75
+#define HGCD_APPR_THRESHOLD                 59
+#define HGCD_REDUCE_THRESHOLD              901
+#define GCD_DC_THRESHOLD                   186
+#define GCDEXT_DC_THRESHOLD                150
+#define JACOBI_BASE_METHOD                   3
 
 #define GET_STR_DC_THRESHOLD                27
 #define GET_STR_PRECOMPUTE_THRESHOLD        40
-#define SET_STR_DC_THRESHOLD               363
+#define SET_STR_DC_THRESHOLD               418
 #define SET_STR_PRECOMPUTE_THRESHOLD      1111
diff --git a/mpn/sparc64/ultrasparc34/gmp-mparam.h b/mpn/sparc64/ultrasparc34/gmp-mparam.h
index faed8efa3..8fe8ddc54 100644
--- a/mpn/sparc64/ultrasparc34/gmp-mparam.h
+++ b/mpn/sparc64/ultrasparc34/gmp-mparam.h
@@ -28,12 +28,13 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD         38
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        24
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     33
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
 #define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
 #define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
 
@@ -55,8 +56,12 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                191
 #define SQR_TOOM8_THRESHOLD                339
 
-#define MULMOD_BNM1_THRESHOLD               14
-#define SQRMOD_BNM1_THRESHOLD               13
+#define MULMID_TOOM42_THRESHOLD             42
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define POWM_SEC_TABLE  4,23,130,780,1812,1926
 
 #define MUL_FFT_MODF_THRESHOLD             212  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -157,7 +162,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 182
 #define SQR_FFT_THRESHOLD                 1984
 
-#define MULLO_BASECASE_THRESHOLD             8
+#define MULLO_BASECASE_THRESHOLD            14
 #define MULLO_DC_THRESHOLD                   0  /* never mpn_mullo_basecase */
 #define MULLO_MUL_N_THRESHOLD             3791
 
@@ -170,7 +175,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define INV_NEWTON_THRESHOLD                17
 #define INV_APPR_THRESHOLD                  17
 
-#define BINV_NEWTON_THRESHOLD              134
+#define BINV_NEWTON_THRESHOLD               92
 #define REDC_1_TO_REDC_2_THRESHOLD           2
 #define REDC_2_TO_REDC_N_THRESHOLD         117
 
@@ -181,12 +186,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD                748
 
 #define MATRIX22_STRASSEN_THRESHOLD         12
-#define HGCD_THRESHOLD                      46
-#define GCD_DC_THRESHOLD                   130
+#define HGCD_THRESHOLD                      45
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             1094
+#define GCD_DC_THRESHOLD                   126
 #define GCDEXT_DC_THRESHOLD                134
 #define JACOBI_BASE_METHOD                   2
 
 #define GET_STR_DC_THRESHOLD                18
 #define GET_STR_PRECOMPUTE_THRESHOLD        27
-#define SET_STR_DC_THRESHOLD               315
+#define SET_STR_DC_THRESHOLD               286
 #define SET_STR_PRECOMPUTE_THRESHOLD      1037
diff --git a/mpn/sparc64/ultrasparct1/gmp-mparam.h b/mpn/sparc64/ultrasparct1/gmp-mparam.h
index 744f7e17c..34c8027f5 100644
--- a/mpn/sparc64/ultrasparct1/gmp-mparam.h
+++ b/mpn/sparc64/ultrasparct1/gmp-mparam.h
@@ -25,14 +25,16 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define DIVREM_1_NORM_THRESHOLD              0  /* always */
 #define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
 #define MOD_1U_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     35
 #define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
 #define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
 
@@ -50,13 +52,17 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_BASECASE_THRESHOLD               0  /* always */
 #define SQR_TOOM2_THRESHOLD                 16
 #define SQR_TOOM3_THRESHOLD                 57
-#define SQR_TOOM4_THRESHOLD                133
-#define SQR_TOOM6_THRESHOLD                156
+#define SQR_TOOM4_THRESHOLD                135
+#define SQR_TOOM6_THRESHOLD                160
 #define SQR_TOOM8_THRESHOLD                260
 
+#define MULMID_TOOM42_THRESHOLD             12
+
 #define MULMOD_BNM1_THRESHOLD                7
 #define SQRMOD_BNM1_THRESHOLD                7
 
+#define POWM_SEC_TABLE  2,23,176,625,2783
+
 #define MUL_FFT_MODF_THRESHOLD             176  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    176, 5}, {      7, 6}, {      4, 5}, {      9, 6}, \
@@ -102,30 +108,32 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULLO_MUL_N_THRESHOLD             3176
 
 #define DC_DIV_QR_THRESHOLD                 27
-#define DC_DIVAPPR_Q_THRESHOLD             107
+#define DC_DIVAPPR_Q_THRESHOLD             108
 #define DC_BDIV_QR_THRESHOLD                27
 #define DC_BDIV_Q_THRESHOLD                 62
 
-#define INV_MULMOD_BNM1_THRESHOLD           22
+#define INV_MULMOD_BNM1_THRESHOLD           14
 #define INV_NEWTON_THRESHOLD               163
 #define INV_APPR_THRESHOLD                 117
 
 #define BINV_NEWTON_THRESHOLD              166
 #define REDC_1_TO_REDC_N_THRESHOLD          32
 
-#define MU_DIV_QR_THRESHOLD                720
-#define MU_DIVAPPR_Q_THRESHOLD             734
-#define MUPI_DIV_QR_THRESHOLD               67
+#define MU_DIV_QR_THRESHOLD                734
+#define MU_DIVAPPR_Q_THRESHOLD             748
+#define MUPI_DIV_QR_THRESHOLD               68
 #define MU_BDIV_QR_THRESHOLD               562
 #define MU_BDIV_Q_THRESHOLD                734
 
-#define MATRIX22_STRASSEN_THRESHOLD         11
-#define HGCD_THRESHOLD                      53
+#define MATRIX22_STRASSEN_THRESHOLD          9
+#define HGCD_THRESHOLD                      66
+#define HGCD_APPR_THRESHOLD                 47
+#define HGCD_REDUCE_THRESHOLD              834
 #define GCD_DC_THRESHOLD                   183
-#define GCDEXT_DC_THRESHOLD                144
+#define GCDEXT_DC_THRESHOLD                142
 #define JACOBI_BASE_METHOD                   3
 
 #define GET_STR_DC_THRESHOLD                20
-#define GET_STR_PRECOMPUTE_THRESHOLD        39
+#define GET_STR_PRECOMPUTE_THRESHOLD        36
 #define SET_STR_DC_THRESHOLD               458
-#define SET_STR_PRECOMPUTE_THRESHOLD       964
+#define SET_STR_PRECOMPUTE_THRESHOLD       963
diff --git a/mpn/x86/atom/gmp-mparam.h b/mpn/x86/atom/gmp-mparam.h
index 8c2595230..391a0ac4a 100644
--- a/mpn/x86/atom/gmp-mparam.h
+++ b/mpn/x86/atom/gmp-mparam.h
@@ -24,26 +24,27 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 /* Generated by tuneup.c */
 
 #define MOD_1_NORM_THRESHOLD                 3
-#define MOD_1_UNNORM_THRESHOLD               6
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           33
 
 #define MUL_TOOM22_THRESHOLD                20
 #define MUL_TOOM33_THRESHOLD                78
-#define MUL_TOOM44_THRESHOLD               184
+#define MUL_TOOM44_THRESHOLD               168
 #define MUL_TOOM6H_THRESHOLD               270
 #define MUL_TOOM8H_THRESHOLD               406
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD      79
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     126
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD     121
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     127
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      96
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 34
@@ -52,8 +53,12 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                303
 #define SQR_TOOM8_THRESHOLD                547
 
-#define MULMOD_BNM1_THRESHOLD               14
-#define SQRMOD_BNM1_THRESHOLD               18
+#define MULMID_TOOM42_THRESHOLD             54
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  2,35,262,1168
 
 #define MUL_FFT_MODF_THRESHOLD             376  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -108,9 +113,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 82
 #define SQR_FFT_THRESHOLD                 2752
 
-#define MULLO_BASECASE_THRESHOLD             4
+#define MULLO_BASECASE_THRESHOLD             5
 #define MULLO_DC_THRESHOLD                  51
-#define MULLO_MUL_N_THRESHOLD             8907
+#define MULLO_MUL_N_THRESHOLD             6633
 
 #define DC_DIV_QR_THRESHOLD                 63
 #define DC_DIVAPPR_Q_THRESHOLD             252
@@ -131,12 +136,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD               1334
 
 #define MATRIX22_STRASSEN_THRESHOLD         15
-#define HGCD_THRESHOLD                     126
-#define GCD_DC_THRESHOLD                   483
-#define GCDEXT_DC_THRESHOLD                351
+#define HGCD_THRESHOLD                     129
+#define HGCD_APPR_THRESHOLD                163
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   469
+#define GCDEXT_DC_THRESHOLD                348
 #define JACOBI_BASE_METHOD                   3
 
 #define GET_STR_DC_THRESHOLD                13
 #define GET_STR_PRECOMPUTE_THRESHOLD        24
-#define SET_STR_DC_THRESHOLD               272
-#define SET_STR_PRECOMPUTE_THRESHOLD      1116
+#define SET_STR_DC_THRESHOLD               262
+#define SET_STR_PRECOMPUTE_THRESHOLD       902
diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h
index 84238c4e0..9cc6798af 100644
--- a/mpn/x86/k7/gmp-mparam.h
+++ b/mpn/x86/k7/gmp-mparam.h
@@ -30,6 +30,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           26
 
@@ -40,19 +41,23 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM8H_THRESHOLD               454
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      95
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD     101
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 50
-#define SQR_TOOM3_THRESHOLD                 87
+#define SQR_TOOM3_THRESHOLD                 81
 #define SQR_TOOM4_THRESHOLD                148
-#define SQR_TOOM6_THRESHOLD                306
+#define SQR_TOOM6_THRESHOLD                274
 #define SQR_TOOM8_THRESHOLD                430
 
+#define MULMID_TOOM42_THRESHOLD             88
+
 #define MULMOD_BNM1_THRESHOLD               18
-#define SQRMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define POWM_SEC_TABLE  2,17,225,961,1604
 
 #define MUL_FFT_MODF_THRESHOLD             888  /* k = 6 */
 #define MUL_FFT_TABLE3                                      \
@@ -155,28 +160,30 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULLO_DC_THRESHOLD                  42
 #define MULLO_MUL_N_THRESHOLD            13463
 
-#define DC_DIV_QR_THRESHOLD                 89
-#define DC_DIVAPPR_Q_THRESHOLD             315
+#define DC_DIV_QR_THRESHOLD                 60
+#define DC_DIVAPPR_Q_THRESHOLD             336
 #define DC_BDIV_QR_THRESHOLD                91
-#define DC_BDIV_Q_THRESHOLD                274
+#define DC_BDIV_Q_THRESHOLD                268
 
 #define INV_MULMOD_BNM1_THRESHOLD           66
-#define INV_NEWTON_THRESHOLD               300
-#define INV_APPR_THRESHOLD                 303
+#define INV_NEWTON_THRESHOLD               284
+#define INV_APPR_THRESHOLD                 284
 
-#define BINV_NEWTON_THRESHOLD              303
-#define REDC_1_TO_REDC_N_THRESHOLD          95
+#define BINV_NEWTON_THRESHOLD              270
+#define REDC_1_TO_REDC_N_THRESHOLD          87
 
-#define MU_DIV_QR_THRESHOLD               1858
-#define MU_DIVAPPR_Q_THRESHOLD            1718
-#define MUPI_DIV_QR_THRESHOLD              132
-#define MU_BDIV_QR_THRESHOLD              1387
+#define MU_DIV_QR_THRESHOLD               1752
+#define MU_DIVAPPR_Q_THRESHOLD            1652
+#define MUPI_DIV_QR_THRESHOLD               97
+#define MU_BDIV_QR_THRESHOLD              1470
 #define MU_BDIV_Q_THRESHOLD               1470
 
 #define MATRIX22_STRASSEN_THRESHOLD         15
-#define HGCD_THRESHOLD                     154
-#define GCD_DC_THRESHOLD                   599
-#define GCDEXT_DC_THRESHOLD                443
+#define HGCD_THRESHOLD                     173
+#define HGCD_APPR_THRESHOLD                226
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   580
+#define GCDEXT_DC_THRESHOLD                414
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                17
diff --git a/mpn/x86/p6/sse2/gmp-mparam.h b/mpn/x86/p6/sse2/gmp-mparam.h
index 2735b9c63..b163c58cc 100644
--- a/mpn/x86/p6/sse2/gmp-mparam.h
+++ b/mpn/x86/p6/sse2/gmp-mparam.h
@@ -31,37 +31,42 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 /* 1867 MHz P6 model 13 */
 
 #define MOD_1_NORM_THRESHOLD                 4
-#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1_UNNORM_THRESHOLD               4
 #define MOD_1N_TO_MOD_1_1_THRESHOLD          5
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          4
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD           22
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
 
 #define MUL_TOOM22_THRESHOLD                20
-#define MUL_TOOM33_THRESHOLD                77
-#define MUL_TOOM44_THRESHOLD               182
+#define MUL_TOOM33_THRESHOLD                74
+#define MUL_TOOM44_THRESHOLD               181
 #define MUL_TOOM6H_THRESHOLD               252
-#define MUL_TOOM8H_THRESHOLD               381
+#define MUL_TOOM8H_THRESHOLD               363
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
 #define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD      79
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 30
 #define SQR_TOOM3_THRESHOLD                101
 #define SQR_TOOM4_THRESHOLD                154
 #define SQR_TOOM6_THRESHOLD                222
-#define SQR_TOOM8_THRESHOLD                547
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             58
 
 #define MULMOD_BNM1_THRESHOLD               13
 #define SQRMOD_BNM1_THRESHOLD               17
 
+#define POWM_SEC_TABLE  4,23,258,768,2388
+
 #define MUL_FFT_MODF_THRESHOLD             565  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    565, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
@@ -143,34 +148,36 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_THRESHOLD                 5760
 
 #define MULLO_BASECASE_THRESHOLD             0  /* always */
-#define MULLO_DC_THRESHOLD                  34
+#define MULLO_DC_THRESHOLD                  33
 #define MULLO_MUL_N_THRESHOLD            13463
 
-#define DC_DIV_QR_THRESHOLD                 22
+#define DC_DIV_QR_THRESHOLD                 20
 #define DC_DIVAPPR_Q_THRESHOLD              56
 #define DC_BDIV_QR_THRESHOLD                60
-#define DC_BDIV_Q_THRESHOLD                132
+#define DC_BDIV_Q_THRESHOLD                134
 
 #define INV_MULMOD_BNM1_THRESHOLD           38
-#define INV_NEWTON_THRESHOLD                71
+#define INV_NEWTON_THRESHOLD                66
 #define INV_APPR_THRESHOLD                  63
 
-#define BINV_NEWTON_THRESHOLD              252
-#define REDC_1_TO_REDC_N_THRESHOLD          62
+#define BINV_NEWTON_THRESHOLD              250
+#define REDC_1_TO_REDC_N_THRESHOLD          63
 
-#define MU_DIV_QR_THRESHOLD               1142
-#define MU_DIVAPPR_Q_THRESHOLD             889
-#define MUPI_DIV_QR_THRESHOLD               39
-#define MU_BDIV_QR_THRESHOLD              1308
-#define MU_BDIV_Q_THRESHOLD               1442
+#define MU_DIV_QR_THRESHOLD               1164
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               38
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1470
 
 #define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                      61
-#define GCD_DC_THRESHOLD                   379
-#define GCDEXT_DC_THRESHOLD                298
-#define JACOBI_BASE_METHOD                   4
+#define HGCD_THRESHOLD                      64
+#define HGCD_APPR_THRESHOLD                105
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                309
+#define JACOBI_BASE_METHOD                   1
 
 #define GET_STR_DC_THRESHOLD                13
-#define GET_STR_PRECOMPUTE_THRESHOLD        20
-#define SET_STR_DC_THRESHOLD               582
-#define SET_STR_PRECOMPUTE_THRESHOLD      1055
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               587
+#define SET_STR_PRECOMPUTE_THRESHOLD      1104
diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h
index b1e56b5e2..8a198ad96 100644
--- a/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -22,37 +22,42 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define BYTES_PER_MP_LIMB 4
 
 
-#define MOD_1_NORM_THRESHOLD                 9
-#define MOD_1_UNNORM_THRESHOLD              20
+#define MOD_1_NORM_THRESHOLD             MP_SIZE_T_MAX  /* never */
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define MOD_1N_TO_MOD_1_1_THRESHOLD          6
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          5
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD        13
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD      6
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           20
 
 #define MUL_TOOM22_THRESHOLD                31
-#define MUL_TOOM33_THRESHOLD               120
-#define MUL_TOOM44_THRESHOLD               286
+#define MUL_TOOM33_THRESHOLD               114
+#define MUL_TOOM44_THRESHOLD               300
 #define MUL_TOOM6H_THRESHOLD               426
-#define MUL_TOOM8H_THRESHOLD               592
+#define MUL_TOOM8H_THRESHOLD               620
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD     195
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     216
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD     193
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     187
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     184
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     207
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     181
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     209
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
-#define SQR_TOOM2_THRESHOLD                 48
-#define SQR_TOOM3_THRESHOLD                174
-#define SQR_TOOM4_THRESHOLD                390
-#define SQR_TOOM6_THRESHOLD                  0
-#define SQR_TOOM8_THRESHOLD                507
+#define SQR_TOOM2_THRESHOLD                 49
+#define SQR_TOOM3_THRESHOLD                173
+#define SQR_TOOM4_THRESHOLD                264
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                810
 
-#define MULMOD_BNM1_THRESHOLD               17
-#define SQRMOD_BNM1_THRESHOLD               21
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define POWM_SEC_TABLE  2,33,246,1052,2178
 
 #define MUL_FFT_MODF_THRESHOLD             904  /* k = 6 */
 #define MUL_FFT_TABLE3                                      \
@@ -102,35 +107,37 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 72
 #define SQR_FFT_THRESHOLD                 6784
 
-#define MULLO_BASECASE_THRESHOLD            12
-#define MULLO_DC_THRESHOLD                  49
-#define MULLO_MUL_N_THRESHOLD            13866
+#define MULLO_BASECASE_THRESHOLD            13
+#define MULLO_DC_THRESHOLD                  52
+#define MULLO_MUL_N_THRESHOLD            13463
 
-#define DC_DIV_QR_THRESHOLD                 37
-#define DC_DIVAPPR_Q_THRESHOLD              81
-#define DC_BDIV_QR_THRESHOLD                51
-#define DC_BDIV_Q_THRESHOLD                 80
+#define DC_DIV_QR_THRESHOLD                 39
+#define DC_DIVAPPR_Q_THRESHOLD              77
+#define DC_BDIV_QR_THRESHOLD                54
+#define DC_BDIV_Q_THRESHOLD                 94
 
 #define INV_MULMOD_BNM1_THRESHOLD           60
-#define INV_NEWTON_THRESHOLD               244
-#define INV_APPR_THRESHOLD                  98
+#define INV_NEWTON_THRESHOLD               182
+#define INV_APPR_THRESHOLD                  93
 
-#define BINV_NEWTON_THRESHOLD              276
-#define REDC_1_TO_REDC_N_THRESHOLD          63
+#define BINV_NEWTON_THRESHOLD              296
+#define REDC_1_TO_REDC_N_THRESHOLD          66
 
 #define MU_DIV_QR_THRESHOLD               2350
-#define MU_DIVAPPR_Q_THRESHOLD            2172
-#define MUPI_DIV_QR_THRESHOLD               48
-#define MU_BDIV_QR_THRESHOLD              1858
-#define MU_BDIV_Q_THRESHOLD               2172
-
-#define MATRIX22_STRASSEN_THRESHOLD         29
-#define HGCD_THRESHOLD                      81
-#define GCD_DC_THRESHOLD                   309
+#define MU_DIVAPPR_Q_THRESHOLD            2130
+#define MUPI_DIV_QR_THRESHOLD               71
+#define MU_BDIV_QR_THRESHOLD              2130
+#define MU_BDIV_Q_THRESHOLD               2130
+
+#define MATRIX22_STRASSEN_THRESHOLD         24
+#define HGCD_THRESHOLD                      77
+#define HGCD_APPR_THRESHOLD                 91
+#define HGCD_REDUCE_THRESHOLD             5010
+#define GCD_DC_THRESHOLD                   327
 #define GCDEXT_DC_THRESHOLD                253
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                10
-#define GET_STR_PRECOMPUTE_THRESHOLD        25
-#define SET_STR_DC_THRESHOLD               118
-#define SET_STR_PRECOMPUTE_THRESHOLD      1099
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               144
+#define SET_STR_PRECOMPUTE_THRESHOLD       979
diff --git a/mpn/x86_64/atom/gmp-mparam.h b/mpn/x86_64/atom/gmp-mparam.h
index 37ddcebc2..380f36f25 100644
--- a/mpn/x86_64/atom/gmp-mparam.h
+++ b/mpn/x86_64/atom/gmp-mparam.h
@@ -31,14 +31,15 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          3
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD     MP_SIZE_T_MAX
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD           17
+#define BMOD_1_TO_MOD_1_THRESHOLD           16
 
 #define MUL_TOOM22_THRESHOLD                10
 #define MUL_TOOM33_THRESHOLD                65
@@ -58,9 +59,13 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                222
 #define SQR_TOOM8_THRESHOLD                333
 
+#define MULMID_TOOM42_THRESHOLD             14
+
 #define MULMOD_BNM1_THRESHOLD                7
 #define SQRMOD_BNM1_THRESHOLD               12
 
+#define POWM_SEC_TABLE  2,31,213,724,2112
+
 #define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    220, 5}, {      7, 4}, {     15, 5}, {     13, 6}, \
@@ -145,9 +150,11 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD                748
 
 #define MATRIX22_STRASSEN_THRESHOLD         13
-#define HGCD_THRESHOLD                      82
+#define HGCD_THRESHOLD                      79
+#define HGCD_APPR_THRESHOLD                 83
+#define HGCD_REDUCE_THRESHOLD             1137
 #define GCD_DC_THRESHOLD                   186
-#define GCDEXT_DC_THRESHOLD                186
+#define GCDEXT_DC_THRESHOLD                189
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                15
diff --git a/mpn/x86_64/bobcat/gmp-mparam.h b/mpn/x86_64/bobcat/gmp-mparam.h
index f1edb1d36..5acb78a62 100644
--- a/mpn/x86_64/bobcat/gmp-mparam.h
+++ b/mpn/x86_64/bobcat/gmp-mparam.h
@@ -58,6 +58,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULMOD_BNM1_THRESHOLD               11
 #define SQRMOD_BNM1_THRESHOLD               15
 
+#define POWM_SEC_TABLE  2,23,322,840
+
 #define MUL_FFT_MODF_THRESHOLD             376  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    376, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
@@ -145,9 +147,11 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD               1308
 
 #define MATRIX22_STRASSEN_THRESHOLD         14
-#define HGCD_THRESHOLD                     103
-#define GCD_DC_THRESHOLD                   469
-#define GCDEXT_DC_THRESHOLD                290
+#define HGCD_THRESHOLD                     105
+#define HGCD_APPR_THRESHOLD                113
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   330
+#define GCDEXT_DC_THRESHOLD                306
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                17
diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h
index 43adaa078..0752688fd 100644
--- a/mpn/x86_64/core2/gmp-mparam.h
+++ b/mpn/x86_64/core2/gmp-mparam.h
@@ -31,14 +31,15 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        16
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           26
 
 #define MUL_TOOM22_THRESHOLD                23
 #define MUL_TOOM33_THRESHOLD                65
-#define MUL_TOOM44_THRESHOLD               178
-#define MUL_TOOM6H_THRESHOLD               222
-#define MUL_TOOM8H_THRESHOLD                 0
+#define MUL_TOOM44_THRESHOLD               169
+#define MUL_TOOM6H_THRESHOLD               254
+#define MUL_TOOM8H_THRESHOLD               357
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      69
 #define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
@@ -48,15 +49,17 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 26
 #define SQR_TOOM3_THRESHOLD                 85
-#define SQR_TOOM4_THRESHOLD                160
-#define SQR_TOOM6_THRESHOLD                218
-#define SQR_TOOM8_THRESHOLD                296
+#define SQR_TOOM4_THRESHOLD                226
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                454
 
 #define MULMID_TOOM42_THRESHOLD             24
 
 #define MULMOD_BNM1_THRESHOLD               15
 #define SQRMOD_BNM1_THRESHOLD               15
 
+#define POWM_SEC_TABLE  2,41,322,840,1100,1556
+
 #define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    380, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
@@ -156,8 +159,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_THRESHOLD                 2752
 
 #define MULLO_BASECASE_THRESHOLD             3
-#define MULLO_DC_THRESHOLD                  20
-#define MULLO_MUL_N_THRESHOLD            10950
+#define MULLO_DC_THRESHOLD                  18
+#define MULLO_MUL_N_THRESHOLD             9174
 
 #define DC_DIV_QR_THRESHOLD                 47
 #define DC_DIVAPPR_Q_THRESHOLD             179
@@ -180,11 +183,13 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MATRIX22_STRASSEN_THRESHOLD         18
 #define HGCD_THRESHOLD                     135
+#define HGCD_APPR_THRESHOLD                169
+#define HGCD_REDUCE_THRESHOLD             2121
 #define GCD_DC_THRESHOLD                   330
 #define GCDEXT_DC_THRESHOLD                361
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                13
 #define GET_STR_PRECOMPUTE_THRESHOLD        23
-#define SET_STR_DC_THRESHOLD               746
+#define SET_STR_DC_THRESHOLD               552
 #define SET_STR_PRECOMPUTE_THRESHOLD      1893
diff --git a/mpn/x86_64/coreinhm/gmp-mparam.h b/mpn/x86_64/coreinhm/gmp-mparam.h
index eec17787d..90cfa2be4 100644
--- a/mpn/x86_64/coreinhm/gmp-mparam.h
+++ b/mpn/x86_64/coreinhm/gmp-mparam.h
@@ -31,6 +31,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        15
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           17
 
@@ -55,6 +56,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULMOD_BNM1_THRESHOLD               13
 #define SQRMOD_BNM1_THRESHOLD               13
 
+#define POWM_SEC_TABLE  2,65,322,1084
+
 #define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    380, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
@@ -112,8 +115,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DC_BDIV_QR_THRESHOLD                32
 #define DC_BDIV_Q_THRESHOLD                 70
 
-#define INV_MULMOD_BNM1_THRESHOLD           46
-#define INV_NEWTON_THRESHOLD               195
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               177
 #define INV_APPR_THRESHOLD                 147
 
 #define BINV_NEWTON_THRESHOLD              252
@@ -126,13 +129,15 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_QR_THRESHOLD              1120
 #define MU_BDIV_Q_THRESHOLD               1187
 
-#define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                     117
-#define GCD_DC_THRESHOLD                   330
-#define GCDEXT_DC_THRESHOLD                382
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     126
+#define HGCD_APPR_THRESHOLD                171
+#define HGCD_REDUCE_THRESHOLD             2205
+#define GCD_DC_THRESHOLD                   345
+#define GCDEXT_DC_THRESHOLD                386
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_DC_THRESHOLD                15
 #define GET_STR_PRECOMPUTE_THRESHOLD        20
-#define SET_STR_DC_THRESHOLD               552
-#define SET_STR_PRECOMPUTE_THRESHOLD      1655
+#define SET_STR_DC_THRESHOLD               232
+#define SET_STR_PRECOMPUTE_THRESHOLD      1585
diff --git a/mpn/x86_64/coreisbr/gmp-mparam.h b/mpn/x86_64/coreisbr/gmp-mparam.h
index e4727116b..dab35f174 100644
--- a/mpn/x86_64/coreisbr/gmp-mparam.h
+++ b/mpn/x86_64/coreisbr/gmp-mparam.h
@@ -29,8 +29,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          3
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        20
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      6
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           30
 
@@ -52,58 +53,119 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                  0
 #define SQR_TOOM8_THRESHOLD                458
 
-#define MULMOD_BNM1_THRESHOLD               11
-#define SQRMOD_BNM1_THRESHOLD               16
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               14
 
-#define MUL_FFT_MODF_THRESHOLD             376  /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
-  { {    376, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
-    {     10, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
-    {     21, 7}, {     11, 6}, {     23, 7}, {     13, 6}, \
-    {     27, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+  { {    380, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
     {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
     {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
     {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
     {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
     {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
     {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
-    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     67,10}, {     39, 9}, {     83,10}, {     47, 9}, \
     {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
     {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
-    {    135,11}, {     79,10}, {    167,11}, {     95,10}, \
-    {    191, 9}, {    383,12}, {     63,11}, {    127,10}, \
-    {    255, 9}, {    511,10}, {    271,11}, {    143,10}, \
-    {    287, 9}, {    575,11}, {    159,10}, {    319,12}, \
-    {     95,11}, {    191,10}, {    383,11}, {    207,13}, \
-    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    {    135,11}, {     79,10}, {    159, 9}, {    319,10}, \
+    {    167,11}, {     95,10}, {    191, 9}, {    383,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271,11}, {    143,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,10}, {    415,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703, 9}, {   1407,11}, \
+    {    367,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,12}, {    223,11}, {    447,10}, \
+    {    895,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,10}, {   1407,11}, {    735,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,10}, \
+    {   1663,12}, {    447,11}, {    895,14}, {    127,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    543,11}, \
+    {   1087,12}, {    575,11}, {   1151,12}, {    607,11}, \
+    {   1215,13}, {    319,12}, {    639,11}, {   1279,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1663,13}, {    447,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1279,13}, {    703,12}, {   1407,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    959,12}, \
+    {   1919,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1215,12}, {   2431,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1407,12}, {   2815,13}, {   1471,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2687,14}, \
+    {   1407,13}, {   2815,15}, {    767,14}, {   1663,13}, \
+    {   3455,14}, {   1919,13}, {   3839,16}, {    511,15}, \
+    {   1023,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2943,13}, {   5887,15}, {  32768,16}, {  65536,17}, \
     { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
     {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 83
-#define MUL_FFT_THRESHOLD                 3712
+#define MUL_FFT_TABLE3_SIZE 203
+#define MUL_FFT_THRESHOLD                 4736
 
-#define SQR_FFT_MODF_THRESHOLD             316  /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD             304  /* k = 5 */
 #define SQR_FFT_TABLE3                                      \
-  { {    316, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+  { {    304, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
     {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
-    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
     {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
-    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
     {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
-    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
-    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     63,10}, {     39, 9}, \
     {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
-    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
-    {    127, 9}, {    255,11}, {     79,10}, {    159, 9}, \
-    {    319,11}, {     95,10}, {    191, 9}, {    383,12}, \
-    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
-    {    271, 9}, {    543,11}, {    143,10}, {    287, 9}, \
-    {    575,10}, {    303,11}, {    159,10}, {    319, 9}, \
-    {    639,12}, {     95,11}, {    191,10}, {    383,11}, \
-    {    207,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
-    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
-    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD                 3264
+    {     47,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    511,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,12}, {     95,11}, {    191,10}, {    383, 9}, \
+    {    767,11}, {    207,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,10}, {    575,11}, {    303,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,10}, {    895,11}, \
+    {    479,10}, {    959,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,12}, {    287,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    639,10}, {   1279,12}, {    351,11}, {    703,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    447,11}, {    895,12}, {    479,11}, \
+    {    959,10}, {   1919,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,13}, \
+    {    447,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1087,13}, {    575,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,12}, {   1919,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1215,12}, {   2431,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1407,12}, {   2815,13}, {   1471,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2687,14}, \
+    {   1407,13}, {   2815,15}, {    767,14}, {   1663,13}, \
+    {   3455,14}, {   1919,16}, {    511,15}, {   1023,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 198
+#define SQR_FFT_THRESHOLD                 2752
 
 #define MULLO_BASECASE_THRESHOLD             5
 #define MULLO_DC_THRESHOLD                  33
diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h
index 99499da2b..b16ff5a6b 100644
--- a/mpn/x86_64/gmp-mparam.h
+++ b/mpn/x86_64/gmp-mparam.h
@@ -30,6 +30,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD        28
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           15
 
@@ -56,6 +57,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULMOD_BNM1_THRESHOLD               17
 #define SQRMOD_BNM1_THRESHOLD               17
 
+#define POWM_SEC_TABLE  2,67,322,991
+
 #define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    570, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
@@ -187,10 +190,12 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_QR_THRESHOLD              1589
 #define MU_BDIV_Q_THRESHOLD               1718
 
-#define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                     139
-#define GCD_DC_THRESHOLD                   606
-#define GCDEXT_DC_THRESHOLD                474
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     125
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   555
+#define GCDEXT_DC_THRESHOLD                478
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                12
diff --git a/mpn/x86_64/nano/gmp-mparam.h b/mpn/x86_64/nano/gmp-mparam.h
index a1c556937..7ee41927b 100644
--- a/mpn/x86_64/nano/gmp-mparam.h
+++ b/mpn/x86_64/nano/gmp-mparam.h
@@ -34,6 +34,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           22
 
@@ -50,13 +51,17 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 34
-#define SQR_TOOM3_THRESHOLD                 74
-#define SQR_TOOM4_THRESHOLD                620
-#define SQR_TOOM6_THRESHOLD                960
-#define SQR_TOOM8_THRESHOLD               1065
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                592
+#define SQR_TOOM6_THRESHOLD                978
+#define SQR_TOOM8_THRESHOLD               1193
 
-#define MULMOD_BNM1_THRESHOLD               15
-#define SQRMOD_BNM1_THRESHOLD               17
+#define MULMID_TOOM42_THRESHOLD             28
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               20
+
+#define POWM_SEC_TABLE  2,29,387,1421
 
 #define MUL_FFT_MODF_THRESHOLD             376  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -176,7 +181,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 215
 #define SQR_FFT_THRESHOLD                 3264
 
-#define MULLO_BASECASE_THRESHOLD            17
+#define MULLO_BASECASE_THRESHOLD             8
 #define MULLO_DC_THRESHOLD                   0  /* never mpn_mullo_basecase */
 #define MULLO_MUL_N_THRESHOLD             6633
 
@@ -190,7 +195,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define INV_APPR_THRESHOLD                 153
 
 #define BINV_NEWTON_THRESHOLD              182
-#define REDC_1_TO_REDC_2_THRESHOLD          14
+#define REDC_1_TO_REDC_2_THRESHOLD          20
 #define REDC_2_TO_REDC_N_THRESHOLD          75
 
 #define MU_DIV_QR_THRESHOLD               1589
@@ -200,12 +205,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD               1528
 
 #define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                      84
-#define GCD_DC_THRESHOLD                   465
-#define GCDEXT_DC_THRESHOLD                456
+#define HGCD_THRESHOLD                     102
+#define HGCD_APPR_THRESHOLD                113
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   706
+#define GCDEXT_DC_THRESHOLD                465
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                12
 #define GET_STR_PRECOMPUTE_THRESHOLD        24
-#define SET_STR_DC_THRESHOLD               537
-#define SET_STR_PRECOMPUTE_THRESHOLD      1639
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1794
diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h
index 8983304c2..4d49fc2cf 100644
--- a/mpn/x86_64/pentium4/gmp-mparam.h
+++ b/mpn/x86_64/pentium4/gmp-mparam.h
@@ -33,34 +33,39 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
 #define MOD_1N_TO_MOD_1_1_THRESHOLD          4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        32
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        38
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
 #define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD           20
 
 #define MUL_TOOM22_THRESHOLD                12
-#define MUL_TOOM33_THRESHOLD                66
+#define MUL_TOOM33_THRESHOLD                74
 #define MUL_TOOM44_THRESHOLD               118
 #define MUL_TOOM6H_THRESHOLD               157
-#define MUL_TOOM8H_THRESHOLD               242
+#define MUL_TOOM8H_THRESHOLD               430
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     145
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 20
-#define SQR_TOOM3_THRESHOLD                 77
-#define SQR_TOOM4_THRESHOLD                214
+#define SQR_TOOM3_THRESHOLD                 69
+#define SQR_TOOM4_THRESHOLD                202
 #define SQR_TOOM6_THRESHOLD                254
-#define SQR_TOOM8_THRESHOLD                454
+#define SQR_TOOM8_THRESHOLD                418
+
+#define MULMID_TOOM42_THRESHOLD             19
 
 #define MULMOD_BNM1_THRESHOLD               10
-#define SQRMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define POWM_SEC_TABLE  3,130,140,724,2316
 
 #define MUL_FFT_MODF_THRESHOLD             236  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -121,11 +126,11 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MULLO_BASECASE_THRESHOLD             0  /* always */
 #define MULLO_DC_THRESHOLD                  32
-#define MULLO_MUL_N_THRESHOLD             5397
+#define MULLO_MUL_N_THRESHOLD             6253
 
-#define DC_DIV_QR_THRESHOLD                 28
-#define DC_DIVAPPR_Q_THRESHOLD              67
-#define DC_BDIV_QR_THRESHOLD                27
+#define DC_DIV_QR_THRESHOLD                 32
+#define DC_DIVAPPR_Q_THRESHOLD              60
+#define DC_BDIV_QR_THRESHOLD                26
 #define DC_BDIV_Q_THRESHOLD                 49
 
 #define INV_MULMOD_BNM1_THRESHOLD           22
@@ -133,8 +138,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define INV_APPR_THRESHOLD                 101
 
 #define BINV_NEWTON_THRESHOLD              199
-#define REDC_1_TO_REDC_2_THRESHOLD          13
-#define REDC_2_TO_REDC_N_THRESHOLD          44
+#define REDC_1_TO_REDC_2_THRESHOLD          23
+#define REDC_2_TO_REDC_N_THRESHOLD          42
 
 #define MU_DIV_QR_THRESHOLD                979
 #define MU_DIVAPPR_Q_THRESHOLD             979
@@ -143,12 +148,14 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MU_BDIV_Q_THRESHOLD                979
 
 #define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                     101
-#define GCD_DC_THRESHOLD                   222
-#define GCDEXT_DC_THRESHOLD                222
+#define HGCD_THRESHOLD                      99
+#define HGCD_APPR_THRESHOLD                117
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   198
+#define GCDEXT_DC_THRESHOLD                233
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                12
 #define GET_STR_PRECOMPUTE_THRESHOLD        26
-#define SET_STR_DC_THRESHOLD               248
+#define SET_STR_DC_THRESHOLD               422
 #define SET_STR_PRECOMPUTE_THRESHOLD      1438
-- 
cgit v1.2.1


From a7466d9e0e147ffcb964e987d207562306da48b5 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sun, 20 Nov 2011 21:47:49 +0100
Subject: Configure improvements powerpc64 with abi=32.

---
 ChangeLog    |  7 +++++++
 configure.in | 14 ++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 658930906..2e4b53904 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2011-11-20  Torbjorn Granlund  <tege@gmplib.org>
+
+	* configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe
+	mechanism.
+
+	* configure.in: Support powerpc32/p3-p7 directory for affected CPUs.
+
 2011-11-17  Torbjorn Granlund  <tege@gmplib.org>
 
 	* tune/speed.c (routine): Add mpn_tabselect.
diff --git a/configure.in b/configure.in
index 887975c40..6427ec3dd 100644
--- a/configure.in
+++ b/configure.in
@@ -919,7 +919,7 @@ case $host in
       powerpc620)   gcc_cflags_cpu="-mcpu=620" ;;
       powerpc630)   gcc_cflags_cpu="-mcpu=630"
 		    xlc_cflags_arch="-qarch=pwr3"
-		    cpu_path="p3" ;;
+		    cpu_path="p3 p3-p7" ;;
       powerpc740)   gcc_cflags_cpu="-mcpu=740" ;;
       powerpc7400 | powerpc7410)
 		    gcc_cflags_asm="-Wa,-maltivec"
@@ -935,19 +935,19 @@ case $host in
       powerpc970)   gcc_cflags_cpu="-mtune=970"
 		    xlc_cflags_arch="-qarch=970 -qarch=pwr3"
 		    vmx_path="powerpc64/vmx"
-		    cpu_path="p4" ;;
+		    cpu_path="p4 p3-p7" ;;
       power4)	    gcc_cflags_cpu="-mtune=power4"
 		    xlc_cflags_arch="-qarch=pwr4"
-		    cpu_path="p4" ;;
+		    cpu_path="p4 p3-p7" ;;
       power5)	    gcc_cflags_cpu="-mtune=power5 -mtune=power4"
 		    xlc_cflags_arch="-qarch=pwr5"
-		    cpu_path="p5 p4" ;;
+		    cpu_path="p5 p4 p3-p7" ;;
       power6)	    gcc_cflags_cpu="-mtune=power6"
 		    xlc_cflags_arch="-qarch=pwr6"
-		    cpu_path="p6" ;;
+		    cpu_path="p6 p3-p7" ;;
       power7)	    gcc_cflags_cpu="-mtune=power7 -mtune=power5"
 		    xlc_cflags_arch="-qarch=pwr7 -qarch=pwr5"
-		    cpu_path="p7 p5 p4" ;;
+		    cpu_path="p7 p5 p4 p3-p7" ;;
     esac
 
     case $host in
@@ -1012,6 +1012,7 @@ case $host in
 	    # incompatible with a shared library.
 	    #
 	    abilist="mode64 mode32 $abilist"
+	    gcc_32_cflags_maybe="-m32"
 	    gcc_cflags_opt="-O3 -O2 -O1"	# will this become used?
 	    cclist_mode32="gcc"
 	    gcc_mode32_cflags_maybe="-m32"
@@ -1057,6 +1058,7 @@ case $host in
 	    # 64-bits.
 	    #
 	    abilist="mode64 mode32 $abilist"
+	    gcc_32_cflags_maybe="-m32"
 	    cclist_mode32="gcc"
 	    gcc_mode32_cflags_maybe="-m32"
 	    gcc_mode32_cflags="-mpowerpc64"
-- 
cgit v1.2.1


From 952803d3c43dcacfbd001d5fed37b32316b529dd Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sun, 20 Nov 2011 21:49:10 +0100
Subject: Provide special powerpc64 add_n/sub_n abi=32 code.

---
 ChangeLog                      |   2 +
 mpn/powerpc32/p3-p7/aors_n.asm | 176 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+)
 create mode 100644 mpn/powerpc32/p3-p7/aors_n.asm

diff --git a/ChangeLog b/ChangeLog
index 2e4b53904..420ae5f4e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
 2011-11-20  Torbjorn Granlund  <tege@gmplib.org>
 
+	* mpn/powerpc32/p3-p7/aors_n.asm: New file.
+
 	* configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe
 	mechanism.
 
diff --git a/mpn/powerpc32/p3-p7/aors_n.asm b/mpn/powerpc32/p3-p7/aors_n.asm
new file mode 100644
index 000000000..6999182a8
--- /dev/null
+++ b/mpn/powerpc32/p3-p7/aors_n.asm
@@ -0,0 +1,176 @@
+dnl  PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          1.5
+C POWER4/PPC970          2
+C POWER5                 2
+C POWER6                 2.78
+C POWER7               2.15-2.87
+
+C This code is based on powerpc64/aors_n.asm.
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_add_n)
+  define(func_nc,	mpn_add_nc)
+  define(GENRVAL,	`addi	r3, r3, 1')
+  define(SETCBR,	`addic	r0, $1, -1')
+  define(CLRCB,		`addic	r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_sub_n)
+  define(func_nc,	mpn_sub_nc)
+  define(GENRVAL,	`neg	r3, r3')
+  define(SETCBR,	`subfic	r0, $1, 0')
+  define(CLRCB,		`addic	r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	SETCBR(r7)
+	b	L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+	CLRCB
+L(ent):	stw	r31, -4(r1)
+	stw	r30, -8(r1)
+	stw	r29, -12(r1)
+	stw	r28, -16(r1)
+
+	rlwinm.	r0, r6, 0,30,31	C r0 = n & 3, set cr0
+	cmpwi	cr6, r0, 2
+	addi	r6, r6, 3	C compute count...
+	srwi	r6, r6, 2	C ...for ctr
+	mtctr	r6		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	lwz	r8, 0(r4)	C load s1 limb
+	lwz	r9, 0(r5)	C load s2 limb
+	lwz	r10, 4(r4)	C load s1 limb
+	lwz	r11, 4(r5)	C load s2 limb
+	lwz	r12, 8(r4)	C load s1 limb
+	addi	r4, r4, 12
+	lwz	r0, 8(r5)	C load s2 limb
+	addi	r5, r5, 12
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	stw	r29, 0(r3)
+	stw	r30, 4(r3)
+	stw	r31, 8(r3)
+	addi	r3, r3, 12
+	bdnz	L(go)
+	b	L(ret)
+
+L(b01):	lwz	r12, 0(r4)	C load s1 limb
+	addi	r4, r4, 4
+	lwz	r0, 0(r5)	C load s2 limb
+	addi	r5, r5, 4
+	ADDSUBC	r31, r0, r12	C add
+	stw	r31, 0(r3)
+	addi	r3, r3, 4
+	bdnz	L(go)
+	b	L(ret)
+
+L(b10):	lwz	r10, 0(r4)	C load s1 limb
+	lwz	r11, 0(r5)	C load s2 limb
+	lwz	r12, 4(r4)	C load s1 limb
+	addi	r4, r4, 8
+	lwz	r0, 4(r5)	C load s2 limb
+	addi	r5, r5, 8
+	ADDSUBC	r30, r11, r10	C add
+	ADDSUBC	r31, r0, r12	C add
+	stw	r30, 0(r3)
+	stw	r31, 4(r3)
+	addi	r3, r3, 8
+	bdnz	L(go)
+	b	L(ret)
+
+L(b00):	C INITCY		C clear/set cy
+L(go):	lwz	r6, 0(r4)	C load s1 limb
+	lwz	r7, 0(r5)	C load s2 limb
+	lwz	r8, 4(r4)	C load s1 limb
+	lwz	r9, 4(r5)	C load s2 limb
+	lwz	r10, 8(r4)	C load s1 limb
+	lwz	r11, 8(r5)	C load s2 limb
+	lwz	r12, 12(r4)	C load s1 limb
+	lwz	r0, 12(r5)	C load s2 limb
+	bdz	L(end)
+
+	addi	r4, r4, 16
+	addi	r5, r5, 16
+
+	ALIGN(16)
+L(top):	ADDSUBC	r28, r7, r6
+	lwz	r6, 0(r4)	C load s1 limb
+	lwz	r7, 0(r5)	C load s2 limb
+	ADDSUBC	r29, r9, r8
+	lwz	r8, 4(r4)	C load s1 limb
+	lwz	r9, 4(r5)	C load s2 limb
+	ADDSUBC	r30, r11, r10
+	lwz	r10, 8(r4)	C load s1 limb
+	lwz	r11, 8(r5)	C load s2 limb
+	ADDSUBC	r31, r0, r12
+	lwz	r12, 12(r4)	C load s1 limb
+	lwz	r0, 12(r5)	C load s2 limb
+	stw	r28, 0(r3)
+	addi	r4, r4, 16
+	stw	r29, 4(r3)
+	addi	r5, r5, 16
+	stw	r30, 8(r3)
+	stw	r31, 12(r3)
+	addi	r3, r3, 16
+	bdnz	L(top)		C decrement ctr and loop back
+
+L(end):	ADDSUBC	r28, r7, r6
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	stw	r28, 0(r3)
+	stw	r29, 4(r3)
+	stw	r30, 8(r3)
+	stw	r31, 12(r3)
+
+L(ret):	lwz	r31, -4(r1)
+	lwz	r30, -8(r1)
+	lwz	r29, -12(r1)
+	lwz	r28, -16(r1)
+
+	subfe	r3, r0, r0	C -cy
+	GENRVAL
+	blr
+EPILOGUE()
-- 
cgit v1.2.1


From 1c9f3475308f9c3ae0b811566c4c88650128b772 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sun, 20 Nov 2011 22:55:07 +0100
Subject: Split x86 CPUs into more subtypes for more accurate passing of gcc
 flags.

---
 ChangeLog    |  3 +++
 configure.in | 20 ++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 420ae5f4e..bca740a7f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2011-11-20  Torbjorn Granlund  <tege@gmplib.org>
 
+	* configure.in: Split x86 CPUs into more subtypes for more accurate
+	passing of gcc flags.
+
 	* mpn/powerpc32/p3-p7/aors_n.asm: New file.
 
 	* configure.in: Pass -m32 for powerpc64 with abi=32, using via _maybe
diff --git a/configure.in b/configure.in
index 6427ec3dd..186d4b576 100644
--- a/configure.in
+++ b/configure.in
@@ -1536,14 +1536,30 @@ case $host in
         gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
         ;;
-      athlon64 | k8 | k10 | bobcat | bulldozer | x86_64)
+      athlon64 | k8 | x86_64)
         gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
         ;;
-      core2 | corei | coreinhm | coreiwsm | coreisbr)
+      k10)
+        gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
+        gcc_cflags_arch="-march=amdfam10 -mtune=k8 -march=k8~-mno-sse2"
+        ;;
+      bobcat)
+        gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
+        gcc_cflags_arch="-march=btver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2"
+        ;;
+      bulldozer)
+        gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
+        gcc_cflags_arch="-march=bdver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2"
+        ;;
+      core2)
         gcc_cflags_cpu="-mtune=core2 -mtune=k8"
         gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
         ;;
+      corei | coreinhm | coreiwsm | coreisbr)
+        gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
+        gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+        ;;
       atom)
         gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
         gcc_cflags_arch="-march=atom -march=pentium3"
-- 
cgit v1.2.1


From cda511a97523ac223432c6767e11b40e95e157e4 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sun, 20 Nov 2011 22:56:02 +0100
Subject: Retune.

---
 mpn/x86_64/coreisbr/gmp-mparam.h | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/mpn/x86_64/coreisbr/gmp-mparam.h b/mpn/x86_64/coreisbr/gmp-mparam.h
index dab35f174..c30c64ec8 100644
--- a/mpn/x86_64/coreisbr/gmp-mparam.h
+++ b/mpn/x86_64/coreisbr/gmp-mparam.h
@@ -53,9 +53,13 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                  0
 #define SQR_TOOM8_THRESHOLD                458
 
-#define MULMOD_BNM1_THRESHOLD               13
+#define MULMID_TOOM42_THRESHOLD             24
+
+#define MULMOD_BNM1_THRESHOLD               14
 #define SQRMOD_BNM1_THRESHOLD               14
 
+#define POWM_SEC_TABLE  4,35,130,713,2080
+
 #define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    380, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
@@ -176,27 +180,29 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DC_BDIV_QR_THRESHOLD                31
 #define DC_BDIV_Q_THRESHOLD                 71
 
-#define INV_MULMOD_BNM1_THRESHOLD           38
-#define INV_NEWTON_THRESHOLD               127
-#define INV_APPR_THRESHOLD                 123
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               123
+#define INV_APPR_THRESHOLD                 122
 
-#define BINV_NEWTON_THRESHOLD              181
-#define REDC_1_TO_REDC_2_THRESHOLD          17
-#define REDC_2_TO_REDC_N_THRESHOLD          51
+#define BINV_NEWTON_THRESHOLD              197
+#define REDC_1_TO_REDC_2_THRESHOLD          20
+#define REDC_2_TO_REDC_N_THRESHOLD          54
 
 #define MU_DIV_QR_THRESHOLD               1334
 #define MU_DIVAPPR_Q_THRESHOLD            1387
-#define MUPI_DIV_QR_THRESHOLD               57
+#define MUPI_DIV_QR_THRESHOLD               46
 #define MU_BDIV_QR_THRESHOLD              1142
 #define MU_BDIV_Q_THRESHOLD               1308
 
 #define MATRIX22_STRASSEN_THRESHOLD         15
-#define HGCD_THRESHOLD                      90
-#define GCD_DC_THRESHOLD                   400
-#define GCDEXT_DC_THRESHOLD                372
+#define HGCD_THRESHOLD                      91
+#define HGCD_APPR_THRESHOLD                105
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   358
+#define GCDEXT_DC_THRESHOLD                351
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                12
-#define GET_STR_PRECOMPUTE_THRESHOLD        21
-#define SET_STR_DC_THRESHOLD               802
-#define SET_STR_PRECOMPUTE_THRESHOLD      1712
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               781
+#define SET_STR_PRECOMPUTE_THRESHOLD      1940
-- 
cgit v1.2.1


From 15a7619b6229dea0d8d895aaa5506e40304dcb3f Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 21 Nov 2011 21:03:39 +0100
Subject: (__GNU_MP_RELEASE): Renamed from typo name.

---
 ChangeLog | 4 ++++
 gmp-h.in  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index bca740a7f..4d031a239 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-21  Torbjorn Granlund  <tege@gmplib.org>
+
+	* gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name.
+
 2011-11-20  Torbjorn Granlund  <tege@gmplib.org>
 
 	* configure.in: Split x86 CPUs into more subtypes for more accurate
diff --git a/gmp-h.in b/gmp-h.in
index 7d6b22926..ba732f5e3 100644
--- a/gmp-h.in
+++ b/gmp-h.in
@@ -2275,7 +2275,7 @@ enum
 #define __GNU_MP_VERSION 5
 #define __GNU_MP_VERSION_MINOR 0
 #define __GNU_MP_VERSION_PATCHLEVEL 90
-#define __GMP_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL)
+#define __GNU_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL)
 
 #define __GMP_H__
 #endif /* __GMP_H__ */
-- 
cgit v1.2.1


From f24a8deaf598267ea9c57ba93e9e6a94038bc8f3 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 21 Nov 2011 21:15:18 +0100
Subject: Spacing cleanup.

---
 AUTHORS                            |  2 +-
 gmp-h.in                           |  2 +-
 gmpxx.h                            |  2 +-
 mpn/generic/gcd_subdiv_step.c      |  2 +-
 mpn/generic/hgcd_appr.c            |  6 +++---
 mpn/generic/hgcd_jacobi.c          |  4 ++--
 mpn/generic/hgcd_reduce.c          | 14 ++++++-------
 mpn/generic/hgcd_step.c            |  2 +-
 mpn/powerpc64/mode64/aorsmul_1.asm |  2 +-
 mpn/s390_32/lshift.asm             |  2 +-
 mpn/s390_32/lshiftc.asm            |  2 +-
 mpn/s390_32/rshift.asm             |  2 +-
 mpn/x86/atom/lshift.asm            |  4 ++--
 mpn/x86/atom/sse2/mul_1.asm        |  2 +-
 mpn/x86/bdiv_dbm1c.asm             |  4 ++--
 mpn/x86/bdiv_q_1.asm               |  2 +-
 mpn/x86/k7/addlsh1_n.asm           |  6 +++---
 mpn/x86/k7/invert_limb.asm         |  2 +-
 mpn/x86/k7/sublsh1_n.asm           |  8 ++++----
 mpn/x86/p6/bdiv_q_1.asm            |  4 ++--
 mpn/x86/pentium/bdiv_q_1.asm       |  2 +-
 mpn/x86_64/div_qr_2n_pi1.asm       |  6 +++---
 mpn/x86_64/div_qr_2u_pi1.asm       |  6 +++---
 mpn/x86_64/mod_1_1.asm             |  4 ++--
 mpz/jacobi.c                       |  8 ++++----
 tests/cxx/t-ops2.cc                | 40 +++++++++++++++++++-------------------
 tests/devel/try.c                  |  2 +-
 tests/mpn/t-hgcd_appr.c            | 14 ++++++-------
 tests/mpn/t-mod_1.c                |  2 +-
 tests/mpn/t-mulmid.c               |  2 +-
 tests/mpz/t-jac.c                  |  4 ++--
 tune/tune-gcd-p.c                  |  4 ++--
 tune/tuneup.c                      |  2 +-
 33 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 170c766e1..f399ce345 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -58,5 +58,5 @@ David Harvey		mpn/generic/add_err1_n.c, add_err2_n.c,
       			aors_err2_n.asm, aors_err3_n.asm,
 			mulmid_basecase.asm,
 			mpn/x86_64/core2/aors_err1_n.asm.
-			
+
 Martin Boij		mpn/generic/perfpow.c
diff --git a/gmp-h.in b/gmp-h.in
index ba732f5e3..fa3438041 100644
--- a/gmp-h.in
+++ b/gmp-h.in
@@ -1535,7 +1535,7 @@ __GMP_DECLSPEC mp_limb_t mpn_divrem_2 __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, m
 
 #define mpn_div_qr_2 __MPN(div_qr_2)
 __GMP_DECLSPEC mp_limb_t mpn_div_qr_2 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
-  
+
 #define mpn_gcd __MPN(gcd)
 __GMP_DECLSPEC mp_size_t mpn_gcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
 
diff --git a/gmpxx.h b/gmpxx.h
index e7ef16266..fb4865466 100644
--- a/gmpxx.h
+++ b/gmpxx.h
@@ -616,7 +616,7 @@ struct __gmp_binary_divides
     }
     else
 #endif
-      mpz_tdiv_q_ui(z, w, l); 
+      mpz_tdiv_q_ui(z, w, l);
   }
   static void eval(mpz_ptr z, unsigned long int l, mpz_srcptr w)
   {
diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c
index 11c00bb6a..3db34073c 100644
--- a/mpn/generic/gcd_subdiv_step.c
+++ b/mpn/generic/gcd_subdiv_step.c
@@ -185,7 +185,7 @@ mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s,
 	}
       else
 	MPN_COPY (bp, ap, an);
-      
+
       MPN_DECR_U (tp, qn, 1);
     }
 
diff --git a/mpn/generic/hgcd_appr.c b/mpn/generic/hgcd_appr.c
index 8454f9da5..f7c7eb2c9 100644
--- a/mpn/generic/hgcd_appr.c
+++ b/mpn/generic/hgcd_appr.c
@@ -72,7 +72,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
      we discard some of the least significant limbs, we must keep one
      additional bit to account for the truncation error. We maintain
      the GMP_NUMB_BITS * s - extra_bits as the current target size. */
-     
+
   s = n/2 + 1;
   if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
     {
@@ -155,7 +155,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
 	      ASSERT (n <= 2*s);
 
 	      nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-	  
+
 	      if (!nn)
 		return 1;
 
@@ -249,7 +249,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
 	  ASSERT (n <= 2*s);
 
 	  nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-	  
+
 	  if (!nn)
 	    return success;
 
diff --git a/mpn/generic/hgcd_jacobi.c b/mpn/generic/hgcd_jacobi.c
index 2dce43b99..0d4cb021c 100644
--- a/mpn/generic/hgcd_jacobi.c
+++ b/mpn/generic/hgcd_jacobi.c
@@ -26,7 +26,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "longlong.h"
 
 /* This file is almost a copy of hgcd.c, with some added calls to
-   mpn_jacobi_update */ 
+   mpn_jacobi_update */
 
 struct hgcd_jacobi_ctx
 {
@@ -127,7 +127,7 @@ hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
     struct hgcd_jacobi_ctx ctx;
     ctx.M = M;
     ctx.bitsp = bitsp;
-    
+
     return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp);
   }
 }
diff --git a/mpn/generic/hgcd_reduce.c b/mpn/generic/hgcd_reduce.c
index 142d44a30..89240af4d 100644
--- a/mpn/generic/hgcd_reduce.c
+++ b/mpn/generic/hgcd_reduce.c
@@ -38,7 +38,7 @@ submul (mp_ptr rp, mp_size_t rn,
   ASSERT (an >= bn);
   ASSERT (rn >= an);
   ASSERT (an + bn <= rn + 1);
-  
+
   TMP_MARK;
   tp = TMP_ALLOC_LIMBS (an + bn);
 
@@ -61,7 +61,7 @@ submul (mp_ptr rp, mp_size_t rn,
 /* FIXME:
     x Take scratch parameter, and figure out scratch need.
 
-    x Use some fallback for small M->n?    
+    x Use some fallback for small M->n?
 */
 static mp_size_t
 hgcd_matrix_apply (const struct hgcd_matrix *M,
@@ -83,7 +83,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
   MPN_NORMALIZE (ap, an);
   bn = n;
   MPN_NORMALIZE (bp, bn);
-  
+
   for (i = 0; i < 2; i++)
     for (j = 0; j < 2; j++)
       {
@@ -102,7 +102,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
   if (mn[0][1] == 0)
     {
       mp_size_t qn;
-      
+
       /* A unchanged, M = (1, 0; q, 1) */
       ASSERT (mn[0][0] == 1);
       ASSERT (M->p[0][0][0] == 1);
@@ -121,7 +121,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
       ASSERT (M->p[1][1][0] == 1);
 
       /* Put A  <-- A - q * B */
-      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);      
+      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
     }
   else
     {
@@ -159,7 +159,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
 	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
       if (n + mn[0][1] < modn)
 	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-  
+
       cy = mpn_sub_n (tp, tp, sp, modn);
       MPN_DECR_U (tp, modn, cy);
 
@@ -209,7 +209,7 @@ mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p)
       itch = 2*(n-p) + mpn_hgcd_itch (n-p);
       /* Currently, hgcd_matrix_apply allocates its own storage. */
     }
-  return itch;      
+  return itch;
 }
 
 /* FIXME: Document storage need. */
diff --git a/mpn/generic/hgcd_step.c b/mpn/generic/hgcd_step.c
index 0e56be39e..dbc757935 100644
--- a/mpn/generic/hgcd_step.c
+++ b/mpn/generic/hgcd_step.c
@@ -112,7 +112,7 @@ mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
       /* Multiply M1^{-1} (a;b) */
       return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
     }
- 
+
  subtract:
 
   return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp);
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index 658a2d941..4b843a044 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -54,7 +54,7 @@ ifdef(`OPERATION_submul_1',`
 ')
 
 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-	
+
 ASM_START()
 PROLOGUE(func_nc)
 EPILOGUE()
diff --git a/mpn/s390_32/lshift.asm b/mpn/s390_32/lshift.asm
index 335a5f77a..17e52655f 100644
--- a/mpn/s390_32/lshift.asm
+++ b/mpn/s390_32/lshift.asm
@@ -126,7 +126,7 @@ L(top):	l	%r10, 16(up)
 L(end):	l	%r10, 16(up)
 	sll	%r10, 0(cnt)
 	st	%r10, 12(rp)
-	
+
 	lr	%r2, %r12
 	lm	%r6, %r12, 24(%r15)
 	br	%r14
diff --git a/mpn/s390_32/lshiftc.asm b/mpn/s390_32/lshiftc.asm
index b42bc715b..9bdd0d798 100644
--- a/mpn/s390_32/lshiftc.asm
+++ b/mpn/s390_32/lshiftc.asm
@@ -138,7 +138,7 @@ L(end):	l	%r10, 16(up)
 	sll	%r10, 0(cnt)
 	xr	%r10, %r13
 	st	%r10, 12(rp)
-	
+
 	lr	%r2, %r12
 	lm	%r6, %r13, 24(%r15)
 	br	%r14
diff --git a/mpn/s390_32/rshift.asm b/mpn/s390_32/rshift.asm
index ec32fa764..becbe1893 100644
--- a/mpn/s390_32/rshift.asm
+++ b/mpn/s390_32/rshift.asm
@@ -120,7 +120,7 @@ L(top):	l	%r11, 0(up)
 L(end):	l	%r11, 0(up)
 	srl	%r11, 0(cnt)
 	st	%r11, 0(rp)
-	
+
 	lr	%r2, %r12
 	lm	%r6, %r12, 24(%r15)
 	br	%r14
diff --git a/mpn/x86/atom/lshift.asm b/mpn/x86/atom/lshift.asm
index d8cb8b505..1005cce59 100644
--- a/mpn/x86/atom/lshift.asm
+++ b/mpn/x86/atom/lshift.asm
@@ -160,7 +160,7 @@ deflit(`FRAME',4)
 	shr	$2, %eax		C (size + 3) / 4
 	and	$3, %edx		C (size - 1) % 4
 	jz	L(goloop)		C jmp if  size == 1 (mod 4)
-	shr	%edx			
+	shr	%edx
 	jnc	L(odd)			C jum if  size == 3 (mod 4)
 
 	add	%ecx, %ecx
@@ -173,7 +173,7 @@ deflit(`FRAME',4)
 	jnz	L(goloop)		C jump if  size == 0 (mod 4)
 L(odd):	lea	-8(up), up
 	lea	-8(rp), rp
-	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4) 
+	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4)
 
 L(sloop):
 	adc	%ecx, %ecx
diff --git a/mpn/x86/atom/sse2/mul_1.asm b/mpn/x86/atom/sse2/mul_1.asm
index dd9b95366..5cd86caec 100644
--- a/mpn/x86/atom/sse2/mul_1.asm
+++ b/mpn/x86/atom/sse2/mul_1.asm
@@ -62,7 +62,7 @@ EPILOGUE()
 PROLOGUE(mpn_mul_1)
 	pxor	%mm6, %mm6
 L(ent):	push	%esi			FRAME_pushl()
-	mov	PARAM_SRC, up		
+	mov	PARAM_SRC, up
 	mov	PARAM_SIZE, %eax	C size
 	movd	PARAM_MUL, %mm7
 	movd	(up), %mm0
diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm
index 201ef173d..ac9faf270 100644
--- a/mpn/x86/bdiv_dbm1c.asm
+++ b/mpn/x86/bdiv_dbm1c.asm
@@ -24,10 +24,10 @@ C P5
 C P6 model 0-8,10-12)
 C P6 model 9  (Banias)
 C P6 model 13 (Dothan)		 5.1
-C P4 model 0  (Willamette)	 
+C P4 model 0  (Willamette)
 C P4 model 1  (?)
 C P4 model 2  (Northwood)	13.67
-C P4 model 3  (Prescott)	 
+C P4 model 3  (Prescott)
 C P4 model 4  (Nocona)
 C Intel Atom
 C AMD K6
diff --git a/mpn/x86/bdiv_q_1.asm b/mpn/x86/bdiv_q_1.asm
index 2528d01f7..7f344ab57 100644
--- a/mpn/x86/bdiv_q_1.asm
+++ b/mpn/x86/bdiv_q_1.asm
@@ -30,7 +30,7 @@ C K6     14.0
 C K7     12.0
 C P4     42.0
 
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)	
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
 
 defframe(PARAM_SHIFT,  24)
 defframe(PARAM_INVERSE,20)
diff --git a/mpn/x86/k7/addlsh1_n.asm b/mpn/x86/k7/addlsh1_n.asm
index e5163b676..05df4a740 100644
--- a/mpn/x86/k7/addlsh1_n.asm
+++ b/mpn/x86/k7/addlsh1_n.asm
@@ -44,14 +44,14 @@ C AMD K8
 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
 C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
 C that means we need an initial magic multiply.
-C 
+C
 C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
 C cannot do rsblsh1_n since we feed carry from the shift blocks to the
 C add/subtract blocks, which is right for addition but reversed for
 C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
 C without losing any time, since we're not issue limited but carry recurrency
 C latency.
-C 
+C
 C Breaking carry recurrency might be a good idea.  We would then need separate
 C registers for the shift carry and add/subtract carry, which in turn would
 C force is to 2*2-way unrolling.
@@ -120,7 +120,7 @@ ifdef(`CPU_P6',`
 L(exact):
 	incl	VAR_COUNT
 	jz	L(end)
-	
+
 	ALIGN(16)
 L(top):
 ifdef(`CPU_P6',`
diff --git a/mpn/x86/k7/invert_limb.asm b/mpn/x86/k7/invert_limb.asm
index da6f28397..435fa96d0 100644
--- a/mpn/x86/k7/invert_limb.asm
+++ b/mpn/x86/k7/invert_limb.asm
@@ -60,7 +60,7 @@ ifdef(`DARWIN',`
 PROLOGUE(mpn_invert_limb)
 deflit(`FRAME', 0)
 	mov	PARAM_DIVISOR, %eax
-	C Avoid push/pop on k7.	
+	C Avoid push/pop on k7.
 	sub	$8, %esp	FRAME_subl_esp(8)
 	mov	%ebx, (%esp)
 	mov	%edi, 4(%esp)
diff --git a/mpn/x86/k7/sublsh1_n.asm b/mpn/x86/k7/sublsh1_n.asm
index 41993f99a..965348586 100644
--- a/mpn/x86/k7/sublsh1_n.asm
+++ b/mpn/x86/k7/sublsh1_n.asm
@@ -30,7 +30,7 @@ C			    cycles/limb
 C P5
 C P6 model 0-8,10-12
 C P6 model 9  (Banias)
-C P6 model 13 (Dothan)		 
+C P6 model 13 (Dothan)
 C P4 model 0  (Willamette)
 C P4 model 1  (?)
 C P4 model 2  (Northwood)
@@ -38,12 +38,12 @@ C P4 model 3  (Prescott)
 C P4 model 4  (Nocona)
 C Intel Atom			 6.75
 C AMD K6
-C AMD K7			 
+C AMD K7
 C AMD K8
 
 C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
 C processors.  It uses 2*4-way unrolling, for good reasons.
-C 
+C
 C Breaking carry recurrency might be a good idea.  We would then need separate
 C registers for the shift carry and add/subtract carry, which in turn would
 C force is to 2*2-way unrolling.
@@ -114,7 +114,7 @@ ifdef(`CPU_P6',`
 	adc	%ebp, %ebp
 
 	rcr	%edx			C restore 1st saved carry bit
-	
+
 	sbb	%eax, (rp)
 	sbb	%ebx, 4(rp)
 	sbb	%ecx, 8(rp)
diff --git a/mpn/x86/p6/bdiv_q_1.asm b/mpn/x86/p6/bdiv_q_1.asm
index 3a8733a0d..0ffbc78e4 100644
--- a/mpn/x86/p6/bdiv_q_1.asm
+++ b/mpn/x86/p6/bdiv_q_1.asm
@@ -25,7 +25,7 @@ include(`../config.m4')
 C       odd  even  divisor
 C P6:  10.0  12.0  cycles/limb
 
-C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)	
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
 
 C The odd case is basically the same as mpn_modexact_1_odd, just with an
 C extra store, and it runs at the same 10 cycles which is the dependent
@@ -269,7 +269,7 @@ ifdef(`PIC',`
 	imull	%edx, %eax	C inv*inv*d
 
 	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
-	
+
 	jmp	L(common)
 
 EPILOGUE()
diff --git a/mpn/x86/pentium/bdiv_q_1.asm b/mpn/x86/pentium/bdiv_q_1.asm
index 965173d1c..7e84fc817 100644
--- a/mpn/x86/pentium/bdiv_q_1.asm
+++ b/mpn/x86/pentium/bdiv_q_1.asm
@@ -27,7 +27,7 @@ C       odd   even
 C P54:  24.5  30.5   cycles/limb
 C P55:  23.0  28.0
 
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)	
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
 
 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
 C expected.  On P54 in the even case the shrdl pairing nonsense (see
diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm
index 9f23012da..c28d0a02c 100644
--- a/mpn/x86_64/div_qr_2n_pi1.asm
+++ b/mpn/x86_64/div_qr_2n_pi1.asm
@@ -44,7 +44,7 @@ C TODO
 C * Store qh in the same stack slot as di_param, instead of pushing
 C   it. (we could put it in register %rbp, but then we would need to
 C   save and restore that instead, which doesn't seem like a win).
-	
+
 ASM_START()
 	TEXT
 	ALIGN(16)
@@ -56,7 +56,7 @@ PROLOGUE(mpn_div_qr_2n_pi1)
 	push	%r13
 	push	%r12
 	push	%rbx
-	
+
 	mov	-16(up, un, 8), u1
 	mov	-8(up, un, 8), u2
 
@@ -135,5 +135,5 @@ L(fix):	C Unlikely update. u2 >= d1
 	inc	t1
 	sub	d0, u1
 	sbb	d1, u2
-	jmp	L(bck)	
+	jmp	L(bck)
 EPILOGUE()
diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm
index cfc7712d5..bdb64c148 100644
--- a/mpn/x86_64/div_qr_2u_pi1.asm
+++ b/mpn/x86_64/div_qr_2u_pi1.asm
@@ -66,7 +66,7 @@ deflit(`FRAME', 56)
 	movl	shift_param, R32(%rcx)
 
 	C FIXME: Different code for SHLD_SLOW
-	
+
 	xor	R32(u2), R32(u2)
 	mov	8(up, un, 8), u1
 	shld	%cl, u1, u2
@@ -173,7 +173,7 @@ L(fix):	C Unlikely update. u2 >= d1
 	inc	t1
 	sub	d0, u1
 	sbb	d1, u2
-	jmp	L(bck)	
+	jmp	L(bck)
 
 C Duplicated, just jumping back to a different address.
 L(fix_qh):	C Unlikely update. u2 >= d1
@@ -185,5 +185,5 @@ L(fix_qh):	C Unlikely update. u2 >= d1
 	inc	t1
 	sub	d0, u1
 	sbb	d1, u2
-	jmp	L(bck_qh)	
+	jmp	L(bck_qh)
 EPILOGUE()
diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm
index 6b233e074..56f708a75 100644
--- a/mpn/x86_64/mod_1_1.asm
+++ b/mpn/x86_64/mod_1_1.asm
@@ -51,7 +51,7 @@ C Note: This implementation needs B1modb only when cnt > 0
 C The iteration is almost as follows,
 C
 C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
-C                                                
+C
 C where r2 is a single bit represented as a mask. But to make sure that the
 C result fits in two limbs and a bit, carry from the addition
 C
@@ -206,7 +206,7 @@ ifdef(`SHLD_SLOW',`
 ')
 	imul	%rdx, %r8
 	shr	R8(%rcx), %r8
-	mov	%r8, 16(%rbx)		C store B1modb	
+	mov	%r8, 16(%rbx)		C store B1modb
 L(z):
 	pop	%r12
 	pop	%rbx
diff --git a/mpz/jacobi.c b/mpz/jacobi.c
index afd9a49b4..8bfb2e92b 100644
--- a/mpz/jacobi.c
+++ b/mpz/jacobi.c
@@ -110,7 +110,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
       result_bit1 ^= JACOBI_N1B_BIT1(blow);
       asize = -asize;
     }
-  
+
   JACOBI_STRIP_LOW_ZEROS (result_bit1, blow, asrcp, asize, alow);
 
   /* Ensure asize >= bsize. Take advantage of the generalized
@@ -147,7 +147,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
 
       result_bit1 ^= JACOBI_RECIP_UU_BIT1 (alow, blow);
     }
-  
+
   if (bsize == 1)
     {
       result_bit1 ^= JACOBI_TWOS_U_BIT1(btwos, alow);
@@ -165,7 +165,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
      % B, but when A is much larger than B, we have to allocate space
      for the large quotient. We use the same area, pointed to by bp,
      for both the quotient A/B and the working copy of B. */
-  
+
   TMP_MARK;
 
   if (asize >= 2*bsize)
@@ -189,7 +189,7 @@ mpz_jacobi (mpz_srcptr a, mpz_srcptr b)
       result_bit1 ^= JACOBI_TWOS_U_BIT1(btwos, alow);
 
       ASSERT_NOCARRY (mpn_rshift (bp, bsrcp, bsize, btwos));
-      bsize -= (ap[bsize-1] | bp[bsize-1]) == 0;      
+      bsize -= (ap[bsize-1] | bp[bsize-1]) == 0;
     }
   else
     MPN_COPY (bp, bsrcp, bsize);
diff --git a/tests/cxx/t-ops2.cc b/tests/cxx/t-ops2.cc
index 9a6e7e020..4967ed208 100644
--- a/tests/cxx/t-ops2.cc
+++ b/tests/cxx/t-ops2.cc
@@ -148,18 +148,18 @@ void checkqf (){
   CHECK_SI(T,0,3,*);
   CHECK_ALL_COMPARISONS(T,5.,2);
   CHECK_ALL_SIGNS_COMPARISONS(T,11.,3);
-  CHECK_MPZ(T,5,-2,<); 
-  CHECK_MPZ(T,5,-2,>); 
+  CHECK_MPZ(T,5,-2,<);
+  CHECK_MPZ(T,5,-2,>);
   CHECK_MPZ(T,5,-2,<=);
   CHECK_MPZ(T,5,-2,>=);
   CHECK_MPZ(T,5,-2,==);
   CHECK_MPZ(T,5,-2,!=);
-  CHECK_MPZ(T,0,0,<);  
-  CHECK_MPZ(T,0,0,>);  
-  CHECK_MPZ(T,0,0,<=); 
-  CHECK_MPZ(T,0,0,>=); 
-  CHECK_MPZ(T,0,0,==); 
-  CHECK_MPZ(T,0,0,!=); 
+  CHECK_MPZ(T,0,0,<);
+  CHECK_MPZ(T,0,0,>);
+  CHECK_MPZ(T,0,0,<=);
+  CHECK_MPZ(T,0,0,>=);
+  CHECK_MPZ(T,0,0,==);
+  CHECK_MPZ(T,0,0,!=);
   ASSERT_ALWAYS(T(6)<<2==6.*4);
   ASSERT_ALWAYS(T(6)>>2==6./4);
   ASSERT_ALWAYS(T(-13)<<2==-13.*4);
@@ -217,18 +217,18 @@ void checkf (){
   CHECK_MPQ(mpf_class,-5.5,-2.25,-);
   CHECK_MPQ(mpf_class,-5.5,-2.25,*);
   CHECK_MPQ(mpf_class,-5.25,-0.5,/);
-  CHECK_MPQ(mpf_class,5,-2,<);  
-  CHECK_MPQ(mpf_class,5,-2,>);  
-  CHECK_MPQ(mpf_class,5,-2,<=); 
-  CHECK_MPQ(mpf_class,5,-2,>=); 
-  CHECK_MPQ(mpf_class,5,-2,==); 
-  CHECK_MPQ(mpf_class,5,-2,!=); 
-  CHECK_MPQ(mpf_class,0,0,<);  
-  CHECK_MPQ(mpf_class,0,0,>);  
-  CHECK_MPQ(mpf_class,0,0,<=); 
-  CHECK_MPQ(mpf_class,0,0,>=); 
-  CHECK_MPQ(mpf_class,0,0,==); 
-  CHECK_MPQ(mpf_class,0,0,!=); 
+  CHECK_MPQ(mpf_class,5,-2,<);
+  CHECK_MPQ(mpf_class,5,-2,>);
+  CHECK_MPQ(mpf_class,5,-2,<=);
+  CHECK_MPQ(mpf_class,5,-2,>=);
+  CHECK_MPQ(mpf_class,5,-2,==);
+  CHECK_MPQ(mpf_class,5,-2,!=);
+  CHECK_MPQ(mpf_class,0,0,<);
+  CHECK_MPQ(mpf_class,0,0,>);
+  CHECK_MPQ(mpf_class,0,0,<=);
+  CHECK_MPQ(mpf_class,0,0,>=);
+  CHECK_MPQ(mpf_class,0,0,==);
+  CHECK_MPQ(mpf_class,0,0,!=);
 }
 
 int
diff --git a/tests/devel/try.c b/tests/devel/try.c
index bf09dd829..7ccb9de0b 100644
--- a/tests/devel/try.c
+++ b/tests/devel/try.c
@@ -459,7 +459,7 @@ validate_bdiv_q_1
 
     refmpn_mul_1 (tp, dst, size, divisor);
     /* Set ignored low bits */
-    tp[0] |= (src[0] & LOW_ZEROS_MASK (divisor)); 
+    tp[0] |= (src[0] & LOW_ZEROS_MASK (divisor));
     if (! refmpn_equal_anynail (tp, src, size))
       {
 	printf ("Bdiv wrong: res * divisor != src (mod B^size)\n");
diff --git a/tests/mpn/t-hgcd_appr.c b/tests/mpn/t-hgcd_appr.c
index 912a1fde0..486b13061 100644
--- a/tests/mpn/t-hgcd_appr.c
+++ b/tests/mpn/t-hgcd_appr.c
@@ -261,7 +261,7 @@ one_test (mpz_t a, mpz_t b, int i)
 		     "after tp: %Mx\n"
 		     "expected: %Mx\n",
 		     hgcd_tp[hgcd_scratch], marker[3]);
-      
+
       abort ();
     }
 
@@ -424,7 +424,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
   mp_bitcnt_t dbits, abits, margin;
   mpz_t appr_r0, appr_r1, t, q;
   struct hgcd_ref appr;
-  
+
   if (!res0)
     {
       if (!res1)
@@ -433,7 +433,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
       fprintf (stderr, "mpn_hgcd_appr returned 1 when no reduction possible.\n");
       return 0;
     }
-      
+
   /* NOTE: No *_clear calls on error return, since we're going to
      abort anyway. */
   mpz_init (t);
@@ -441,7 +441,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
   hgcd_ref_init (&appr);
   mpz_init (appr_r0);
   mpz_init (appr_r1);
- 
+
   if (mpz_size (ref_r0) <= s)
     {
       fprintf (stderr, "ref_r0 too small!!!: "); debug_mp (ref_r0, 16);
@@ -460,7 +460,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
       fprintf (stderr, "ref |r0 - r1| too large!!!: "); debug_mp (t, 16);
       return 0;
     }
-   
+
   if (!res1)
     {
       mpz_set (appr_r0, a);
@@ -473,7 +473,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
       for (i = 0; i<2; i++)
 	{
 	  unsigned j;
-	  
+
 	  for (j = 0; j<2; j++)
 	    {
 	      mp_size_t mn = hgcd->n;
@@ -567,7 +567,7 @@ hgcd_appr_valid_p (mpz_t a, mpz_t b, mp_size_t res0,
 
       fprintf (stderr, "appr_r1: "); debug_mp (appr_r1, 16);
       fprintf (stderr, "ref_r1: "); debug_mp (ref_r1, 16);
-      
+
       return 0;
     }
   mpz_clear (t);
diff --git a/tests/mpn/t-mod_1.c b/tests/mpn/t-mod_1.c
index f1966154d..2f86ba277 100644
--- a/tests/mpn/t-mod_1.c
+++ b/tests/mpn/t-mod_1.c
@@ -90,7 +90,7 @@ main (int argc, char **argv)
   rands = RANDS;
   mpz_init (a);
   mpz_init (b);
-  
+
   for (i = 0; i < 300; i++)
     {
       mp_size_t asize;
diff --git a/tests/mpn/t-mulmid.c b/tests/mpn/t-mulmid.c
index ab224acea..a946aefe8 100644
--- a/tests/mpn/t-mulmid.c
+++ b/tests/mpn/t-mulmid.c
@@ -52,7 +52,7 @@ main (int argc, char **argv)
   bp = TMP_ALLOC_LIMBS (MAX_N);
   rp = TMP_ALLOC_LIMBS (MAX_N + 2);
   refp = TMP_ALLOC_LIMBS (MAX_N + 2);
-  
+
   for (test = 0; test < COUNT; test++)
     {
       mp_size_t an, bn, rn;
diff --git a/tests/mpz/t-jac.c b/tests/mpz/t-jac.c
index 5d8cad177..34cd82e78 100644
--- a/tests/mpz/t-jac.c
+++ b/tests/mpz/t-jac.c
@@ -921,7 +921,7 @@ mpz_nextprime_step (mpz_ptr p, mpz_srcptr n, mpz_srcptr step_in)
   mpz_gcd (gcd, p, step);
   ASSERT_ALWAYS (mpz_cmp_ui (gcd, 1) == 0);
   mpz_clear (gcd);
-    
+
   pn = SIZ(p);
   count_leading_zeros (cnt, PTR(p)[pn - 1]);
   nbits = pn * GMP_NUMB_BITS - (cnt - GMP_NAIL_BITS);
@@ -1016,7 +1016,7 @@ check_large_quotients (void)
       mpz_set_ui (op1, 0);
       mpz_urandomb (bs, rands, 32);
       mpz_urandomb (bs, rands, mpz_get_ui (bs) % 10 + 1);
-      
+
       gcd_size = 1 + mpz_get_ui (bs);
       if (gcd_size & 1)
 	{
diff --git a/tune/tune-gcd-p.c b/tune/tune-gcd-p.c
index 3c3815bd2..6d8863178 100644
--- a/tune/tune-gcd-p.c
+++ b/tune/tune-gcd-p.c
@@ -39,7 +39,7 @@ search (double *minp, double (*f)(void *, int), void *ctx, int start, int end)
   double y[4];
 
   int best_i;
-  
+
   x[0] = start;
   x[3] = end;
 
@@ -60,7 +60,7 @@ search (double *minp, double (*f)(void *, int), void *ctx, int start, int end)
 #if 0
       printf("%d: %f, %d: %f, %d:, %f %d: %f\n",
 	     x[0], y[0], x[1], y[1], x[2], y[2], x[3], y[3]);
-#endif    
+#endif
       for (best_i = 0, i = 1; i < 4; i++)
 	if (y[i] < y[best_i])
 	  best_i = i;
diff --git a/tune/tuneup.c b/tune/tuneup.c
index 444e5e429..4cc75eed1 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -203,7 +203,7 @@ mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
 mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
-int	   mod_1_1p_method		= 0; 
+int	   mod_1_1p_method		= 0;
 mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
 mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
 mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
-- 
cgit v1.2.1


From 4c05211240a87d8eaa562b7c842ceac0a521bc02 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 00:11:49 +0100
Subject: Add power6-specific addmul_1.

---
 ChangeLog                            |   4 +
 mpn/powerpc64/mode64/p6/addmul_1.asm | 147 +++++++++++++++++++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 mpn/powerpc64/mode64/p6/addmul_1.asm

diff --git a/ChangeLog b/ChangeLog
index 4d031a239..61631cce8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-22  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/powerpc64/mode64/p6/addmul_1.asm: New file.
+
 2011-11-21  Torbjorn Granlund  <tege@gmplib.org>
 
 	* gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name.
diff --git a/mpn/powerpc64/mode64/p6/addmul_1.asm b/mpn/powerpc64/mode64/p6/addmul_1.asm
new file mode 100644
index 000000000..bffa6f308
--- /dev/null
+++ b/mpn/powerpc64/mode64/p6/addmul_1.asm
@@ -0,0 +1,147 @@
+dnl  PowerPC-64 mpn_addmul_1 optimised for power6.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011
+dnl  Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 ?
+C POWER6		12.25
+C POWER7		 ?
+
+C TODO
+C  * Reduce register usage.
+C  * Schedule function entry code.
+C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C    would bring us to 9 c/l.
+C  * Generalise to handle submul_1.
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`v0',  `r6')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 3		C compute count...
+	srdi	n, n, 2		C ...for ctr
+	mtctr	n		C copy loop count into ctr
+	beq	cr0, L(b0)
+	blt	cr6, L(b1)
+	beq	cr6, L(b2)
+
+L(b3):	addi	up, up, 16
+	addi	rp, rp, 16
+	ld	r8, -16(up)
+	ld	r7, -8(up)
+	ld	r27, 0(up)
+	mulld	r5,  r8, v0
+	mulhdu	r8,  r8, v0
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r29, -16(rp)
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	addc	r9, r9, r8
+	adde	r11, r11, r7
+	addze	r12, r27
+	addc	r5, r5, r29
+	b	L(l3)
+
+L(b2):	addi	up, up, 8
+	addi	rp, rp, 8
+	ld	r7, -8(up)
+	ld	r27, 0(up)
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	addc	r11, r11, r7
+	addze	r12, r27
+	addc	r9, r9, r30
+	b	L(l2)
+
+L(b1):	ld	r27, 0(up)
+	ld	r31, 0(rp)
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	addc	r11, r11, r31
+	b	L(l1)
+
+L(b0):	addi	up, up, -8
+	addi	rp, rp, -8
+	addic	r12, r0, 0	C clear r12 and cy (use that r0 = 0)
+
+	ALIGN(32)
+L(top):	ld	r10, 8(up)
+	ld	r8, 16(up)
+	ld	r7, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	mulld	r0,  r10, v0
+	mulhdu	r10, r10, v0
+	mulld	r5,  r8, v0
+	mulhdu	r8,  r8, v0
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r28, -24(rp)
+	adde	r0, r0, r12
+	ld	r29, -16(rp)
+	adde	r5, r5, r10
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	adde	r9, r9, r8
+	adde	r11, r11, r7
+	addze	r12, r27
+	addc	r0, r0, r28
+	std	r0, -24(rp)
+	adde	r5, r5, r29
+L(l3):	std	r5, -16(rp)
+	adde	r9, r9, r30
+L(l2):	std	r9, -8(rp)
+	adde	r11, r11, r31
+L(l1):	std	r11, 0(rp)
+	bdnz	L(top)
+
+	addze	r3, r12
+	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	blr
+EPILOGUE()
-- 
cgit v1.2.1


From 8ba30f40072e06e46ce109592fb3df2c9087e5d1 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 14:30:39 +0100
Subject: Retune.

---
 mpn/powerpc64/mode64/p6/gmp-mparam.h | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index 5ec334089..bf7f0fd0c 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -1,7 +1,7 @@
 /* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
-Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -53,7 +53,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                274
 #define SQR_TOOM8_THRESHOLD                410
 
-#define MULMID_TOOM42_THRESHOLD             24
+#define MULMID_TOOM42_THRESHOLD             36
 
 #define MULMOD_BNM1_THRESHOLD               14
 #define SQRMOD_BNM1_THRESHOLD               14
@@ -111,36 +111,36 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_THRESHOLD                 2368
 
 #define MULLO_BASECASE_THRESHOLD             5
-#define MULLO_DC_THRESHOLD                  28
-#define MULLO_MUL_N_THRESHOLD             3084
+#define MULLO_DC_THRESHOLD                  61
+#define MULLO_MUL_N_THRESHOLD             5558
 
-#define DC_DIV_QR_THRESHOLD                 23
+#define DC_DIV_QR_THRESHOLD                 29
 #define DC_DIVAPPR_Q_THRESHOLD             112
-#define DC_BDIV_QR_THRESHOLD                29
-#define DC_BDIV_Q_THRESHOLD                 79
+#define DC_BDIV_QR_THRESHOLD                70
+#define DC_BDIV_Q_THRESHOLD                168
 
-#define INV_MULMOD_BNM1_THRESHOLD           51
+#define INV_MULMOD_BNM1_THRESHOLD           61
 #define INV_NEWTON_THRESHOLD                93
 #define INV_APPR_THRESHOLD                  91
 
-#define BINV_NEWTON_THRESHOLD              132
-#define REDC_1_TO_REDC_N_THRESHOLD          39
+#define BINV_NEWTON_THRESHOLD              222
+#define REDC_1_TO_REDC_N_THRESHOLD          63
 
-#define MU_DIV_QR_THRESHOLD                855
+#define MU_DIV_QR_THRESHOLD                807
 #define MU_DIVAPPR_Q_THRESHOLD             807
-#define MUPI_DIV_QR_THRESHOLD               23
-#define MU_BDIV_QR_THRESHOLD               807
-#define MU_BDIV_Q_THRESHOLD                872
+#define MUPI_DIV_QR_THRESHOLD               27
+#define MU_BDIV_QR_THRESHOLD               872
+#define MU_BDIV_Q_THRESHOLD               1078
 
 #define MATRIX22_STRASSEN_THRESHOLD         13
-#define HGCD_THRESHOLD                      69
-#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_THRESHOLD                      94
+#define HGCD_APPR_THRESHOLD                 55
 #define HGCD_REDUCE_THRESHOLD             2121
-#define GCD_DC_THRESHOLD                   268
-#define GCDEXT_DC_THRESHOLD                209
+#define GCD_DC_THRESHOLD                   253
+#define GCDEXT_DC_THRESHOLD                217
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                17
-#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        29
 #define SET_STR_DC_THRESHOLD               532
-#define SET_STR_PRECOMPUTE_THRESHOLD      1648
+#define SET_STR_PRECOMPUTE_THRESHOLD      1561
-- 
cgit v1.2.1


From 0e7ee006721d05c6a652b5ebb3feda42ca44c68b Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 16:57:06 +0100
Subject: Add more cycle numbers.

---
 mpn/x86/tabselect.asm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm
index ab646dac3..7c8c2601f 100644
--- a/mpn/x86/tabselect.asm
+++ b/mpn/x86/tabselect.asm
@@ -27,12 +27,12 @@ C P6 model 9  (Banias)		 ?
 C P6 model 13 (Dothan)		 ?
 C P4 model 0  (Willamette)	 ?
 C P4 model 1  (?)		 ?
-C P4 model 2  (Northwood)	 ?
+C P4 model 2  (Northwood)	 4.5
 C P4 model 3  (Prescott)	 ?
 C P4 model 4  (Nocona)		 ?
 C Intel Atom			 ?
 C AMD K6			 ?
-C AMD K7			 ?
+C AMD K7			 3.4
 C AMD K8			 ?
 C AMD K10			 ?
 
-- 
cgit v1.2.1


From c1be217f4a744da94162daa00293255d16f61cac Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 16:58:46 +0100
Subject: Align loop for slightly better power5 performance.

---
 mpn/powerpc64/mode64/aors_n.asm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm
index c6ea35089..8c30871c2 100644
--- a/mpn/powerpc64/mode64/aors_n.asm
+++ b/mpn/powerpc64/mode64/aors_n.asm
@@ -1,6 +1,6 @@
 dnl  PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
 
-dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
 dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
@@ -23,7 +23,7 @@ include(`../config.m4')
 C                   cycles/limb
 C POWER3/PPC630          1.5
 C POWER4/PPC970          2
-C POWER5                 2.25
+C POWER5                 2
 C POWER6                 2.63
 C POWER7               2.25-2.87
 
@@ -137,6 +137,7 @@ L(go):	ld	r6, 0(r4)	C load s1 limb
 	addi	r4, r4, 32
 	addi	r5, r5, 32
 
+	ALIGN(16)
 L(top):	ADDSUBC	r28, r7, r6
 	ld	r6, 0(r4)	C load s1 limb
 	ld	r7, 0(r5)	C load s2 limb
-- 
cgit v1.2.1


From 10688cef0b5361ffebd094967fdcd7ebb3b63d83 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 17:01:22 +0100
Subject: Add more cycle numbers.

---
 mpn/powerpc32/aors_n.asm | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/mpn/powerpc32/aors_n.asm b/mpn/powerpc32/aors_n.asm
index f9e9b50d5..12115a9e9 100644
--- a/mpn/powerpc32/aors_n.asm
+++ b/mpn/powerpc32/aors_n.asm
@@ -19,14 +19,17 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C                cycles/limb
-C 603e:              ?
-C 604e:              ?		old: 3.25
-C 75x (G3):          ?		old: 3.5
-C 7400,7410 (G4):    3.25
-C 744x,745x (G4+):   4
-C power4/ppc970:     ?		old: 2.0
-C power5:            ?		old: 2.5
+C                   cycles/limb
+C 603e:                  ?
+C 604e:                  ?		old: 3.25
+C 75x (G3):              ?		old: 3.5
+C 7400,7410 (G4):        3.25
+C 744x,745x (G4+):       4
+C POWER3/PPC630          2
+C POWER4/PPC970          2.4
+C POWER5                 2.75
+C POWER6               40-140
+C POWER7                 3
 
 C INPUT PARAMETERS
 define(`rp',	`r3')
-- 
cgit v1.2.1


From d8b2d9eabb0faff2d12c3f2b3ab5c9e36fb21701 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 17:03:17 +0100
Subject: Add more cycle numbers.

---
 mpn/x86_64/tabselect.asm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
index ca475942b..2611b3212 100644
--- a/mpn/x86_64/tabselect.asm
+++ b/mpn/x86_64/tabselect.asm
@@ -23,11 +23,12 @@ include(`../config.m4')
 C	     cycles/limb
 C AMD K8,K9	 2.5
 C AMD K10	 2.5
+C AMD bobcat	 3.5
 C Intel P4	 4
-C Intel core2	 2.3
+C Intel core2	 2.33
 C Intel NHM	 2.5
 C Intel SBR	 2.2
-C Intel atom	 ?
+C Intel atom	 5
 C VIA nano	 3.5
 
 C NOTES
-- 
cgit v1.2.1


From e7f9942cc24335135f4bd92e53787fd619efa69d Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 17:14:32 +0100
Subject: Don't fail fat builds under 64-bit DOS.

---
 configure.in | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/configure.in b/configure.in
index 186d4b576..1d1ebd10f 100644
--- a/configure.in
+++ b/configure.in
@@ -1930,9 +1930,17 @@ case $host in
 
       if test "$abi" = 64; then
 	gcc_64_cflags=""
-	extra_functions_64="$extra_functions_64 fat fat_entry"
-	path_64="x86_64/fat x86_64"
-	fat_path="x86_64 x86_64/fat x86_64/pentium4 x86_64/core2 x86_64/coreinhm x86_64/coreisbr x86_64/atom x86_64/nano"
+	case $host in
+	  *-*-mingw* | *-*-cygwin)
+	    path_64=""	# Windows amd64 calling conventions are *different*
+	    fat_path=""
+	    ;;
+	  *)
+	    extra_functions_64="$extra_functions_64 fat fat_entry"
+	    path_64="x86_64/fat x86_64"
+	    fat_path="x86_64 x86_64/fat x86_64/pentium4 x86_64/core2 x86_64/coreinhm x86_64/coreisbr x86_64/atom x86_64/nano"
+	    ;;
+	esac
       fi
 
       fat_functions="add_n addmul_1 copyd copyi
-- 
cgit v1.2.1


From 042073d276059b723232c6db58c005645131d167 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 17:14:35 +0100
Subject: *** empty log message ***

---
 ChangeLog | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 61631cce8..1d6a44512 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2011-11-22  Torbjorn Granlund  <tege@gmplib.org>
 
+	* configure.in: Don't fail fat builds under 64-bit DOS.
+
+	* mpn/powerpc64/mode64/aors_n.asm: Align loop for slightly better
+	power5 performance.
+
 	* mpn/powerpc64/mode64/p6/addmul_1.asm: New file.
 
 2011-11-21  Torbjorn Granlund  <tege@gmplib.org>
-- 
cgit v1.2.1


From 17a8a01f86586cbe7436565a7d22764f8f5988ea Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 22 Nov 2011 22:05:25 +0100
Subject: Generalise new power6 addmul_1 to support also submul_1.

---
 ChangeLog                             |   4 +-
 mpn/powerpc64/mode64/p6/addmul_1.asm  | 147 -----------------------------
 mpn/powerpc64/mode64/p6/aorsmul_1.asm | 172 ++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+), 149 deletions(-)
 delete mode 100644 mpn/powerpc64/mode64/p6/addmul_1.asm
 create mode 100644 mpn/powerpc64/mode64/p6/aorsmul_1.asm

diff --git a/ChangeLog b/ChangeLog
index 1d6a44512..80e0f7a32 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,12 @@
 2011-11-22  Torbjorn Granlund  <tege@gmplib.org>
 
+	* mpn/powerpc64/mode64/p6/aorsmul_1.asm: New file.
+
 	* configure.in: Don't fail fat builds under 64-bit DOS.
 
 	* mpn/powerpc64/mode64/aors_n.asm: Align loop for slightly better
 	power5 performance.
 
-	* mpn/powerpc64/mode64/p6/addmul_1.asm: New file.
-
 2011-11-21  Torbjorn Granlund  <tege@gmplib.org>
 
 	* gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name.
diff --git a/mpn/powerpc64/mode64/p6/addmul_1.asm b/mpn/powerpc64/mode64/p6/addmul_1.asm
deleted file mode 100644
index bffa6f308..000000000
--- a/mpn/powerpc64/mode64/p6/addmul_1.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-dnl  PowerPC-64 mpn_addmul_1 optimised for power6.
-
-dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011
-dnl  Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C		    cycles/limb
-C POWER3/PPC630		 ?
-C POWER4/PPC970		 ?
-C POWER5		 ?
-C POWER6		12.25
-C POWER7		 ?
-
-C TODO
-C  * Reduce register usage.
-C  * Schedule function entry code.
-C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
-C    would bring us to 9 c/l.
-C  * Generalise to handle submul_1.
-
-C INPUT PARAMETERS
-define(`rp',  `r3')
-define(`up',  `r4')
-define(`n',   `r5')
-define(`v0',  `r6')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
-	std	r31, -8(r1)
-	std	r30, -16(r1)
-	std	r29, -24(r1)
-	std	r28, -32(r1)
-	std	r27, -40(r1)
-
-	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
-	cmpdi	cr6, r0, 2
-	addi	n, n, 3		C compute count...
-	srdi	n, n, 2		C ...for ctr
-	mtctr	n		C copy loop count into ctr
-	beq	cr0, L(b0)
-	blt	cr6, L(b1)
-	beq	cr6, L(b2)
-
-L(b3):	addi	up, up, 16
-	addi	rp, rp, 16
-	ld	r8, -16(up)
-	ld	r7, -8(up)
-	ld	r27, 0(up)
-	mulld	r5,  r8, v0
-	mulhdu	r8,  r8, v0
-	mulld	r9,  r7, v0
-	mulhdu	r7,  r7, v0
-	mulld	r11, r27, v0
-	mulhdu	r27, r27, v0
-	ld	r29, -16(rp)
-	ld	r30, -8(rp)
-	ld	r31, 0(rp)
-	addc	r9, r9, r8
-	adde	r11, r11, r7
-	addze	r12, r27
-	addc	r5, r5, r29
-	b	L(l3)
-
-L(b2):	addi	up, up, 8
-	addi	rp, rp, 8
-	ld	r7, -8(up)
-	ld	r27, 0(up)
-	mulld	r9,  r7, v0
-	mulhdu	r7,  r7, v0
-	mulld	r11, r27, v0
-	mulhdu	r27, r27, v0
-	ld	r30, -8(rp)
-	ld	r31, 0(rp)
-	addc	r11, r11, r7
-	addze	r12, r27
-	addc	r9, r9, r30
-	b	L(l2)
-
-L(b1):	ld	r27, 0(up)
-	ld	r31, 0(rp)
-	mulld	r11, r27, v0
-	mulhdu	r12, r27, v0
-	addc	r11, r11, r31
-	b	L(l1)
-
-L(b0):	addi	up, up, -8
-	addi	rp, rp, -8
-	addic	r12, r0, 0	C clear r12 and cy (use that r0 = 0)
-
-	ALIGN(32)
-L(top):	ld	r10, 8(up)
-	ld	r8, 16(up)
-	ld	r7, 24(up)
-	ld	r27, 32(up)
-	addi	up, up, 32
-	addi	rp, rp, 32
-	mulld	r0,  r10, v0
-	mulhdu	r10, r10, v0
-	mulld	r5,  r8, v0
-	mulhdu	r8,  r8, v0
-	mulld	r9,  r7, v0
-	mulhdu	r7,  r7, v0
-	mulld	r11, r27, v0
-	mulhdu	r27, r27, v0
-	ld	r28, -24(rp)
-	adde	r0, r0, r12
-	ld	r29, -16(rp)
-	adde	r5, r5, r10
-	ld	r30, -8(rp)
-	ld	r31, 0(rp)
-	adde	r9, r9, r8
-	adde	r11, r11, r7
-	addze	r12, r27
-	addc	r0, r0, r28
-	std	r0, -24(rp)
-	adde	r5, r5, r29
-L(l3):	std	r5, -16(rp)
-	adde	r9, r9, r30
-L(l2):	std	r9, -8(rp)
-	adde	r11, r11, r31
-L(l1):	std	r11, 0(rp)
-	bdnz	L(top)
-
-	addze	r3, r12
-	ld	r31, -8(r1)
-	ld	r30, -16(r1)
-	ld	r29, -24(r1)
-	ld	r28, -32(r1)
-	ld	r27, -40(r1)
-	blr
-EPILOGUE()
diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 000000000..4bd508488
--- /dev/null
+++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
@@ -0,0 +1,172 @@
+dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011
+dnl  Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               mpn_addmul_1    mpn_submul_1
+C               cycles/limb     cycles/limb
+C POWER3/PPC630     ?               ?
+C POWER4/PPC970     ?               ?
+C POWER5            ?               ?
+C POWER6           12.25           12.8
+C POWER7            ?               ?
+
+C TODO
+C  * Reduce register usage.
+C  * Schedule function entry code.
+C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C    would bring us to 9 c/l.
+C  * Handle n = 1 and perhaps n = 2 seperately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`v0',  `r6')
+
+ifdef(`OPERATION_addmul_1',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_addmul_1)
+  define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
+  define(AM,		`$1')
+  define(SM,		`')
+  define(CLRRSC,	`addic	$1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_submul_1)
+  define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
+  define(AM,		`')
+  define(SM,		`$1')
+  define(CLRRSC,	`subfc	$1, r0, r0')
+')
+
+ASM_START()
+PROLOGUE(func)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 3		C compute count...
+	srdi	n, n, 2		C ...for ctr
+	mtctr	n		C copy loop count into ctr
+	beq	cr0, L(b0)
+	blt	cr6, L(b1)
+	beq	cr6, L(b2)
+
+L(b3):	ld	r8, 0(up)
+	ld	r7, 8(up)
+	ld	r27, 16(up)
+	addi	up, up, 16
+	addi	rp, rp, 16
+	mulld	r5,  r8, v0
+	mulhdu	r8,  r8, v0
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r29, -16(rp)
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	addc	r9, r9, r8
+	adde	r11, r11, r7
+	addze	r12, r27
+	ADDSUB	r5, r5, r29
+	b	L(l3)
+
+L(b2):	ld	r7, 0(up)
+	ld	r27, 8(up)
+	addi	up, up, 8
+	addi	rp, rp, 8
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	addc	r11, r11, r7
+	addze	r12, r27
+	ADDSUB	r9, r9, r30
+	b	L(l2)
+
+L(b1):	ld	r27, 0(up)
+	ld	r31, 0(rp)
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	ADDSUB	r11, r11, r31
+	b	L(l1)
+
+L(b0):	addi	up, up, -8
+	addi	rp, rp, -8
+	CLRRSC(	r12)		C clear r12 and clr/set cy
+
+	ALIGN(32)
+L(top):
+SM(`	subfe	r11, r0, r0')	C complement...
+SM(`	addic	r11, r11, 1')	C ...carry flag
+	ld	r10, 8(up)
+	ld	r8, 16(up)
+	ld	r7, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	mulld	r0,  r10, v0
+	mulhdu	r10, r10, v0
+	mulld	r5,  r8, v0
+	mulhdu	r8,  r8, v0
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r28, -24(rp)
+	adde	r0, r0, r12
+	ld	r29, -16(rp)
+	adde	r5, r5, r10
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	adde	r9, r9, r8
+	adde	r11, r11, r7
+	addze	r12, r27
+	ADDSUB	r0, r0, r28
+	std	r0, -24(rp)
+	ADDSUBC	r5, r5, r29
+L(l3):	std	r5, -16(rp)
+	ADDSUBC	r9, r9, r30
+L(l2):	std	r9, -8(rp)
+	ADDSUBC	r11, r11, r31
+L(l1):	std	r11, 0(rp)
+	bdnz	L(top)
+
+AM(`	addze	r3, r12')
+SM(`	subfe	r11, r0, r0')		C complement...
+	ld	r31, -8(r1)
+SM(`	subf	r3, r11, r12')
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	blr
+EPILOGUE()
-- 
cgit v1.2.1


From 682827871b8ddf4674d2233c852b516cbcd9c2a1 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 12:13:26 +0100
Subject: (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's threshold.

---
 tune/tuneup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tune/tuneup.c b/tune/tuneup.c
index 4cc75eed1..bc7e8cc3d 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -1568,7 +1568,7 @@ tune_mu_div (void)
     param.name = "MU_DIV_QR_THRESHOLD";
     param.function = speed_mpn_dcpi1_div_qr;
     param.function2 = speed_mpn_mu_div_qr;
-    param.min_size = 6;
+    param.min_size = mul_toom22_threshold;
     param.max_size = 5000;
     param.step_factor = 0.02;
     one (&mu_div_qr_threshold, &param);
@@ -1578,7 +1578,7 @@ tune_mu_div (void)
     param.name = "MU_DIVAPPR_Q_THRESHOLD";
     param.function = speed_mpn_dcpi1_divappr_q;
     param.function2 = speed_mpn_mu_divappr_q;
-    param.min_size = 6;
+    param.min_size = mul_toom22_threshold;
     param.max_size = 5000;
     param.step_factor = 0.02;
     one (&mu_divappr_q_threshold, &param);
@@ -1627,7 +1627,7 @@ tune_mu_bdiv (void)
     param.name = "MU_BDIV_QR_THRESHOLD";
     param.function = speed_mpn_dcpi1_bdiv_qr;
     param.function2 = speed_mpn_mu_bdiv_qr;
-    param.min_size = 4;
+    param.min_size = mul_toom22_threshold;
     param.max_size = 5000;
     param.step_factor = 0.02;
     one (&mu_bdiv_qr_threshold, &param);
@@ -1637,7 +1637,7 @@ tune_mu_bdiv (void)
     param.name = "MU_BDIV_Q_THRESHOLD";
     param.function = speed_mpn_dcpi1_bdiv_q;
     param.function2 = speed_mpn_mu_bdiv_q;
-    param.min_size = 4;
+    param.min_size = mul_toom22_threshold;
     param.max_size = 5000;
     param.step_factor = 0.02;
     one (&mu_bdiv_q_threshold, &param);
-- 
cgit v1.2.1


From faeba6f2f2dfe18c15702387f1c2267f341a7783 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 12:17:47 +0100
Subject: Add power7/32 tuning file.

---
 mpn/powerpc32/p7/gmp-mparam.h | 149 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 mpn/powerpc32/p7/gmp-mparam.h

diff --git a/mpn/powerpc32/p7/gmp-mparam.h b/mpn/powerpc32/p7/gmp-mparam.h
new file mode 100644
index 000000000..bd18d4042
--- /dev/null
+++ b/mpn/powerpc32/p7/gmp-mparam.h
@@ -0,0 +1,149 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009,
+2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* 3550 MHz POWER7/T4 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     15
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           34
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                89
+#define MUL_TOOM44_THRESHOLD               130
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+
+#define SQR_BASECASE_THRESHOLD               4
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                 89
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                381
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  4,35,225,780,2212
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     14, 5}, {     29, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671, 8}, {   1343,10}, {    351,11}, \
+    {    191,10}, {    415, 9}, {    831,10}, {    431,11}, \
+    {    223,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 77
+#define MUL_FFT_THRESHOLD                 5312
+
+#define SQR_FFT_MODF_THRESHOLD             344  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    344, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543, 8}, {   1087,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 79
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  34
+#define MULLO_MUL_N_THRESHOLD            10323
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             202
+#define DC_BDIV_QR_THRESHOLD                68
+#define DC_BDIV_Q_THRESHOLD                152
+
+#define INV_MULMOD_BNM1_THRESHOLD           66
+#define INV_NEWTON_THRESHOLD               226
+#define INV_APPR_THRESHOLD                 189
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1442
+#define MUPI_DIV_QR_THRESHOLD               91
+#define MU_BDIV_QR_THRESHOLD              1308
+#define MU_BDIV_Q_THRESHOLD               1442
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     126
+#define HGCD_APPR_THRESHOLD                139
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   573
+#define GCDEXT_DC_THRESHOLD                448
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                 9
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               834
+#define SET_STR_PRECOMPUTE_THRESHOLD      1888
-- 
cgit v1.2.1


From 161de004453f214c6030bb9e9babd2d0048a7337 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 12:19:09 +0100
Subject: Retune.

---
 mpn/powerpc32/p5/gmp-mparam.h        | 137 +++++++++++------------
 mpn/powerpc32/p6/gmp-mparam.h        | 206 ++++++++++++++++++-----------------
 mpn/powerpc64/mode64/p3/gmp-mparam.h |  73 +++++++------
 mpn/powerpc64/mode64/p6/gmp-mparam.h |  42 +++----
 4 files changed, 240 insertions(+), 218 deletions(-)

diff --git a/mpn/powerpc32/p5/gmp-mparam.h b/mpn/powerpc32/p5/gmp-mparam.h
index a8400ce65..ba210ecc4 100644
--- a/mpn/powerpc32/p5/gmp-mparam.h
+++ b/mpn/powerpc32/p5/gmp-mparam.h
@@ -30,114 +30,117 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
 #define MOD_1N_TO_MOD_1_1_THRESHOLD          8
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        46
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     15
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     18
 #define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD           62
+#define BMOD_1_TO_MOD_1_THRESHOLD           61
 
 #define MUL_TOOM22_THRESHOLD                22
-#define MUL_TOOM33_THRESHOLD                78
+#define MUL_TOOM33_THRESHOLD                57
 #define MUL_TOOM44_THRESHOLD               130
-#define MUL_TOOM6H_THRESHOLD               206
-#define MUL_TOOM8H_THRESHOLD               260
+#define MUL_TOOM6H_THRESHOLD               189
+#define MUL_TOOM8H_THRESHOLD               309
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
 #define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      83
 #define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
 
-#define SQR_BASECASE_THRESHOLD               0  /* always */
-#define SQR_TOOM2_THRESHOLD                 42
+#define SQR_BASECASE_THRESHOLD               6
+#define SQR_TOOM2_THRESHOLD                 40
 #define SQR_TOOM3_THRESHOLD                 77
-#define SQR_TOOM4_THRESHOLD                169
-#define SQR_TOOM6_THRESHOLD                246
-#define SQR_TOOM8_THRESHOLD                381
+#define SQR_TOOM4_THRESHOLD                124
+#define SQR_TOOM6_THRESHOLD                140
+#define SQR_TOOM8_THRESHOLD                238
+
+#define MULMID_TOOM42_THRESHOLD             40
 
 #define MULMOD_BNM1_THRESHOLD               15
-#define SQRMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define POWM_SEC_TABLE  4,29,252,840,2080
 
-#define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD             412  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
-  { {    380, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
-    {     13, 5}, {     27, 6}, {     21, 7}, {     11, 6}, \
-    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
-    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
-    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
-    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
-    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
-    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
-    {     47,10}, {     31, 9}, {     79,10}, {     47,11}, \
-    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
-    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
-    {    127, 9}, {    255,10}, {    143, 9}, {    287, 8}, \
-    {    575,10}, {    159,11}, {     95, 9}, {    383,12}, \
-    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
-    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
-    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
-    {    671,10}, {    351,11}, {    191,10}, {    383, 9}, \
-    {    767,10}, {    415, 9}, {    831,11}, {    223,12}, \
-    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
-#define MUL_FFT_TABLE3_SIZE 76
+  { {    412, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    415, 9}, {    831,11}, {    223,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 71
 #define MUL_FFT_THRESHOLD                 4736
 
-#define SQR_FFT_MODF_THRESHOLD             316  /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
 #define SQR_FFT_TABLE3                                      \
-  { {    316, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+  { {    340, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
     {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
-    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
-    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
-    {     19, 6}, {     77, 7}, {     39, 8}, {     23, 7}, \
-    {     47, 8}, {     27, 9}, {     15, 8}, {     39, 9}, \
-    {     23, 8}, {     47,10}, {     15, 7}, {    121, 9}, \
-    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
-    {     47,10}, {     31, 9}, {     79,10}, {     47,11}, \
-    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
-    {     79, 9}, {    159, 8}, {    319, 9}, {    175,10}, \
-    {     95, 9}, {    191, 8}, {    383,11}, {     63,10}, \
+    {     27, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     47,10}, {     31, 9}, \
+    {     71,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
     {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
     {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
-    {    159, 9}, {    319,10}, {    175,11}, {     95,10}, \
-    {    191, 9}, {    383,10}, {    207,12}, {     63,11}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
     {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
     {    543,10}, {    287, 9}, {    575,10}, {    303,11}, \
     {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
     {    671,10}, {    351,11}, {    191,10}, {    383, 9}, \
     {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
     {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
-#define SQR_FFT_TABLE3_SIZE 88
+#define SQR_FFT_TABLE3_SIZE 76
 #define SQR_FFT_THRESHOLD                 3712
 
 #define MULLO_BASECASE_THRESHOLD             2
 #define MULLO_DC_THRESHOLD                  68
 #define MULLO_MUL_N_THRESHOLD             9236
 
-#define DC_DIV_QR_THRESHOLD                 70
-#define DC_DIVAPPR_Q_THRESHOLD             238
+#define DC_DIV_QR_THRESHOLD                 69
+#define DC_DIVAPPR_Q_THRESHOLD             220
 #define DC_BDIV_QR_THRESHOLD                75
 #define DC_BDIV_Q_THRESHOLD                188
 
 #define INV_MULMOD_BNM1_THRESHOLD           54
-#define INV_NEWTON_THRESHOLD               250
-#define INV_APPR_THRESHOLD                 246
+#define INV_NEWTON_THRESHOLD               230
+#define INV_APPR_THRESHOLD                 230
 
-#define BINV_NEWTON_THRESHOLD              375
+#define BINV_NEWTON_THRESHOLD              278
 #define REDC_1_TO_REDC_N_THRESHOLD          87
 
-#define MU_DIV_QR_THRESHOLD               1334
-#define MU_DIVAPPR_Q_THRESHOLD            1387
-#define MUPI_DIV_QR_THRESHOLD              114
-#define MU_BDIV_QR_THRESHOLD              1078
-#define MU_BDIV_Q_THRESHOLD               1334
+#define MU_DIV_QR_THRESHOLD               1210
+#define MU_DIVAPPR_Q_THRESHOLD            1308
+#define MUPI_DIV_QR_THRESHOLD              106
+#define MU_BDIV_QR_THRESHOLD              1017
+#define MU_BDIV_Q_THRESHOLD               1210
 
 #define MATRIX22_STRASSEN_THRESHOLD         14
-#define HGCD_THRESHOLD                     104
-#define GCD_DC_THRESHOLD                   424
-#define GCDEXT_DC_THRESHOLD                321
+#define HGCD_THRESHOLD                     110
+#define HGCD_APPR_THRESHOLD                138
+#define HGCD_REDUCE_THRESHOLD             2578
+#define GCD_DC_THRESHOLD                   408
+#define GCDEXT_DC_THRESHOLD                298
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                12
-#define GET_STR_PRECOMPUTE_THRESHOLD        23
-#define SET_STR_DC_THRESHOLD               454
-#define SET_STR_PRECOMPUTE_THRESHOLD      1074
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               527
+#define SET_STR_PRECOMPUTE_THRESHOLD      1090
diff --git a/mpn/powerpc32/p6/gmp-mparam.h b/mpn/powerpc32/p6/gmp-mparam.h
index 73951d0ae..529a66d19 100644
--- a/mpn/powerpc32/p6/gmp-mparam.h
+++ b/mpn/powerpc32/p6/gmp-mparam.h
@@ -29,115 +29,127 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_NORM_THRESHOLD                 3
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
 #define MOD_1N_TO_MOD_1_1_THRESHOLD          3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        15
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD     MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
 #define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
 #define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
 
-#define MUL_TOOM22_THRESHOLD                34
-#define MUL_TOOM33_THRESHOLD                70
-#define MUL_TOOM44_THRESHOLD               187
-#define MUL_TOOM6H_THRESHOLD               286
-#define MUL_TOOM8H_THRESHOLD               321
+#define MUL_TOOM22_THRESHOLD                19
+#define MUL_TOOM33_THRESHOLD                55
+#define MUL_TOOM44_THRESHOLD                88
+#define MUL_TOOM6H_THRESHOLD               137
+#define MUL_TOOM8H_THRESHOLD               181
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD     110
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     118
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD     107
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     145
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      57
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      56
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      57
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      56
 
 #define SQR_BASECASE_THRESHOLD               0  /* always */
-#define SQR_TOOM2_THRESHOLD                 68
-#define SQR_TOOM3_THRESHOLD                113
-#define SQR_TOOM4_THRESHOLD                312
-#define SQR_TOOM6_THRESHOLD                330
-#define SQR_TOOM8_THRESHOLD                357
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                 56
+#define SQR_TOOM4_THRESHOLD                130
+#define SQR_TOOM6_THRESHOLD                189
+#define SQR_TOOM8_THRESHOLD                296
 
-#define MULMOD_BNM1_THRESHOLD               19
-#define SQRMOD_BNM1_THRESHOLD               20
+#define MULMID_TOOM42_THRESHOLD             26
 
-#define MUL_FFT_MODF_THRESHOLD             304  /* k = 5 */
+#define MULMOD_BNM1_THRESHOLD                7
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define POWM_SEC_TABLE  2,26,127,453,1068
+
+#define MUL_FFT_MODF_THRESHOLD             212  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
-  { {    304, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
-    {     10, 5}, {     21, 6}, {     17, 7}, {      9, 6}, \
-    {     20, 7}, {     11, 6}, {     24, 7}, {     13, 8}, \
-    {      7, 7}, {     21, 8}, {     11, 7}, {     27, 9}, \
-    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
-    {     41, 8}, {     23, 7}, {     47, 8}, {     27, 9}, \
+  { {    212, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     16, 7}, {      9, 6}, \
+    {     19, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 9}, {      7, 8}, {     15, 7}, \
+    {     31, 8}, {     19, 7}, {     39, 8}, {     23, 9}, \
     {     15, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
-    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
-    {     79, 9}, {     47, 8}, {     95,10}, {     31, 9}, \
-    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
-    {     63, 9}, {    127, 8}, {    255, 9}, {    135,10}, \
-    {     79, 9}, {    159, 8}, {    319,10}, {     95, 9}, \
-    {    191, 8}, {    383,11}, {     63,10}, {    127, 9}, \
-    {    255, 8}, {    511, 9}, {    271,10}, {    143, 9}, \
-    {    287,10}, {    159, 9}, {    319,11}, {     95,10}, \
-    {    191, 9}, {    383,12}, {     63,11}, {    127,10}, \
-    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
-    {    287,11}, {    159,10}, {    319, 9}, {    639,10}, \
-    {    351,11}, {    191,10}, {    383, 9}, {    767,10}, \
-    {    415,11}, {    223,10}, {    447,12}, {   4096,13}, \
-    {   8192,14}, {  16384,15}, {  32768,16} }
-#define MUL_FFT_TABLE3_SIZE 83
-#define MUL_FFT_THRESHOLD                 4736
-
-#define SQR_FFT_MODF_THRESHOLD             312  /* k = 5 */
-#define SQR_FFT_TABLE3                                      \
-  { {    312, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
-    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
-    {     27, 7}, {     17, 6}, {     35, 7}, {     21, 8}, \
-    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
-    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
-    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
-    {     47,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
-    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     71, 8}, {    143, 7}, {    287, 9}, \
     {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
-    {    127, 8}, {    255,10}, {     79, 9}, {    159, 8}, \
-    {    319,10}, {     95, 9}, {    191,11}, {     63,10}, \
-    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    127, 8}, {    255, 7}, {    511, 9}, {    143, 8}, \
+    {    287,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    175, 8}, {    351,10}, {     95, 9}, {    191, 8}, \
+    {    383, 9}, {    207,10}, {    111,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD                 1728
+
+#define SQR_FFT_MODF_THRESHOLD             184  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    184, 5}, {      6, 4}, {     13, 5}, {     13, 6}, \
+    {      7, 5}, {     15, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     23, 9}, {      7, 8}, {     23, 9}, \
+    {     15, 8}, {     39, 9}, {     23,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     63, 8}, {    127, 7}, \
+    {    255, 9}, {     71, 8}, {    143, 7}, {    287, 6}, \
+    {    575, 9}, {     79,10}, {     47,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    143, 8}, \
+    {    287, 7}, {    575,10}, {     79, 9}, {    159, 8}, \
+    {    319, 9}, {    175, 8}, {    351,10}, {     95, 9}, \
+    {    191, 8}, {    383, 9}, {    207,10}, {    111, 9}, \
+    {    223,11}, {     63,10}, {    127, 9}, {    255,10}, \
     {    143, 9}, {    287, 8}, {    575,10}, {    159, 9}, \
-    {    319,11}, {     95,10}, {    191, 9}, {    383,12}, \
-    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
-    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
-    {    159,10}, {    319, 9}, {    639,10}, {    351,11}, \
-    {    191,10}, {    383, 9}, {    767,10}, {    415,11}, \
-    {    223,10}, {    447,12}, {   4096,13}, {   8192,14}, \
-    {  16384,15}, {  32768,16} }
-#define SQR_FFT_TABLE3_SIZE 78
-#define SQR_FFT_THRESHOLD                 2752
-
-#define MULLO_BASECASE_THRESHOLD             0  /* always */
-#define MULLO_DC_THRESHOLD                 151
-#define MULLO_MUL_N_THRESHOLD             1175
-
-#define DC_DIV_QR_THRESHOLD                133
-#define DC_DIVAPPR_Q_THRESHOLD             442
-#define DC_BDIV_QR_THRESHOLD               130
-#define DC_BDIV_Q_THRESHOLD                324
-
-#define INV_MULMOD_BNM1_THRESHOLD          116
-#define INV_NEWTON_THRESHOLD               507
-#define INV_APPR_THRESHOLD                 454
-
-#define BINV_NEWTON_THRESHOLD              507
-#define REDC_1_TO_REDC_N_THRESHOLD         118
-
-#define MU_DIV_QR_THRESHOLD               1652
-#define MU_DIVAPPR_Q_THRESHOLD            1752
-#define MUPI_DIV_QR_THRESHOLD              225
-#define MU_BDIV_QR_THRESHOLD               762
-#define MU_BDIV_Q_THRESHOLD               1017
-
-#define MATRIX22_STRASSEN_THRESHOLD         28
-#define HGCD_THRESHOLD                      76
-#define GCD_DC_THRESHOLD                   333
-#define GCDEXT_DC_THRESHOLD                245
+    {    319,10}, {    175, 9}, {    351,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207, 9}, {    415,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    351, 9}, {    703, 8}, {   1407,11}, {    191,10}, \
+    {    415,11}, {    223,10}, {    447, 9}, {    895,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 92
+#define SQR_FFT_THRESHOLD                 1600
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  57
+#define MULLO_MUL_N_THRESHOLD             3176
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             187
+#define DC_BDIV_QR_THRESHOLD                64
+#define DC_BDIV_Q_THRESHOLD                146
+
+#define INV_MULMOD_BNM1_THRESHOLD           68
+#define INV_NEWTON_THRESHOLD               182
+#define INV_APPR_THRESHOLD                 182
+
+#define BINV_NEWTON_THRESHOLD              186
+#define REDC_1_TO_REDC_N_THRESHOLD          60
+
+#define MU_DIV_QR_THRESHOLD                924
+#define MU_DIVAPPR_Q_THRESHOLD             807
+#define MUPI_DIV_QR_THRESHOLD               73
+#define MU_BDIV_QR_THRESHOLD               667
+#define MU_BDIV_Q_THRESHOLD                823
+
+#define MATRIX22_STRASSEN_THRESHOLD          8
+#define HGCD_THRESHOLD                      61
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD              974
+#define GCD_DC_THRESHOLD                   195
+#define GCDEXT_DC_THRESHOLD                134
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                10
-#define GET_STR_PRECOMPUTE_THRESHOLD        20
-#define SET_STR_DC_THRESHOLD               199
-#define SET_STR_PRECOMPUTE_THRESHOLD       478
+#define GET_STR_DC_THRESHOLD                 9
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               190
+#define SET_STR_PRECOMPUTE_THRESHOLD       411
diff --git a/mpn/powerpc64/mode64/p3/gmp-mparam.h b/mpn/powerpc64/mode64/p3/gmp-mparam.h
index 221b0e1d8..cf1d8ca47 100644
--- a/mpn/powerpc64/mode64/p3/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p3/gmp-mparam.h
@@ -23,12 +23,13 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     17
 #define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
 #define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
 
@@ -36,22 +37,26 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM33_THRESHOLD                33
 #define MUL_TOOM44_THRESHOLD                46
 #define MUL_TOOM6H_THRESHOLD                77
-#define MUL_TOOM8H_THRESHOLD               115
+#define MUL_TOOM8H_THRESHOLD               139
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      49
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD      38
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      33
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD      32
-
-#define SQR_BASECASE_THRESHOLD               0  /* always */
-#define SQR_TOOM2_THRESHOLD                 16
-#define SQR_TOOM3_THRESHOLD                 49
-#define SQR_TOOM4_THRESHOLD                 70
-#define SQR_TOOM6_THRESHOLD                 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      48
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      49
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 14
+#define SQR_TOOM3_THRESHOLD                 45
+#define SQR_TOOM4_THRESHOLD                 64
+#define SQR_TOOM6_THRESHOLD                 85
 #define SQR_TOOM8_THRESHOLD                139
 
+#define MULMID_TOOM42_THRESHOLD             22
+
 #define MULMOD_BNM1_THRESHOLD                8
-#define SQRMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               10
+
+#define POWM_SEC_TABLE  2,23,127,502,1421
 
 #define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -123,35 +128,37 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 118
 #define SQR_FFT_THRESHOLD                 1728
 
-#define MULLO_BASECASE_THRESHOLD             3
-#define MULLO_DC_THRESHOLD                  28
-#define MULLO_MUL_N_THRESHOLD             4940
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  27
+#define MULLO_MUL_N_THRESHOLD             2367
 
-#define DC_DIV_QR_THRESHOLD                 27
-#define DC_DIVAPPR_Q_THRESHOLD              95
-#define DC_BDIV_QR_THRESHOLD                28
+#define DC_DIV_QR_THRESHOLD                 26
+#define DC_DIVAPPR_Q_THRESHOLD              87
+#define DC_BDIV_QR_THRESHOLD                27
 #define DC_BDIV_Q_THRESHOLD                 62
 
-#define INV_MULMOD_BNM1_THRESHOLD           29
-#define INV_NEWTON_THRESHOLD                92
-#define INV_APPR_THRESHOLD                  94
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD                91
+#define INV_APPR_THRESHOLD                  91
 
 #define BINV_NEWTON_THRESHOLD              115
-#define REDC_1_TO_REDC_N_THRESHOLD          30
+#define REDC_1_TO_REDC_N_THRESHOLD          31
 
 #define MU_DIV_QR_THRESHOLD                551
 #define MU_DIVAPPR_Q_THRESHOLD             551
-#define MUPI_DIV_QR_THRESHOLD               49
-#define MU_BDIV_QR_THRESHOLD               492
+#define MUPI_DIV_QR_THRESHOLD               50
+#define MU_BDIV_QR_THRESHOLD               474
 #define MU_BDIV_Q_THRESHOLD                492
 
-#define MATRIX22_STRASSEN_THRESHOLD          9
-#define HGCD_THRESHOLD                      55
-#define GCD_DC_THRESHOLD                   150
-#define GCDEXT_DC_THRESHOLD                124
+#define MATRIX22_STRASSEN_THRESHOLD          8
+#define HGCD_THRESHOLD                      53
+#define HGCD_APPR_THRESHOLD                 55
+#define HGCD_REDUCE_THRESHOLD              688
+#define GCD_DC_THRESHOLD                   148
+#define GCDEXT_DC_THRESHOLD                118
 #define JACOBI_BASE_METHOD                   1
 
-#define GET_STR_DC_THRESHOLD                17
+#define GET_STR_DC_THRESHOLD                16
 #define GET_STR_PRECOMPUTE_THRESHOLD        27
-#define SET_STR_DC_THRESHOLD               354
+#define SET_STR_DC_THRESHOLD               375
 #define SET_STR_PRECOMPUTE_THRESHOLD       812
diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index bf7f0fd0c..5392138f1 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -39,26 +39,26 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MUL_TOOM33_THRESHOLD                50
 #define MUL_TOOM44_THRESHOLD               112
 #define MUL_TOOM6H_THRESHOLD               274
-#define MUL_TOOM8H_THRESHOLD               430
+#define MUL_TOOM8H_THRESHOLD               339
 
 #define MUL_TOOM32_TO_TOOM43_THRESHOLD      62
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD      84
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      76
 #define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD      66
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      78
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 24
 #define SQR_TOOM3_THRESHOLD                 49
 #define SQR_TOOM4_THRESHOLD                136
-#define SQR_TOOM6_THRESHOLD                274
-#define SQR_TOOM8_THRESHOLD                410
+#define SQR_TOOM6_THRESHOLD                226
+#define SQR_TOOM8_THRESHOLD                393
 
 #define MULMID_TOOM42_THRESHOLD             36
 
 #define MULMOD_BNM1_THRESHOLD               14
 #define SQRMOD_BNM1_THRESHOLD               14
 
-#define POWM_SEC_TABLE  4,19,228,713,919
+#define POWM_SEC_TABLE  4,23,213,840,2618
 
 #define MUL_FFT_MODF_THRESHOLD             340  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
@@ -112,35 +112,35 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MULLO_BASECASE_THRESHOLD             5
 #define MULLO_DC_THRESHOLD                  61
-#define MULLO_MUL_N_THRESHOLD             5558
+#define MULLO_MUL_N_THRESHOLD             3271
 
-#define DC_DIV_QR_THRESHOLD                 29
-#define DC_DIVAPPR_Q_THRESHOLD             112
+#define DC_DIV_QR_THRESHOLD                 59
+#define DC_DIVAPPR_Q_THRESHOLD             200
 #define DC_BDIV_QR_THRESHOLD                70
 #define DC_BDIV_Q_THRESHOLD                168
 
 #define INV_MULMOD_BNM1_THRESHOLD           61
-#define INV_NEWTON_THRESHOLD                93
-#define INV_APPR_THRESHOLD                  91
+#define INV_NEWTON_THRESHOLD               166
+#define INV_APPR_THRESHOLD                 166
 
 #define BINV_NEWTON_THRESHOLD              222
 #define REDC_1_TO_REDC_N_THRESHOLD          63
 
-#define MU_DIV_QR_THRESHOLD                807
-#define MU_DIVAPPR_Q_THRESHOLD             807
-#define MUPI_DIV_QR_THRESHOLD               27
-#define MU_BDIV_QR_THRESHOLD               872
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               59
+#define MU_BDIV_QR_THRESHOLD               889
 #define MU_BDIV_Q_THRESHOLD               1078
 
 #define MATRIX22_STRASSEN_THRESHOLD         13
-#define HGCD_THRESHOLD                      94
-#define HGCD_APPR_THRESHOLD                 55
-#define HGCD_REDUCE_THRESHOLD             2121
-#define GCD_DC_THRESHOLD                   253
-#define GCDEXT_DC_THRESHOLD                217
+#define HGCD_THRESHOLD                     109
+#define HGCD_APPR_THRESHOLD                108
+#define HGCD_REDUCE_THRESHOLD             1052
+#define GCD_DC_THRESHOLD                   501
+#define GCDEXT_DC_THRESHOLD                249
 #define JACOBI_BASE_METHOD                   4
 
 #define GET_STR_DC_THRESHOLD                16
 #define GET_STR_PRECOMPUTE_THRESHOLD        29
 #define SET_STR_DC_THRESHOLD               532
-#define SET_STR_PRECOMPUTE_THRESHOLD      1561
+#define SET_STR_PRECOMPUTE_THRESHOLD      1639
-- 
cgit v1.2.1


From 52b003cc5830e7fde5e3dca4338c170f3cdd9fe5 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 12:39:37 +0100
Subject: *** empty log message ***

---
 mpn/s390_64/README | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 mpn/s390_64/README

diff --git a/mpn/s390_64/README b/mpn/s390_64/README
new file mode 100644
index 000000000..82b68a080
--- /dev/null
+++ b/mpn/s390_64/README
@@ -0,0 +1,77 @@
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+
+
+There are 5 generations of 64-but s390 processors, z900, z990, z9,
+z10, and z196.  The current GMP code was optimised for the two oldest,
+z900 and z990.
+
+
+mpn_copyi
+
+This code makes use of a loop around MVC.  It almost surely runs very
+close to optimally.  A small improvement could be done by using one
+MVC for size 256 bytes, now we use two (we use an extra MVC when
+copying any multiple of 256 bytes).
+
+
+mpn_copyd
+
+We have tried several feed-in variants here, branch tree, jump table
+and computed goto.  The fastest (on z990) turned out to be computed
+goto.
+
+An approach not tried is EX of LMG and STMG, modifying the register set
+on-the-fly.  Using that trick, we could completely avoid using
+separate feed-in paths.
+
+
+mpn_lshift, mpn_rshift
+
+The current code runs at pipeline decode bandwith on z990.
+
+
+mpn_add_n, mpn_sub_n
+
+The current code is 4-way unrolled.  It should be unrolled more, at
+least 8x, in order to reach 2.5 c/l.
+
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1
+
+The current code is very naive, but due to the non-pipelined nature of
+MLGR on z900 and z990, more sophisticated code would not gain much.
+
+On z10 one would need to cluster at least 4 MLGR together, in order to
+reduce stalling.
+
+On z196, one surely want to use unrolling and pipelining, to perhaps
+reach around 12 c/l.  A major issue here and on z10 is ALCGR's 3 cycle
+stalling.
+
+
+mpn_mul_2, mpn_addmul_2
+
+At least for older machines (z900, z990) with very slow MLGR, we
+should use Karatsuba's algorithm on 2-limb units, making mul_2 and
+addmul_2 the main multiplicaton primitives.  The newer machines might
+benefit less from this approach, perhaps in particular z10, where MLGR
+clustering is more important.
+
+With Karatsuba, one could hope for around 16 cycles per accumulated
+128 cross product, on z990.
-- 
cgit v1.2.1


From 5c345ce60c939a92a920e984d36b4d4d52d4bae9 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 12:41:46 +0100
Subject: *** empty log message ***

---
 ChangeLog | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 80e0f7a32..ba4f47ede 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2011-11-24  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/powerpc32/p7/gmp-mparam.h: New file.
+
+	* tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's
+	threshold.
+
 2011-11-22  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/powerpc64/mode64/p6/aorsmul_1.asm: New file.
-- 
cgit v1.2.1


From cbc96e61b041e6ff713adf3885c610fdefa2023f Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 22:05:28 +0100
Subject: (Formatted Output Strings): Clarify rules for mpf_t precision.

---
 doc/gmp.texi | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/gmp.texi b/doc/gmp.texi
index 1d6538165..9e77abe7f 100644
--- a/doc/gmp.texi
+++ b/doc/gmp.texi
@@ -5909,7 +5909,7 @@ instance extensions registered with GLIBC @code{register_printf_function}.
 Also currently there's no support for POSIX @samp{$} style numbered arguments
 (perhaps this will be added in the future).
 
-The precision field has it's usual meaning for integer @samp{Z} and float
+The precision field has its usual meaning for integer @samp{Z} and float
 @samp{F} types, but is currently undefined for @samp{Q} and should not be used
 with that.
 
@@ -5920,7 +5920,10 @@ happens even for an @samp{f} conversion of an @code{mpf_t} which is an
 integer, for instance @math{2^@W{1024}} in an @code{mpf_t} of 128 bits
 precision will only produce about 40 digits, then pad with zeros to the
 decimal point.  An empty precision field like @samp{%.Fe} or @samp{%.Ff} can
-be used to specifically request just the significant digits.
+be used to specifically request just the significant digits.  Without any dot
+and thus no precision field, a precision value of 6 will be used.  Note that
+these rules mean that @samp{%Ff}, @samp{%.Ff}, and @samp{%.0Ff} will all be
+different.
 
 The decimal point character (or string) is taken from the current locale
 settings on systems which provide @code{localeconv} (@pxref{Locales,, Locales
-- 
cgit v1.2.1


From 853e7d21ab5471b137ac4f80258dd779d54061ba Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Thu, 24 Nov 2011 22:11:22 +0100
Subject: *** empty log message ***

---
 ChangeLog | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index ba4f47ede..761f9161b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2011-11-24  Torbjorn Granlund  <tege@gmplib.org>
 
+	* doc/gmp.texi (Formatted Output Strings): Clarify rules for mpf_t
+	precision.
+
 	* mpn/powerpc32/p7/gmp-mparam.h: New file.
 
 	* tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Up min_size to karatsuba's
-- 
cgit v1.2.1


From df16fd175d4cfcbd2d60cab0ca927c992e3185a4 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Fri, 25 Nov 2011 23:55:30 +0100
Subject: Overhaul x86/x86_64 support, merging three case statements into one.

---
 configure.in | 110 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/configure.in b/configure.in
index 1d1ebd10f..28df31214 100644
--- a/configure.in
+++ b/configure.in
@@ -1468,46 +1468,62 @@ case $host in
       i386*)
         gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386"
         gcc_cflags_arch="-march=i386"
+	path="x86"
         ;;
       i486*)
         gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=i486"
+	path="x86/i486 x86"
         ;;
       i586 | pentium)
         gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486"
         gcc_cflags_arch="-march=pentium"
+	path="x86/pentium x86"
         ;;
       pentiummmx)
         gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486"
         gcc_cflags_arch="-march=pentium-mmx -march=pentium"
+	path="x86/pentium/mmx x86/pentium x86"
         ;;
       i686 | pentiumpro)
         gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=pentiumpro -march=pentium"
+	path="x86/p6 x86"
         ;;
       pentium2)
         gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium"
+	path="x86/p6/mmx x86/p6 x86"
         ;;
-      pentium3 | pentiumm)
+      pentium3)
         gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
+	path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+        ;;
+      pentiumm)
+        gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
+        gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
+	path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
         ;;
       k6)
         gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k6"
+	path="x86/k6/mmx x86/k6 x86"
         ;;
       k62)
         gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k6-2 -march=k6"
+	path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
         ;;
       k63)
         gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k6-3 -march=k6"
+	path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
         ;;
       geode)
         gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k6-3 -march=k6"
+	path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
         ;;
       athlon)
         # Athlon instruction costs are close to P6 (3 cycle load latency,
@@ -1515,6 +1531,7 @@ case $host in
         # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro.
         gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium"
+	path="x86/k7/mmx x86/k7 x86"
         ;;
       i786 | pentium4)
         # pentiumpro is the primary fallback when gcc doesn't know pentium4.
@@ -1524,77 +1541,84 @@ case $host in
         gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium"
         gcc_64_cflags_cpu="-mtune=nocona"
+	path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86"
+	path_64="x86_64/pentium4 x86_64"
         ;;
       viac32)
         # Not sure of the best fallbacks here for -mcpu.
         # c3-2 has sse and mmx, so pentium3 is good for -march.
         gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium"
+	path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
         ;;
       viac3*)
         # Not sure of the best fallbacks here.
         gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
+	path="x86/pentium/mmx x86/pentium x86"
         ;;
       athlon64 | k8 | x86_64)
         gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
+	path="x86/k8 x86"
+	path_64="x86_64/k8 x86_64"
         ;;
       k10)
         gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
-        gcc_cflags_arch="-march=amdfam10 -mtune=k8 -march=k8~-mno-sse2"
+        gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+	path="x86/k10 x86/k8 x86"
+	path_64="x86_64/k10 x86_64/k8 x86_64"
         ;;
       bobcat)
         gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
-        gcc_cflags_arch="-march=btver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2"
+        gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+	path="x86/bobcat x86"
+	path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64"
         ;;
-      bulldozer)
+      bulldozer | bd1)
         gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
-        gcc_cflags_arch="-march=bdver1 -march=amdfam10 -mtune=k8 -march=k8~-mno-sse2"
+        gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+	path="x86/bd1 x86"
+	path_64="x86_64/bd1 x86_64"
         ;;
       core2)
         gcc_cflags_cpu="-mtune=core2 -mtune=k8"
         gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+	path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+	path_64="x86_64/core2 x86_64"
+       ;;
+      corei | coreinhm | coreiwsm)
+        gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
+        gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+	path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+	path_64="x86_64/coreinhm x86_64/core2 x86_64"
         ;;
-      corei | coreinhm | coreiwsm | coreisbr)
+      coreisbr)
         gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
         gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+	path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+	path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
         ;;
       atom)
         gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
         gcc_cflags_arch="-march=atom -march=pentium3"
+	path="x86/atom/sse2 x86/atom/mmx x86/atom x86"
+	path_64="x86_64/atom x86_64"
+        ;;
+      nano)
+        gcc_cflags_cpu="-mtune=nano"
+        gcc_cflags_arch="-march=nano"
+	path="x86/nano x86"
+	path_64="x86_64/nano x86_64"
         ;;
       *)
         gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
         gcc_cflags_arch="-march=i486"
+	path="x86"
+	path_64="x86_64"
         ;;
     esac
 
-    case $host_cpu in
-      i386*)                path="x86" ;;
-      i486*)                path="x86/i486 x86" ;;
-      i586 | pentium)       path="x86/pentium x86" ;;
-      pentiummmx)           path="x86/pentium/mmx x86/pentium x86" ;;
-      i686 | pentiumpro)    path="x86/p6 x86" ;;
-      pentium2)             path="x86/p6/mmx x86/p6 x86" ;;
-      pentium3)             path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86";;
-      pentiumm | core2 | corei | coreinhm | coreiwsm | coreisbr)
-                            path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86";;
-      [k6[23]])             path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;;
-      k6)                   path="x86/k6/mmx x86/k6 x86" ;;
-      geode)                path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86" ;;
-      # we don't have any specific 32-bit code for athlon64/opteron, the
-      # athlon code should be reasonable
-      athlon | athlon64 | k8 | k10 | bobcat | bulldozer)
-                            path="x86/k7/mmx x86/k7 x86" ;;
-      i786 | pentium4)      path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86" ;;
-      # VIA/Centaur processors, sold as CyrixIII and C3.
-      viac32)               path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86";;
-      viac3*)               path="x86/pentium/mmx x86/pentium x86";;
-      atom)                 path="x86/atom/sse2 x86/atom/mmx x86/atom x86" ;;
-      *)                    path="x86" ;;
-    esac
-
     case $host in
       X86_64_PATTERN)
 	cclist_64="gcc"
@@ -1604,34 +1628,10 @@ case $host in
 	SPEED_CYCLECOUNTER_OBJ_64=x86_64.lo
 	cyclecounter_size_64=2
 	abilist="64 32"
-	path_64="x86_64"
 	if test "$enable_assembly" = "yes" ; then
 	    extra_functions_64="invert_limb_table"
 	fi
 
-	case $host_cpu in
-	  x86_64)
-	    ;;
-	  k10 | bulldozer)
-	    path_64="x86_64/k10 x86_64/k8 $path_64" ;;
-	  athlon64 | k8)
-	    path_64="x86_64/k8 $path_64" ;;
-	  bobcat)
-	    path_64="x86_64/bobcat x86_64/k10 x86_64/k8 $path_64" ;;
-	  pentium4)
-	    path_64="x86_64/pentium4 $path_64" ;;
-	  core2)
-	    path_64="x86_64/core2 $path_64" ;;
-	  corei | coreinhm | coreiwsm)
-	    path_64="x86_64/coreinhm x86_64/core2 $path_64" ;;
-	  coreisbr)
-	    path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 $path_64" ;;
-	  atom)
-	    path_64="x86_64/atom $path_64" ;;
-	  nano)
-	    path_64="x86_64/nano $path_64" ;;
-	esac
-
 	case $host in
 	  *-*-solaris*)
 	    # Sun cc.
-- 
cgit v1.2.1


From d0600ffcd15e1baaadee4838ef966ae28eb6e695 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Fri, 25 Nov 2011 23:57:06 +0100
Subject: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode.

---
 mpn/x86/bobcat/gmp-mparam.h   | 141 +++++++++++++++++++++++++++++++++++++++
 mpn/x86/core2/gmp-mparam.h    | 141 +++++++++++++++++++++++++++++++++++++++
 mpn/x86/coreinhm/gmp-mparam.h | 141 +++++++++++++++++++++++++++++++++++++++
 mpn/x86/coreisbr/gmp-mparam.h | 140 ++++++++++++++++++++++++++++++++++++++
 mpn/x86/k10/gmp-mparam.h      | 142 +++++++++++++++++++++++++++++++++++++++
 mpn/x86/k8/gmp-mparam.h       | 144 +++++++++++++++++++++++++++++++++++++++
 mpn/x86/nano/gmp-mparam.h     | 152 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 1001 insertions(+)
 create mode 100644 mpn/x86/bobcat/gmp-mparam.h
 create mode 100644 mpn/x86/core2/gmp-mparam.h
 create mode 100644 mpn/x86/coreinhm/gmp-mparam.h
 create mode 100644 mpn/x86/coreisbr/gmp-mparam.h
 create mode 100644 mpn/x86/k10/gmp-mparam.h
 create mode 100644 mpn/x86/k8/gmp-mparam.h
 create mode 100644 mpn/x86/nano/gmp-mparam.h

diff --git a/mpn/x86/bobcat/gmp-mparam.h b/mpn/x86/bobcat/gmp-mparam.h
new file mode 100644
index 000000000..58dfee1cf
--- /dev/null
+++ b/mpn/x86/bobcat/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           40
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                85
+#define MUL_TOOM44_THRESHOLD               147
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     111
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                303
+#define SQR_TOOM8_THRESHOLD                454
+
+#define MULMID_TOOM42_THRESHOLD             76
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define POWM_SEC_TABLE  2,17,225,357,2212
+
+#define MUL_FFT_MODF_THRESHOLD             888  /* k = 6 */
+#define MUL_FFT_TABLE3                                      \
+  { {    888, 6}, {     27, 7}, {     15, 6}, {     33, 7}, \
+    {     17, 6}, {     35, 7}, {     19, 6}, {     39, 7}, \
+    {     23, 6}, {     47, 7}, {     27, 8}, {     15, 7}, \
+    {     31, 6}, {     63, 7}, {     35, 8}, {     19, 7}, \
+    {     41, 8}, {     23, 7}, {     49, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 7}, {     79, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    271,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335,11}, {    191,10}, {    383, 9}, \
+    {    767,11}, {    223,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 66
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             730  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    730, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     15, 5}, {     31, 6}, {     27, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 6}, {     63, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 7}, {     79, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271,11}, {    159,10}, {    319, 9}, {    671,11}, \
+    {    191, 9}, {    767,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 70
+#define SQR_FFT_THRESHOLD                 7296
+
+#define MULLO_BASECASE_THRESHOLD             5
+#define MULLO_DC_THRESHOLD                  45
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 72
+#define DC_DIVAPPR_Q_THRESHOLD             214
+#define DC_BDIV_QR_THRESHOLD                67
+#define DC_BDIV_Q_THRESHOLD                142
+
+#define INV_MULMOD_BNM1_THRESHOLD           71
+#define INV_NEWTON_THRESHOLD               250
+#define INV_APPR_THRESHOLD                 228
+
+#define BINV_NEWTON_THRESHOLD              270
+#define REDC_1_TO_REDC_N_THRESHOLD          71
+
+#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIVAPPR_Q_THRESHOLD            1822
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1787
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD_THRESHOLD                      81
+#define HGCD_APPR_THRESHOLD                128
+#define HGCD_REDUCE_THRESHOLD             4455
+#define GCD_DC_THRESHOLD                   465
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        32
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD       812
diff --git a/mpn/x86/core2/gmp-mparam.h b/mpn/x86/core2/gmp-mparam.h
new file mode 100644
index 000000000..feb0f281f
--- /dev/null
+++ b/mpn/x86/core2/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           19
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               228
+#define MUL_TOOM6H_THRESHOLD               294
+#define MUL_TOOM8H_THRESHOLD               458
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      90
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      96
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                116
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                262
+#define SQR_TOOM8_THRESHOLD                597
+
+#define MULMID_TOOM42_THRESHOLD             70
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  6,26,262,991,2212
+
+#define MUL_FFT_MODF_THRESHOLD             690  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    690, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    191,10}, {    383, 9}, \
+    {    799,11}, {    223,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 70
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             630  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    630, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     15, 5}, {     31, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    271, 9}, {    543,11}, \
+    {    159,10}, {    319, 9}, {    671, 8}, {   1343,11}, \
+    {    191,10}, {    383, 9}, {    799,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 67
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  30
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 15
+#define DC_DIVAPPR_Q_THRESHOLD              49
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                190
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD                35
+#define INV_APPR_THRESHOLD                  35
+
+#define BINV_NEWTON_THRESHOLD              324
+#define REDC_1_TO_REDC_N_THRESHOLD          83
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1099
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1718
+
+#define MATRIX22_STRASSEN_THRESHOLD         31
+#define HGCD_THRESHOLD                     118
+#define HGCD_APPR_THRESHOLD                149
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   351
+#define GCDEXT_DC_THRESHOLD                309
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               517
+#define SET_STR_PRECOMPUTE_THRESHOLD      1402
diff --git a/mpn/x86/coreinhm/gmp-mparam.h b/mpn/x86/coreinhm/gmp-mparam.h
new file mode 100644
index 000000000..21afeb619
--- /dev/null
+++ b/mpn/x86/coreinhm/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.5 */
+
+#define MOD_1_NORM_THRESHOLD                24
+#define MOD_1_UNNORM_THRESHOLD              15
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      5
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           16
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               214
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     148
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     131
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 42
+#define SQR_TOOM3_THRESHOLD                149
+#define SQR_TOOM4_THRESHOLD                226
+#define SQR_TOOM6_THRESHOLD                333
+#define SQR_TOOM8_THRESHOLD                494
+
+#define MULMID_TOOM42_THRESHOLD             78
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               21
+
+#define POWM_SEC_TABLE  2,33,294,1298,2870
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     28, 6}, {     15, 5}, {     33, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     37, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 63
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             505  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    505, 5}, {     28, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 74
+#define SQR_FFT_THRESHOLD                 4800
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  35
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 21
+#define DC_DIVAPPR_Q_THRESHOLD              42
+#define DC_BDIV_QR_THRESHOLD                84
+#define DC_BDIV_Q_THRESHOLD                156
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD                17
+#define INV_APPR_THRESHOLD                  17
+
+#define BINV_NEWTON_THRESHOLD              348
+#define REDC_1_TO_REDC_N_THRESHOLD          83
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD             501
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD_THRESHOLD                      57
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   253
+#define GCDEXT_DC_THRESHOLD                233
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               127
+#define SET_STR_PRECOMPUTE_THRESHOLD       646
diff --git a/mpn/x86/coreisbr/gmp-mparam.h b/mpn/x86/coreisbr/gmp-mparam.h
new file mode 100644
index 000000000..16ef958ad
--- /dev/null
+++ b/mpn/x86/coreisbr/gmp-mparam.h
@@ -0,0 +1,140 @@
+/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-24, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD                24
+#define MOD_1_UNNORM_THRESHOLD              25
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      3
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           18
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD               101
+#define MUL_TOOM44_THRESHOLD               244
+#define MUL_TOOM6H_THRESHOLD               351
+#define MUL_TOOM8H_THRESHOLD               547
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     109
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     109
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 48
+#define SQR_TOOM3_THRESHOLD                165
+#define SQR_TOOM4_THRESHOLD                276
+#define SQR_TOOM6_THRESHOLD                366
+#define SQR_TOOM8_THRESHOLD                572
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define POWM_SEC_TABLE  2,27,258,1052
+
+#define MUL_FFT_MODF_THRESHOLD             716  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    716, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     27, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     31, 6}, \
+    {     63, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     51, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271,11}, {    159,10}, {    319, 9}, \
+    {    639,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    223,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 69
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             595  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    595, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,11}, \
+    {    159,10}, {    319, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 63
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 100
+#define MULLO_MUL_N_THRESHOLD            14379
+
+#define DC_DIV_QR_THRESHOLD                 22
+#define DC_DIVAPPR_Q_THRESHOLD              30
+#define DC_BDIV_QR_THRESHOLD               120
+#define DC_BDIV_Q_THRESHOLD                268
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD                12
+#define INV_APPR_THRESHOLD                  13
+
+#define BINV_NEWTON_THRESHOLD              410
+#define REDC_1_TO_REDC_N_THRESHOLD         100
+
+#define MU_DIV_QR_THRESHOLD               1037
+#define MU_DIVAPPR_Q_THRESHOLD             889
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD              1858
+#define MU_BDIV_Q_THRESHOLD               2172
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD_THRESHOLD                      59
+#define HGCD_APPR_THRESHOLD                 56
+#define HGCD_REDUCE_THRESHOLD             4818
+#define GCD_DC_THRESHOLD                   278
+#define GCDEXT_DC_THRESHOLD                298
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        23
+#define SET_STR_DC_THRESHOLD               438
+#define SET_STR_PRECOMPUTE_THRESHOLD      1206
diff --git a/mpn/x86/k10/gmp-mparam.h b/mpn/x86/k10/gmp-mparam.h
new file mode 100644
index 000000000..5c036223c
--- /dev/null
+++ b/mpn/x86/k10/gmp-mparam.h
@@ -0,0 +1,142 @@
+/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           32
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                77
+#define MUL_TOOM44_THRESHOLD               127
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      77
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      90
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                336
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             54
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  4,32,164,879,2178
+
+#define MUL_FFT_MODF_THRESHOLD             786  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    786, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 6}, {     63, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     63, 9}, {     39, 8}, {     83, 9}, {     47,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255, 7}, {   1023, 8}, {    543, 9}, {    279,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543, 8}, {   1087,10}, {    287,11}, {    159, 9}, \
+    {    671,11}, {    191,10}, {    399, 9}, {    799,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 76
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             660  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    660, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     31, 7}, {     63, 8}, \
+    {     35, 7}, {     71, 8}, {     39, 9}, {     23, 8}, \
+    {     55,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95,11}, {     63,10}, \
+    {    159,11}, {     95, 8}, {    799,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    543,11}, {    159, 9}, \
+    {    639,10}, {    367,11}, {    191,10}, {    383, 9}, \
+    {    799,10}, {    415,11}, {    223,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 67
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             6
+#define MULLO_DC_THRESHOLD                  42
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             248
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               250
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              276
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1718
+#define MU_DIVAPPR_Q_THRESHOLD            1652
+#define MUPI_DIV_QR_THRESHOLD              114
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     131
+#define HGCD_APPR_THRESHOLD                163
+#define HGCD_REDUCE_THRESHOLD             3810
+#define GCD_DC_THRESHOLD                   555
+#define GCDEXT_DC_THRESHOLD                389
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               140
+#define SET_STR_PRECOMPUTE_THRESHOLD      1334
diff --git a/mpn/x86/k8/gmp-mparam.h b/mpn/x86/k8/gmp-mparam.h
new file mode 100644
index 000000000..727a381f1
--- /dev/null
+++ b/mpn/x86/k8/gmp-mparam.h
@@ -0,0 +1,144 @@
+/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           42
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               136
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      96
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                 77
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                294
+#define SQR_TOOM8_THRESHOLD                430
+
+#define MULMID_TOOM42_THRESHOLD             74
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  2,14,216,991,2658
+
+#define MUL_FFT_MODF_THRESHOLD             888  /* k = 6 */
+#define MUL_FFT_TABLE3                                      \
+  { {    888, 6}, {     15, 5}, {     31, 6}, {     25, 7}, \
+    {     13, 6}, {     27, 7}, {     15, 6}, {     33, 7}, \
+    {     17, 6}, {     35, 7}, {     19, 6}, {     39, 7}, \
+    {     23, 6}, {     47, 7}, {     27, 8}, {     15, 7}, \
+    {     31, 6}, {     63, 7}, {     35, 8}, {     19, 7}, \
+    {     41, 8}, {     23, 7}, {     47, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 7}, {     79, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    271, 9}, {    543,10}, \
+    {    287,11}, {    159,10}, {    335,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,11}, \
+    {    223,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 73
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             758  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    758, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    671,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 74
+#define SQR_FFT_THRESHOLD                 7296
+
+#define MULLO_BASECASE_THRESHOLD             8
+#define MULLO_DC_THRESHOLD                  35
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 91
+#define DC_DIVAPPR_Q_THRESHOLD             278
+#define DC_BDIV_QR_THRESHOLD                87
+#define DC_BDIV_Q_THRESHOLD                216
+
+#define INV_MULMOD_BNM1_THRESHOLD           62
+#define INV_NEWTON_THRESHOLD               262
+#define INV_APPR_THRESHOLD                 262
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              106
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD_THRESHOLD                     139
+#define HGCD_APPR_THRESHOLD                176
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   610
+#define GCDEXT_DC_THRESHOLD                419
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        29
+#define SET_STR_DC_THRESHOLD               450
+#define SET_STR_PRECOMPUTE_THRESHOLD      1366
diff --git a/mpn/x86/nano/gmp-mparam.h b/mpn/x86/nano/gmp-mparam.h
new file mode 100644
index 000000000..5fa509372
--- /dev/null
+++ b/mpn/x86/nano/gmp-mparam.h
@@ -0,0 +1,152 @@
+/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        53
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           32
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD               132
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     130
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     135
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                194
+#define SQR_TOOM4_THRESHOLD                502
+#define SQR_TOOM6_THRESHOLD                746
+#define SQR_TOOM8_THRESHOLD               1005
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  4,23,258,828,2246
+
+#define MUL_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    308, 5}, {     13, 6}, {      7, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     11, 5}, {     23, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     24, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 8}, {     11, 7}, {     25, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     47,10}, \
+    {     31, 9}, {     71,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    543, 9}, \
+    {    287, 8}, {    575, 7}, {   1215,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    543, 8}, {   1087,10}, {    287, 9}, \
+    {    607, 8}, {   1215,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703, 8}, {   1407, 9}, \
+    {    735, 8}, {   1471,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,10}, {    479, 9}, {    959, 8}, \
+    {   1919,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD                 1856
+
+#define SQR_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    396, 5}, {     13, 6}, {      7, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     15, 6}, {     31, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 8}, {     11, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    543,10}, {    143, 9}, \
+    {    287, 8}, {    607, 7}, {   1215, 6}, {   2431,10}, \
+    {    159, 8}, {    639,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    607, 8}, {   1215,11}, \
+    {    159,10}, {    319, 9}, {    671,10}, {    351, 9}, \
+    {    703, 8}, {   1407, 9}, {    735, 8}, {   1471, 7}, \
+    {   2943,11}, {    191,10}, {    383, 9}, {    799,10}, \
+    {    415, 9}, {    895,10}, {    479,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD                 2368
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  51
+#define MULLO_MUL_N_THRESHOLD             3369
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             183
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                118
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               266
+#define INV_APPR_THRESHOLD                 218
+
+#define BINV_NEWTON_THRESHOLD              268
+#define REDC_1_TO_REDC_N_THRESHOLD          56
+
+#define MU_DIV_QR_THRESHOLD               1308
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              124
+#define MU_BDIV_QR_THRESHOLD               855
+#define MU_BDIV_Q_THRESHOLD               1334
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD_THRESHOLD                     104
+#define HGCD_APPR_THRESHOLD                139
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   456
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               542
+#define SET_STR_PRECOMPUTE_THRESHOLD       840
-- 
cgit v1.2.1


From 60d13bded47cd6c09d6ce761347a675b91ecead5 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Fri, 25 Nov 2011 23:57:13 +0100
Subject: *** empty log message ***

---
 ChangeLog | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 761f9161b..5f69c758f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2011-11-25  Torbjorn Granlund  <tege@gmplib.org>
+
+	* x86/*: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode.
+
+	* configure.in: Overhaul x86/x86_64 support, merging three case
+	statements into one.
+
 2011-11-24  Torbjorn Granlund  <tege@gmplib.org>
 
 	* doc/gmp.texi (Formatted Output Strings): Clarify rules for mpf_t
-- 
cgit v1.2.1


From 3d1e2a383827ce817e2dca7c2eabf61f8ebf2245 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sat, 26 Nov 2011 10:56:14 +0100
Subject: Reinsert x86 path components accidentally lost in major edit.

---
 configure.in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/configure.in b/configure.in
index 28df31214..580b3e427 100644
--- a/configure.in
+++ b/configure.in
@@ -1560,25 +1560,25 @@ case $host in
       athlon64 | k8 | x86_64)
         gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
         gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
-	path="x86/k8 x86"
+	path="x86/k8 x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/k8 x86_64"
         ;;
       k10)
         gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
         gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2"
-	path="x86/k10 x86/k8 x86"
+	path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/k10 x86_64/k8 x86_64"
         ;;
       bobcat)
         gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
         gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
-	path="x86/bobcat x86"
+	path="x86/bobcat x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64"
         ;;
       bulldozer | bd1)
         gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
         gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
-	path="x86/bd1 x86"
+	path="x86/bd1 x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/bd1 x86_64"
         ;;
       core2)
-- 
cgit v1.2.1


From ff7620568f0f51316d73501a764ed229bd0d7923 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Sat, 26 Nov 2011 10:57:47 +0100
Subject: Partially tabify.

---
 configure.in | 176 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 88 insertions(+), 88 deletions(-)

diff --git a/configure.in b/configure.in
index 580b3e427..728529c13 100644
--- a/configure.in
+++ b/configure.in
@@ -1466,157 +1466,157 @@ case $host in
     gcc_cflags_optlist="cpu arch"
     case $host_cpu in
       i386*)
-        gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386"
-        gcc_cflags_arch="-march=i386"
+	gcc_cflags_cpu="-mtune=i386 -mcpu=i386 -m386"
+	gcc_cflags_arch="-march=i386"
 	path="x86"
-        ;;
+	;;
       i486*)
-        gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=i486"
+	gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=i486"
 	path="x86/i486 x86"
-        ;;
+	;;
       i586 | pentium)
-        gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486"
-        gcc_cflags_arch="-march=pentium"
+	gcc_cflags_cpu="-mtune=pentium -mcpu=pentium -m486"
+	gcc_cflags_arch="-march=pentium"
 	path="x86/pentium x86"
-        ;;
+	;;
       pentiummmx)
-        gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486"
-        gcc_cflags_arch="-march=pentium-mmx -march=pentium"
+	gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486"
+	gcc_cflags_arch="-march=pentium-mmx -march=pentium"
 	path="x86/pentium/mmx x86/pentium x86"
-        ;;
+	;;
       i686 | pentiumpro)
-        gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=pentiumpro -march=pentium"
+	gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=pentiumpro -march=pentium"
 	path="x86/p6 x86"
-        ;;
+	;;
       pentium2)
-        gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium"
+	gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium"
 	path="x86/p6/mmx x86/p6 x86"
-        ;;
+	;;
       pentium3)
-        gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
+	gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
 	path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
-        ;;
+	;;
       pentiumm)
-        gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
+	gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
 	path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
-        ;;
+	;;
       k6)
-        gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=k6"
+	gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=k6"
 	path="x86/k6/mmx x86/k6 x86"
-        ;;
+	;;
       k62)
-        gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=k6-2 -march=k6"
+	gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=k6-2 -march=k6"
 	path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
-        ;;
+	;;
       k63)
-        gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=k6-3 -march=k6"
+	gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=k6-3 -march=k6"
 	path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
-        ;;
+	;;
       geode)
-        gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=k6-3 -march=k6"
+	gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=k6-3 -march=k6"
 	path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
-        ;;
+	;;
       athlon)
-        # Athlon instruction costs are close to P6 (3 cycle load latency,
-        # 4-6 cycle mul, 40 cycle div, pairable adc, etc) so if gcc doesn't
-        # know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro.
-        gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium"
+	# Athlon instruction costs are close to P6 (3 cycle load latency,
+	# 4-6 cycle mul, 40 cycle div, pairable adc, etc) so if gcc doesn't
+	# know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro.
+	gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium"
 	path="x86/k7/mmx x86/k7 x86"
-        ;;
+	;;
       i786 | pentium4)
-        # pentiumpro is the primary fallback when gcc doesn't know pentium4.
-        # This gets us cmov to eliminate branches.  Maybe "athlon" would be
-        # a possibility on gcc 3.0.
-        #
-        gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium"
-        gcc_64_cflags_cpu="-mtune=nocona"
+	# pentiumpro is the primary fallback when gcc doesn't know pentium4.
+	# This gets us cmov to eliminate branches.  Maybe "athlon" would be
+	# a possibility on gcc 3.0.
+	#
+	gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium"
+	gcc_64_cflags_cpu="-mtune=nocona"
 	path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86"
 	path_64="x86_64/pentium4 x86_64"
-        ;;
+	;;
       viac32)
-        # Not sure of the best fallbacks here for -mcpu.
-        # c3-2 has sse and mmx, so pentium3 is good for -march.
-        gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium"
+	# Not sure of the best fallbacks here for -mcpu.
+	# c3-2 has sse and mmx, so pentium3 is good for -march.
+	gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium"
 	path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
-        ;;
+	;;
       viac3*)
-        # Not sure of the best fallbacks here.
-        gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
+	# Not sure of the best fallbacks here.
+	gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
 	path="x86/pentium/mmx x86/pentium x86"
-        ;;
+	;;
       athlon64 | k8 | x86_64)
-        gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
+	gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
 	path="x86/k8 x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/k8 x86_64"
-        ;;
+	;;
       k10)
-        gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
-        gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+	gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
+	gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2"
 	path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/k10 x86_64/k8 x86_64"
-        ;;
+	;;
       bobcat)
-        gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
-        gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+	gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
+	gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
 	path="x86/bobcat x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64"
-        ;;
+	;;
       bulldozer | bd1)
-        gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
-        gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
+	gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
+	gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
 	path="x86/bd1 x86/k7/mmx x86/k7 x86"
 	path_64="x86_64/bd1 x86_64"
-        ;;
+	;;
       core2)
-        gcc_cflags_cpu="-mtune=core2 -mtune=k8"
-        gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+	gcc_cflags_cpu="-mtune=core2 -mtune=k8"
+	gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
 	path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
 	path_64="x86_64/core2 x86_64"
        ;;
       corei | coreinhm | coreiwsm)
-        gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
-        gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+	gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
+	gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
 	path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
 	path_64="x86_64/coreinhm x86_64/core2 x86_64"
-        ;;
+	;;
       coreisbr)
-        gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
-        gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
+	gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
+	gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
 	path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
 	path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
-        ;;
+	;;
       atom)
-        gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
-        gcc_cflags_arch="-march=atom -march=pentium3"
+	gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
+	gcc_cflags_arch="-march=atom -march=pentium3"
 	path="x86/atom/sse2 x86/atom/mmx x86/atom x86"
 	path_64="x86_64/atom x86_64"
-        ;;
+	;;
       nano)
-        gcc_cflags_cpu="-mtune=nano"
-        gcc_cflags_arch="-march=nano"
+	gcc_cflags_cpu="-mtune=nano"
+	gcc_cflags_arch="-march=nano"
 	path="x86/nano x86"
 	path_64="x86_64/nano x86_64"
-        ;;
+	;;
       *)
-        gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
-        gcc_cflags_arch="-march=i486"
+	gcc_cflags_cpu="-mtune=i486 -mcpu=i486 -m486"
+	gcc_cflags_arch="-march=i486"
 	path="x86"
 	path_64="x86_64"
-        ;;
+	;;
     esac
 
     case $host in
-- 
cgit v1.2.1


From 84e1db5df779cff97b211a773ae2577abb99fa32 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 16:35:07 +0100
Subject: Use CNST_LIMB for some constants.

---
 mpn/generic/udiv_w_sdiv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mpn/generic/udiv_w_sdiv.c b/mpn/generic/udiv_w_sdiv.c
index c01f95847..ceefd1b5f 100644
--- a/mpn/generic/udiv_w_sdiv.c
+++ b/mpn/generic/udiv_w_sdiv.c
@@ -9,7 +9,7 @@
    GNU MP RELEASE.
 
 
-Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+Copyright 1992, 1994, 1996, 2000, 2011 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -116,12 +116,12 @@ mpn_udiv_w_sdiv (rp, a1, a0, d)
 	{				/* Hence a1 = d - 1 = 2*b1 - 1 */
 	  if (a0 >= -d)
 	    {
-	      q = -1;
+	      q = -CNST_LIMB(1);
 	      r = a0 + d;
 	    }
 	  else
 	    {
-	      q = -2;
+	      q = -CNST_LIMB(2);
 	      r = a0 + 2*d;
 	    }
 	}
-- 
cgit v1.2.1


From a18078c77e2cc3fc4919acb9bf18a330b3268366 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 18:53:54 +0100
Subject: Changes to support non-standard ABIs in a coherent way.

---
 configure.in | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/configure.in b/configure.in
index 728529c13..c9f023bbf 100644
--- a/configure.in
+++ b/configure.in
@@ -1640,10 +1640,9 @@ case $host in
 	    ;;
 	  *-*-mingw* | *-*-cygwin)
 	    limb_64=longlong
-	    path_64=""	# Windows amd64 calling conventions are *different*
-	    extra_functions_64=""
-	    # Silence many pedantic warnings for w64.  FIXME.
-	    gcc_64_cflags="$gcc_cflags -std=gnu99"
+	    extra_functions_64=""# FIXME: remove when invert_limb is ported
+	    AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64])
+	    AC_DEFINE(GMP_NONSTD_ABI,1,[Define to 1 if ABI is non-standard])
 	    ;;
 	esac
 	;;
@@ -3061,6 +3060,17 @@ for tmp_fn in $gmp_mpn_functions; do
             esac
           fi
 
+	  # If the host uses a non-standard ABI, check if tmp_file supports it
+	  #
+	  if test -n "$GMP_NONSTD_ABI" && test $tmp_dir != generic; then
+	    abi=[`sed -n 's/^[ 	]*ABI_SUPPORT(\(.*\))/\1/p' $tmp_file `]
+	    if echo "$abi" | grep -q "\\b${GMP_NONSTD_ABI}\\b"; then
+	      true
+	    else
+	      continue
+	    fi
+	  fi
+
           found=yes
           eval found_$tmp_ext=yes
 
@@ -3344,6 +3354,8 @@ if test "$gmp_asm_syntax_testing" != no; then
 	  case $host in
 	    *-*-darwin*)
 	      GMP_INCLUDE_MPN(x86_64/darwin.m4) ;;
+	    *-*-mingw* | *-*-cygwin)
+	      GMP_INCLUDE_MPN(x86_64/dos64.m4) ;;
 	  esac
           ;;
       esac
-- 
cgit v1.2.1


From 4fc5b919efe2d88b2ba5f8066ebac6ae6b326019 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 18:54:52 +0100
Subject: (ABI_SUPPORT): New dummy macro.

---
 mpn/asm-defs.m4 | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4
index 7a5639fbe..b95cad7c0 100644
--- a/mpn/asm-defs.m4
+++ b/mpn/asm-defs.m4
@@ -1713,6 +1713,22 @@ m4_assert_numargs(1)
 )
 
 
+dnl  Usage: ABI_SUPPORT(abi)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what ABIs
+dnl  are supported in an asm file.
+dnl
+dnl  If multiple non-standard ABIs are supported, several ABI_SUPPORT
+dnl  declarations should be used:
+dnl
+dnl         ABI_SUPPORT(FOOABI)
+dnl         ABI_SUPPORT(BARABI)
+
+define(ABI_SUPPORT,
+m4_assert_numargs(1)
+)
+
+
 dnl  Usage: GMP_NUMB_MASK
 dnl
 dnl  A bit mask for the number part of a limb.  Eg. with 6 bit nails in a
-- 
cgit v1.2.1


From 5db131d3587a358bb093d29a90ff922828a4e1bd Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:06:35 +0100
Subject: More DOS64 configure changes.

---
 configure.in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configure.in b/configure.in
index c9f023bbf..5b7cf188a 100644
--- a/configure.in
+++ b/configure.in
@@ -1640,9 +1640,10 @@ case $host in
 	    ;;
 	  *-*-mingw* | *-*-cygwin)
 	    limb_64=longlong
-	    extra_functions_64=""# FIXME: remove when invert_limb is ported
+	    extra_functions_64="" # FIXME: remove when invert_limb is ported
+	    CALLING_CONVENTIONS_OBJS_64=""
 	    AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64])
-	    AC_DEFINE(GMP_NONSTD_ABI,1,[Define to 1 if ABI is non-standard])
+	    AC_SUBST(GMP_NONSTD_ABI,DOS64)
 	    ;;
 	esac
 	;;
-- 
cgit v1.2.1


From 3ef9c90709d4a4e1a6a5052cea8e17c9feb76728 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:06:51 +0100
Subject: New file.

---
 mpn/x86_64/dos64.m4 | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 mpn/x86_64/dos64.m4

diff --git a/mpn/x86_64/dos64.m4 b/mpn/x86_64/dos64.m4
new file mode 100644
index 000000000..ef60834ec
--- /dev/null
+++ b/mpn/x86_64/dos64.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl  Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+define(`HOST_DOS64')
+
+define(`JUMPTABSECT', `.section .rdata,"dr"')
+
+define(`DOS64_ENTRY',
+	`push	%rdi
+	push	%rsi
+	mov	%rcx, %rdi
+ifelse(eval($1>=2),1,`dnl
+	mov	%rdx, %rsi
+ifelse(eval($1>=3),1,`dnl
+	mov	%r8, %rdx
+ifelse(eval($1>=4),1,`dnl
+	mov	%r9, %rcx
+')')')')
+
+define(`DOS64_EXIT',
+	`pop	%rsi
+	pop	%rdi')
+
+divert`'dnl
-- 
cgit v1.2.1


From d220c40fa205a350318faf2257a87480fce910df Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:08:16 +0100
Subject: (DOS64_ENTRY, DOS64_EXIT): New, empty defs.

---
 mpn/x86_64/x86_64-defs.m4 | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4
index 6942a7882..79d7b3cf2 100644
--- a/mpn/x86_64/x86_64-defs.m4
+++ b/mpn/x86_64/x86_64-defs.m4
@@ -2,8 +2,8 @@ divert(-1)
 
 dnl  m4 macros for amd64 assembler.
 
-dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free
-dnl  Software Foundation, Inc.
+dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009, 2011
+dnl  Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -169,4 +169,7 @@ ifdef(`PIC',
 
 define(`JUMPTABSECT', `.section	.data.rel.ro.local,"aw",@progbits')
 
+define(`DOS64_ENTRY',`')
+define(`DOS64_EXIT',`')
+
 divert`'dnl
-- 
cgit v1.2.1


From dc3473c8e0d2c4e84a698901cc82327b7df24286 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:09:54 +0100
Subject: Retune.

---
 mpn/x86_64/gmp-mparam.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h
index b16ff5a6b..aca6853f0 100644
--- a/mpn/x86_64/gmp-mparam.h
+++ b/mpn/x86_64/gmp-mparam.h
@@ -192,7 +192,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MATRIX22_STRASSEN_THRESHOLD         16
 #define HGCD_THRESHOLD                     125
-#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_APPR_THRESHOLD                173
 #define HGCD_REDUCE_THRESHOLD             3524
 #define GCD_DC_THRESHOLD                   555
 #define GCDEXT_DC_THRESHOLD                478
-- 
cgit v1.2.1


From d4f5eddea43f682b380e85a9db69b4e8fd8ea54a Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:11:55 +0100
Subject: Retune.

---
 mpn/ia64/gmp-mparam.h          |  16 +++---
 mpn/s390_32/esame/gmp-mparam.h |  86 ++++++++++++++++---------------
 mpn/s390_64/gmp-mparam.h       |   8 +--
 mpn/x86/bobcat/gmp-mparam.h    | 113 +++++++++++++++++++++--------------------
 4 files changed, 114 insertions(+), 109 deletions(-)

diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h
index f080b876e..77e02f518 100644
--- a/mpn/ia64/gmp-mparam.h
+++ b/mpn/ia64/gmp-mparam.h
@@ -26,10 +26,10 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        21
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        26
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
 #define USE_PREINV_DIVREM_1                  1  /* native */
 #define DIV_QR_2_PI2_THRESHOLD              12
@@ -54,6 +54,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                  0  /* always */
 #define SQR_TOOM8_THRESHOLD                  0  /* always */
 
+#define MULMID_TOOM42_THRESHOLD             98
+
 #define MULMOD_BNM1_THRESHOLD               23
 #define SQRMOD_BNM1_THRESHOLD               28
 
@@ -171,9 +173,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MATRIX22_STRASSEN_THRESHOLD         23
 #define HGCD_THRESHOLD                     117
-#define HGCD_APPR_THRESHOLD                 50
-#define HGCD_REDUCE_THRESHOLD             3389
-#define GCD_DC_THRESHOLD                   496
+#define HGCD_APPR_THRESHOLD                111
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   555
 #define GCDEXT_DC_THRESHOLD                368
 #define JACOBI_BASE_METHOD                   4
 
diff --git a/mpn/s390_32/esame/gmp-mparam.h b/mpn/s390_32/esame/gmp-mparam.h
index 5dedeeb81..a6508be1a 100644
--- a/mpn/s390_32/esame/gmp-mparam.h
+++ b/mpn/s390_32/esame/gmp-mparam.h
@@ -24,43 +24,45 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 /* 1200 MHz IBM z990 running in 32-bit mode */
 
 #define DIVREM_1_NORM_THRESHOLD              0  /* always */
-#define DIVREM_1_UNNORM_THRESHOLD            3
-#define MOD_1_1P_METHOD                      1
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD         12
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     21
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     42
 #define USE_PREINV_DIVREM_1                  1
 #define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD           50
+#define BMOD_1_TO_MOD_1_THRESHOLD           30
 
 #define MUL_TOOM22_THRESHOLD                16
-#define MUL_TOOM33_THRESHOLD                66
-#define MUL_TOOM44_THRESHOLD               169
-#define MUL_TOOM6H_THRESHOLD               369
-#define MUL_TOOM8H_THRESHOLD               517
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD               147
+#define MUL_TOOM6H_THRESHOLD               226
+#define MUL_TOOM8H_THRESHOLD               333
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD     106
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD     114
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     187
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     100
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     102
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
-#define SQR_TOOM2_THRESHOLD                 28
-#define SQR_TOOM3_THRESHOLD                 93
-#define SQR_TOOM4_THRESHOLD                387
-#define SQR_TOOM6_THRESHOLD                552
-#define SQR_TOOM8_THRESHOLD                  0  /* always */
+#define SQR_TOOM2_THRESHOLD                 26
+#define SQR_TOOM3_THRESHOLD                 81
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                478
 
 #define MULMID_TOOM42_THRESHOLD             38
 
 #define MULMOD_BNM1_THRESHOLD               13
 #define SQRMOD_BNM1_THRESHOLD               15
 
+#define POWM_SEC_TABLE  4,23,262,892,2500
+
 #define MUL_FFT_MODF_THRESHOLD             336  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    336, 5}, {     19, 6}, {     11, 5}, {     23, 6}, \
@@ -91,37 +93,37 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_FFT_TABLE3_SIZE 35
 #define SQR_FFT_THRESHOLD                 2368
 
-#define MULLO_BASECASE_THRESHOLD             6
-#define MULLO_DC_THRESHOLD                  45
+#define MULLO_BASECASE_THRESHOLD             5
+#define MULLO_DC_THRESHOLD                  49
 #define MULLO_MUL_N_THRESHOLD             5397
 
-#define DC_DIV_QR_THRESHOLD                 40
-#define DC_DIVAPPR_Q_THRESHOLD             152
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             146
 #define DC_BDIV_QR_THRESHOLD                51
-#define DC_BDIV_Q_THRESHOLD                136
+#define DC_BDIV_Q_THRESHOLD                124
 
 #define INV_MULMOD_BNM1_THRESHOLD           46
-#define INV_NEWTON_THRESHOLD               197
-#define INV_APPR_THRESHOLD                 157
+#define INV_NEWTON_THRESHOLD               179
+#define INV_APPR_THRESHOLD                 153
 
-#define BINV_NEWTON_THRESHOLD              114
+#define BINV_NEWTON_THRESHOLD              214
 #define REDC_1_TO_REDC_N_THRESHOLD          55
 
-#define MU_DIV_QR_THRESHOLD               1210
-#define MU_DIVAPPR_Q_THRESHOLD            1334
-#define MUPI_DIV_QR_THRESHOLD               81
-#define MU_BDIV_QR_THRESHOLD               942
-#define MU_BDIV_Q_THRESHOLD               1258
-
-#define MATRIX22_STRASSEN_THRESHOLD         17
-#define HGCD_THRESHOLD                     104
-#define GCD_DC_THRESHOLD                   278
+#define MU_DIV_QR_THRESHOLD               1078
+#define MU_DIVAPPR_Q_THRESHOLD            1078
+#define MUPI_DIV_QR_THRESHOLD               74
+#define MU_BDIV_QR_THRESHOLD               872
+#define MU_BDIV_Q_THRESHOLD               1078
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD_THRESHOLD                      90
+#define HGCD_APPR_THRESHOLD                111
+#define HGCD_REDUCE_THRESHOLD             1962
+#define GCD_DC_THRESHOLD                   225
 #define GCDEXT_DC_THRESHOLD                217
 #define JACOBI_BASE_METHOD                   2
 
-#define GET_STR_DC_THRESHOLD                16
-#define GET_STR_PRECOMPUTE_THRESHOLD        30
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
 #define SET_STR_DC_THRESHOLD               274
 #define SET_STR_PRECOMPUTE_THRESHOLD       824
-
-/* Tuneup completed successfully, took 108 seconds */
diff --git a/mpn/s390_64/gmp-mparam.h b/mpn/s390_64/gmp-mparam.h
index 46ca86726..c0ade71c2 100644
--- a/mpn/s390_64/gmp-mparam.h
+++ b/mpn/s390_64/gmp-mparam.h
@@ -28,19 +28,19 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MOD_1_1P_METHOD                      2
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        58
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        38
 #define MOD_1_2_TO_MOD_1_4_THRESHOLD         0
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     19
 #define USE_PREINV_DIVREM_1                  1
 #define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD           47
+#define BMOD_1_TO_MOD_1_THRESHOLD           88
 
 #define MUL_TOOM22_THRESHOLD                10
 #define MUL_TOOM33_THRESHOLD                41
-#define MUL_TOOM44_THRESHOLD                99
+#define MUL_TOOM44_THRESHOLD               104
 #define MUL_TOOM6H_THRESHOLD               149
 #define MUL_TOOM8H_THRESHOLD               212
 
diff --git a/mpn/x86/bobcat/gmp-mparam.h b/mpn/x86/bobcat/gmp-mparam.h
index 58dfee1cf..e14ba39f5 100644
--- a/mpn/x86/bobcat/gmp-mparam.h
+++ b/mpn/x86/bobcat/gmp-mparam.h
@@ -25,30 +25,30 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
 #define MOD_1_NORM_THRESHOLD                 0  /* always */
 #define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD         12
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        23
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
 #define USE_PREINV_DIVREM_1                  1  /* native */
 #define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD           40
+#define BMOD_1_TO_MOD_1_THRESHOLD           42
 
 #define MUL_TOOM22_THRESHOLD                28
-#define MUL_TOOM33_THRESHOLD                85
+#define MUL_TOOM33_THRESHOLD                90
 #define MUL_TOOM44_THRESHOLD               147
-#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM6H_THRESHOLD               274
 #define MUL_TOOM8H_THRESHOLD               454
 
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     111
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
 
 #define SQR_BASECASE_THRESHOLD               0  /* always (native) */
 #define SQR_TOOM2_THRESHOLD                 38
-#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM3_THRESHOLD                 89
 #define SQR_TOOM4_THRESHOLD                220
 #define SQR_TOOM6_THRESHOLD                303
 #define SQR_TOOM8_THRESHOLD                454
@@ -58,84 +58,85 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define MULMOD_BNM1_THRESHOLD               19
 #define SQRMOD_BNM1_THRESHOLD               23
 
-#define POWM_SEC_TABLE  2,17,225,357,2212
+#define POWM_SEC_TABLE  4,14,290,357,2178
 
 #define MUL_FFT_MODF_THRESHOLD             888  /* k = 6 */
 #define MUL_FFT_TABLE3                                      \
-  { {    888, 6}, {     27, 7}, {     15, 6}, {     33, 7}, \
-    {     17, 6}, {     35, 7}, {     19, 6}, {     39, 7}, \
-    {     23, 6}, {     47, 7}, {     27, 8}, {     15, 7}, \
-    {     31, 6}, {     63, 7}, {     35, 8}, {     19, 7}, \
-    {     41, 8}, {     23, 7}, {     49, 8}, {     31, 7}, \
-    {     63, 8}, {     39, 7}, {     79, 9}, {     23, 8}, \
-    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
-    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
-    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
-    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
-    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
-    {    191,11}, {     63,10}, {    127, 9}, {    255,10}, \
-    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
-    {    127,10}, {    271,11}, {    159,10}, {    319, 9}, \
-    {    639,10}, {    335,11}, {    191,10}, {    383, 9}, \
-    {    767,11}, {    223,12}, {   4096,13}, {   8192,14}, \
+  { {    888, 6}, {     25, 7}, {     13, 6}, {     27, 7}, \
+    {     15, 6}, {     33, 7}, {     17, 6}, {     35, 7}, \
+    {     19, 6}, {     39, 7}, {     23, 6}, {     47, 7}, \
+    {     27, 8}, {     15, 7}, {     31, 6}, {     63, 7}, \
+    {     35, 8}, {     19, 7}, {     41, 8}, {     23, 7}, \
+    {     49, 8}, {     31, 7}, {     63, 8}, {     39, 7}, \
+    {     79, 8}, {     43, 9}, {     23, 8}, {     51, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,11}, \
+    {    159,10}, {    319, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,12}, {   4096,13}, {   8192,14}, \
     {  16384,15}, {  32768,16} }
-#define MUL_FFT_TABLE3_SIZE 66
+#define MUL_FFT_TABLE3_SIZE 70
 #define MUL_FFT_THRESHOLD                 7552
 
-#define SQR_FFT_MODF_THRESHOLD             730  /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD             723  /* k = 5 */
 #define SQR_FFT_TABLE3                                      \
-  { {    730, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+  { {    723, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
     {     15, 5}, {     31, 6}, {     27, 7}, {     15, 6}, \
     {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
     {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
     {     15, 7}, {     31, 6}, {     63, 7}, {     35, 8}, \
-    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
-    {     31, 7}, {     63, 8}, {     39, 7}, {     79, 8}, \
-    {     43, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     47, 7}, {     95, 8}, {     51, 9}, {     31, 8}, \
     {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
     {     95,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
     {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
     {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
     {     95, 9}, {    191,11}, {     63,10}, {    127, 9}, \
     {    255,10}, {    159,11}, {     95,10}, {    191,12}, \
-    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
-    {    271,11}, {    159,10}, {    319, 9}, {    671,11}, \
-    {    191, 9}, {    767,12}, {   4096,13}, {   8192,14}, \
-    {  16384,15}, {  32768,16} }
-#define SQR_FFT_TABLE3_SIZE 70
-#define SQR_FFT_THRESHOLD                 7296
+    {     63,11}, {    127,10}, {    255, 9}, {    543,11}, \
+    {    159, 9}, {    671,11}, {    191,10}, {    383, 9}, \
+    {    799,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 69
+#define SQR_FFT_THRESHOLD                 5760
 
 #define MULLO_BASECASE_THRESHOLD             5
 #define MULLO_DC_THRESHOLD                  45
 #define MULLO_MUL_N_THRESHOLD            13463
 
-#define DC_DIV_QR_THRESHOLD                 72
-#define DC_DIVAPPR_Q_THRESHOLD             214
+#define DC_DIV_QR_THRESHOLD                 75
+#define DC_DIVAPPR_Q_THRESHOLD             216
 #define DC_BDIV_QR_THRESHOLD                67
-#define DC_BDIV_Q_THRESHOLD                142
+#define DC_BDIV_Q_THRESHOLD                143
 
-#define INV_MULMOD_BNM1_THRESHOLD           71
-#define INV_NEWTON_THRESHOLD               250
+#define INV_MULMOD_BNM1_THRESHOLD           75
+#define INV_NEWTON_THRESHOLD               244
 #define INV_APPR_THRESHOLD                 228
 
-#define BINV_NEWTON_THRESHOLD              270
+#define BINV_NEWTON_THRESHOLD              276
 #define REDC_1_TO_REDC_N_THRESHOLD          71
 
-#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIV_QR_THRESHOLD               1858
 #define MU_DIVAPPR_Q_THRESHOLD            1822
 #define MUPI_DIV_QR_THRESHOLD              122
 #define MU_BDIV_QR_THRESHOLD              1787
 #define MU_BDIV_Q_THRESHOLD               1787
 
-#define MATRIX22_STRASSEN_THRESHOLD         21
-#define HGCD_THRESHOLD                      81
-#define HGCD_APPR_THRESHOLD                128
-#define HGCD_REDUCE_THRESHOLD             4455
-#define GCD_DC_THRESHOLD                   465
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD_THRESHOLD                      78
+#define HGCD_APPR_THRESHOLD                 55
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   474
 #define GCDEXT_DC_THRESHOLD                345
 #define JACOBI_BASE_METHOD                   4
 
-#define GET_STR_DC_THRESHOLD                11
-#define GET_STR_PRECOMPUTE_THRESHOLD        32
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        31
 #define SET_STR_DC_THRESHOLD               270
 #define SET_STR_PRECOMPUTE_THRESHOLD       812
-- 
cgit v1.2.1


From 68afbfbde8fb3e1bc9bb31d53ce5d81f438262a1 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:13:37 +0100
Subject: Support ABI DOS64.

---
 mpn/x86_64/aorsmul_1.asm    | 51 ++++++++++++++++++++++++++++++-----------
 mpn/x86_64/mul_1.asm        | 55 ++++++++++++++++++++++++++++++++++-----------
 mpn/x86_64/mul_basecase.asm | 14 ++++++++++++
 mpn/x86_64/sqr_basecase.asm | 17 +++++++++++++-
 4 files changed, 110 insertions(+), 27 deletions(-)

diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm
index 9c64d56fc..a406bc9e8 100644
--- a/mpn/x86_64/aorsmul_1.asm
+++ b/mpn/x86_64/aorsmul_1.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
 
-dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -28,20 +28,27 @@ C Intel corei	 ?
 C Intel atom	21.3
 C VIA nano	 5.5
 
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
 C optimization tool suite written by David Harvey and Torbjorn Granlund.
 
-C TODO:
-C  * The inner loop is great, but the prologue and epilogue code was
-C    quickly written.  Tune it!
+C TODO
+C  * The loop is great, but the prologue and epilogue code was quickly written.
+C    Tune it!
 
-C INPUT PARAMETERS
-define(`rp',	 `%rdi')
-define(`up',	 `%rsi')
-define(`n_param',`%rdx')
-define(`vl',	 `%rcx')
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vl',      `%rcx')   C r9
 
-define(`n',	`%r11')
+define(`n',       `%r11')
+
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
 
 ifdef(`OPERATION_addmul_1',`
       define(`ADDSUB',        `add')
@@ -52,17 +59,33 @@ ifdef(`OPERATION_submul_1',`
       define(`func',  `mpn_submul_1')
 ')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
 
+IFDOS(`	define(`up', ``%rsi'')	') dnl
+IFDOS(`	define(`rp', ``%rcx'')	') dnl
+IFDOS(`	define(`vl', ``%r9'')	') dnl
+IFDOS(`	define(`r9', ``rdi'')	') dnl
+IFDOS(`	define(`n',  ``%r8'')	') dnl
+IFDOS(`	define(`r8', ``r11'')	') dnl
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
 	mov	(up), %rax		C read first u limb early
 	push	%rbx
-	mov	n_param, %rbx		C move away n from rdx, mul uses it
+IFELF(`	mov	n_param, %rbx   ')	C move away n  from rdx, mul uses it
+IFDOS(`	mov	n, %rbx         ')
 	mul	vl
-	mov	%rbx, n
+IFELF(`	mov	%rbx, n         ')
 
 	and	$3, R32(%rbx)
 	jz	L(b0)
@@ -145,5 +168,7 @@ L(ret):	adc	$0, %rdx
 	mov	%rdx, %rax
 
 	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm
index 5f8dc4c9c..3b87bbf01 100644
--- a/mpn/x86_64/mul_1.asm
+++ b/mpn/x86_64/mul_1.asm
@@ -28,38 +28,65 @@ C Intel corei	 3.8
 C Intel atom	19.8
 C VIA nano	 ?
 
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
 C optimization tool suite written by David Harvey and Torbjorn Granlund.
 
-C TODO:
-C  * The inner loop is great, but the prologue and epilogue code was
-C    quickly written.  Tune it!
+C TODO
+C  * The loop is great, but the prologue and epilogue code was quickly written.
+C    Tune it!
 
-C INPUT PARAMETERS
-define(`rp',	 `%rdi')
-define(`up',	 `%rsi')
-define(`n_param',`%rdx')
-define(`vl',	 `%rcx')
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vl',      `%rcx')   C r9
 
-define(`n',	`%r11')
+define(`n',       `%r11')
+
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+IFDOS(`	define(`up', ``%rsi'')	') dnl
+IFDOS(`	define(`rp', ``%rcx'')	') dnl
+IFDOS(`	define(`vl', ``%r9'')	') dnl
+IFDOS(`	define(`r9', ``rdi'')	') dnl
+IFDOS(`	define(`n',  ``%r8'')	') dnl
+IFDOS(`	define(`r8', ``r11'')	') dnl
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mul_1c)
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
 	push	%rbx
-	mov	%r8, %r10
+IFELF(`	mov	%r8, %r10')
+IFDOS(`	mov	64(%rsp), %r10')	C 40 + 3*8  (3 push insns)
 	jmp	L(common)
 EPILOGUE()
 
 PROLOGUE(mpn_mul_1)
+
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
 	push	%rbx
 	xor	%r10, %r10
 L(common):
 	mov	(up), %rax		C read first u limb early
-	mov	n_param, %rbx		C move away n from rdx, mul uses it
+IFELF(`	mov	n_param, %rbx   ')	C move away n  from rdx, mul uses it
+IFDOS(`	mov	n, %rbx         ')
 	mul	vl
-	mov	%rbx, %r11
+IFELF(`	mov	%rbx, n         ')
 
 	add	%r10, %rax
 	adc	$0, %rdx
@@ -145,5 +172,7 @@ L(L2):	mul	vl
 L(ret):	mov	%rdx, %rax
 
 	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mul_basecase.asm b/mpn/x86_64/mul_basecase.asm
index fdba9a6e3..5fede9234 100644
--- a/mpn/x86_64/mul_basecase.asm
+++ b/mpn/x86_64/mul_basecase.asm
@@ -59,10 +59,23 @@ define(`n',  `%r11')
 define(`outer_addr', `%r14')
 define(`un',  `%r13')
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mul_basecase)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -448,6 +461,7 @@ L(ret):	pop	%r15
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 
 EPILOGUE()
diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm
index 311daab8a..f71627ab9 100644
--- a/mpn/x86_64/sqr_basecase.asm
+++ b/mpn/x86_64/sqr_basecase.asm
@@ -75,12 +75,22 @@ define(`w1',	`%rcx')
 define(`w2',	`%rbp')
 define(`w3',	`%r10')
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
-
 PROLOGUE(mpn_sqr_basecase)
+	DOS64_ENTRY(3)
 	add	$-40, %rsp
 	mov	%rbx, 32(%rsp)
 	mov	%rbp, 24(%rsp)
@@ -115,6 +125,7 @@ L(1):	mov	(up), %rax
 	mov	%rdx, 8(rp)
 	add	$32, %rsp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 
 L(2):	mov	(up), %rax
@@ -139,6 +150,7 @@ L(2):	mov	(up), %rax
 	mov	%r11, 24(rp)
 	add	$32, %rsp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 
 L(3):	mov	(up), %rax
@@ -184,6 +196,7 @@ L(3):	mov	(up), %rax
 	adc	%rbx, 40(rp)
 	add	$32, %rsp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 
 L(4):	mov	(up), %rax
@@ -256,6 +269,7 @@ L(4):	mov	(up), %rax
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 
 
@@ -780,5 +794,6 @@ L(d1):	mov	%r11, 24(rp,j,8)
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
-- 
cgit v1.2.1


From b69304467882ffb2ef4e4ffe6c6876f877dc1d40 Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Mon, 28 Nov 2011 23:13:59 +0100
Subject: *** empty log message ***

---
 ChangeLog | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 5f69c758f..6967ce66a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2011-11-28  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/x86_64/mul_basecase.asm: Support ABI DOS64.
+	* mpn/x86_64/sqr_basecase.asm: Support ABI DOS64.
+	* mpn/x86_64/aorsmul_1.asm: Support ABI DOS64.
+	* mpn/x86_64/mul_1.asm: Support ABI DOS64.
+
+	* mpn/x86_64/x86_64-defs.m4 (DOS64_ENTRY, DOS64_EXIT): New, empty defs.
+
+	* mpn/x86_64/dos64.m4: New file.
+
+	* mpn/asm-defs.m4 (ABI_SUPPORT): New dummy macro.
+
+	* configure.in (64-bit mingw/cygwin): Define HOST_DOS64,GMP_NONSTD_ABI.
+	No longer clear out path_64.
+	(mpn code selection loop): Handle GMP_NONSTD_ABI.
+
+	* mpn/generic/udiv_w_sdiv.c: Use CNST_LIMB for some constants.
+
 2011-11-25  Torbjorn Granlund  <tege@gmplib.org>
 
 	* x86/*: Many new gmp-mparam.h file for 64-bit CPUs in 32-bit mode.
-- 
cgit v1.2.1


From cdaf5d1a1e737e7db82e6509571468fa660c043b Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 29 Nov 2011 21:46:37 +0100
Subject: Retune.

---
 mpn/x86_64/coreinhm/gmp-mparam.h | 92 +++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/mpn/x86_64/coreinhm/gmp-mparam.h b/mpn/x86_64/coreinhm/gmp-mparam.h
index 90cfa2be4..0a0ada3c5 100644
--- a/mpn/x86_64/coreinhm/gmp-mparam.h
+++ b/mpn/x86_64/coreinhm/gmp-mparam.h
@@ -53,58 +53,92 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define SQR_TOOM6_THRESHOLD                318
 #define SQR_TOOM8_THRESHOLD                502
 
+#define MULMID_TOOM42_THRESHOLD             22
+
 #define MULMOD_BNM1_THRESHOLD               13
 #define SQRMOD_BNM1_THRESHOLD               13
 
-#define POWM_SEC_TABLE  2,65,322,1084
+#define POWM_SEC_TABLE  3,42,83,643,2080
 
 #define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
 #define MUL_FFT_TABLE3                                      \
   { {    380, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
-    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
-    {     23, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     10, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
     {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
     {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
     {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
-    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
     {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
     {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
     {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
     {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
-    {    159, 9}, {    319, 8}, {    639,10}, {    167,11}, \
-    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
-    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
-    {    575,11}, {    159,10}, {    319,12}, {     95,11}, \
-    {    191,10}, {    383,11}, {    207,13}, {   8192,14}, \
-    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
-    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
-    {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 74
+    {    159,11}, {     95,10}, {    191, 9}, {    383,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,10}, {    575,11}, {    303,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,10}, {    895,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,12}, {    287,11}, {    607,12}, {    319,11}, \
+    {    639,12}, {    351,11}, {    703,10}, {   1407,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,10}, {   1663,12}, {    447,11}, {    895,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    607,13}, {    319,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    959,11}, {   1919,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 137
 #define MUL_FFT_THRESHOLD                 3712
 
-#define SQR_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD             304  /* k = 5 */
 #define SQR_FFT_TABLE3                                      \
-  { {    308, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+  { {    304, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
     {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
     {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
     {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
     {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
     {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
-    {     15, 9}, {     43,10}, {     23, 9}, {     47,11}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
     {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
-    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
-    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
-    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
-    {    511,10}, {    135,11}, {     79,10}, {    159, 9}, \
-    {    319,11}, {     95,10}, {    191, 9}, {    383, 8}, \
-    {    767,12}, {     63,10}, {    255,11}, {    143, 9}, \
-    {    575, 8}, {   1151,11}, {    159,10}, {    319, 9}, \
-    {    639,11}, {    175,12}, {     95,11}, {    191,10}, \
-    {    383,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
-    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
-    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD                 3200
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    255,11}, {     79,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319,11}, {    175,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,10}, {    895,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,12}, {    287,11}, {    575,10}, \
+    {   1151,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,11}, {    959,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,13}, \
+    {    447,12}, {    959,11}, {   1919,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 137
+#define SQR_FFT_THRESHOLD                 2752
 
 #define MULLO_BASECASE_THRESHOLD             4
 #define MULLO_DC_THRESHOLD                  21
-- 
cgit v1.2.1


From 23df1f61b5f28b4bf4953acd2b069d1f09d6450f Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 29 Nov 2011 21:59:39 +0100
Subject: Add DOS64 ABI support to most files.

---
 ChangeLog                         |  4 ++++
 configure.in                      |  1 -
 mpn/x86_64/addmul_2.asm           |  7 ++++++-
 mpn/x86_64/aorrlsh1_n.asm         |  8 +++++++-
 mpn/x86_64/aorrlsh2_n.asm         |  5 ++++-
 mpn/x86_64/aorrlshC_n.asm         |  7 ++++++-
 mpn/x86_64/aorrlsh_n.asm          | 14 ++++++++++++++
 mpn/x86_64/aors_n.asm             | 34 ++++++++++++++++++++++++++--------
 mpn/x86_64/aorscnd_n.asm          | 14 ++++++++++++++
 mpn/x86_64/bdiv_dbm1c.asm         | 16 +++++++++++++++-
 mpn/x86_64/bdiv_q_1.asm           | 21 +++++++++++++++++++--
 mpn/x86_64/com.asm                |  8 ++++++--
 mpn/x86_64/copyd.asm              |  9 +++++++--
 mpn/x86_64/copyi.asm              |  9 +++++++--
 mpn/x86_64/core2/aorrlsh1_n.asm   |  5 ++++-
 mpn/x86_64/core2/aorrlsh2_n.asm   |  5 ++++-
 mpn/x86_64/core2/aorrlsh_n.asm    |  4 ++++
 mpn/x86_64/core2/aors_n.asm       | 19 ++++++++++++++++---
 mpn/x86_64/core2/aorsmul_1.asm    |  8 +++++++-
 mpn/x86_64/core2/lshift.asm       | 39 +++++++++++++++++++++++----------------
 mpn/x86_64/core2/lshiftc.asm      | 39 +++++++++++++++++++++++----------------
 mpn/x86_64/core2/rsh1aors_n.asm   | 17 ++++++++++++++++-
 mpn/x86_64/core2/rshift.asm       | 39 +++++++++++++++++++++++----------------
 mpn/x86_64/core2/sublsh1_n.asm    |  5 ++++-
 mpn/x86_64/core2/sublsh2_n.asm    |  5 ++++-
 mpn/x86_64/core2/sublshC_n.asm    |  4 +++-
 mpn/x86_64/coreinhm/aorrlsh_n.asm | 17 +++++++++++++++++
 mpn/x86_64/coreisbr/aors_n.asm    | 14 ++++++++++++++
 mpn/x86_64/invert_limb.asm        |  6 +++++-
 mpn/x86_64/invert_limb_table.asm  |  3 +++
 mpn/x86_64/logops_n.asm           | 16 ++++++++++++----
 mpn/x86_64/lshift.asm             | 11 ++++++++++-
 mpn/x86_64/lshiftc.asm            |  7 ++++++-
 mpn/x86_64/lshsub_n.asm           | 16 +++++++++++++++-
 mpn/x86_64/mod_1_1.asm            |  7 +++++++
 mpn/x86_64/mod_1_2.asm            |  9 ++++++++-
 mpn/x86_64/mod_1_4.asm            | 14 +++++++++++---
 mpn/x86_64/mod_34lsub1.asm        | 12 +++++++++---
 mpn/x86_64/mul_2.asm              |  7 ++++++-
 mpn/x86_64/mulmid_basecase.asm    | 14 +++++++++++++-
 mpn/x86_64/popham.asm             | 12 +++++++++---
 mpn/x86_64/redc_1.asm             |  5 +++++
 mpn/x86_64/rsh1aors_n.asm         | 17 ++++++++++++++++-
 mpn/x86_64/rshift.asm             |  7 ++++++-
 mpn/x86_64/sqr_basecase.asm       |  8 --------
 mpn/x86_64/sublsh1_n.asm          |  7 ++++++-
 mpn/x86_64/tabselect.asm          | 14 ++++++++++++++
 47 files changed, 458 insertions(+), 111 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 6967ce66a..01c275bd2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-11-29  Torbjorn Granlund  <tege@gmplib.org>
+
+	* mpn/x86_64: Add DOS64 ABI support to most files.
+
 2011-11-28  Torbjorn Granlund  <tege@gmplib.org>
 
 	* mpn/x86_64/mul_basecase.asm: Support ABI DOS64.
diff --git a/configure.in b/configure.in
index 5b7cf188a..601d6348f 100644
--- a/configure.in
+++ b/configure.in
@@ -1640,7 +1640,6 @@ case $host in
 	    ;;
 	  *-*-mingw* | *-*-cygwin)
 	    limb_64=longlong
-	    extra_functions_64="" # FIXME: remove when invert_limb is ported
 	    CALLING_CONVENTIONS_OBJS_64=""
 	    AC_DEFINE(HOST_DOS64,1,[Define to 1 for Windos/64])
 	    AC_SUBST(GMP_NONSTD_ABI,DOS64)
diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm
index 107c3dafe..5c6647888 100644
--- a/mpn/x86_64/addmul_2.asm
+++ b/mpn/x86_64/addmul_2.asm
@@ -50,10 +50,14 @@ define(`w2', `%rbp')
 define(`w3', `%r10')
 define(`n',  `%r11')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
 	TEXT
 	ALIGN(16)
-ASM_START()
 PROLOGUE(mpn_addmul_2)
+	DOS64_ENTRY(4)
 	mov	n_param, n
 	push	%rbx
 	push	%rbp
@@ -164,6 +168,7 @@ L(end):	xor	R32(w1), R32(w1)
 
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
 
diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm
index 2ea556b73..dda7d590e 100644
--- a/mpn/x86_64/aorrlsh1_n.asm
+++ b/mpn/x86_64/aorrlsh1_n.asm
@@ -1,7 +1,8 @@
 dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
 dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
 
-dnl  Copyright 2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2011 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -54,10 +55,14 @@ ifdef(`OPERATION_rsblsh1_n', `
 
 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	push	%rbp
 
 	mov	(vp), %r8
@@ -147,5 +152,6 @@ ifdef(`OPERATION_rsblsh1_n',`
 	movslq	R32(%rbp), %rax')
 
 	pop	%rbp
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh2_n.asm b/mpn/x86_64/aorrlsh2_n.asm
index 6d55cfd10..8c427a674 100644
--- a/mpn/x86_64/aorrlsh2_n.asm
+++ b/mpn/x86_64/aorrlsh2_n.asm
@@ -3,7 +3,7 @@ dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n',`
 
 MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm
index cab0b07f4..ae9a9d952 100644
--- a/mpn/x86_64/aorrlshC_n.asm
+++ b/mpn/x86_64/aorrlshC_n.asm
@@ -1,7 +1,7 @@
 dnl  AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
 dnl  AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
 
-dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -37,10 +37,14 @@ define(`n',	`%rcx')
 
 define(M, eval(m4_lshift(1,LSH)))
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	push	%r12
 	push	%r13
 	push	%r14
@@ -140,5 +144,6 @@ ifelse(ADDSUB,add,`
 	pop	%r14
 	pop	%r13
 	pop	%r12
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm
index d19dea535..8ab3688d2 100644
--- a/mpn/x86_64/aorrlsh_n.asm
+++ b/mpn/x86_64/aorrlsh_n.asm
@@ -56,10 +56,23 @@ ifdef(`OPERATION_rsblsh_n',`
 
 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
 	push	%r12
 	push	%r13
 	push	%r14
@@ -155,5 +168,6 @@ L(end):	add	R32(%rbx), R32(%rbx)
 	pop	%r14
 	pop	%r13
 	pop	%r12
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm
index 916e9b664..eadde641b 100644
--- a/mpn/x86_64/aors_n.asm
+++ b/mpn/x86_64/aors_n.asm
@@ -1,7 +1,7 @@
 dnl  AMD64 mpn_add_n, mpn_sub_n
 
-dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation,
-dnl  Inc.
+dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -30,15 +30,15 @@ C Intel SBR	 1.59
 C Intel atom	 4
 C VIA nano	 3.25
 
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
 C optimization tool suite written by David Harvey and Torbjorn Granlund.
 
 C INPUT PARAMETERS
-define(`rp',	`%rdi')
-define(`up',	`%rsi')
-define(`vp',	`%rdx')
-define(`n',	`%rcx')
-define(`cy',	`%r8')		C (only for mpn_add_nc)
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (only for mpn_add_nc)
 
 ifdef(`OPERATION_add_n', `
 	define(ADCSBB,	      adc)
@@ -51,10 +51,23 @@ ifdef(`OPERATION_sub_n', `
 
 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func_nc)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	mov	R32(n), R32(%rax)
 	shr	$2, n
 	and	$3, R32(%rax)
@@ -69,6 +82,7 @@ PROLOGUE(func_nc)
 EPILOGUE()
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	mov	R32(n), R32(%rax)
 	shr	$2, n
 	and	$3, R32(%rax)
@@ -85,6 +99,7 @@ L(lt4):	dec	R32(%rax)
 	ADCSBB	(vp), %r8
 	mov	%r8, (rp)
 	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 
 L(2):	dec	R32(%rax)
@@ -95,6 +110,7 @@ L(2):	dec	R32(%rax)
 	mov	%r8, (rp)
 	mov	%r9, 8(rp)
 	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 
 L(3):	mov	16(up), %r10
@@ -105,6 +121,7 @@ L(3):	mov	16(up), %r10
 	mov	%r9, 8(rp)
 	mov	%r10, 16(rp)
 	setc	R8(%rax)
+	DOS64_EXIT()
 	ret
 
 	ALIGN(16)
@@ -142,5 +159,6 @@ L(end):	lea	32(up), up
 	dec	R32(%rax)
 	jnz	L(lt4)
 	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/aorscnd_n.asm b/mpn/x86_64/aorscnd_n.asm
index 19ea42f2a..d22a2a218 100644
--- a/mpn/x86_64/aorscnd_n.asm
+++ b/mpn/x86_64/aorscnd_n.asm
@@ -59,10 +59,23 @@ ifdef(`OPERATION_subcnd_n', `
 
 MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -160,5 +173,6 @@ L(end):	neg	R32(%rax)
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm
index f6a77507d..0fef478d9 100644
--- a/mpn/x86_64/bdiv_dbm1c.asm
+++ b/mpn/x86_64/bdiv_dbm1c.asm
@@ -41,10 +41,23 @@ define(`cy',	  `%r8')
 
 define(`n',       `%r9')
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_bdiv_dbm1c)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	mov	(up), %rax
 	mov	n_param, n
 	mov	R32(n_param), R32(%r11)
@@ -84,6 +97,7 @@ L(lo1):	sub	%rax, %r8
 	add	$4, n
 	jnz	L(top)
 
-L(end):	mov	%r8, %rax
+	mov	%r8, %rax
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm
index 01624a52a..e1e1db5a5 100644
--- a/mpn/x86_64/bdiv_q_1.asm
+++ b/mpn/x86_64/bdiv_q_1.asm
@@ -1,8 +1,8 @@
 dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
 dnl  1-limb divisor, returning quotient only.
 
-dnl  Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation,
-dnl  Inc.
+dnl  Copyright 2001, 2002, 2004, 2005, 2006, 2009, 2011 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -41,10 +41,22 @@ C di		r8	just mpn_pi1_bdiv_q_1
 C shift		r9	just mpn_pi1_bdiv_q_1
 
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_bdiv_q_1)
+	DOS64_ENTRY(4)
 	push	%rbx
 
 	mov	%rcx, %rax
@@ -91,6 +103,9 @@ L(evn):	bsf	%rax, %rcx
 EPILOGUE()
 
 PROLOGUE(mpn_pi1_bdiv_q_1)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
 	push	%rbx
 
 	mov	%rcx, %r11		C d
@@ -144,11 +159,13 @@ L(ent):	imul	%r8, %rax
 	imul	%r8, %rax
 	mov	%rax, (%rdi)
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 
 L(one):	shr	R8(%rcx), %rax
 	imul	%r8, %rax
 	mov	%rax, (%rdi)
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm
index 6ff62eeac..3a232fc20 100644
--- a/mpn/x86_64/com.asm
+++ b/mpn/x86_64/com.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_com.
 
-dnl  Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+dnl  Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -34,11 +34,14 @@ define(`rp',`%rdi')
 define(`up',`%rsi')
 define(`n',`%rdx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_com)
+	DOS64_ENTRY(3)
 	movq	(up), %r8
 	movl	R32(%rdx), R32(%rax)
 	leaq	(up,n,8), up
@@ -76,5 +79,6 @@ L(e10):	movq	24(up,n,8), %r9
 	movq	%r9, 24(rp,n,8)
 	addq	$4, n
 	jnc	L(oop)
-L(ret):	ret
+L(ret):	DOS64_EXIT()
+	ret
 EPILOGUE()
diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm
index 13210217b..15e929f4e 100644
--- a/mpn/x86_64/copyd.asm
+++ b/mpn/x86_64/copyd.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_copyd -- copy limb vector, decrementing.
 
-dnl  Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -39,10 +39,14 @@ define(`rp',`%rdi')
 define(`up',`%rsi')
 define(`n',`%rdx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_copyd)
+	DOS64_ENTRY(3)
 	leaq	-8(up,n,8), up
 	leaq	(rp,n,8), rp
 	subq	$4, n
@@ -73,5 +77,6 @@ L(end):	shrl	R32(%rdx)		C edx = lowpart(n)
 	movq	-8(up), %r9
 	movq	%r8, -8(rp)
 	movq	%r9, -16(rp)
-1:	ret
+1:	DOS64_EXIT()
+	ret
 EPILOGUE()
diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm
index d5cbdd644..1dd6c3168 100644
--- a/mpn/x86_64/copyi.asm
+++ b/mpn/x86_64/copyi.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_copyi -- copy limb vector, incrementing.
 
-dnl  Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -39,10 +39,14 @@ define(`rp',`%rdi')
 define(`up',`%rsi')
 define(`n',`%rdx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_copyi)
+	DOS64_ENTRY(3)
 	leaq	-8(rp), rp
 	subq	$4, n
 	jc	L(end)
@@ -72,5 +76,6 @@ L(end):	shrl	R32(%rdx)		C edx = lowpart(n)
 	movq	8(up), %r9
 	movq	%r8, 8(rp)
 	movq	%r9, 16(rp)
-1:	ret
+1:	DOS64_EXIT()
+	ret
 EPILOGUE()
diff --git a/mpn/x86_64/core2/aorrlsh1_n.asm b/mpn/x86_64/core2/aorrlsh1_n.asm
index 346c21f33..e44e718a6 100644
--- a/mpn/x86_64/core2/aorrlsh1_n.asm
+++ b/mpn/x86_64/core2/aorrlsh1_n.asm
@@ -3,7 +3,7 @@ dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh1_n', `
 
 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/core2/aorrlsh2_n.asm b/mpn/x86_64/core2/aorrlsh2_n.asm
index 1da0c527f..2d9c89553 100644
--- a/mpn/x86_64/core2/aorrlsh2_n.asm
+++ b/mpn/x86_64/core2/aorrlsh2_n.asm
@@ -3,7 +3,7 @@ dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n', `
 
 MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/core2/aorrlsh_n.asm b/mpn/x86_64/core2/aorrlsh_n.asm
index 8d03970ca..a8f5c051a 100644
--- a/mpn/x86_64/core2/aorrlsh_n.asm
+++ b/mpn/x86_64/core2/aorrlsh_n.asm
@@ -20,4 +20,8 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 include(`../config.m4')
 
 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm
index 75807c79a..bc109cc22 100644
--- a/mpn/x86_64/core2/aors_n.asm
+++ b/mpn/x86_64/core2/aors_n.asm
@@ -1,6 +1,6 @@
 dnl  Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
 
-dnl  Copyright 2006, 2007 Free Software Foundation, Inc.
+dnl  Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -48,16 +48,28 @@ ifdef(`OPERATION_sub_n', `
 
 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
 
-ASM_START()
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
+ASM_START()
 	TEXT
 	ALIGN(16)
-
 PROLOGUE(func_nc)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	jmp	L(start)
 EPILOGUE()
 
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	xor	%r8, %r8
 L(start):
 	mov	(up), %r10
@@ -96,6 +108,7 @@ L(end):	ADCSBB	%r11, %r10
 	mov	%r10, 8(rp)
 	mov	R32(%rcx), R32(%rax)	C clear eax, ecx contains 0
 	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 
 	ALIGN(16)
diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm
index bb4f663c4..aeda30159 100644
--- a/mpn/x86_64/core2/aorsmul_1.asm
+++ b/mpn/x86_64/core2/aorsmul_1.asm
@@ -1,6 +1,7 @@
 dnl  x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
 
-dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -45,10 +46,14 @@ ifdef(`OPERATION_submul_1',`
 
 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	push	%rbx
 	push	%rbp
 	lea	(%rdx), %rbx
@@ -127,5 +132,6 @@ L(n1):	mov	8(rp), %r10
 	adc	%rdx, %rax
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm
index 3b17e8315..2e175de76 100644
--- a/mpn/x86_64/core2/lshift.asm
+++ b/mpn/x86_64/core2/lshift.asm
@@ -1,6 +1,6 @@
 dnl  x86-64 mpn_lshift optimized for "Core 2".
 
-dnl  Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
 define(`rp',	`%rdi')
 define(`up',	`%rsi')
 define(`n',	`%rdx')
-define(`cnt',	`%cl')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_lshift)
+	DOS64_ENTRY(4)
 	lea	-8(rp,n,8), rp
 	lea	-8(up,n,8), up
 
@@ -51,7 +55,7 @@ L(b00):	C n = 4, 8, 12, ...
 	mov	(up), %r10
 	mov	-8(up), %r11
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r10, %rax
+	shld	R8(cnt), %r10, %rax
 	mov	-16(up), %r8
 	lea	24(rp), rp
 	sub	$4, n
@@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ...
 	jae	L(nb01)
 L(b01):	mov	(up), %r9
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r9, %rax
+	shld	R8(cnt), %r9, %rax
 	sub	$2, n
 	jb	L(le1)
 	mov	-8(up), %r10
@@ -70,8 +74,9 @@ L(b01):	mov	(up), %r9
 	lea	-8(up), up
 	lea	16(rp), rp
 	jmp	L(01)
-L(le1):	shl	R8(%rcx), %r9
+L(le1):	shl	R8(cnt), %r9
 	mov	%r9, (rp)
+	DOS64_EXIT()
 	ret
 
 L(nb01):C n = 2, 6, 10, ...
@@ -79,17 +84,18 @@ L(nb01):C n = 2, 6, 10, ...
 L(b10):	mov	(up), %r8
 	mov	-8(up), %r9
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r8, %rax
+	shld	R8(cnt), %r8, %rax
 	sub	$3, n
 	jb	L(le2)
 	mov	-16(up), %r10
 	lea	-16(up), up
 	lea	8(rp), rp
 	jmp	L(10)
-L(le2):	shld	R8(%rcx), %r9, %r8
+L(le2):	shld	R8(cnt), %r9, %r8
 	mov	%r8, (rp)
-	shl	R8(%rcx), %r9
+	shl	R8(cnt), %r9
 	mov	%r9, -8(rp)
+	DOS64_EXIT()
 	ret
 
 	ALIGN(16)			C performance critical!
@@ -97,23 +103,23 @@ L(b11):	C n = 3, 7, 11, ...
 	mov	(up), %r11
 	mov	-8(up), %r8
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r11, %rax
+	shld	R8(cnt), %r11, %rax
 	mov	-16(up), %r9
 	lea	-24(up), up
 	sub	$4, n
 	jb	L(end)
 
 	ALIGN(16)
-L(top):	shld	R8(%rcx), %r8, %r11
+L(top):	shld	R8(cnt), %r8, %r11
 	mov	(up), %r10
 	mov	%r11, (rp)
-L(10):	shld	R8(%rcx), %r9, %r8
+L(10):	shld	R8(cnt), %r9, %r8
 	mov	-8(up), %r11
 	mov	%r8, -8(rp)
-L(01):	shld	R8(%rcx), %r10, %r9
+L(01):	shld	R8(cnt), %r10, %r9
 	mov	-16(up), %r8
 	mov	%r9, -16(rp)
-L(00):	shld	R8(%rcx), %r11, %r10
+L(00):	shld	R8(cnt), %r11, %r10
 	mov	-24(up), %r9
 	mov	%r10, -24(rp)
 	add	$-32, up
@@ -121,11 +127,12 @@ L(00):	shld	R8(%rcx), %r11, %r10
 	sub	$4, n
 	jnc	L(top)
 
-L(end):	shld	R8(%rcx), %r8, %r11
+L(end):	shld	R8(cnt), %r8, %r11
 	mov	%r11, (rp)
-	shld	R8(%rcx), %r9, %r8
+	shld	R8(cnt), %r9, %r8
 	mov	%r8, -8(rp)
-	shl	R8(%rcx), %r9
+	shl	R8(cnt), %r9
 	mov	%r9, -16(rp)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm
index a19f72297..31a08f7ae 100644
--- a/mpn/x86_64/core2/lshiftc.asm
+++ b/mpn/x86_64/core2/lshiftc.asm
@@ -1,6 +1,6 @@
 dnl  x86-64 mpn_lshiftc optimized for "Core 2".
 
-dnl  Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
 define(`rp',	`%rdi')
 define(`up',	`%rsi')
 define(`n',	`%rdx')
-define(`cnt',	`%cl')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_lshiftc)
+	DOS64_ENTRY(4)
 	lea	-8(rp,n,8), rp
 	lea	-8(up,n,8), up
 
@@ -51,7 +55,7 @@ L(b00):	C n = 4, 8, 12, ...
 	mov	(up), %r10
 	mov	-8(up), %r11
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r10, %rax
+	shld	R8(cnt), %r10, %rax
 	mov	-16(up), %r8
 	lea	24(rp), rp
 	sub	$4, n
@@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ...
 	jae	L(nb01)
 L(b01):	mov	(up), %r9
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r9, %rax
+	shld	R8(cnt), %r9, %rax
 	sub	$2, n
 	jb	L(le1)
 	mov	-8(up), %r10
@@ -70,9 +74,10 @@ L(b01):	mov	(up), %r9
 	lea	-8(up), up
 	lea	16(rp), rp
 	jmp	L(01)
-L(le1):	shl	R8(%rcx), %r9
+L(le1):	shl	R8(cnt), %r9
 	not	%r9
 	mov	%r9, (rp)
+	DOS64_EXIT()
 	ret
 
 L(nb01):C n = 2, 6, 10, ...
@@ -80,19 +85,20 @@ L(nb01):C n = 2, 6, 10, ...
 L(b10):	mov	(up), %r8
 	mov	-8(up), %r9
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r8, %rax
+	shld	R8(cnt), %r8, %rax
 	sub	$3, n
 	jb	L(le2)
 	mov	-16(up), %r10
 	lea	-16(up), up
 	lea	8(rp), rp
 	jmp	L(10)
-L(le2):	shld	R8(%rcx), %r9, %r8
+L(le2):	shld	R8(cnt), %r9, %r8
 	not	%r8
 	mov	%r8, (rp)
-	shl	R8(%rcx), %r9
+	shl	R8(cnt), %r9
 	not	%r9
 	mov	%r9, -8(rp)
+	DOS64_EXIT()
 	ret
 
 	ALIGN(16)			C performance critical!
@@ -100,26 +106,26 @@ L(b11):	C n = 3, 7, 11, ...
 	mov	(up), %r11
 	mov	-8(up), %r8
 	xor	R32(%rax), R32(%rax)
-	shld	R8(%rcx), %r11, %rax
+	shld	R8(cnt), %r11, %rax
 	mov	-16(up), %r9
 	lea	-24(up), up
 	sub	$4, n
 	jb	L(end)
 
 	ALIGN(16)
-L(top):	shld	R8(%rcx), %r8, %r11
+L(top):	shld	R8(cnt), %r8, %r11
 	mov	(up), %r10
 	not	%r11
 	mov	%r11, (rp)
-L(10):	shld	R8(%rcx), %r9, %r8
+L(10):	shld	R8(cnt), %r9, %r8
 	mov	-8(up), %r11
 	not	%r8
 	mov	%r8, -8(rp)
-L(01):	shld	R8(%rcx), %r10, %r9
+L(01):	shld	R8(cnt), %r10, %r9
 	mov	-16(up), %r8
 	not	%r9
 	mov	%r9, -16(rp)
-L(00):	shld	R8(%rcx), %r11, %r10
+L(00):	shld	R8(cnt), %r11, %r10
 	mov	-24(up), %r9
 	not	%r10
 	mov	%r10, -24(rp)
@@ -128,14 +134,15 @@ L(00):	shld	R8(%rcx), %r11, %r10
 	sub	$4, n
 	jnc	L(top)
 
-L(end):	shld	R8(%rcx), %r8, %r11
+L(end):	shld	R8(cnt), %r8, %r11
 	not	%r11
 	mov	%r11, (rp)
-	shld	R8(%rcx), %r9, %r8
+	shld	R8(cnt), %r9, %r8
 	not	%r8
 	mov	%r8, -8(rp)
-	shl	R8(%rcx), %r9
+	shl	R8(cnt), %r9
 	not	%r9
 	mov	%r9, -16(rp)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm
index eb52efc08..b350e4a43 100644
--- a/mpn/x86_64/core2/rsh1aors_n.asm
+++ b/mpn/x86_64/core2/rsh1aors_n.asm
@@ -1,6 +1,6 @@
 dnl  Intel P6/64 mpn_rsh1add_n and mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1
 
-dnl  Copyright 2003, 2005, 2009, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2009, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -49,11 +49,24 @@ ifdef(`OPERATION_rsh1sub_n', `
 
 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 
 	ALIGN(16)
 PROLOGUE(func_nc)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	push	%rbx
 	push	%rbp
 
@@ -66,6 +79,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(func_n)
+	DOS64_ENTRY(4)
 	push	%rbx
 	push	%rbp
 
@@ -171,5 +185,6 @@ L(end):	shrd	$1, %rbx, %rbp
 	mov	%rbp, (rp)
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm
index 38a77364f..68306881c 100644
--- a/mpn/x86_64/core2/rshift.asm
+++ b/mpn/x86_64/core2/rshift.asm
@@ -1,6 +1,6 @@
 dnl  x86-64 mpn_rshift optimized for "Core 2".
 
-dnl  Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
 define(`rp',	`%rdi')
 define(`up',	`%rsi')
 define(`n',	`%rdx')
-define(`cnt',	`%cl')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_rshift)
+	DOS64_ENTRY(4)
 	mov	R32(%rdx), R32(%rax)
 	and	$3, R32(%rax)
 	jne	L(nb00)
@@ -48,7 +52,7 @@ L(b00):	C n = 4, 8, 12, ...
 	mov	(up), %r10
 	mov	8(up), %r11
 	xor	R32(%rax), R32(%rax)
-	shrd	R8(%rcx), %r10, %rax
+	shrd	R8(cnt), %r10, %rax
 	mov	16(up), %r8
 	lea	8(up), up
 	lea	-24(rp), rp
@@ -60,7 +64,7 @@ L(nb00):C n = 1, 5, 9, ...
 	jae	L(nb01)
 L(b01):	mov	(up), %r9
 	xor	R32(%rax), R32(%rax)
-	shrd	R8(%rcx), %r9, %rax
+	shrd	R8(cnt), %r9, %rax
 	sub	$2, n
 	jb	L(le1)
 	mov	8(up), %r10
@@ -68,8 +72,9 @@ L(b01):	mov	(up), %r9
 	lea	16(up), up
 	lea	-16(rp), rp
 	jmp	L(01)
-L(le1):	shr	R8(%rcx), %r9
+L(le1):	shr	R8(cnt), %r9
 	mov	%r9, (rp)
+	DOS64_EXIT()
 	ret
 
 L(nb01):C n = 2, 6, 10, ...
@@ -77,17 +82,18 @@ L(nb01):C n = 2, 6, 10, ...
 L(b10):	mov	(up), %r8
 	mov	8(up), %r9
 	xor	R32(%rax), R32(%rax)
-	shrd	R8(%rcx), %r8, %rax
+	shrd	R8(cnt), %r8, %rax
 	sub	$3, n
 	jb	L(le2)
 	mov	16(up), %r10
 	lea	24(up), up
 	lea	-8(rp), rp
 	jmp	L(10)
-L(le2):	shrd	R8(%rcx), %r9, %r8
+L(le2):	shrd	R8(cnt), %r9, %r8
 	mov	%r8, (rp)
-	shr	R8(%rcx), %r9
+	shr	R8(cnt), %r9
 	mov	%r9, 8(rp)
+	DOS64_EXIT()
 	ret
 
 	ALIGN(16)
@@ -95,23 +101,23 @@ L(b11):	C n = 3, 7, 11, ...
 	mov	(up), %r11
 	mov	8(up), %r8
 	xor	R32(%rax), R32(%rax)
-	shrd	R8(%rcx), %r11, %rax
+	shrd	R8(cnt), %r11, %rax
 	mov	16(up), %r9
 	lea	32(up), up
 	sub	$4, n
 	jb	L(end)
 
 	ALIGN(16)
-L(top):	shrd	R8(%rcx), %r8, %r11
+L(top):	shrd	R8(cnt), %r8, %r11
 	mov	-8(up), %r10
 	mov	%r11, (rp)
-L(10):	shrd	R8(%rcx), %r9, %r8
+L(10):	shrd	R8(cnt), %r9, %r8
 	mov	(up), %r11
 	mov	%r8, 8(rp)
-L(01):	shrd	R8(%rcx), %r10, %r9
+L(01):	shrd	R8(cnt), %r10, %r9
 	mov	8(up), %r8
 	mov	%r9, 16(rp)
-L(00):	shrd	R8(%rcx), %r11, %r10
+L(00):	shrd	R8(cnt), %r11, %r10
 	mov	16(up), %r9
 	mov	%r10, 24(rp)
 	add	$32, up
@@ -119,11 +125,12 @@ L(00):	shrd	R8(%rcx), %r11, %r10
 	sub	$4, n
 	jnc	L(top)
 
-L(end):	shrd	R8(%rcx), %r8, %r11
+L(end):	shrd	R8(cnt), %r8, %r11
 	mov	%r11, (rp)
-	shrd	R8(%rcx), %r9, %r8
+	shrd	R8(cnt), %r9, %r8
 	mov	%r8, 8(rp)
-	shr	R8(%rcx), %r9
+	shr	R8(cnt), %r9
 	mov	%r9, 16(rp)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/core2/sublsh1_n.asm b/mpn/x86_64/core2/sublsh1_n.asm
index 7522b429f..50411d7d0 100644
--- a/mpn/x86_64/core2/sublsh1_n.asm
+++ b/mpn/x86_64/core2/sublsh1_n.asm
@@ -2,7 +2,7 @@ dnl  AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -30,4 +30,7 @@ define(func,	mpn_sublsh1_n)
 
 MULFUNC_PROLOGUE(mpn_sublsh1_n)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/mpn/x86_64/core2/sublsh2_n.asm b/mpn/x86_64/core2/sublsh2_n.asm
index 036d2c859..affc87177 100644
--- a/mpn/x86_64/core2/sublsh2_n.asm
+++ b/mpn/x86_64/core2/sublsh2_n.asm
@@ -2,7 +2,7 @@ dnl  AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -30,4 +30,7 @@ define(func,	mpn_sublsh2_n)
 
 MULFUNC_PROLOGUE(mpn_sublsh2_n)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm
index 2f89c35e3..7c4545f5a 100644
--- a/mpn/x86_64/core2/sublshC_n.asm
+++ b/mpn/x86_64/core2/sublshC_n.asm
@@ -3,7 +3,7 @@ dnl  Core iN.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -40,6 +40,7 @@ ASM_START()
 	TEXT
 	ALIGN(8)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	push	%rbx
 	push	%r12
 
@@ -141,5 +142,6 @@ L(end):	shr	$RSH, %r11
 	pop	%rbx
 	sub	R32(%r11), R32(%rax)
 	neg	R32(%rax)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm
index a4afae69d..e22cc065d 100644
--- a/mpn/x86_64/coreinhm/aorrlsh_n.asm
+++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm
@@ -62,10 +62,23 @@ C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
 C refmpn_rsblsh_nc
 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(func_n)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')	C cnt
 	push	%rbx
 	xor	R32(%rbx), R32(%rbx)	C clear CF save register
 L(ent):	push	%rbp
@@ -170,9 +183,13 @@ L(wd1):	shrd	%cl, %r8, %r11
 IFRSB(	neg	%rax)
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
 PROLOGUE(func_nc)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')	C cnt
+IFDOS(`	mov	64(%rsp), %r9	')	C cy
 	push	%rbx
 	neg	cy
 	sbb	R32(%rbx), R32(%rbx)	C initialise CF save register
diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm
index 66a5e3b60..4d8d1cccf 100644
--- a/mpn/x86_64/coreisbr/aors_n.asm
+++ b/mpn/x86_64/coreisbr/aors_n.asm
@@ -49,10 +49,22 @@ ifdef(`OPERATION_sub_n', `
 
 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	xor	%r8, %r8
 L(ent):	mov	R32(n), R32(%rax)
 	shr	$2, n
@@ -144,5 +156,7 @@ L(e1):	ADCSBB	16(vp), %r10
 	ret
 EPILOGUE()
 PROLOGUE(func_nc)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	jmp	L(ent)
 EPILOGUE()
diff --git a/mpn/x86_64/invert_limb.asm b/mpn/x86_64/invert_limb.asm
index 8c6aa68b6..06cf1414a 100644
--- a/mpn/x86_64/invert_limb.asm
+++ b/mpn/x86_64/invert_limb.asm
@@ -2,7 +2,7 @@ dnl  AMD64 mpn_invert_limb -- Invert a normalized limb.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund and Niels M�ller.
 
-dnl  Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2004, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -33,11 +33,14 @@ C VIA nano	 79			157
 
 C rax rcx rdx rdi rsi r8
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_invert_limb)		C			Kn	C2	Ci
+	DOS64_ENTRY(1)
 	mov	%rdi, %rax		C			 0	 0	 0
 	shr	$55, %rax		C			 1	 1	 1
 ifdef(`PIC',`
@@ -94,6 +97,7 @@ ifdef(`DARWIN',`
 	adc	%rdi, %rdx
 	sub	%rdx, %rax
 
+	DOS64_EXIT()
 	ret
 EPILOGUE()
 ASM_END()
diff --git a/mpn/x86_64/invert_limb_table.asm b/mpn/x86_64/invert_limb_table.asm
index 98a331372..86d75b8ce 100644
--- a/mpn/x86_64/invert_limb_table.asm
+++ b/mpn/x86_64/invert_limb_table.asm
@@ -21,6 +21,9 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 C Table entry X contains floor (0x7fd00 / (0x100 + X))
 
diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm
index 1df564a8f..02b9da549 100644
--- a/mpn/x86_64/logops_n.asm
+++ b/mpn/x86_64/logops_n.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 logops.
 
-dnl  Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+dnl  Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -72,6 +72,8 @@ define(`up',`%rsi')
 define(`vp',`%rdx')
 define(`n',`%rcx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 
@@ -79,6 +81,7 @@ ifdef(`VARIANT_1',`
 	TEXT
 	ALIGN(32)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	movq	(vp), %r8
 	movl	R32(%rcx), R32(%rax)
 	leaq	(vp,n,8), vp
@@ -117,7 +120,8 @@ L(e10):	movq	24(vp,n,8), %r9
 	movq	%r9, 24(rp,n,8)
 	addq	$4, n
 	jnc	L(oop)
-L(ret):	ret
+L(ret):	DOS64_EXIT()
+	ret
 EPILOGUE()
 ')
 
@@ -125,6 +129,7 @@ ifdef(`VARIANT_2',`
 	TEXT
 	ALIGN(32)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	movq	(vp), %r8
 	notq	%r8
 	movl	R32(%rcx), R32(%rax)
@@ -168,7 +173,8 @@ L(e10):	movq	24(vp,n,8), %r9
 	movq	%r9, 24(rp,n,8)
 	addq	$4, n
 	jnc	L(oop)
-L(ret):	ret
+L(ret):	DOS64_EXIT()
+	ret
 EPILOGUE()
 ')
 
@@ -176,6 +182,7 @@ ifdef(`VARIANT_3',`
 	TEXT
 	ALIGN(32)
 PROLOGUE(func)
+	DOS64_ENTRY(4)
 	movq	(vp), %r8
 	movl	R32(%rcx), R32(%rax)
 	leaq	(vp,n,8), vp
@@ -220,6 +227,7 @@ L(e10):	movq	24(vp,n,8), %r9
 	movq	%r9, 24(rp,n,8)
 	addq	$4, n
 	jnc	L(oop)
-L(ret):	ret
+L(ret):	DOS64_EXIT()
+	ret
 EPILOGUE()
 ')
diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm
index 2f3d5c94d..5852ba9f9 100644
--- a/mpn/x86_64/lshift.asm
+++ b/mpn/x86_64/lshift.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_lshift -- mpn left shift.
 
-dnl  Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2007, 2009, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -36,10 +36,14 @@ define(`up',	`%rsi')
 define(`n',	`%rdx')
 define(`cnt',	`%rcx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_lshift)
+	DOS64_ENTRY(4)
 	cmp	$1, R8(%rcx)
 	jne	L(gen)
 
@@ -83,6 +87,7 @@ L(t1):	mov	(up), %r8
 	dec	R32(%rax)
 	jne	L(n00)
 	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 L(e1):	test	R32(%rax), R32(%rax)	C clear cy
 L(n00):	mov	(up), %r8
@@ -91,6 +96,7 @@ L(n00):	mov	(up), %r8
 	adc	%r8, %r8
 	mov	%r8, (rp)
 L(ret):	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 L(n01):	dec	R32(%rax)
 	mov	8(up), %r9
@@ -100,6 +106,7 @@ L(n01):	dec	R32(%rax)
 	mov	%r8, (rp)
 	mov	%r9, 8(rp)
 	adc	R32(%rax), R32(%rax)
+	DOS64_EXIT()
 	ret
 L(n10):	mov	16(up), %r10
 	adc	%r8, %r8
@@ -109,6 +116,7 @@ L(n10):	mov	16(up), %r10
 	mov	%r9, 8(rp)
 	mov	%r10, 16(rp)
 	adc	$-1, R32(%rax)
+	DOS64_EXIT()
 	ret
 
 L(gen):	neg	R32(%rcx)		C put rsh count in cl
@@ -222,5 +230,6 @@ L(end):
 L(ast):	mov	(up), %r10
 	shl	R8(%rcx), %r10
 	mov	%r10, (rp)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm
index 93bb614d3..b4124b037 100644
--- a/mpn/x86_64/lshiftc.asm
+++ b/mpn/x86_64/lshiftc.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_lshiftc -- mpn left shift with complement.
 
-dnl  Copyright 2003, 2005, 2006, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2006, 2009, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -36,10 +36,14 @@ define(`up',	`%rsi')
 define(`n',	`%rdx')
 define(`cnt',	`%rcx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_lshiftc)
+	DOS64_ENTRY(4)
 	neg	R32(%rcx)		C put rsh count in cl
 	mov	-8(up,n,8), %rax
 	shr	R8(%rcx), %rax		C function return value
@@ -162,5 +166,6 @@ L(ast):	mov	(up), %r10
 	shl	R8(%rcx), %r10
 	not	%r10
 	mov	%r10, (rp)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm
index 3a42863ad..6e5816b1c 100644
--- a/mpn/x86_64/lshsub_n.asm
+++ b/mpn/x86_64/lshsub_n.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
 
-dnl  Copyright 2006 Free Software Foundation, Inc.
+dnl  Copyright 2006, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -44,10 +44,23 @@ define(`vp',	`%rdx')
 define(`n',	`%rcx')
 define(`cnt',	`%r8')
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_lshsub_n)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
 
 	push	%r12
 	push	%r13
@@ -151,5 +164,6 @@ L(end):
 	pop	%r13
 	pop	%r12
 
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm
index 56f708a75..8afa96e05 100644
--- a/mpn/x86_64/mod_1_1.asm
+++ b/mpn/x86_64/mod_1_1.asm
@@ -67,10 +67,14 @@ C the source of the cmov in the loop.
 C
 C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
 
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mod_1_1p)
+	DOS64_ENTRY(4)
 	push	%rbp
 	push	%rbx
 	mov	%rdx, b
@@ -163,6 +167,7 @@ L(ok):	shr	R8(%rcx), %rax
 
 	pop	%rbx
 	pop	%rbp
+	DOS64_EXIT()
 	ret
 L(fix):	sub	b, %rax
 	jmp	L(ok)
@@ -170,6 +175,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(mpn_mod_1_1p_cps)
+	DOS64_ENTRY(2)
 	push	%rbp
 	bsr	%rsi, %rcx
 	push	%rbx
@@ -211,6 +217,7 @@ L(z):
 	pop	%r12
 	pop	%rbx
 	pop	%rbp
+	DOS64_EXIT()
 	ret
 EPILOGUE()
 ASM_END()
diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm
index a0ecb6855..b09f24bc0 100644
--- a/mpn/x86_64/mod_1_2.asm
+++ b/mpn/x86_64/mod_1_2.asm
@@ -2,7 +2,7 @@ dnl  AMD64 mpn_mod_1s_2p
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -31,10 +31,14 @@ C Intel SBR	 4.5
 C Intel atom	28
 C VIA nano	 8
 
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mod_1s_2p)
+	DOS64_ENTRY(4)
 	push	%r14
 	test	$1, R8(%rsi)
 	mov	%rdx, %r14
@@ -145,6 +149,7 @@ L(1):	xor	R32(%rcx), R32(%rcx)
 	pop	%r12
 	pop	%r13
 	pop	%r14
+	DOS64_EXIT()
 	ret
 L(one):
 	mov	(%rdi), %r8
@@ -154,6 +159,7 @@ L(one):
 EPILOGUE()
 
 PROLOGUE(mpn_mod_1s_2p_cps)
+	DOS64_ENTRY(2)
 	push	%rbp
 	bsr	%rsi, %rcx
 	push	%rbx
@@ -214,5 +220,6 @@ ifdef(`SHLD_SLOW',`
 	pop	%r12
 	pop	%rbx
 	pop	%rbp
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm
index d99080d7f..3068e3def 100644
--- a/mpn/x86_64/mod_1_4.asm
+++ b/mpn/x86_64/mod_1_4.asm
@@ -2,7 +2,7 @@ dnl  AMD64 mpn_mod_1s_4p
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -30,17 +30,22 @@ C Intel corei	 4
 C Intel atom	23
 C VIA nano	 4.75
 
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mod_1s_4p)
+	DOS64_ENTRY(4)
+	push	%r15
 	push	%r14
 	push	%r13
 	push	%r12
 	push	%rbp
 	push	%rbx
 
-	mov	%rdx, -16(%rsp)
+	mov	%rdx, %r15
 	mov	%rcx, %r14
 	mov	16(%rcx), %r11		C B1modb
 	mov	24(%rcx), %rbx		C B2modb
@@ -135,7 +140,7 @@ L(end):	mov	8(%r14), R32(%rsi)
 	or	%rdx, %rdi
 	mov	%rdi, %rax
 	mulq	(%r14)
-	mov	-16(%rsp), %rbx
+	mov	%r15, %rbx
 	mov	%rax, %r9
 	sal	R8(%rcx), %r8
 	inc	%rdi
@@ -155,11 +160,13 @@ L(end):	mov	8(%r14), R32(%rsi)
 	pop	%r12
 	pop	%r13
 	pop	%r14
+	DOS64_EXIT()
 	ret
 EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(mpn_mod_1s_4p_cps)
+	DOS64_ENTRY(2)
 	push	%rbp
 	bsr	%rsi, %rcx
 	push	%rbx
@@ -244,5 +251,6 @@ ifdef(`SHLD_SLOW',`
 	pop	%r12
 	pop	%rbx
 	pop	%rbp
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm
index 08cd7d939..ee4d0d347 100644
--- a/mpn/x86_64/mod_34lsub1.asm
+++ b/mpn/x86_64/mod_34lsub1.asm
@@ -1,7 +1,7 @@
 dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
 
-dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010 Free Software
-dnl  Foundation, Inc.
+dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011 Free
+dnl  Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -39,10 +39,14 @@ C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
 C TODO
 C  * Review feed-in and wind-down code.
 
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_mod_34lsub1)
+	DOS64_ENTRY(2)
 
 	mov	$0x0000FFFFFFFFFFFF, %r11
 
@@ -66,7 +70,8 @@ PROLOGUE(mpn_mod_34lsub1)
 
 	shl	$16, %rdx		C src[1] low
 	add	%rdx, %rax
-L(one):	ret
+L(one):	DOS64_EXIT()
+	ret
 
 
 C Don't change this, the wind-down code is not able to handle greater values
@@ -176,5 +181,6 @@ L(0):	add	%r9, %rax
 	add	%rdx, %rax		C apply 2mod3 high
 	add	%rdi, %rax		C apply 2mod3 low
 
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm
index 206a4ea2c..35deefa8b 100644
--- a/mpn/x86_64/mul_2.asm
+++ b/mpn/x86_64/mul_2.asm
@@ -1,7 +1,7 @@
 dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
 dnl  store the result in a third limb vector.
 
-dnl  Copyright 2008 Free Software Foundation, Inc.
+dnl  Copyright 2008, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -53,10 +53,14 @@ define(`w2', `%rbp')
 define(`w3', `%r10')
 define(`n',  `%r11')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mul_2)
+	DOS64_ENTRY(4)
 	push	%rbx
 	push	%rbp
 
@@ -172,5 +176,6 @@ L(m22):	mul	v1
 
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/mulmid_basecase.asm b/mpn/x86_64/mulmid_basecase.asm
index 375e7f70e..d2d56d4a4 100644
--- a/mpn/x86_64/mulmid_basecase.asm
+++ b/mpn/x86_64/mulmid_basecase.asm
@@ -50,11 +50,23 @@ define(`vp',  `%r15')
 
 define(`vp_inner', `%r10')
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mulmid_basecase)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -539,6 +551,6 @@ L(ret):	pop	%r15
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
-
 EPILOGUE()
diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm
index 9db368106..999452328 100644
--- a/mpn/x86_64/popham.asm
+++ b/mpn/x86_64/popham.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
 
-dnl  Copyright 2004, 2005, 2007, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2004, 2005, 2007, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -44,6 +44,7 @@ ifdef(`OPERATION_popcount',`
   define(`h33333333',	`%r11')
   define(`h0f0f0f0f',	`%rcx')
   define(`h01010101',	`%rdx')
+  define(`POP',		`$1')
   define(`HAM',		`dnl')
 ')
 ifdef(`OPERATION_hamdist',`
@@ -55,17 +56,22 @@ ifdef(`OPERATION_hamdist',`
   define(`h33333333',	`%r11')
   define(`h0f0f0f0f',	`%rcx')
   define(`h01010101',	`%r14')
+  define(`POP',		`dnl')
   define(`HAM',		`$1')
 ')
 
 
 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(func)
-
+ POP(`	DOS64_ENTRY(2)		')
+ HAM(`	DOS64_ENTRY(3)		')
 	push	%r12
 	push	%r13
  HAM(`	push	%r14		')
@@ -155,6 +161,6 @@ L(end):
  HAM(`	pop	%r14		')
 	pop	%r13
 	pop	%r12
+	DOS64_EXIT()
 	ret
-
 EPILOGUE()
diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm
index 8d731c68c..53b5641a0 100644
--- a/mpn/x86_64/redc_1.asm
+++ b/mpn/x86_64/redc_1.asm
@@ -49,10 +49,14 @@ define(`n',	  `%r13')
 define(`i',	  `%r11')
 define(`nneg',	  `%r12')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_redc_1)
+	DOS64_ENTRY(4)
 	push	%rbp
 	push	%rbx
 	push	%r12
@@ -293,5 +297,6 @@ L(ret):	pop	%r14
 	pop	%r12
 	pop	%rbx
 	pop	%rbp
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm
index c4a336446..1b6a103f1 100644
--- a/mpn/x86_64/rsh1aors_n.asm
+++ b/mpn/x86_64/rsh1aors_n.asm
@@ -1,7 +1,7 @@
 dnl  AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
 dnl  AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
 
-dnl  Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -53,11 +53,24 @@ ifdef(`OPERATION_rsh1sub_n', `
 
 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 
 	ALIGN(16)
 PROLOGUE(func_nc)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
 	push	%rbx
 
 	xor	R32(%rax), R32(%rax)
@@ -69,6 +82,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(func_n)
+	DOS64_ENTRY(4)
 	push	%rbx
 
 	xor	R32(%rax), R32(%rax)
@@ -169,5 +183,6 @@ L(top):	add	%rbx, %rbx		C rotate carry limb, restore acy
 
 L(end):	mov	%rbx, (rp)
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm
index 0f822a4a0..57a4ab093 100644
--- a/mpn/x86_64/rshift.asm
+++ b/mpn/x86_64/rshift.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_rshift -- mpn right shift.
 
-dnl  Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -36,10 +36,14 @@ define(`up',	`%rsi')
 define(`n',	`%rdx')
 define(`cnt',	`%rcx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_rshift)
+	DOS64_ENTRY(4)
 	neg	R32(%rcx)		C put rsh count in cl
 	mov	(up), %rax
 	shl	R8(%rcx), %rax		C function return value
@@ -156,5 +160,6 @@ L(end):
 L(ast):	mov	(up), %r10
 	shr	R8(%rcx), %r10
 	mov	%r10, (rp)
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm
index f71627ab9..71195d7ae 100644
--- a/mpn/x86_64/sqr_basecase.asm
+++ b/mpn/x86_64/sqr_basecase.asm
@@ -75,14 +75,6 @@ define(`w1',	`%rcx')
 define(`w2',	`%rbp')
 define(`w3',	`%r10')
 
-ifdef(`HOST_DOS64',`
-  define(`IFDOS',   `$1')
-  define(`IFELF',   `')
-',`
-  define(`IFDOS',   `')
-  define(`IFELF',   `$1')
-')
-
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(ELF64)
 
diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm
index a2f48c007..a0515cf18 100644
--- a/mpn/x86_64/sublsh1_n.asm
+++ b/mpn/x86_64/sublsh1_n.asm
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
 
-dnl  Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2006, 2007, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -41,10 +41,14 @@ define(`up',`%rsi')
 define(`vp',`%rdx')
 define(`n', `%rcx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_sublsh1_n)
+	DOS64_ENTRY(4)
 	push	%rbx
 	push	%rbp
 
@@ -140,5 +144,6 @@ L(end):	add	R32(%rbp), R32(%rax)
 
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
index 2611b3212..a6699a9a4 100644
--- a/mpn/x86_64/tabselect.asm
+++ b/mpn/x86_64/tabselect.asm
@@ -50,10 +50,23 @@ define(`maskn',  `%r12')
 C rax rbx  rcx  rdx rdi rsi rbp (rsp)  r8   r9 r10 r11 r12 r13 r14 r15
 C         nents  n  rp  tab           which
 
+ifdef(`HOST_DOS64',`
+  define(`IFDOS',   `$1')
+  define(`IFELF',   `')
+',`
+  define(`IFDOS',   `')
+  define(`IFELF',   `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_tabselect)
+	DOS64_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -105,5 +118,6 @@ L(outer_end):
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	DOS64_EXIT()
 	ret
 EPILOGUE()
-- 
cgit v1.2.1


From 2c033efc02631f22e6e180ce737a2faf81b09ccc Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tege@gmplib.org>
Date: Tue, 29 Nov 2011 23:28:07 +0100
Subject: Fix typo in last change (thanks Marco!).

---
 mpn/x86_64/mod_1_4.asm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm
index 3068e3def..629520877 100644
--- a/mpn/x86_64/mod_1_4.asm
+++ b/mpn/x86_64/mod_1_4.asm
@@ -160,6 +160,7 @@ L(end):	mov	8(%r14), R32(%rsi)
 	pop	%r12
 	pop	%r13
 	pop	%r14
+	pop	%r15
 	DOS64_EXIT()
 	ret
 EPILOGUE()
-- 
cgit v1.2.1