go to gmp 4.3.2baserock/pedroalvarez/gcc-5.3.0-gmp432

author: Pedro Alvarez <pedro.alvarez@codethink.co.uk> 2016-05-27 17:39:31 +0100
committer: Pedro Alvarez <pedro.alvarez@codethink.co.uk> 2016-05-27 17:53:32 +0100
commit: 26c75cf8267919f81a1759c9c965a52c660233f9 (patch)
tree: cf2a39cf56c2c8ac45760854413ab233e6263974 /gmp/mpn/x86/k7
parent: 56892c1d217baea02092b51a09bbc924130ca84c (diff)
download: gcc-tarball-26c75cf8267919f81a1759c9c965a52c660233f9.tar.gz
25 files changed, 1120 insertions, 2056 deletions
diff --git a/gmp/mpn/x86/k7/README b/gmp/mpn/x86/k7/README
index 5711b612c5..e2c5e0c18d 100644
--- a/gmp/mpn/x86/k7/README
+++ b/gmp/mpn/x86/k7/README
@@ -3,28 +3,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
 
 The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
-  * the GNU Lesser General Public License as published by the Free
-    Software Foundation; either version 3 of the License, or (at your
-    option) any later version.
-
-or
-
-  * the GNU General Public License as published by the Free Software
-    Foundation; either version 2 of the License, or (at your option) any
-    later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
 
 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
 
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library.  If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 
 
diff --git a/gmp/mpn/x86/k7/addlsh1_n.asm b/gmp/mpn/x86/k7/addlsh1_n.asm
deleted file mode 100644
index a957b6f78e..0000000000
--- a/gmp/mpn/x86/k7/addlsh1_n.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-
-dnl  Copyright 2011 Free Software Foundation, Inc.
-
-dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
-C The innerloop is 2*3-way unrolled, which is best we can do with the available
-C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
-C cannot feed carry between operations there.
-
-C			    cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9  (Banias)
-C P6 model 13 (Dothan)		 5.4	(worse than add_n + lshift)
-C P4 model 0  (Willamette)
-C P4 model 1  (?)
-C P4 model 2  (Northwood)
-C P4 model 3  (Prescott)
-C P4 model 4  (Nocona)
-C Intel Atom			 6
-C AMD K6			 ?
-C AMD K7			 2.5
-C AMD K8
-
-C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
-C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
-C that means we need an initial magic multiply.
-C
-C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
-C cannot do rsblsh1_n since we feed carry from the shift blocks to the
-C add/subtract blocks, which is right for addition but reversed for
-C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
-C without losing any time, since we're not issue limited but carry recurrency
-C latency.
-C
-C Breaking carry recurrency might be a good idea.  We would then need separate
-C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
-
-defframe(PARAM_SIZE,	16)
-defframe(PARAM_DBLD,	12)
-defframe(PARAM_SRC,	 8)
-defframe(PARAM_DST,	 4)
-
-dnl  re-use parameter space
-define(VAR_COUNT,`PARAM_DST')
-define(VAR_TMP,`PARAM_DBLD')
-
-ASM_START()
-	TEXT
-	ALIGN(8)
-PROLOGUE(mpn_addlsh1_n)
-deflit(`FRAME',0)
-
-define(`rp',  `%edi')
-define(`up',  `%esi')
-define(`vp',  `%ebp')
-
-	mov	$0x2aaaaaab, %eax
-
-	push	%ebx			FRAME_pushl()
-	mov	PARAM_SIZE, %ebx	C size
-
-	push	rp			FRAME_pushl()
-	mov	PARAM_DST, rp
-
-	mul	%ebx
-
-	push	up			FRAME_pushl()
-	mov	PARAM_SRC, up
-
-	not	%edx			C count = -(size\8)-1
-	mov	%edx, VAR_COUNT
-
-	push	vp			FRAME_pushl()
-	mov	PARAM_DBLD, vp
-
-	lea	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
-	xor	%edx, %edx
-	lea	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
-	or	%ebx, %ebx
-	jz	L(exact)
-
-L(oop):
-ifdef(`CPU_P6',`
-	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(vp), %eax
-	adc	%eax, %eax
-	rcr	%edx			C restore 1st saved carry bit
-	lea	4(vp), vp
-	adc	(up), %eax
-	lea	4(up), up
-	adc	%edx, %edx		C save a carry bit in edx
-ifdef(`CPU_P6',`
-	adc	%edx, %edx ')		C save another carry bit in edx
-	dec	%ebx
-	mov	%eax, (rp)
-	lea	4(rp), rp
-	jnz	L(oop)
-	mov	vp, VAR_TMP
-L(exact):
-	incl	VAR_COUNT
-	jz	L(end)
-
-	ALIGN(16)
-L(top):
-ifdef(`CPU_P6',`
-	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(vp), %eax
-	adc	%eax, %eax
-	mov	4(vp), %ebx
-	adc	%ebx, %ebx
-	mov	8(vp), %ecx
-	adc	%ecx, %ecx
-
-	rcr	%edx			C restore 1st saved carry bit
-
-	adc	(up), %eax
-	mov	%eax, (rp)
-	adc	4(up), %ebx
-	mov	%ebx, 4(rp)
-	adc	8(up), %ecx
-	mov	%ecx, 8(rp)
-
-	mov	12(vp), %eax
-	adc	%eax, %eax
-	mov	16(vp), %ebx
-	adc	%ebx, %ebx
-	mov	20(vp), %ecx
-	adc	%ecx, %ecx
-
-	lea	24(vp), vp
-	adc	%edx, %edx		C save a carry bit in edx
-
-	adc	12(up), %eax
-	mov	%eax, 12(rp)
-	adc	16(up), %ebx
-	mov	%ebx, 16(rp)
-	adc	20(up), %ecx
-
-	lea	24(up), up
-
-ifdef(`CPU_P6',`
-	adc	%edx, %edx ')		C save another carry bit in edx
-	mov	%ecx, 20(rp)
-	incl	VAR_COUNT
-	lea	24(rp), rp
-	jne	L(top)
-
-L(end):
-	pop	vp			FRAME_popl()
-	pop	up			FRAME_popl()
-
-ifdef(`CPU_P6',`
-	xor	%eax, %eax
-	shr	$1, %edx
-	adc	%edx, %eax
-',`
-	adc	$0, %edx
-	mov	%edx, %eax
-')
-	pop	rp			FRAME_popl()
-	pop	%ebx			FRAME_popl()
-	ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/k7/aors_n.asm b/gmp/mpn/x86/k7/aors_n.asm
index 1a08072029..d84de3ee98 100644
--- a/gmp/mpn/x86/k7/aors_n.asm
+++ b/gmp/mpn/x86/k7/aors_n.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
 
-dnl  Copyright 1999-2003 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/aorsmul_1.asm b/gmp/mpn/x86/k7/aorsmul_1.asm
index eec8df6de2..b247c29131 100644
--- a/gmp/mpn/x86/k7/aorsmul_1.asm
+++ b/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -1,49 +1,39 @@
 dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
 
-dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
+dnl  Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
-C			    cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9  (Banias)		 6.5
+C                           cycles/limb
+C P5:
+C P6 model 0-8,10-12)
+C P6 model 9  (Banias)
 C P6 model 13 (Dothan)
 C P4 model 0  (Willamette)
 C P4 model 1  (?)
 C P4 model 2  (Northwood)
 C P4 model 3  (Prescott)
 C P4 model 4  (Nocona)
-C AMD K6
-C AMD K7			 3.75
-C AMD K8
+C K6:
+C K7:                            3.75
+C K8:
 
 C TODO
 C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
diff --git a/gmp/mpn/x86/k7/bdiv_q_1.asm b/gmp/mpn/x86/k7/bdiv_q_1.asm
deleted file mode 100644
index df3477f539..0000000000
--- a/gmp/mpn/x86/k7/bdiv_q_1.asm
+++ /dev/null
@@ -1,244 +0,0 @@
-dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
-
-dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
-
-dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C          cycles/limb
-C Athlon:     11.0
-C Hammer:      9.0
-
-
-C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C                      mp_limb_t divisor);
-C
-C The dependent chain is mul+imul+sub for 11 cycles and that speed is
-C achieved with no special effort.  The load and shrld latencies are hidden
-C by out of order execution.
-C
-C It's a touch faster on size==1 to use the mul-by-inverse than divl.
-
-defframe(PARAM_SHIFT,  24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE,   12)
-defframe(PARAM_SRC,    8)
-defframe(PARAM_DST,    4)
-
-defframe(SAVE_EBX,     -4)
-defframe(SAVE_ESI,     -8)
-defframe(SAVE_EDI,    -12)
-defframe(SAVE_EBP,    -16)
-defframe(VAR_INVERSE, -20)
-defframe(VAR_DST_END, -24)
-
-deflit(STACK_SPACE, 24)
-
-	TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C		    mp_limb_t inverse, int shift)
-	ALIGN(16)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
-	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
-	movl	PARAM_SHIFT, %ecx	C shift count
-
-	movl	%ebp, SAVE_EBP
-	movl	PARAM_SIZE, %ebp
-
-	movl	%esi, SAVE_ESI
-	movl	PARAM_SRC, %esi
-
-	movl	%edi, SAVE_EDI
-	movl	PARAM_DST, %edi
-
-	movl	%ebx, SAVE_EBX
-
-	leal	(%esi,%ebp,4), %esi	C src end
-	leal	(%edi,%ebp,4), %edi	C dst end
-	negl	%ebp			C -size
-
-	movl	PARAM_INVERSE, %eax	C inv
-
-L(common):
-	movl	%eax, VAR_INVERSE
-	movl	(%esi,%ebp,4), %eax	C src[0]
-
-	incl	%ebp
-	jz	L(one)
-
-	movl	(%esi,%ebp,4), %edx	C src[1]
-
-	shrdl(	%cl, %edx, %eax)
-
-	movl	%edi, VAR_DST_END
-	xorl	%ebx, %ebx
-	jmp	L(entry)
-
-	ALIGN(8)
-L(top):
-	C eax	q
-	C ebx	carry bit, 0 or 1
-	C ecx	shift
-	C edx
-	C esi	src end
-	C edi	dst end
-	C ebp	counter, limbs, negative
-
-	mull	PARAM_DIVISOR		C carry limb in edx
-
-	movl	-4(%esi,%ebp,4), %eax
-	movl	(%esi,%ebp,4), %edi
-
-	shrdl(	%cl, %edi, %eax)
-
-	subl	%ebx, %eax		C apply carry bit
-	setc	%bl
-	movl	VAR_DST_END, %edi
-
-	subl	%edx, %eax		C apply carry limb
-	adcl	$0, %ebx
-
-L(entry):
-	imull	VAR_INVERSE, %eax
-
-	movl	%eax, -4(%edi,%ebp,4)
-	incl	%ebp
-	jnz	L(top)
-
-
-	mull	PARAM_DIVISOR		C carry limb in edx
-
-	movl	-4(%esi), %eax		C src high limb
-	shrl	%cl, %eax
-	movl	SAVE_ESI, %esi
-
-	subl	%ebx, %eax		C apply carry bit
-	movl	SAVE_EBX, %ebx
-	movl	SAVE_EBP, %ebp
-
-	subl	%edx, %eax		C apply carry limb
-
-	imull	VAR_INVERSE, %eax
-
-	movl	%eax, -4(%edi)
-	movl	SAVE_EDI, %edi
-	addl	$STACK_SPACE, %esp
-
-	ret
-
-L(one):
-	shrl	%cl, %eax
-	movl	SAVE_ESI, %esi
-	movl	SAVE_EBX, %ebx
-
-	imull	VAR_INVERSE, %eax
-
-	movl	SAVE_EBP, %ebp
-
-	movl	%eax, -4(%edi)
-	movl	SAVE_EDI, %edi
-	addl	$STACK_SPACE, %esp
-
-	ret
-EPILOGUE()
-
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C                           mp_limb_t divisor);
-C
-
-	ALIGN(16)
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
-	movl	PARAM_DIVISOR, %eax
-	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
-	movl	$-1, %ecx		C shift count
-
-	movl	%ebp, SAVE_EBP
-	movl	PARAM_SIZE, %ebp
-
-	movl	%esi, SAVE_ESI
-	movl	%edi, SAVE_EDI
-
-	C If there's usually only one or two trailing zero bits then this
-	C should be faster than bsfl.
-L(strip_twos):
-	incl	%ecx
-	shrl	%eax
-	jnc	L(strip_twos)
-
-	movl	%ebx, SAVE_EBX
-	leal	1(%eax,%eax), %ebx	C d without twos
-	andl	$127, %eax		C d/2, 7 bits
-
-ifdef(`PIC',`
-	LEA(	binvert_limb_table, %edx)
-	movzbl	(%eax,%edx), %eax		C inv 8 bits
-',`
-	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
-')
-
-	leal	(%eax,%eax), %edx	C 2*inv
-	movl	%ebx, PARAM_DIVISOR	C d without twos
-
-	imull	%eax, %eax		C inv*inv
-
-	movl	PARAM_SRC, %esi
-	movl	PARAM_DST, %edi
-
-	imull	%ebx, %eax		C inv*inv*d
-
-	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
-	leal	(%edx,%edx), %eax	C 2*inv
-
-	imull	%edx, %edx		C inv*inv
-
-	leal	(%esi,%ebp,4), %esi	C src end
-	leal	(%edi,%ebp,4), %edi	C dst end
-	negl	%ebp			C -size
-
-	imull	%ebx, %edx		C inv*inv*d
-
-	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
-
-	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
-	pushl	%eax	FRAME_pushl()
-	imull	PARAM_DIVISOR, %eax
-	cmpl	$1, %eax
-	popl	%eax	FRAME_popl()')
-
-	jmp	L(common)
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/dive_1.asm b/gmp/mpn/x86/k7/dive_1.asm
index 8eb4f45ac0..c994e0fb06 100644
--- a/gmp/mpn/x86/k7/dive_1.asm
+++ b/gmp/mpn/x86/k7/dive_1.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
 
 dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
@@ -116,7 +105,7 @@ ifdef(`PIC',`
 
 	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
 
-	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	ASSERT(e,`	C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
 	pushl	%eax	FRAME_pushl()
 	imull	PARAM_DIVISOR, %eax
 	cmpl	$1, %eax
diff --git a/gmp/mpn/x86/k7/gcd_1.asm b/gmp/mpn/x86/k7/gcd_1.asm
index c7d12c83c0..f912f43730 100644
--- a/gmp/mpn/x86/k7/gcd_1.asm
+++ b/gmp/mpn/x86/k7/gcd_1.asm
@@ -1,186 +1,369 @@
-dnl  x86 mpn_gcd_1 optimised for AMD K7.
+dnl  AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
 
-dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
+dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
-C	     cycles/bit (approx)
-C AMD K7	 5.31
-C AMD K8,K9	 5.33
-C AMD K10	 5.30
-C AMD bd1	 ?
-C AMD bobcat	 7.02
-C Intel P4-2	10.1
-C Intel P4-3/4	10.0
-C Intel P6/13	 5.88
-C Intel core2	 6.26
-C Intel NHM	 6.83
-C Intel SBR	 8.50
-C Intel atom	 8.90
-C VIA nano	 ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C TODO
-C  * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
-C  * Stream things better through registers, avoiding some copying.
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C K7: 6.75 cycles/bit (approx)  1x1 gcd
+C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
+
+
+dnl  Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
+dnl  where x is the larger of the two.  See tune/README for more.
+dnl
+dnl  divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
+dnl  suggests 40/7*2=11.4 but 7 seems to be about right.
+
+deflit(DIV_THRESHOLD, 7)
+
 
+C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C
+C This is mixed in with the code, but as per the k7 optimization manual it's
+C a full cache line and suitably aligned so it won't get swapped between
+C code and data.  Having it in TEXT rather than RODATA saves needing a GOT
+C entry when PIC.
+C
+C Actually, there doesn't seem to be a measurable difference between this in
+C it's own cache line or plonked in the middle of the code.  Presumably
+C since TEXT is read-only there's no worries about coherency.
+
+deflit(MASK, 63)
 deflit(MAXSHIFT, 6)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
 
-DEF_OBJECT(ctz_table,64)
+	TEXT
+	ALIGN(64)
+L(table):
 	.byte	MAXSHIFT
 forloop(i,1,MASK,
 `	.byte	m4_count_trailing_zeros(i)
 ')
-END_OBJECT(ctz_table)
 
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`DIV_THRES_LOG2', 7)
 
+C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
+C
+
+defframe(PARAM_LIMB,   12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
 
-define(`up',    `%edi')
-define(`n',     `%esi')
-define(`v0',    `%edx')
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(CALL_DIVISOR,-20)
+defframe(CALL_SIZE,   -24)
+defframe(CALL_SRC,    -28)
 
+deflit(STACK_SPACE, 28)
 
-ASM_START()
 	TEXT
 	ALIGN(16)
+
 PROLOGUE(mpn_gcd_1)
-	push	%edi
-	push	%esi
+deflit(`FRAME',0)
+
+	ASSERT(ne, `cmpl $0, PARAM_LIMB')	C y!=0
+	ASSERT(ae, `cmpl $1, PARAM_SIZE')	C size>=1
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_LIMB, %edx
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
 
-	mov	12(%esp), up
-	mov	16(%esp), n
-	mov	20(%esp), v0
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
 
-	mov	(up), %eax		C U low limb
-	or	v0, %eax		C x | y
-	mov	$-1, %ecx
+	movl	(%eax), %esi		C src low limb
+
+ifdef(`PIC',`
+	movl	%edi, SAVE_EDI
+	call	L(movl_eip_to_edi)
+L(here):
+	addl	$L(table)-L(here), %edi
+')
+
+	movl	%esi, %ebx
+	orl	%edx, %esi	C x|y
+	movl	$-1, %ecx
 
 L(twos):
-	inc	%ecx
-	shr	%eax
-	jnc	L(twos)
+	incl	%ecx
+	shrl	%esi
+	jnc	L(twos)		C 3/4 chance of x or y odd already
 
-	shr	%cl, v0
-	mov	%ecx, %eax		C common twos
+	shrl	%cl, %ebx
+	shrl	%cl, %edx
+	movl	%ecx, %esi	C common twos
 
-L(divide_strip_y):
-	shr	v0
-	jnc	L(divide_strip_y)
-	adc	v0, v0
-
-	push	%eax
-	push	v0
-
-	cmp	$1, n
-	jnz	L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	mov	(up), %ecx
-	mov	%ecx, %eax
-	shr	$DIV_THRES_LOG2, %ecx
-	cmp	%ecx, v0
-	ja	L(reduced)
-
-	mov	v0, %esi
-	xor	%edx, %edx
-	div	%esi
-	mov	%edx, %eax
-	jmp	L(reduced)
-
-L(reduce_nby1):
-ifdef(`PIC_WITH_EBX',`
-	push	%ebx
-	call	L(movl_eip_to_ebx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	movl	PARAM_SIZE, %ecx
+	cmpl	$1, %ecx
+	ja	L(divide)
+
+
+	C eax
+	C ebx	x
+	C ecx
+	C edx	y
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp
+
+	movl	%edx, %eax
+	cmpl	%ebx, %edx
+
+	cmovb(	%ebx, %eax)	C swap to make x bigger than y
+	cmovb(	%edx, %ebx)
+
+
+L(strip_y):
+	C eax	x
+	C ebx	y
+	C ecx
+	C edx
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp
+
+	ASSERT(nz,`orl %ebx,%ebx')
+	shrl	%ebx
+	jnc	L(strip_y)
+	rcll	%ebx
+
+
+	C eax	x
+	C ebx	y (odd)
+	C ecx
+	C edx
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp
+
+	movl	%eax, %ecx
+	movl	%ebx, %edx
+	shrl	$DIV_THRESHOLD, %eax
+
+	cmpl	%eax, %ebx
+	movl	%ecx, %eax
+	ja	L(strip_x_entry)	C do x%y if x much bigger than y
+
+
+	xorl	%edx, %edx
+
+	divl	%ebx
+
+	orl	%edx, %edx
+	movl	%edx, %eax		C remainder -> x
+	movl	%ebx, %edx		C y
+
+	jz	L(done_ebx)
+	jmp	L(strip_x)
+
+
+	C Offset 0x9D here for non-PIC.  About 0.4 cycles/bit is saved by
+	C ensuring the end of the jnz at the end of this loop doesn't cross
+	C into the next cache line at 0xC0.
+	C
+	C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
+	C it crosses but doesn't suffer any measurable slowdown.
+
+L(top):
+	C eax	x
+	C ebx	y-x
+	C ecx	x-y
+	C edx	y
+	C esi	twos, for use at end
+	C edi	[PIC] L(table)
+
+	cmovc(	%ebx, %ecx)		C if x-y gave carry, use x and y-x
+	cmovc(	%eax, %edx)
+
+L(strip_x):
+	movl	%ecx, %eax
+L(strip_x_entry):
+	andl	$MASK, %ecx
+
+	ASSERT(nz, `orl %eax, %eax')
+
+ifdef(`PIC',`
+	movb	(%ecx,%edi), %cl
+',`
+	movb	L(table) (%ecx), %cl
 ')
-	push	v0			C param 3
-	push	n			C param 2
-	push	up			C param 1
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-	CALL(	mpn_mod_1)
-	jmp	L(called)
-L(bmod):
-	CALL(	mpn_modexact_1_odd)
-
-L(called):
-	add	$12, %esp		C deallocate params
-ifdef(`PIC_WITH_EBX',`
-	pop	%ebx
+
+	shrl	%cl, %eax
+	cmpb	$MAXSHIFT, %cl
+
+	movl	%eax, %ecx
+	movl	%edx, %ebx
+	je	L(strip_x)
+
+	ASSERT(nz, `testl $1, %eax')	C both odd
+	ASSERT(nz, `testl $1, %edx')
+
+	subl	%eax, %ebx
+	subl	%edx, %ecx
+	jnz	L(top)
+
+
+L(done):
+	movl	%esi, %ecx
+	movl	SAVE_ESI, %esi
+ifdef(`PIC',`
+	movl	SAVE_EDI, %edi
 ')
-L(reduced):
-	pop	%edx
-
-	LEA(	ctz_table, %esi)
-	test	%eax, %eax
-	mov	%eax, %ecx
-	jnz	L(mid)
-	jmp	L(end)
-
-	ALIGN(16)			C               K8    BC    P4    NHM   SBR
-L(top):	cmovc(	%ecx, %eax)		C if x-y < 0	0
-	cmovc(	%edi, %edx)		C use x,y-x	0
-L(mid):	and	$MASK, %ecx		C		0
-	movzbl	(%esi,%ecx), %ecx	C		1
-	jz	L(shift_alot)		C		1
-	shr	%cl, %eax		C		3
-	mov	%eax, %edi		C		4
-	mov	%edx, %ecx		C		3
-	sub	%eax, %ecx		C		4
-	sub	%edx, %eax		C		4
-	jnz	L(top)			C		5
-
-L(end):	pop	%ecx
-	mov	%edx, %eax
-	shl	%cl, %eax
-	pop	%esi
-	pop	%edi
-	ret
 
-L(shift_alot):
-	shr	$MAXSHIFT, %eax
-	mov	%eax, %ecx
-	jmp	L(mid)
+	shll	%cl, %eax
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
 
-ifdef(`PIC_WITH_EBX',`
-L(movl_eip_to_ebx):
-	mov	(%esp), %ebx
 	ret
+
+
+
+C -----------------------------------------------------------------------------
+C two or more limbs
+
+dnl  MODEXACT_THRESHOLD is the size at which it's better to call
+dnl  mpn_modexact_1_odd than do an inline loop.
+
+deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
+
+L(divide):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	y
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp
+
+L(divide_strip_y):
+	ASSERT(nz,`orl %edx,%edx')
+	shrl	%edx
+	jnc	L(divide_strip_y)
+	leal	1(%edx,%edx), %ebx		C y now odd
+
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+	movl	-4(%eax,%ecx,4), %eax		C src high limb
+
+	cmp	$MODEXACT_THRESHOLD, %ecx
+	jae	L(modexact)
+
+	cmpl	%ebx, %eax			C high cmp divisor
+	movl	$0, %edx
+
+	cmovc(	%eax, %edx)			C skip a div if high<divisor
+	sbbl	$0, %ecx
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx	y
+	C ecx	counter (size to 1, inclusive)
+	C edx	carry (remainder)
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp	src
+
+	movl	-4(%ebp,%ecx,4), %eax
+
+	divl	%ebx
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+	C eax
+	C ebx	y (odd)
+	C ecx
+	C edx	x
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp
+
+	orl	%edx, %edx
+	movl	SAVE_EBP, %ebp
+	movl	%edx, %eax
+
+	movl	%edx, %ecx
+	movl	%ebx, %edx
+	jnz	L(strip_x_entry)
+
+
+L(done_ebx):
+	movl	%ebx, %eax
+	jmp	L(done)
+
+
+
+L(modexact):
+	C eax
+	C ebx	y
+	C ecx	size
+	C edx
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp	src
+
+ifdef(`PIC',`
+	movl	%ebp, CALL_SRC
+	movl	%ebx, %ebp		C y
+	movl	%edi, %ebx		C L(table)
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
+	movl	%ebp, CALL_DIVISOR
+	movl	%ecx, CALL_SIZE
+
+	call	GSYM_PREFIX`'mpn_modexact_1_odd@PLT
+',`
+dnl non-PIC
+	movl	%ebx, CALL_DIVISOR
+	movl	%ebp, CALL_SRC
+	movl	%ecx, CALL_SIZE
+
+	call	GSYM_PREFIX`'mpn_modexact_1_odd
 ')
+
+	C eax	x
+	C ebx	[non-PIC] y
+	C ecx
+	C edx
+	C esi	common twos
+	C edi	[PIC] L(table)
+	C ebp	[PIC] y
+
+	orl	%eax, %eax
+	movl	ifdef(`PIC',`%ebp',`%ebx'), %edx
+	movl	SAVE_EBP, %ebp
+
+	movl	%eax, %ecx
+	jnz	L(strip_x_entry)
+
+	movl	%edx, %eax
+	jmp	L(done)
+
+
+ifdef(`PIC', `
+L(movl_eip_to_edi):
+	movl	(%esp), %edi
+	ret_internal
+')
+
 EPILOGUE()
diff --git a/gmp/mpn/x86/k7/gmp-mparam.h b/gmp/mpn/x86/k7/gmp-mparam.h
index 9977a113e2..ced0c020f7 100644
--- a/gmp/mpn/x86/k7/gmp-mparam.h
+++ b/gmp/mpn/x86/k7/gmp-mparam.h
@@ -1,241 +1,73 @@
 /* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free
+Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
 The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
 
-  * the GNU Lesser General Public License as published by the Free
-    Software Foundation; either version 3 of the License, or (at your
-    option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
 
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
-  * the GNU General Public License as published by the Free Software
-    Foundation; either version 2 of the License, or (at your option) any
-    later version.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
 
-or both in parallel, as here.
 
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library.  If not,
-see https://www.gnu.org/licenses/.  */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2083 MHz K7 Barton */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD                 0  /* always */
-#define MOD_1_UNNORM_THRESHOLD               3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        24
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
-#define USE_PREINV_DIVREM_1                  1  /* native */
-#define DIV_QR_1N_PI1_METHOD                 1
-#define DIV_QR_1_NORM_THRESHOLD              3
-#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
-#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
-#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD           24
-
-#define MUL_TOOM22_THRESHOLD                28
-#define MUL_TOOM33_THRESHOLD                85
-#define MUL_TOOM44_THRESHOLD               147
-#define MUL_TOOM6H_THRESHOLD               216
-#define MUL_TOOM8H_THRESHOLD               309
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      98
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD     102
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD     124
-
-#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
-#define SQR_TOOM2_THRESHOLD                 50
-#define SQR_TOOM3_THRESHOLD                 81
-#define SQR_TOOM4_THRESHOLD                216
-#define SQR_TOOM6_THRESHOLD                306
-#define SQR_TOOM8_THRESHOLD                446
-
-#define MULMID_TOOM42_THRESHOLD             56
-
-#define MULMOD_BNM1_THRESHOLD               17
-#define SQRMOD_BNM1_THRESHOLD               17
-
-#define MUL_FFT_MODF_THRESHOLD             904  /* k = 6 */
-#define MUL_FFT_TABLE3                                      \
-  { {    904, 6}, {     21, 7}, {     11, 6}, {     25, 7}, \
-    {     13, 6}, {     27, 7}, {     15, 6}, {     31, 7}, \
-    {     17, 6}, {     35, 7}, {     19, 6}, {     39, 7}, \
-    {     23, 6}, {     47, 7}, {     27, 8}, {     15, 7}, \
-    {     31, 6}, {     63, 7}, {     35, 8}, {     19, 7}, \
-    {     39, 8}, {     23, 7}, {     47, 8}, {     31, 7}, \
-    {     63, 8}, {     39, 7}, {     79, 9}, {     23, 8}, \
-    {     47, 7}, {     95, 8}, {     51, 9}, {     31, 8}, \
-    {     71, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
-    {     95, 9}, {     55,10}, {     31, 9}, {     63, 8}, \
-    {    127, 9}, {     71, 8}, {    143, 9}, {     79, 8}, \
-    {    159,10}, {     47, 9}, {     95, 8}, {    191, 9}, \
-    {    103,11}, {     31,10}, {     63, 9}, {    127, 8}, \
-    {    255, 9}, {    143,10}, {     79, 9}, {    167,10}, \
-    {     95, 9}, {    199,10}, {    111,11}, {     63,10}, \
-    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
-    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
-    {    383,10}, {    207,12}, {     63,11}, {    127,10}, \
-    {    255, 9}, {    511,10}, {    271, 8}, {   1087,10}, \
-    {    287,11}, {    159,10}, {    319, 9}, {    639,11}, \
-    {    191,10}, {    383, 9}, {    767, 8}, {   1535, 9}, \
-    {    799, 8}, {   1599,11}, {    223,12}, {    127,11}, \
-    {    255,10}, {    511, 9}, {   1023,10}, {    543, 9}, \
-    {   1087,11}, {    287,10}, {    575, 9}, {   1151,10}, \
-    {    607, 9}, {   1215, 8}, {   2431,11}, {    319,10}, \
-    {    639, 9}, {   1279,10}, {    671, 9}, {   1343,12}, \
-    {    191,11}, {    383,10}, {    767, 9}, {   1535,10}, \
-    {    799, 9}, {   1599,10}, {    831, 9}, {   1663,10}, \
-    {    863,13}, {    127,12}, {    255,11}, {    511,10}, \
-    {   1023,11}, {    543,10}, {   1087,11}, {    575,10}, \
-    {   1151,11}, {    607,10}, {   1215, 9}, {   2431,12}, \
-    {    319,11}, {    639,10}, {   1407,11}, {    735,10}, \
-    {   1471, 9}, {   2943,12}, {    383,11}, {    767,10}, \
-    {   1535,11}, {    799,10}, {   1599,11}, {    831,10}, \
-    {   1663,11}, {    895,10}, {   1791,11}, {    959,10}, \
-    {   1919,13}, {    255,12}, {    511,11}, {   1023,10}, \
-    {   2047,11}, {   1087,12}, {    575,11}, {   1151,10}, \
-    {   2303,11}, {   1215,10}, {   2431,12}, {    639,11}, \
-    {   1279,10}, {   2559,11}, {   1407,10}, {   2815,11}, \
-    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
-    {   1599,12}, {    831,11}, {   1663,12}, {    895,11}, \
-    {   1791,10}, {   3583,12}, {    959,11}, {   1919,10}, \
-    {   3839,14}, {    255,13}, {    511,12}, {   1023,11}, \
-    {   2047,12}, {   1087,11}, {   2175,12}, {   1151,11}, \
-    {   2303,12}, {   1215,11}, {   2431,13}, {    639,12}, \
-    {   1407,11}, {   2815,12}, {   1471,11}, {   2943,13}, \
-    {    767,12}, {   1663,11}, {   3327,13}, {    895,12}, \
-    {   1791,11}, {   3583,12}, {   1919,11}, {   3839,12}, \
-    {   1983,11}, {   3967,14}, {    511,13}, {   1023,12}, \
-    {   2239,13}, {   1151,12}, {   2495,13}, {   1279,12}, \
-    {   2559,13}, {   1407,12}, {   2943,11}, {   5887,14}, \
-    {    767,13}, {   1535,12}, {   3071,13}, {   1663,12}, \
-    {   3327,13}, {   1791,12}, {   3583,13}, {   1919,12}, \
-    {   3967,15}, {    511,14}, {   1023,13}, {   2047,12}, \
-    {   4095,13}, {   2175,12}, {   4351,13}, {   2431,12}, \
-    {   4863,14}, {   1279,13}, {   2559,12}, {   5119,13}, \
-    {   2943,12}, {   5887,14}, {  16384,15}, {  32768,16} }
-#define MUL_FFT_TABLE3_SIZE 228
-#define MUL_FFT_THRESHOLD                 7808
-
-#define SQR_FFT_MODF_THRESHOLD             888  /* k = 6 */
-#define SQR_FFT_TABLE3                                      \
-  { {    888, 6}, {     21, 7}, {     11, 6}, {     25, 7}, \
-    {     13, 6}, {     27, 7}, {     15, 6}, {     31, 7}, \
-    {     17, 6}, {     35, 7}, {     19, 6}, {     39, 7}, \
-    {     23, 6}, {     47, 7}, {     27, 8}, {     15, 7}, \
-    {     31, 6}, {     63, 7}, {     35, 8}, {     19, 7}, \
-    {     39, 8}, {     23, 7}, {     47, 8}, {     31, 7}, \
-    {     63, 8}, {     39, 9}, {     23, 8}, {     47, 7}, \
-    {     95, 8}, {     51, 9}, {     31, 8}, {     67, 9}, \
-    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
-    {     55,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
-    {     79,10}, {     47, 9}, {     95, 8}, {    191,11}, \
-    {     31,10}, {     63, 9}, {    127, 8}, {    255, 9}, \
-    {    143,10}, {     79, 9}, {    167,10}, {     95, 9}, \
-    {    191,10}, {    111,11}, {     63,10}, {    127, 9}, \
-    {    255, 8}, {    511,10}, {    143, 9}, {    287, 8}, \
-    {    575,10}, {    159,11}, {     95,10}, {    191, 9}, \
-    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
-    {    511,10}, {    271, 9}, {    543, 8}, {   1087,10}, \
-    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
-    {    639, 8}, {   1279, 9}, {    671,11}, {    191,10}, \
-    {    383, 9}, {    799, 8}, {   1599, 9}, {    831,11}, \
-    {    223,12}, {    127,11}, {    255,10}, {    543, 9}, \
-    {   1087,11}, {    287,10}, {    575, 9}, {   1215, 8}, \
-    {   2431,11}, {    319,10}, {    639, 9}, {   1279,10}, \
-    {    671, 9}, {   1407,12}, {    191,10}, {    799, 9}, \
-    {   1599,10}, {    831, 9}, {   1663,10}, {    863, 9}, \
-    {   1727,11}, {    447,13}, {    127,12}, {    255,11}, \
-    {    511,10}, {   1023,11}, {    543,10}, {   1087, 9}, \
-    {   2175,10}, {   1119,11}, {    575,10}, {   1151,11}, \
-    {    607,10}, {   1215, 9}, {   2431,12}, {    319,11}, \
-    {    639,10}, {   1279,11}, {    671,10}, {   1343, 9}, \
-    {   2687,11}, {    703,10}, {   1407,11}, {    735,10}, \
-    {   1471, 9}, {   2943,10}, {   1503,12}, {    383,11}, \
-    {    767,10}, {   1535,11}, {    799,10}, {   1599,11}, \
-    {    863,10}, {   1727,12}, {    447,11}, {    895,10}, \
-    {   1791,11}, {    959,10}, {   1919,13}, {    255,12}, \
-    {    511,11}, {   1023,10}, {   2047,11}, {   1087,10}, \
-    {   2175,11}, {   1119,12}, {    575,11}, {   1151,10}, \
-    {   2303,11}, {   1215,10}, {   2431,12}, {    639,11}, \
-    {   1407,10}, {   2815,11}, {   1471,10}, {   2943,12}, \
-    {    767,11}, {   1599,12}, {    831,11}, {   1663,10}, \
-    {   3327,12}, {    895,11}, {   1791,10}, {   3583,12}, \
-    {    959,11}, {   1919,10}, {   3839,11}, {   1983,14}, \
-    {    255,13}, {    511,12}, {   1023,11}, {   2047,12}, \
-    {   1087,11}, {   2175,12}, {   1151,11}, {   2303,12}, \
-    {   1215,11}, {   2431,13}, {    639,12}, {   1407,11}, \
-    {   2815,12}, {   1471,11}, {   2943,13}, {    767,12}, \
-    {   1663,11}, {   3327,12}, {   1727,13}, {    895,12}, \
-    {   1791,11}, {   3583,12}, {   1919,11}, {   3839,12}, \
-    {   1983,11}, {   3967,14}, {    511,13}, {   1023,12}, \
-    {   2175,13}, {   1151,12}, {   2495,13}, {   1279,12}, \
-    {   2559,13}, {   1407,12}, {   2943,11}, {   5887,14}, \
-    {    767,13}, {   1535,12}, {   3071,13}, {   1663,12}, \
-    {   3327,13}, {   1791,12}, {   3583,13}, {   1919,12}, \
-    {   3967,15}, {    511,14}, {   1023,13}, {   2047,12}, \
-    {   4095,13}, {   2175,12}, {   4351,13}, {   2431,14}, \
-    {   1279,13}, {   2943,12}, {   5887,14}, {  16384,15}, \
-    {  32768,16} }
-#define SQR_FFT_TABLE3_SIZE 229
-#define SQR_FFT_THRESHOLD                 7552
-
-#define MULLO_BASECASE_THRESHOLD             8
-#define MULLO_DC_THRESHOLD                  36
-#define MULLO_MUL_N_THRESHOLD            13463
-
-#define DC_DIV_QR_THRESHOLD                 45
-#define DC_DIVAPPR_Q_THRESHOLD             208
-#define DC_BDIV_QR_THRESHOLD                43
-#define DC_BDIV_Q_THRESHOLD                140
-
-#define INV_MULMOD_BNM1_THRESHOLD           62
-#define INV_NEWTON_THRESHOLD               204
-#define INV_APPR_THRESHOLD                 204
-
-#define BINV_NEWTON_THRESHOLD              230
-#define REDC_1_TO_REDC_N_THRESHOLD          59
-
-#define MU_DIV_QR_THRESHOLD               1752
-#define MU_DIVAPPR_Q_THRESHOLD            1528
-#define MUPI_DIV_QR_THRESHOLD               82
-#define MU_BDIV_QR_THRESHOLD              1360
-#define MU_BDIV_Q_THRESHOLD               1470
-
-#define POWM_SEC_TABLE  1,16,102,336,1221
-
-#define MATRIX22_STRASSEN_THRESHOLD         16
-#define HGCD_THRESHOLD                     120
-#define HGCD_APPR_THRESHOLD                143
-#define HGCD_REDUCE_THRESHOLD             4818
-#define GCD_DC_THRESHOLD                   474
-#define GCDEXT_DC_THRESHOLD                345
-#define JACOBI_BASE_METHOD                   4
-
-#define GET_STR_DC_THRESHOLD                15
-#define GET_STR_PRECOMPUTE_THRESHOLD        33
-#define SET_STR_DC_THRESHOLD               298
-#define SET_STR_PRECOMPUTE_THRESHOLD      1187
-
-#define FAC_DSC_THRESHOLD                  602
-#define FAC_ODD_THRESHOLD                   29
+/* 2083 MHz Athlon */
+
+/* Generated by tuneup.c, 2008-12-23, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD          28
+#define MUL_TOOM3_THRESHOLD              89
+#define MUL_TOOM44_THRESHOLD            130
+
+#define SQR_BASECASE_THRESHOLD            0  /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD          52
+#define SQR_TOOM3_THRESHOLD              89
+#define SQR_TOOM4_THRESHOLD             196
+
+#define MULLOW_BASECASE_THRESHOLD        10
+#define MULLOW_DC_THRESHOLD              96
+#define MULLOW_MUL_N_THRESHOLD          234
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 86
+#define POWM_THRESHOLD                  134
+#define MATRIX22_STRASSEN_THRESHOLD      18
+#define HGCD_THRESHOLD                  163
+#define GCD_DC_THRESHOLD                665
+#define GCDEXT_DC_THRESHOLD             605
+#define JACOBI_BASE_METHOD                1
+
+#define USE_PREINV_DIVREM_1               1  /* native */
+#define USE_PREINV_MOD_1                  1  /* native */
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always (native) */
+
+#define GET_STR_DC_THRESHOLD             19
+#define GET_STR_PRECOMPUTE_THRESHOLD     35
+#define SET_STR_DC_THRESHOLD            826
+#define SET_STR_PRECOMPUTE_THRESHOLD   1691
+
+#define MUL_FFT_TABLE  { 432, 864, 1664, 4608, 10240, 40960, 163840, 655360, 0 }
+#define MUL_FFT_MODF_THRESHOLD          496
+#define MUL_FFT_THRESHOLD              4864
+
+#define SQR_FFT_TABLE  { 432, 864, 1664, 4608, 10240, 40960, 98304, 655360, 0 }
+#define SQR_FFT_MODF_THRESHOLD          432
+#define SQR_FFT_THRESHOLD              3840
+
+/* These tables need to be updated.  */
+
+#define MUL_FFT_TABLE2 {{1, 4}, {401, 5}, {801, 6}, {817, 5}, {865, 6}, {1025, 5}, {1057, 6}, {1601, 7}, {1633, 6}, {1729, 7}, {1921, 6}, {2113, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {2945, 6}, {3009, 7}, {3457, 8}, {3521, 7}, {4481, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6017, 8}, {7553, 9}, {7681, 8}, {9985, 9}, {11777, 8}, {13057, 9}, {13825, 8}, {14081, 9}, {15873, 8}, {16641, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24577, 9}, {25089, 8}, {25345, 9}, {27393, 10}, {27649, 9}, {28161, 10}, {31745, 9}, {38913, 10}, {39425, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {98305, 11}, {99329, 10}, {100353, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {110593, 11}, {112641, 10}, {113665, 11}, {129025, 10}, {162817, 11}, {194561, 10}, {195585, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4976641, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1, 4}, {401, 5}, {417, 4}, {433, 5}, {881, 6}, {961, 5}, {993, 6}, {1857, 7}, {1921, 6}, {2049, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {3457, 8}, {3841, 7}, {4481, 8}, {4609, 7}, {4737, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6273, 8}, {7041, 9}, {7681, 8}, {9985, 9}, {10241, 8}, {10497, 9}, {11777, 8}, {13057, 9}, {15873, 8}, {16385, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24321, 9}, {24577, 10}, {24833, 9}, {25601, 10}, {27137, 9}, {27649, 10}, {31745, 9}, {38401, 10}, {38913, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {99329, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {107521, 11}, {110593, 10}, {113665, 11}, {129025, 10}, {154625, 11}, {155649, 10}, {162817, 11}, {194561, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {727041, 11}, {729089, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4714497, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {7073793, 13}, {7331841, 12}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/x86/k7/invert_limb.asm b/gmp/mpn/x86/k7/invert_limb.asm
deleted file mode 100644
index 6cce455a9d..0000000000
--- a/gmp/mpn/x86/k7/invert_limb.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl  x86 mpn_invert_limb
-
-dnl  Contributed to the GNU project by Niels Möller
-
-dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C			    cycles (approx)	div
-C P5				 ?
-C P6 model 0-8,10-12		 ?
-C P6 model 9  (Banias)		 ?
-C P6 model 13 (Dothan)		 ?
-C P4 model 0  (Willamette)	 ?
-C P4 model 1  (?)		 ?
-C P4 model 2  (Northwood)	 ?
-C P4 model 3  (Prescott)	 ?
-C P4 model 4  (Nocona)		 ?
-C AMD K6			 ?
-C AMD K7			41		53
-C AMD K8			 ?
-
-C TODO
-C  * These c/l numbers are for a non-PIC build.  Consider falling back to using
-C    the 'div' instruction for PIC builds.
-C  * Perhaps use this file--or at least the algorithm--for more machines than k7.
-
-C Register usage:
-C   Input D in %edi
-C   Current approximation is in %eax and/or %ecx
-C   %ebx and %edx are temporaries
-C   %esi and %ebp are unused
-
-defframe(PARAM_DIVISOR,4)
-
-ASM_START()
-
-C Make approx_tab global to work around Apple relocation bug.
-ifdef(`DARWIN',`
-	deflit(`approx_tab', MPN(invert_limb_tab))
-	GLOBL	approx_tab')
-
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_invert_limb)
-deflit(`FRAME', 0)
-	mov	PARAM_DIVISOR, %eax
-	C Avoid push/pop on k7.
-	sub	$8, %esp	FRAME_subl_esp(8)
-	mov	%ebx, (%esp)
-	mov	%edi, 4(%esp)
-
-	mov	%eax, %edi
-	shr	$22, %eax
-ifdef(`PIC',`
-	LEA(	approx_tab, %ebx)
-	movzwl	-1024(%ebx, %eax, 2), %eax
-',`
-	movzwl	-1024+approx_tab(%eax, %eax), %eax	C %eax = v0
-')
-
-	C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
-	mov	%eax, %ecx
-	imul	%eax, %eax
-	mov	%edi, %ebx
-	shr	$11, %ebx
-	inc	%ebx
-	mul	%ebx
-	mov	%edi, %ebx				C Prepare
-	shr	%ebx
-	sbb	%eax, %eax
-	sub	%eax, %ebx				C %ebx = d_31, %eax = mask
-	shl	$4, %ecx
-	dec	%ecx
-	sub	%edx, %ecx				C %ecx = v1
-
-	C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
-	imul	%ecx, %ebx
-	and	%ecx, %eax
-	shr	%eax
-	sub	%ebx, %eax
-	mul	%ecx
-	mov	%edi, %eax				C Prepare for next mul
-	shl	$15, %ecx
-	shr	%edx
-	add	%edx, %ecx				C %ecx = v2
-
-	mul	%ecx
-	add	%edi, %eax
-	mov	%ecx, %eax
-	adc	%edi, %edx
-	sub	%edx, %eax				C %eax = v3
-
-	mov	(%esp), %ebx
-	mov	4(%esp), %edi
-	add	$8, %esp
-
-	ret
-
-EPILOGUE()
-
-DEF_OBJECT(approx_tab,2)
-	.value	0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
-	.value	0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
-	.value	0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
-	.value	0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
-	.value	0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
-	.value	0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
-	.value	0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
-	.value	0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
-	.value	0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
-	.value	0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
-	.value	0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
-	.value	0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
-	.value	0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
-	.value	0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
-	.value	0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
-	.value	0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
-	.value	0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
-	.value	0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
-	.value	0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
-	.value	0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
-	.value	0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
-	.value	0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
-	.value	0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
-	.value	0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
-	.value	0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
-	.value	0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
-	.value	0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
-	.value	0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
-	.value	0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
-	.value	0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
-	.value	0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
-	.value	0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
-	.value	0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
-	.value	0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
-	.value	0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
-	.value	0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
-	.value	0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
-	.value	0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
-	.value	0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
-	.value	0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
-	.value	0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
-	.value	0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
-	.value	0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
-	.value	0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
-	.value	0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
-	.value	0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
-	.value	0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
-	.value	0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
-	.value	0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
-	.value	0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
-	.value	0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
-	.value	0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
-	.value	0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
-	.value	0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
-	.value	0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
-	.value	0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
-	.value	0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
-	.value	0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
-	.value	0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
-	.value	0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
-	.value	0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
-	.value	0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
-	.value	0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
-	.value	0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
-END_OBJECT(approx_tab)
diff --git a/gmp/mpn/x86/k7/mmx/com.asm b/gmp/mpn/x86/k7/mmx/com_n.asm
index a258c224f1..068c01f076 100644
--- a/gmp/mpn/x86/k7/mmx/com.asm
+++ b/gmp/mpn/x86/k7/mmx/com_n.asm
@@ -1,32 +1,21 @@
-dnl  AMD Athlon mpn_com -- mpn bitwise one's complement.
+dnl  AMD Athlon mpn_com_n -- mpn bitwise one's complement.
 
 dnl  Copyright 2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
@@ -34,7 +23,7 @@ include(`../config.m4')
 C K7: 1.0 cycles/limb
 
 
-C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
 C
 C The loop form below is necessary for the claimed speed.  It needs to be
 C aligned to a 16 byte boundary and only 16 bytes long.  Maybe that's so it
@@ -62,7 +51,7 @@ defframe(PARAM_DST, 4)
 	TEXT
 	ALIGN(16)
 
-PROLOGUE(mpn_com)
+PROLOGUE(mpn_com_n)
 deflit(`FRAME',0)
 
 	movl	PARAM_DST, %edx
diff --git a/gmp/mpn/x86/k7/mmx/copyd.asm b/gmp/mpn/x86/k7/mmx/copyd.asm
index 59ece40920..4601fcd75a 100644
--- a/gmp/mpn/x86/k7/mmx/copyd.asm
+++ b/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
 
 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/mmx/copyi.asm b/gmp/mpn/x86/k7/mmx/copyi.asm
index 9a28f927ec..a17d575ff4 100644
--- a/gmp/mpn/x86/k7/mmx/copyi.asm
+++ b/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
 
 dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/mmx/divrem_1.asm b/gmp/mpn/x86/k7/mmx/divrem_1.asm
index cf343280bb..fa5824c7b9 100644
--- a/gmp/mpn/x86/k7/mmx/divrem_1.asm
+++ b/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -1,33 +1,22 @@
 dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
 dnl  division.
 
-dnl  Copyright 1999-2002, 2004 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
@@ -456,7 +445,7 @@ C chain, and nothing better than 18 cycles has been found when using it.
 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
 C be an extremely rare event.
 C
-C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
 C if some special data is coming out with this always, the q1_ff special
 C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
 C induce the q1_ff case, for speed measurements or testing.  Note that
@@ -735,12 +724,12 @@ C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
 C rnd() means rounding down to a multiple of d.
 C
 C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
-C		     = m*d + b*d - m - b
-C		     = floor((b(b-d)-1)/d)*d + b*d - m - b
-C		     = rnd(b(b-d)-1) + b*d - m - b
-C		     = rnd(b(b-d)-1 + b*d) - m - b
-C		     = rnd(b*b-1) - m - b
-C		     <= (b-2)*b
+C	             = m*d + b*d - m - b
+C	             = floor((b(b-d)-1)/d)*d + b*d - m - b
+C	             = rnd(b(b-d)-1) + b*d - m - b
+C	             = rnd(b(b-d)-1 + b*d) - m - b
+C	             = rnd(b*b-1) - m - b
+C	             <= (b-2)*b
 C
 C Unchanged from the general case is that the final quotient limb q can be
 C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
diff --git a/gmp/mpn/x86/k7/mmx/lshift.asm b/gmp/mpn/x86/k7/mmx/lshift.asm
index b3383cf2c3..b3bff8ffd1 100644
--- a/gmp/mpn/x86/k7/mmx/lshift.asm
+++ b/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_lshift -- mpn left shift.
 
-dnl  Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/mmx/mod_1.asm b/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..2b42e55caf
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,509 @@
+dnl  AMD K7 mpn_mod_1 -- mpn by limb remainder.
+
+dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb.
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                             mp_limb_t inverse);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded.  See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl  Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl  is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_INVERSE,16)  dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY,  16)  dnl mpn_mod_1c
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+	movl	PARAM_SRC, %ecx
+	movl	PARAM_SIZE, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_INVERSE, %edx
+
+	movl	%esi, SAVE_ESI
+	movl	-4(%ecx,%eax,4), %edi		C src high limb
+	leal	-16(%ecx,%eax,4), %ecx		C &src[size-4]
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_INVERSE, %edx
+
+	movl	$0, VAR_NORM			C l==0
+
+	movl	%edi, %esi
+	subl	%ebp, %edi			C high-divisor
+
+	cmovc(	%esi, %edi)			C restore if underflow
+	decl	%eax
+	jz	L(done_edi)			C size==1, high-divisor only
+
+	movl	8(%ecx), %esi			C src second high limb
+	movl	%edx, VAR_INVERSE
+
+	movl	$32, %ebx			C 32-l
+	decl	%eax
+	jz	L(inverse_one_left)		C size==2, one divide
+
+	movd	%ebx, %mm7			C 32-l
+	decl	%eax
+	jz	L(inverse_two_left)		C size==3, two divides
+
+	jmp	L(inverse_top)			C size>=4
+
+
+L(done_edi):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	movl	%edi, %eax
+
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+
+
+	ALIGN(32)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	movl	%edx, %eax
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	movl	%ebx, SAVE_EBX
+	movl	%ecx, %ebx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7		C 32-l
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l.  It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa.  They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary.  Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1.  On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+	ALIGN(32)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SIZE, %ebx   C dummy
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, PARAM_SIZE   C dummy
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	nop			   C dummy
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	leal	(%ecx), %ecx	   C dummy
+
+	C
+
+	C
+
+	subl	%eax, %esi	   C low  n - (q1+1)*d
+	movl	PARAM_SRC, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C high n - (q1+1)*d, 0 or -1
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	cmpl	%eax, %ecx
+	jae	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+
+L(inverse_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%eax), %edx
+	movl	SAVE_EBP, %ebp
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	movl	PARAM_SRC, %edx
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%edx, %ecx
+	jae	L(inverse_top)
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/popham.asm b/gmp/mpn/x86/k7/mmx/popham.asm
index 95965b74d4..5dc0a78c42 100644
--- a/gmp/mpn/x86/k7/mmx/popham.asm
+++ b/gmp/mpn/x86/k7/mmx/popham.asm
@@ -1,40 +1,29 @@
 dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
 dnl  distance.
 
-dnl  Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
 C			     popcount	     hamdist
 C P3 generic			6.5		7
-C P3 model 9  (Banias)          5.7		6.1
+C P3 model 9  (Banias)          ?		?
 C P3 model 13 (Dothan)		5.75		6
 C K7				5		6
 
diff --git a/gmp/mpn/x86/k7/mmx/rshift.asm b/gmp/mpn/x86/k7/mmx/rshift.asm
index 345d23a25e..3566ce85d7 100644
--- a/gmp/mpn/x86/k7/mmx/rshift.asm
+++ b/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_rshift -- mpn right shift.
 
-dnl  Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/mod_1_1.asm b/gmp/mpn/x86/k7/mod_1_1.asm
deleted file mode 100644
index 1bbe6f92d7..0000000000
--- a/gmp/mpn/x86/k7/mod_1_1.asm
+++ /dev/null
@@ -1,221 +0,0 @@
-dnl  x86-32 mpn_mod_1_1p, requiring cmov.
-
-dnl  Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
-
-dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C			    cycles/limb
-C P5				 ?
-C P6 model 0-8,10-12		 ?
-C P6 model 9  (Banias)		 ?
-C P6 model 13 (Dothan)		 ?
-C P4 model 0  (Willamette)	 ?
-C P4 model 1  (?)		 ?
-C P4 model 2  (Northwood)	 ?
-C P4 model 3  (Prescott)	 ?
-C P4 model 4  (Nocona)		 ?
-C AMD K6			 ?
-C AMD K7			 7
-C AMD K8			 ?
-
-define(`B2mb', `%ebx')
-define(`r0', `%esi')
-define(`r2', `%ebp')
-define(`t0', `%edi')
-define(`ap', `%ecx')  C Also shift count
-
-C Stack frame
-C	pre	36(%esp)
-C	b	32(%esp)
-C	n	28(%esp)
-C	ap	24(%esp)
-C	return	20(%esp)
-C	%ebp	16(%esp)
-C	%edi	12(%esp)
-C	%esi	8(%esp)
-C	%ebx	4(%esp)
-C	B2mod	(%esp)
-
-define(`B2modb', `(%esp)')
-define(`n', `28(%esp)')
-define(`b', `32(%esp)')
-define(`pre', `36(%esp)')
-
-C mp_limb_t
-C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
-C
-C The pre array contains bi, cnt, B1modb, B2modb
-C Note: This implementation needs B1modb only when cnt > 0
-
-ASM_START()
-	TEXT
-	ALIGN(8)
-PROLOGUE(mpn_mod_1_1p)
-	push	%ebp
-	push	%edi
-	push	%esi
-	push	%ebx
-	mov	32(%esp), %ebp		C pre[]
-
-	mov	12(%ebp), %eax		C B2modb
-	push	%eax			C Put it on stack
-
-	mov	n, %edx
-	mov	24(%esp), ap
-
-	lea	(ap, %edx, 4), ap
-	mov	-4(ap), %eax
-	cmp	$3, %edx
-	jnc	L(first)
-	mov	-8(ap), r0
-	jmp	L(reduce_two)
-
-L(first):
-	C First iteration, no r2
-	mull	B2modb
-	mov	-12(ap), r0
-	add	%eax, r0
-	mov	-8(ap), %eax
-	adc	%edx, %eax
-	sbb	r2, r2
-	subl	$3, n
-	lea	-16(ap), ap
-	jz	L(reduce_three)
-
-	mov	B2modb, B2mb
-	sub	b, B2mb
-	lea	(B2mb, r0), t0
-	jmp	L(mid)
-
-	ALIGN(16)
-L(top): C Loopmixed to 7 c/l on k7
-	add	%eax, r0
-	lea	(B2mb, r0), t0
-	mov	r2, %eax
-	adc	%edx, %eax
-	sbb	r2, r2
-L(mid):	mull	B2modb
-	and	B2modb, r2
-	add	r0, r2
-	decl	n
-	mov	(ap), r0
-	cmovc(	t0, r2)
-	lea	-4(ap), ap
-	jnz	L(top)
-
-	add	%eax, r0
-	mov	r2, %eax
-	adc	%edx, %eax
-	sbb	r2, r2
-
-L(reduce_three):
-	C Eliminate r2
-	and	b, r2
-	sub	r2, %eax
-
-L(reduce_two):
-	mov	pre, %ebp
-	movb	4(%ebp), %cl
-	test	%cl, %cl
-	jz	L(normalized)
-
-	C Unnormalized, use B1modb to reduce to size < B b
-	mull	8(%ebp)
-	xor	t0, t0
-	add	%eax, r0
-	adc	%edx, t0
-	mov	t0, %eax
-
-	C Left-shift to normalize
-	shld	%cl, r0, %eax C Always use shld?
-
-	shl	%cl, r0
-	jmp	L(udiv)
-
-L(normalized):
-	mov	%eax, t0
-	sub	b, t0
-	cmovnc(	t0, %eax)
-
-L(udiv):
-	lea	1(%eax), t0
-	mull	(%ebp)
-	mov	b, %ebx		C Needed in register for lea
-	add	r0, %eax
-	adc	t0, %edx
-	imul	%ebx, %edx
-	sub	%edx, r0
-	cmp	r0, %eax
-	lea	(%ebx, r0), %eax
-	cmovnc(	r0, %eax)
-	cmp	%ebx, %eax
-	jnc	L(fix)
-L(ok):	shr	%cl, %eax
-
-	add	$4, %esp
-	pop	%ebx
-	pop	%esi
-	pop	%edi
-	pop	%ebp
-
-	ret
-L(fix):	sub	%ebx, %eax
-	jmp	L(ok)
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps)
-	push	%ebp
-	mov	12(%esp), %ebp
-	push	%esi
-	bsr	%ebp, %ecx
-	push	%ebx
-	xor	$31, %ecx
-	mov	16(%esp), %esi
-	sal	%cl, %ebp
-	mov	%ebp, %edx
-	not	%edx
-	mov	$-1, %eax
-	div	%ebp			C On K7, invert_limb would be a few cycles faster.
-	mov	%eax, (%esi)		C store bi
-	mov	%ecx, 4(%esi)		C store cnt
-	neg	%ebp
-	mov	$1, %edx
-	shld	%cl, %eax, %edx
-	imul	%ebp, %edx
-	shr	%cl, %edx
-	imul	%ebp, %eax
-	mov	%edx, 8(%esi)		C store B1modb
-	mov	%eax, 12(%esi)		C store B2modb
-	pop	%ebx
-	pop	%esi
-	pop	%ebp
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_1_4.asm b/gmp/mpn/x86/k7/mod_1_4.asm
deleted file mode 100644
index bb7597edd2..0000000000
--- a/gmp/mpn/x86/k7/mod_1_4.asm
+++ /dev/null
@@ -1,260 +0,0 @@
-dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
-
-dnl  Contributed to the GNU project by Torbjorn Granlund.
-
-dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C			    cycles/limb
-C P5				 ?
-C P6 model 0-8,10-12		 ?
-C P6 model 9  (Banias)		 ?
-C P6 model 13 (Dothan)		 6
-C P4 model 0  (Willamette)	 ?
-C P4 model 1  (?)		 ?
-C P4 model 2  (Northwood)	15.5
-C P4 model 3  (Prescott)	 ?
-C P4 model 4  (Nocona)		 ?
-C AMD K6			 ?
-C AMD K7			 4.75
-C AMD K8			 ?
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p)
-	push	%ebp
-	push	%edi
-	push	%esi
-	push	%ebx
-	sub	$28, %esp
-	mov	60(%esp), %edi		C cps[]
-	mov	8(%edi), %eax
-	mov	12(%edi), %edx
-	mov	16(%edi), %ecx
-	mov	20(%edi), %esi
-	mov	24(%edi), %edi
-	mov	%eax, 4(%esp)
-	mov	%edx, 8(%esp)
-	mov	%ecx, 12(%esp)
-	mov	%esi, 16(%esp)
-	mov	%edi, 20(%esp)
-	mov	52(%esp), %eax		C n
-	xor	%edi, %edi
-	mov	48(%esp), %esi		C up
-	lea	-12(%esi,%eax,4), %esi
-	and	$3, %eax
-	je	L(b0)
-	cmp	$2, %eax
-	jc	L(b1)
-	je	L(b2)
-
-L(b3):	mov	4(%esi), %eax
-	mull	4(%esp)
-	mov	(%esi), %ebp
-	add	%eax, %ebp
-	adc	%edx, %edi
-	mov	8(%esi), %eax
-	mull	8(%esp)
-	lea	-12(%esi), %esi
-	jmp	L(m0)
-
-L(b0):	mov	(%esi), %eax
-	mull	4(%esp)
-	mov	-4(%esi), %ebp
-	add	%eax, %ebp
-	adc	%edx, %edi
-	mov	4(%esi), %eax
-	mull	8(%esp)
-	add	%eax, %ebp
-	adc	%edx, %edi
-	mov	8(%esi), %eax
-	mull	12(%esp)
-	lea	-16(%esi), %esi
-	jmp	L(m0)
-
-L(b1):	mov	8(%esi), %ebp
-	lea	-4(%esi), %esi
-	jmp	L(m1)
-
-L(b2):	mov	8(%esi), %edi
-	mov	4(%esi), %ebp
-	lea	-8(%esi), %esi
-	jmp	L(m1)
-
-	ALIGN(16)
-L(top):	mov	(%esi), %eax
-	mull	4(%esp)
-	mov	-4(%esi), %ebx
-	xor	%ecx, %ecx
-	add	%eax, %ebx
-	adc	%edx, %ecx
-	mov	4(%esi), %eax
-	mull	8(%esp)
-	add	%eax, %ebx
-	adc	%edx, %ecx
-	mov	8(%esi), %eax
-	mull	12(%esp)
-	add	%eax, %ebx
-	adc	%edx, %ecx
-	lea	-16(%esi), %esi
-	mov	16(%esp), %eax
-	mul	%ebp
-	add	%eax, %ebx
-	adc	%edx, %ecx
-	mov	20(%esp), %eax
-	mul	%edi
-	mov	%ebx, %ebp
-	mov	%ecx, %edi
-L(m0):	add	%eax, %ebp
-	adc	%edx, %edi
-L(m1):	subl	$4, 52(%esp)
-	ja	L(top)
-
-L(end):	mov	4(%esp), %eax
-	mul	%edi
-	mov	60(%esp), %edi
-	add	%eax, %ebp
-	adc	$0, %edx
-	mov	4(%edi), %ecx
-	mov	%edx, %esi
-	mov	%ebp, %eax
-	sal	%cl, %esi
-	mov	%ecx, %ebx
-	neg	%ecx
-	shr	%cl, %eax
-	or	%esi, %eax
-	lea	1(%eax), %esi
-	mull	(%edi)
-	mov	%ebx, %ecx
-	mov	%eax, %ebx
-	mov	%ebp, %eax
-	mov	56(%esp), %ebp
-	sal	%cl, %eax
-	add	%eax, %ebx
-	adc	%esi, %edx
-	imul	%ebp, %edx
-	sub	%edx, %eax
-	lea	(%eax,%ebp), %edx
-	cmp	%eax, %ebx
-	cmovc(	%edx, %eax)
-	mov	%eax, %edx
-	sub	%ebp, %eax
-	cmovc(	%edx, %eax)
-	add	$28, %esp
-	pop	%ebx
-	pop	%esi
-	pop	%edi
-	pop	%ebp
-	shr	%cl, %eax
-	ret
-EPILOGUE()
-
-	ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p_cps)
-C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
-	push	%ebp
-	push	%edi
-	push	%esi
-	push	%ebx
-	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
-	mov	24(%esp), %ebx
-	bsr	%ebx, %ecx
-	xor	$31, %ecx
-	sal	%cl, %ebx		C b << cnt
-	mov	%ebx, %edx
-	not	%edx
-	mov	$-1, %eax
-	div	%ebx
-	xor	%edi, %edi
-	sub	%ebx, %edi
-	mov	$1, %esi
-	mov	%eax, (%ebp)		C store bi
-	mov	%ecx, 4(%ebp)		C store cnt
-	shld	%cl, %eax, %esi
-	imul	%edi, %esi
-	mov	%eax, %edi
-	mul	%esi
-
-	add	%esi, %edx
-	shr	%cl, %esi
-	mov	%esi, 8(%ebp)		C store B1modb
-
-	not	%edx
-	imul	%ebx, %edx
-	lea	(%edx,%ebx), %esi
-	cmp	%edx, %eax
-	cmovnc(	%edx, %esi)
-	mov	%edi, %eax
-	mul	%esi
-
-	add	%esi, %edx
-	shr	%cl, %esi
-	mov	%esi, 12(%ebp)		C store B2modb
-
-	not	%edx
-	imul	%ebx, %edx
-	lea	(%edx,%ebx), %esi
-	cmp	%edx, %eax
-	cmovnc(	%edx, %esi)
-	mov	%edi, %eax
-	mul	%esi
-
-	add	%esi, %edx
-	shr	%cl, %esi
-	mov	%esi, 16(%ebp)		C store B3modb
-
-	not	%edx
-	imul	%ebx, %edx
-	lea	(%edx,%ebx), %esi
-	cmp	%edx, %eax
-	cmovnc(	%edx, %esi)
-	mov	%edi, %eax
-	mul	%esi
-
-	add	%esi, %edx
-	shr	%cl, %esi
-	mov	%esi, 20(%ebp)		C store B4modb
-
-	not	%edx
-	imul	%ebx, %edx
-	add	%edx, %ebx
-	cmp	%edx, %eax
-	cmovnc(	%edx, %ebx)
-
-	shr	%cl, %ebx
-	mov	%ebx, 24(%ebp)		C store B5modb
-
-	pop	%ebx
-	pop	%esi
-	pop	%edi
-	pop	%ebp
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_34lsub1.asm b/gmp/mpn/x86/k7/mod_34lsub1.asm
index ee3ad04099..f00e84dc42 100644
--- a/gmp/mpn/x86/k7/mod_34lsub1.asm
+++ b/gmp/mpn/x86/k7/mod_34lsub1.asm
@@ -1,32 +1,22 @@
 dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
 
-dnl  Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
+dnl  Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/mode1o.asm b/gmp/mpn/x86/k7/mode1o.asm
index 6472ec5949..ef858049a6 100644
--- a/gmp/mpn/x86/k7/mode1o.asm
+++ b/gmp/mpn/x86/k7/mode1o.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
 
-dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
@@ -122,7 +111,7 @@ ifdef(`PIC',`
 
 	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
 
-	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	ASSERT(e,`	C d*inv == 1 mod 2^BITS_PER_MP_LIMB
 	movl	%esi, %eax
 	imull	%edi, %eax
 	cmpl	$1, %eax')
diff --git a/gmp/mpn/x86/k7/mul_1.asm b/gmp/mpn/x86/k7/mul_1.asm
index 755cd2ed50..016262d594 100644
--- a/gmp/mpn/x86/k7/mul_1.asm
+++ b/gmp/mpn/x86/k7/mul_1.asm
@@ -1,38 +1,28 @@
 dnl  AMD K7 mpn_mul_1.
 
-dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
+dnl  Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
-C			    cycles/limb
-C P5
+C                           cycles/limb
+C P5:
 C P6 model 0-8,10-12)
 C P6 model 9  (Banias)
 C P6 model 13 (Dothan)
@@ -41,9 +31,9 @@ C P4 model 1  (?)
 C P4 model 2  (Northwood)
 C P4 model 3  (Prescott)
 C P4 model 4  (Nocona)
-C AMD K6
-C AMD K7			 3.25
-C AMD K8
+C K6:
+C K7:                            3.25
+C K8:
 
 C TODO
 C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
diff --git a/gmp/mpn/x86/k7/mul_basecase.asm b/gmp/mpn/x86/k7/mul_basecase.asm
index 4dfb500885..7f4c0002f7 100644
--- a/gmp/mpn/x86/k7/mul_basecase.asm
+++ b/gmp/mpn/x86/k7/mul_basecase.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
 
-dnl  Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
diff --git a/gmp/mpn/x86/k7/sqr_basecase.asm b/gmp/mpn/x86/k7/sqr_basecase.asm
index 7b6a97e0df..408a13dc9b 100644
--- a/gmp/mpn/x86/k7/sqr_basecase.asm
+++ b/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -1,32 +1,21 @@
 dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
 
-dnl  Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
+dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
@@ -39,18 +28,18 @@ C     roughly the Karatsuba recursing range).
 dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
 dnl  some comments.
 
-deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
 
-ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
-`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
+`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
 
-m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
-deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
 
 
 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
 C
-C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes,
 C which is quite a bit, but is considered good value since squares big
 C enough to use most of the code will be spending quite a few cycles in it.
 
diff --git a/gmp/mpn/x86/k7/sublsh1_n.asm b/gmp/mpn/x86/k7/sublsh1_n.asm
deleted file mode 100644
index 523b01218d..0000000000
--- a/gmp/mpn/x86/k7/sublsh1_n.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
-
-dnl  Copyright 2011 Free Software Foundation, Inc.
-
-dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns.  The
-C innerloop is 2*3-way unrolled, which is best we can do with the available
-C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
-C cannot feed carry between operations there.
-
-C			    cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9  (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0  (Willamette)
-C P4 model 1  (?)
-C P4 model 2  (Northwood)
-C P4 model 3  (Prescott)
-C P4 model 4  (Nocona)
-C Intel Atom			 6.75
-C AMD K6
-C AMD K7
-C AMD K8
-
-C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
-C processors.  It uses 2*4-way unrolling, for good reasons.
-C
-C Breaking carry recurrency might be a good idea.  We would then need separate
-C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
-
-defframe(PARAM_SIZE,	12)
-defframe(PARAM_SRC,	 8)
-defframe(PARAM_DST,	 4)
-
-dnl  re-use parameter space
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_EBP,`PARAM_DST')
-
-ASM_START()
-	TEXT
-	ALIGN(8)
-PROLOGUE(mpn_sublsh1_n_ip1)
-deflit(`FRAME',0)
-
-define(`rp',  `%edi')
-define(`up',  `%esi')
-
-	mov	PARAM_SIZE, %eax	C size
-	push	up			FRAME_pushl()
-	push	rp			FRAME_pushl()
-	xor	%edx, %edx
-	mov	PARAM_SRC, up
-	mov	PARAM_DST, rp
-	mov	%ebx, SAVE_EBX
-	mov	%eax, %ebx
-	shr	$3, %eax
-
-	not	%eax			C count = -(size\8)-i
-	and	$7, %ebx		C size % 8
-	jz	L(exact)
-
-L(oop):
-ifdef(`CPU_P6',`
-	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(up), %ecx
-	adc	%ecx, %ecx
-	rcr	%edx			C restore 1st saved carry bit
-	lea	4(up), up
-	sbb	%ecx, (rp)
-	lea	4(rp), rp
-	adc	%edx, %edx		C save a carry bit in edx
-ifdef(`CPU_P6',`
-	adc	%edx, %edx ')		C save another carry bit in edx
-	dec	%ebx
-	jnz	L(oop)
-L(exact):
-	inc	%eax
-	jz	L(end)
-	mov	%eax, VAR_COUNT
-	mov	%ebp, SAVE_EBP
-
-	ALIGN(16)
-L(top):
-ifdef(`CPU_P6',`
-	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(up), %eax
-	adc	%eax, %eax
-	mov	4(up), %ebx
-	adc	%ebx, %ebx
-	mov	8(up), %ecx
-	adc	%ecx, %ecx
-	mov	12(up), %ebp
-	adc	%ebp, %ebp
-
-	rcr	%edx			C restore 1st saved carry bit
-
-	sbb	%eax, (rp)
-	sbb	%ebx, 4(rp)
-	sbb	%ecx, 8(rp)
-	sbb	%ebp, 12(rp)
-
-	mov	16(up), %eax
-	adc	%eax, %eax
-	mov	20(up), %ebx
-	adc	%ebx, %ebx
-	mov	24(up), %ecx
-	adc	%ecx, %ecx
-	mov	28(up), %ebp
-	adc	%ebp, %ebp
-
-	lea	32(up), up
-	adc	%edx, %edx		C save a carry bit in edx
-
-	sbb	%eax, 16(rp)
-	sbb	%ebx, 20(rp)
-	sbb	%ecx, 24(rp)
-	sbb	%ebp, 28(rp)
-
-ifdef(`CPU_P6',`
-	adc	%edx, %edx ')		C save another carry bit in edx
-	incl	VAR_COUNT
-	lea	32(rp), rp
-	jne	L(top)
-
-	mov	SAVE_EBP, %ebp
-L(end):
-	mov	SAVE_EBX, %ebx
-
-ifdef(`CPU_P6',`
-	xor	%eax, %eax
-	shr	$1, %edx
-	adc	%edx, %eax
-',`
-	adc	$0, %edx
-	mov	%edx, %eax
-')
-	pop	rp			FRAME_popl()
-	pop	up			FRAME_popl()
-	ret
-EPILOGUE()
-ASM_END()
author	Pedro Alvarez <pedro.alvarez@codethink.co.uk>	2016-05-27 17:39:31 +0100
committer	Pedro Alvarez <pedro.alvarez@codethink.co.uk>	2016-05-27 17:53:32 +0100
commit	26c75cf8267919f81a1759c9c965a52c660233f9 (patch)
tree	cf2a39cf56c2c8ac45760854413ab233e6263974 /gmp/mpn/x86/k7
parent	56892c1d217baea02092b51a09bbc924130ca84c (diff)
download	gcc-tarball-26c75cf8267919f81a1759c9c965a52c660233f9.tar.gz