24 files changed, 330 insertions, 4679 deletions
diff --git a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
deleted file mode 100644
index 7066bb4372..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-
-dnl  Contributed to the GNU project by Torbjorn Granlund.
-
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-ifdef(`OPERATION_addlsh1_n', `
-	define(ADDSUB,	add)
-	define(ADCSBB,	adc)
-	define(func,	mpn_addlsh1_n)')
-ifdef(`OPERATION_rsblsh1_n', `
-	define(ADDSUB,	sub)
-	define(ADCSBB,	sbb)
-	define(func,	mpn_rsblsh1_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
deleted file mode 100644
index 5065120857..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl  AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
-dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
-
-dnl  Contributed to the GNU project by Torbjorn Granlund.
-
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n', `
-	define(ADDSUB,	add)
-	define(ADCSBB,	adc)
-	define(func,	mpn_addlsh2_n)')
-ifdef(`OPERATION_rsblsh2_n', `
-	define(ADDSUB,	sub)
-	define(ADCSBB,	sbb)
-	define(func,	mpn_rsblsh2_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aorrlsh_n.asm b/gmp/mpn/x86_64/core2/aorrlsh_n.asm
deleted file mode 100644
index 57abf31579..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh_n.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
-
-dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aors_err1_n.asm b/gmp/mpn/x86_64/core2/aors_err1_n.asm
deleted file mode 100644
index 3f875aefa4..0000000000
--- a/gmp/mpn/x86_64/core2/aors_err1_n.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-dnl  Core 2 mpn_add_err1_n, mpn_sub_err1_n
-
-dnl  Contributed by David Harvey.
-
-dnl  Copyright 2011 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 4.14
-C Intel corei	 ?
-C Intel atom	 ?
-C VIA nano	 ?
-
-
-C INPUT PARAMETERS
-define(`rp',	`%rdi')
-define(`up',	`%rsi')
-define(`vp',	`%rdx')
-define(`ep',	`%rcx')
-define(`yp',	`%r8')
-define(`n',	`%r9')
-define(`cy_param',	`8(%rsp)')
-
-define(`el',	`%rbx')
-define(`eh',	`%rbp')
-define(`t0',	`%r10')
-define(`t1',	`%r11')
-define(`t2',	`%r12')
-define(`t3',	`%r13')
-define(`w0',	`%r14')
-define(`w1',	`%r15')
-
-ifdef(`OPERATION_add_err1_n', `
-	define(ADCSBB,	      adc)
-	define(func,	      mpn_add_err1_n)')
-ifdef(`OPERATION_sub_err1_n', `
-	define(ADCSBB,	      sbb)
-	define(func,	      mpn_sub_err1_n)')
-
-MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
-
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(func)
-	mov	cy_param, %rax
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-
-	lea	(up,n,8), up
-	lea	(vp,n,8), vp
-	lea	(rp,n,8), rp
-
-	mov	R32(n), R32(%r10)
-	and	$3, R32(%r10)
-	jz	L(0mod4)
-	cmp	$2, R32(%r10)
-	jc	L(1mod4)
-	jz	L(2mod4)
-L(3mod4):
-	xor	R32(el), R32(el)
-	xor	R32(eh), R32(eh)
-	xor	R32(t0), R32(t0)
-	xor	R32(t1), R32(t1)
-	lea	-24(yp,n,8), yp
-	neg	n
-
-	shr	$1, %al		   C restore carry
-	mov	(up,n,8), w0
-	mov	8(up,n,8), w1
-	ADCSBB	(vp,n,8), w0
-	mov	w0, (rp,n,8)
-	cmovc	16(yp), el
-	ADCSBB	8(vp,n,8), w1
-	mov	w1, 8(rp,n,8)
-	cmovc	8(yp), t0
-	mov	16(up,n,8), w0
-	ADCSBB	16(vp,n,8), w0
-	mov	w0, 16(rp,n,8)
-	cmovc	(yp), t1
-	setc	%al		   C save carry
-	add	t0, el
-	adc	$0, eh
-	add	t1, el
-	adc	$0, eh
-
-	add	$3, n
-	jnz	L(loop)
-	jmp	L(end)
-
-	ALIGN(16)
-L(0mod4):
-	xor	R32(el), R32(el)
-	xor	R32(eh), R32(eh)
-	lea	(yp,n,8), yp
-	neg	n
-	jmp	L(loop)
-
-	ALIGN(16)
-L(1mod4):
-	xor	R32(el), R32(el)
-	xor	R32(eh), R32(eh)
-	lea	-8(yp,n,8), yp
-	neg	n
-
-	shr	$1, %al		   C restore carry
-	mov	(up,n,8), w0
-	ADCSBB	(vp,n,8), w0
-	mov	w0, (rp,n,8)
-	cmovc	(yp), el
-	setc	%al		   C save carry
-
-	add	$1, n
-	jnz	L(loop)
-	jmp	L(end)
-
-	ALIGN(16)
-L(2mod4):
-	xor	R32(el), R32(el)
-	xor	R32(eh), R32(eh)
-	xor	R32(t0), R32(t0)
-	lea	-16(yp,n,8), yp
-	neg	n
-
-	shr	$1, %al		   C restore carry
-	mov	(up,n,8), w0
-	mov	8(up,n,8), w1
-	ADCSBB	(vp,n,8), w0
-	mov	w0, (rp,n,8)
-	cmovc	8(yp), el
-	ADCSBB	8(vp,n,8), w1
-	mov	w1, 8(rp,n,8)
-	cmovc	(yp), t0
-	setc	%al		   C save carry
-	add	t0, el
-	adc	$0, eh
-
-	add	$2, n
-	jnz	L(loop)
-	jmp	L(end)
-
-	ALIGN(32)
-L(loop):
-	mov	(up,n,8), w0
-	shr	$1, %al		   C restore carry
-	mov	-8(yp), t0
-	mov	$0, R32(t3)
-	ADCSBB	(vp,n,8), w0
-	cmovnc	t3, t0
-	mov	w0, (rp,n,8)
-	mov	8(up,n,8), w1
-	mov	16(up,n,8), w0
-	ADCSBB	8(vp,n,8), w1
-	mov	-16(yp), t1
-	cmovnc	t3, t1
-	mov	-24(yp), t2
-	mov	w1, 8(rp,n,8)
-	ADCSBB	16(vp,n,8), w0
-	cmovnc	t3, t2
-	mov	24(up,n,8), w1
-	ADCSBB	24(vp,n,8), w1
-	cmovc	-32(yp), t3
-	setc	%al		   C save carry
-	add	t0, el
-	adc	$0, eh
-	add	t1, el
-	adc	$0, eh
-	add	t2, el
-	adc	$0, eh
-	lea	-32(yp), yp
-	mov	w0, 16(rp,n,8)
-	add	t3, el
-	adc	$0, eh
-	add	$4, n
-	mov	w1, -8(rp,n,8)
-	jnz	L(loop)
-
-L(end):
-	mov	el, (ep)
-	mov	eh, 8(ep)
-
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aors_n.asm b/gmp/mpn/x86_64/core2/aors_n.asm
index 74a1bce48a..d26af866f9 100644
--- a/gmp/mpn/x86_64/core2/aors_n.asm
+++ b/gmp/mpn/x86_64/core2/aors_n.asm
@@ -1,45 +1,30 @@
-dnl  Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
+dnl  Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
 
-dnl  Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
+dnl  Copyright 2006, 2007 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
-dnl
+
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
 C	     cycles/limb
-C AMD K8,K9	 2
-C AMD K10	 2
-C Intel P4	10
-C Intel core2	 2
-C Intel NHM	 2
-C Intel SBR	 2
-C Intel atom	 9
-C VIA nano	 3
+C K8,K9:	 2.25
+C K10:		 2
+C P4:		10
+C P6-15:	 2.05
 
 C INPUT PARAMETERS
 define(`rp',	`%rdi')
@@ -59,83 +44,80 @@ ifdef(`OPERATION_sub_n', `
 
 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
 
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
 ASM_START()
+
 	TEXT
 	ALIGN(16)
+
+PROLOGUE(func_nc)
+	jmp	L(start)
+EPILOGUE()
+
 PROLOGUE(func)
-	FUNC_ENTRY(4)
 	xor	%r8, %r8
 L(start):
 	mov	(up), %r10
 	mov	(vp), %r11
 
-	lea	(up,n,8), up
-	lea	(vp,n,8), vp
-	lea	(rp,n,8), rp
-	mov	R32(n), R32(%rax)
+	lea	-8(up,n,8), up
+	lea	-8(vp,n,8), vp
+	lea	-16(rp,n,8), rp
+	mov	%ecx, %eax
 	neg	n
-	and	$3, R32(%rax)
+	and	$3, %eax
 	je	L(b00)
-	add	%rax, n			C clear low rcx bits for jrcxz
-	cmp	$2, R32(%rax)
+	add	%rax, n		C clear low rcx bits for jrcxz
+	cmp	$2, %eax
 	jl	L(b01)
 	je	L(b10)
 
-L(b11):	neg	%r8			C set cy
+L(b11):	shr	%r8			C set cy
 	jmp	L(e11)
 
-L(b00):	neg	%r8			C set cy
+L(b00):	shr	%r8			C set cy
 	mov	%r10, %r8
 	mov	%r11, %r9
 	lea	4(n), n
 	jmp	L(e00)
 
-	nop
-	nop
-	nop
-L(b01):	neg	%r8			C set cy
-	jmp	L(top)
+L(b01):	shr	%r8			C set cy
+	jmp	L(e01)
 
-L(b10):	neg	%r8			C set cy
+L(b10):	shr	%r8			C set cy
 	mov	%r10, %r8
 	mov	%r11, %r9
 	jmp	L(e10)
 
 L(end):	ADCSBB	%r11, %r10
-	mov	%r10, -8(rp)
-	mov	R32(%rcx), R32(%rax)	C clear eax, ecx contains 0
-	adc	R32(%rax), R32(%rax)
-	FUNC_EXIT()
+	mov	%r10, 8(rp)
+	mov	%ecx, %eax		C clear eax, ecx contains 0
+	adc	%eax, %eax
 	ret
 
 	ALIGN(16)
-L(top):	jrcxz	L(end)
-	mov	(up,n,8), %r8
-	mov	(vp,n,8), %r9
-	lea	4(n), n
-	ADCSBB	%r11, %r10
-	mov	%r10, -40(rp,n,8)
-L(e00):	mov	-24(up,n,8), %r10
-	mov	-24(vp,n,8), %r11
-	ADCSBB	%r9, %r8
-	mov	%r8, -32(rp,n,8)
-L(e11):	mov	-16(up,n,8), %r8
-	mov	-16(vp,n,8), %r9
+L(top):
+	mov	-24(up,n,8), %r8
+	mov	-24(vp,n,8), %r9
 	ADCSBB	%r11, %r10
 	mov	%r10, -24(rp,n,8)
-L(e10):	mov	-8(up,n,8), %r10
-	mov	-8(vp,n,8), %r11
+L(e00):
+	mov	-16(up,n,8), %r10
+	mov	-16(vp,n,8), %r11
 	ADCSBB	%r9, %r8
 	mov	%r8, -16(rp,n,8)
+L(e11):
+	mov	-8(up,n,8), %r8
+	mov	-8(vp,n,8), %r9
+	ADCSBB	%r11, %r10
+	mov	%r10, -8(rp,n,8)
+L(e10):
+	mov	(up,n,8), %r10
+	mov	(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, (rp,n,8)
+L(e01):
+	jrcxz	L(end)
+	lea	4(n), n
 	jmp	L(top)
-EPILOGUE()
 
-PROLOGUE(func_nc)
-	FUNC_ENTRY(4)
-IFDOS(`	mov	56(%rsp), %r8	')
-	jmp	L(start)
 EPILOGUE()
-
diff --git a/gmp/mpn/x86_64/core2/sublshC_n.asm b/gmp/mpn/x86_64/core2/aorslsh1_n.asm
index 5acc46b032..18db7c96f8 100644
--- a/gmp/mpn/x86_64/core2/sublshC_n.asm
+++ b/gmp/mpn/x86_64/core2/aorslsh1_n.asm
@@ -1,45 +1,29 @@
-dnl  AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << 1), optimised for Core 2 and
-dnl  Core iN.
+dnl  x86-64 mpn_addlsh1_n and mpn_sublsh1_n, optimized for "Core" 2.
 
-dnl  Contributed to the GNU project by Torbjorn Granlund.
-
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl  Copyright 2008 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
-dnl
+
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
 
 C	     cycles/limb
-C AMD K8,K9	 4.25
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 3
-C Intel NHM	 3.1
-C Intel SBR	 2.47
-C Intel atom	 ?
-C VIA nano	 ?
+C K8,K9:	 4.25
+C K10:		 ?
+C P4:		 ?
+C P6-15:	 3
 
 C INPUT PARAMETERS
 define(`rp',`%rdi')
@@ -47,11 +31,21 @@ define(`up',`%rsi')
 define(`vp',`%rdx')
 define(`n', `%rcx')
 
+ifdef(`OPERATION_addlsh1_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func,	mpn_addlsh1_n)')
+ifdef(`OPERATION_sublsh1_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func,	mpn_sublsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
 ASM_START()
 	TEXT
 	ALIGN(8)
 PROLOGUE(func)
-	FUNC_ENTRY(4)
 	push	%rbx
 	push	%r12
 
@@ -64,7 +58,7 @@ PROLOGUE(func)
 	xor	R32(%r11), R32(%r11)
 
 	mov	-24(vp,n,8), %r8	C do first limb early
-	shrd	$RSH, %r8, %r11
+	shrd	$63, %r8, %r11
 
 	and	$3, R32(%rax)
 	je	L(b0)
@@ -73,9 +67,9 @@ PROLOGUE(func)
 	je	L(b2)
 
 L(b3):	mov	-16(vp,n,8), %r9
-	shrd	$RSH, %r9, %r8
+	shrd	$63, %r9, %r8
 	mov	-8(vp,n,8), %r10
-	shrd	$RSH, %r10, %r9
+	shrd	$63, %r10, %r9
 	mov	-24(up,n,8), %r12
 	ADDSUB	%r11, %r12
 	mov	%r12, -24(rp,n,8)
@@ -101,7 +95,7 @@ L(b1):	mov	-24(up,n,8), %r12
 	jmp	L(end)
 
 L(b2):	mov	-16(vp,n,8), %r9
-	shrd	$RSH, %r9, %r8
+	shrd	$63, %r9, %r8
 	mov	-24(up,n,8), %r12
 	ADDSUB	%r11, %r12
 	mov	%r12, -24(rp,n,8)
@@ -116,13 +110,13 @@ L(b2):	mov	-16(vp,n,8), %r9
 
 	ALIGN(16)
 L(top):	mov	-24(vp,n,8), %r8
-	shrd	$RSH, %r8, %r11
+	shrd	$63, %r8, %r11
 L(b0):	mov	-16(vp,n,8), %r9
-	shrd	$RSH, %r9, %r8
+	shrd	$63, %r9, %r8
 	mov	-8(vp,n,8), %r10
-	shrd	$RSH, %r10, %r9
+	shrd	$63, %r10, %r9
 	mov	(vp,n,8), %rbx
-	shrd	$RSH, %rbx, %r10
+	shrd	$63, %rbx, %r10
 
 	add	R32(%rax), R32(%rax)	C restore cy
 
@@ -148,11 +142,10 @@ L(b0):	mov	-16(vp,n,8), %r9
 	add	$4, n
 	js	L(top)
 
-L(end):	shr	$RSH, %r11
+L(end):	add	%r11, %r11
 	pop	%r12
 	pop	%rbx
-	sub	R32(%r11), R32(%rax)
+	sbb	$0, R32(%rax)
 	neg	R32(%rax)
-	FUNC_EXIT()
 	ret
 EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aorsmul_1.asm b/gmp/mpn/x86_64/core2/aorsmul_1.asm
index 6b313dd836..1d05b30b59 100644
--- a/gmp/mpn/x86_64/core2/aorsmul_1.asm
+++ b/gmp/mpn/x86_64/core2/aorsmul_1.asm
@@ -1,46 +1,29 @@
 dnl  x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
 
-dnl  Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
-dnl
+
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 C	     cycles/limb
-C AMD K8,K9	 4
-C AMD K10	 4
-C AMD bd1	 5.1
-C AMD bobcat
-C Intel P4	 ?
-C Intel core2	 4.3-4.5 (fluctuating)
-C Intel NHM	 5.0
-C Intel SBR	 4.1
-C Intel atom	 ?
-C VIA nano	 5.25
+C K8,K9:	 4
+C K10:		 4
+C P4:		 ?
+C P6-15:	 4.3-4.7 (fluctuating)
 
 C INPUT PARAMETERS
 define(`rp',	`%rdi')
@@ -50,129 +33,111 @@ define(`v0',	`%rcx')
 
 ifdef(`OPERATION_addmul_1',`
       define(`ADDSUB',        `add')
-      define(`func',     `mpn_addmul_1')
-      define(`func_1c',  `mpn_addmul_1c')
+      define(`func',  `mpn_addmul_1')
 ')
 ifdef(`OPERATION_submul_1',`
       define(`ADDSUB',        `sub')
-      define(`func',     `mpn_submul_1')
-      define(`func_1c',  `mpn_submul_1c')
+      define(`func',  `mpn_submul_1')
 ')
 
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-	C For DOS, on the stack we have four saved registers, return address,
-	C space for four register arguments, and finally the carry input.
-
-IFDOS(` define(`carry_in', `72(%rsp)')') dnl
-IFSTD(` define(`carry_in', `%r8')') dnl
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
 
 ASM_START()
 	TEXT
 	ALIGN(16)
-PROLOGUE(func_1c)
-	FUNC_ENTRY(4)
-	push	%rbx
-	push	%rbp
-	lea	(%rdx), %rbx
-	neg	%rbx
-
-	mov	(up), %rax
-	mov	(rp), %r10
-
-	lea	-16(rp,%rdx,8), rp
-	lea	(up,%rdx,8), up
-	mul	%rcx
-	add	carry_in, %rax
-	adc	$0, %rdx
-	jmp	L(start_nc)
-EPILOGUE()
-
-	ALIGN(16)
 PROLOGUE(func)
-	FUNC_ENTRY(4)
-	push	%rbx
-	push	%rbp
-	lea	(%rdx), %rbx
-	neg	%rbx
+	push	%r15
+	push	%r12
+	push	%r13
+	lea	(%rdx), %r15
+	neg	%r15
 
 	mov	(up), %rax
-	mov	(rp), %r10
 
-	lea	-16(rp,%rdx,8), rp
+	bt	$0, %r15
+	jc	L(odd)
+
+	lea	(rp,%rdx,8), rp
 	lea	(up,%rdx,8), up
 	mul	%rcx
 
-L(start_nc):
-	bt	$0, R32(%rbx)
-	jc	L(odd)
-
 	lea	(%rax), %r11
-	mov	8(up,%rbx,8), %rax
-	lea	(%rdx), %rbp
-	mul	%rcx
-	add	$2, %rbx
+	mov	8(up,%r15,8), %rax
+	mov	(rp,%r15,8), %r13
+	lea	(%rdx), %r12
+
+	add	$2, %r15
 	jns	L(n2)
 
+	mul	%rcx
 	lea	(%rax), %r8
-	mov	(up,%rbx,8), %rax
+	mov	(up,%r15,8), %rax
+	mov	-8(rp,%r15,8), %r10
 	lea	(%rdx), %r9
-	jmp	L(mid)
+	jmp	L(m)
 
-L(odd):	add	$1, %rbx
+L(odd):	lea	(rp,%rdx,8), rp
+	lea	(up,%rdx,8), up
+	mul	%rcx
+	add	$1, %r15
 	jns	L(n1)
 
-	lea	(%rax), %r8
-	mov	(up,%rbx,8), %rax
+L(gt1):	lea	(%rax), %r8
+	mov	(up,%r15,8), %rax
+	mov	-8(rp,%r15,8), %r10
 	lea	(%rdx), %r9
 	mul	%rcx
 	lea	(%rax), %r11
-	mov	8(up,%rbx,8), %rax
-	lea	(%rdx), %rbp
-	jmp	L(e)
+	mov	8(up,%r15,8), %rax
+	mov	(rp,%r15,8), %r13
+	lea	(%rdx), %r12
+	add	$2, %r15
+	jns	L(end)
 
 	ALIGN(16)
 L(top):	mul	%rcx
 	ADDSUB	%r8, %r10
 	lea	(%rax), %r8
-	mov	(up,%rbx,8), %rax
+	mov	0(up,%r15,8), %rax
 	adc	%r9, %r11
-	mov	%r10, -8(rp,%rbx,8)
-	mov	(rp,%rbx,8), %r10
+	mov	%r10, -24(rp,%r15,8)
+	mov	-8(rp,%r15,8), %r10
 	lea	(%rdx), %r9
-	adc	$0, %rbp
-L(mid):	mul	%rcx
-	ADDSUB	%r11, %r10
+	adc	$0, %r12
+L(m):	mul	%rcx
+	ADDSUB	%r11, %r13
 	lea	(%rax), %r11
-	mov	8(up,%rbx,8), %rax
-	adc	%rbp, %r8
-	mov	%r10, (rp,%rbx,8)
-	mov	8(rp,%rbx,8), %r10
-	lea	(%rdx), %rbp
+	mov	8(up,%r15,8), %rax
+	adc	%r12, %r8
+	mov	%r13, -16(rp,%r15,8)
+	mov	0(rp,%r15,8), %r13
+	lea	(%rdx), %r12
 	adc	$0, %r9
-L(e):	add	$2, %rbx
+
+	add	$2, %r15
 	js	L(top)
 
-	mul	%rcx
+L(end):	mul	%rcx
 	ADDSUB	%r8, %r10
 	adc	%r9, %r11
-	mov	%r10, -8(rp)
-	adc	$0, %rbp
-L(n2):	mov	(rp), %r10
-	ADDSUB	%r11, %r10
-	adc	%rbp, %rax
-	mov	%r10, (rp)
+	mov	%r10, -24(rp,%r15,8)
+	mov	-8(rp,%r15,8), %r10
+	adc	$0, %r12
+L(r):	ADDSUB	%r11, %r13
+	adc	%r12, %rax
+	mov	%r13, -16(rp,%r15,8)
 	adc	$0, %rdx
-L(n1):	mov	8(rp), %r10
-	ADDSUB	%rax, %r10
-	mov	%r10, 8(rp)
-	mov	R32(%rbx), R32(%rax)	C zero rax
+L(x):	ADDSUB	%rax, %r10
+	mov	%r10, -8(rp,%r15,8)
+	mov	$0, %eax
 	adc	%rdx, %rax
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
+L(ret):	pop	%r13
+	pop	%r12
+	pop	%r15
 	ret
+L(n2):	mul	%rcx
+	mov	-8(rp,%r15,8), %r10
+	jmp	L(r)
+L(n1):	mov	-8(rp,%r15,8), %r10
+	jmp	L(x)
 EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/copyd.asm b/gmp/mpn/x86_64/core2/copyd.asm
deleted file mode 100644
index f0dc54a55e..0000000000
--- a/gmp/mpn/x86_64/core2/copyd.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl  X86-64 mpn_copyd optimised for Intel Sandy Bridge.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/core2/copyi.asm b/gmp/mpn/x86_64/core2/copyi.asm
deleted file mode 100644
index 9c26e00c52..0000000000
--- a/gmp/mpn/x86_64/core2/copyi.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl  X86-64 mpn_copyi optimised for Intel Sandy Bridge.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/core2/divrem_1.asm b/gmp/mpn/x86_64/core2/divrem_1.asm
deleted file mode 100644
index 623bea386c..0000000000
--- a/gmp/mpn/x86_64/core2/divrem_1.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
-
-dnl  Copyright 2004, 2005, 2007-2010, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C		norm	unorm	frac
-C AMD K8,K9	15	15	12
-C AMD K10	15	15	12
-C Intel P4	44	44	43
-C Intel core2	24	24	19.5
-C Intel corei	19	19	18
-C Intel atom	51	51	36
-C VIA nano	46	44	22.5
-
-C mp_limb_t
-C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
-C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
-
-C mp_limb_t
-C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
-C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
-C                      mp_limb_t dinv, int cnt)
-
-C INPUT PARAMETERS
-define(`qp',		`%rdi')
-define(`fn_param',	`%rsi')
-define(`up_param',	`%rdx')
-define(`un_param',	`%rcx')
-define(`d',		`%r8')
-define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
-C       shift passed on stack		C only for mpn_preinv_divrem_1
-
-define(`cnt',		`%rcx')
-define(`up',		`%rsi')
-define(`fn',		`%r12')
-define(`un',		`%rbx')
-
-
-C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
-C         cnt         qp      d  dinv
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFSTD(`define(`CNTOFF',		`40($1)')')
-IFDOS(`define(`CNTOFF',		`104($1)')')
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_preinv_divrem_1)
-	FUNC_ENTRY(4)
-IFDOS(`	mov	56(%rsp), %r8	')
-IFDOS(`	mov	64(%rsp), %r9	')
-	xor	R32(%rax), R32(%rax)
-	push	%r13
-	push	%r12
-	push	%rbp
-	push	%rbx
-
-	mov	fn_param, fn
-	mov	un_param, un
-	add	fn_param, un_param
-	mov	up_param, up
-
-	lea	-8(qp,un_param,8), qp
-
-	mov	CNTOFF(%rsp), R8(cnt)
-	shl	R8(cnt), d
-	jmp	L(ent)
-EPILOGUE()
-
-	ALIGN(16)
-PROLOGUE(mpn_divrem_1)
-	FUNC_ENTRY(4)
-IFDOS(`	mov	56(%rsp), %r8	')
-	xor	R32(%rax), R32(%rax)
-	push	%r13
-	push	%r12
-	push	%rbp
-	push	%rbx
-
-	mov	fn_param, fn
-	mov	un_param, un
-	add	fn_param, un_param
-	mov	up_param, up
-	je	L(ret)
-
-	lea	-8(qp,un_param,8), qp
-	xor	R32(%rbp), R32(%rbp)
-
-L(unnormalized):
-	test	un, un
-	je	L(44)
-	mov	-8(up,un,8), %rax
-	cmp	d, %rax
-	jae	L(44)
-	mov	%rbp, (qp)
-	mov	%rax, %rbp
-	lea	-8(qp), qp
-	je	L(ret)
-	dec	un
-L(44):
-	bsr	d, %rcx
-	not	R32(%rcx)
-	sal	R8(%rcx), d
-	sal	R8(%rcx), %rbp
-
-	push	%rcx
-IFSTD(`	push	%rdi		')
-IFSTD(`	push	%rsi		')
-	push	%r8
-IFSTD(`	mov	d, %rdi		')
-IFDOS(`	mov	d, %rcx		')
-	CALL(	mpn_invert_limb)
-	pop	%r8
-IFSTD(`	pop	%rsi		')
-IFSTD(`	pop	%rdi		')
-	pop	%rcx
-
-	mov	%rax, dinv
-	mov	%rbp, %rax
-	test	un, un
-	je	L(frac)
-L(ent):	mov	-8(up,un,8), %rbp
-	shr	R8(%rcx), %rax
-	shld	R8(%rcx), %rbp, %rax
-	sub	$2, un
-	js	L(end)
-
-	ALIGN(16)
-L(top):	lea	1(%rax), %r11
-	mul	dinv
-	mov	(up,un,8), %r10
-	shld	R8(%rcx), %r10, %rbp
-	mov	%rbp, %r13
-	add	%rax, %r13
-	adc	%r11, %rdx
-	mov	%rdx, %r11
-	imul	d, %rdx
-	sub	%rdx, %rbp
-	lea	(d,%rbp), %rax
-	sub	$8, qp
-	cmp	%r13, %rbp
-	cmovc	%rbp, %rax
-	adc	$-1, %r11
-	cmp	d, %rax
-	jae	L(ufx)
-L(uok):	dec	un
-	mov	%r11, 8(qp)
-	mov	%r10, %rbp
-	jns	L(top)
-
-L(end):	lea	1(%rax), %r11
-	sal	R8(%rcx), %rbp
-	mul	dinv
-	add	%rbp, %rax
-	adc	%r11, %rdx
-	mov	%rax, %r11
-	mov	%rdx, %r13
-	imul	d, %rdx
-	sub	%rdx, %rbp
-	mov	d, %rax
-	add	%rbp, %rax
-	cmp	%r11, %rbp
-	cmovc	%rbp, %rax
-	adc	$-1, %r13
-	cmp	d, %rax
-	jae	L(efx)
-L(eok):	mov	%r13, (qp)
-	sub	$8, qp
-	jmp	L(frac)
-
-L(ufx):	sub	d, %rax
-	inc	%r11
-	jmp	L(uok)
-L(efx):	sub	d, %rax
-	inc	%r13
-	jmp	L(eok)
-
-L(frac):mov	d, %rbp
-	neg	%rbp
-	jmp	L(fent)
-
-	ALIGN(16)			C	    K8-K10  P6-CNR P6-NHM  P4
-L(ftop):mul	dinv			C	      0,12   0,17   0,17
-	add	%r11, %rdx		C	      5      8     10
-	mov	%rax, %r11		C	      4      8      3
-	mov	%rdx, %r13		C	      6      9     11
-	imul	%rbp, %rdx		C	      6      9     11
-	mov	d, %rax			C
-	add	%rdx, %rax		C	     10     14     14
-	cmp	%r11, %rdx		C	     10     14     14
-	cmovc	%rdx, %rax		C	     11     15     15
-	adc	$-1, %r13		C
-	mov	%r13, (qp)		C
-	sub	$8, qp			C
-L(fent):lea	1(%rax), %r11		C
-	dec	fn			C
-	jns	L(ftop)			C
-
-	shr	R8(%rcx), %rax
-L(ret):	pop	%rbx
-	pop	%rbp
-	pop	%r12
-	pop	%r13
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/gcd_1.asm b/gmp/mpn/x86_64/core2/gcd_1.asm
deleted file mode 100644
index e0cab9b4e4..0000000000
--- a/gmp/mpn/x86_64/core2/gcd_1.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl  AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/bit (approx)
-C AMD K8,K9	 8.50
-C AMD K10	 4.30
-C AMD bd1	 5.00
-C AMD bobcat	10.0
-C Intel P4	18.6
-C Intel core2	 3.83
-C Intel NHM	 5.17
-C Intel SBR	 4.69
-C Intel atom	17.0
-C VIA nano	 5.44
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C  * Optimise inner-loop for specific CPUs.
-C  * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up',    `%rdi')
-define(`n',     `%rsi')
-define(`v0',    `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function.  It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
-  define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	FUNC_ENTRY(3)
-	mov	(up), %rax	C U low limb
-	or	v0, %rax
-	bsf	%rax, %rax	C min(ctz(u0),ctz(v0))
-
-	bsf	v0, %rcx
-	shr	R8(%rcx), v0
-
-	push	%rax		C preserve common twos over call
-	push	v0		C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-
-	cmp	$1, n
-	jnz	L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	mov	(up), %r8
-	mov	%r8, %rax
-	shr	$BMOD_THRES_LOG2, %r8
-	cmp	%r8, v0
-	ja	L(reduced)
-	jmp	L(bmod)
-
-L(reduce_nby1):
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	CALL(	mpn_mod_1)
-	jmp	L(reduced)
-L(bmod):
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	CALL(	mpn_modexact_1_odd)
-L(reduced):
-
-	add	$STACK_ALLOC, %rsp
-	pop	%rdx
-
-	bsf	%rax, %rcx
-C	test	%rax, %rax	C FIXME: does this lower latency?
-	jnz	L(mid)
-	jmp	L(end)
-
-	ALIGN(16)		C               K10   BD    C2    NHM   SBR
-L(top):	cmovc	%r10, %rax	C if x-y < 0    0,3   0,3   0,6   0,5   0,5
-	cmovc	%r9, %rdx	C use x,y-x     0,3   0,3   2,8   1,7   1,7
-L(mid):	shr	R8(%rcx), %rax	C               1,7   1,6   2,8   2,8   2,8
-	mov	%rdx, %r10	C               1     1     4     3     3
-	sub	%rax, %r10	C               2     2     5     4     4
-	bsf	%r10, %rcx	C               3     3     6     5     5
-	mov	%rax, %r9	C               2     2     3     3     4
-	sub	%rdx, %rax	C               2     2     4     3     4
-	jnz	L(top)		C
-
-L(end):	pop	%rcx
-	mov	%rdx, %rax
-	shl	R8(%rcx), %rax
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/gmp-mparam.h b/gmp/mpn/x86_64/core2/gmp-mparam.h
index 0f4f88f780..8207da4895 100644
--- a/gmp/mpn/x86_64/core2/gmp-mparam.h
+++ b/gmp/mpn/x86_64/core2/gmp-mparam.h
@@ -1,217 +1,78 @@
-/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file.
+/* "Core 2" gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
 The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
 
-  * the GNU Lesser General Public License as published by the Free
-    Software Foundation; either version 3 of the License, or (at your
-    option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
 
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
-  * the GNU General Public License as published by the Free Software
-    Foundation; either version 2 of the License, or (at your option) any
-    later version.
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
 
-or both in parallel, as here.
+/* 2133 MHz "Core 2" / 65nm / 4096 Kibyte cache / socket 775 */
 
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library.  If not,
-see https://www.gnu.org/licenses/.  */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 2133 MHz Core 2 (65nm) */
-/* FFT tuning limit = 60000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD                 0  /* always */
-#define MOD_1_UNNORM_THRESHOLD               0  /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        16
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
-#define USE_PREINV_DIVREM_1                  1  /* native */
-#define DIV_QR_1_NORM_THRESHOLD              1
-#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
-#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
-#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD           24
-
-#define MUL_TOOM22_THRESHOLD                23
-#define MUL_TOOM33_THRESHOLD                65
-#define MUL_TOOM44_THRESHOLD               179
-#define MUL_TOOM6H_THRESHOLD               268
-#define MUL_TOOM8H_THRESHOLD               357
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD      69
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD      78
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD     100
-
-#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
-#define SQR_TOOM2_THRESHOLD                 28
-#define SQR_TOOM3_THRESHOLD                102
-#define SQR_TOOM4_THRESHOLD                160
-#define SQR_TOOM6_THRESHOLD                222
-#define SQR_TOOM8_THRESHOLD                296
-
-#define MULMID_TOOM42_THRESHOLD             28
-
-#define MULMOD_BNM1_THRESHOLD               12
-#define SQRMOD_BNM1_THRESHOLD               13
-
-#define MUL_FFT_MODF_THRESHOLD             372  /* k = 5 */
-#define MUL_FFT_TABLE3                                      \
-  { {    372, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
-    {     21, 7}, {     11, 6}, {     23, 7}, {     12, 6}, \
-    {     25, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
-    {     13, 7}, {     27, 8}, {     15, 7}, {     32, 8}, \
-    {     17, 7}, {     36, 8}, {     19, 7}, {     40, 8}, \
-    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
-    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
-    {     47, 9}, {     27,10}, {     15, 9}, {     43,10}, \
-    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
-    {     63,10}, {     39, 9}, {     83,10}, {     47, 9}, \
-    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
-    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
-    {    127, 9}, {    255,10}, {    135, 9}, {    271,11}, \
-    {     79,10}, {    159, 9}, {    319,10}, {    167,11}, \
-    {     95,10}, {    191, 9}, {    383,10}, {    207,11}, \
-    {    111,12}, {     63,11}, {    127,10}, {    271,11}, \
-    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
-    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
-    {    383,11}, {    207,10}, {    415,11}, {    223,13}, \
-    {     63,12}, {    127,11}, {    271,10}, {    543,11}, \
-    {    287,10}, {    575,11}, {    303,10}, {    607,12}, \
-    {    159,11}, {    319,10}, {    639,11}, {    351,12}, \
-    {    191,11}, {    415,12}, {    223,11}, {    479,13}, \
-    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
-    {    607,12}, {    319,11}, {    639,12}, {    351,11}, \
-    {    703,13}, {    191,12}, {    415,11}, {    831,12}, \
-    {    479,14}, {    127,13}, {    255,12}, {    607,13}, \
-    {    319,12}, {    703,13}, {    383,12}, {    831,13}, \
-    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
-    {   1023,13}, {    575,12}, {   1215,13}, {    639,12}, \
-    {   1279,13}, {    703,14}, {    383,13}, {    831,12}, \
-    {   1663,13}, {    895,15}, {    255,14}, {    511,13}, \
-    {   1151,14}, {    639,13}, {   1343,14}, {    767,13}, \
-    {   1599,14}, {    895,15}, {    511,14}, {   1279,13}, \
-    {   2687,14}, {   1407,13}, {   2815,15}, {    767,14}, \
-    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,16}, \
-    {    511,15}, {   1023,14}, {   2047,13}, {   4095,14}, \
-    {   2175,12}, {   8959,14}, {   2303,13}, {   4607,12}, \
-    {   9471,14}, {   2431,13}, {   4863,12}, {   9983,15}, \
-    {   1279,14}, {   2559,12}, {  10239,14}, {   2687,12}, \
-    {  11775,15}, {   1535,14}, {   3327,13}, {   6655,14}, \
-    {   3455,13}, {   6911,14}, {   3583,12}, {  14335,11}, \
-    {  28671,10}, {  57343,11}, {   2048,12}, {   4096,13}, \
-    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
-    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
-    {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 183
-#define MUL_FFT_THRESHOLD                 4736
-
-#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
-#define SQR_FFT_TABLE3                                      \
-  { {    340, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
-    {      9, 5}, {     19, 6}, {     23, 7}, {     12, 6}, \
-    {     25, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
-    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
-    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
-    {     33, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
-    {     47, 9}, {     27,10}, {     15, 9}, {     43,10}, \
-    {     23, 9}, {     55,10}, {     31, 9}, {     67,10}, \
-    {     39, 9}, {     79,10}, {     47,11}, {     31,10}, \
-    {     79,11}, {     47,10}, {     95,12}, {     31,11}, \
-    {     63,10}, {    127, 9}, {    255, 8}, {    511, 9}, \
-    {    271, 8}, {    543,11}, {     79, 9}, {    319, 8}, \
-    {    639,11}, {     95,10}, {    191, 9}, {    383,10}, \
-    {    207, 9}, {    415,12}, {     63,11}, {    127,10}, \
-    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
-    {    303, 9}, {    607,10}, {    319, 9}, {    639,11}, \
-    {    175,12}, {     95,11}, {    191,10}, {    383,11}, \
-    {    207,10}, {    415,13}, {     63,12}, {    127,11}, \
-    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
-    {    303,10}, {    607,11}, {    319,10}, {    639,11}, \
-    {    351,12}, {    191,11}, {    415,10}, {    831,12}, \
-    {    223,11}, {    447,10}, {    895,11}, {    479,13}, \
-    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
-    {    607,12}, {    319,11}, {    639,12}, {    351,13}, \
-    {    191,12}, {    415,11}, {    831,12}, {    479,14}, \
-    {    127,13}, {    255,12}, {    607,13}, {    319,12}, \
-    {    703,13}, {    383,12}, {    831,13}, {    447,12}, \
-    {    959,14}, {    255,13}, {    511,12}, {   1023,13}, \
-    {    575,12}, {   1215,13}, {    639,12}, {   1279,13}, \
-    {    703,14}, {    383,13}, {    831,12}, {   1663,13}, \
-    {    959,15}, {    255,14}, {    511,13}, {   1087,12}, \
-    {   2175,13}, {   1215,14}, {    639,13}, {   1343,12}, \
-    {   2687,13}, {   1407,12}, {   2815,14}, {    767,13}, \
-    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
-    {   2175,14}, {   1151,13}, {   2303,12}, {   4607,13}, \
-    {   2431,12}, {   4863,14}, {   1279,13}, {   2687,14}, \
-    {   1407,15}, {    767,14}, {   1535,13}, {   3071,14}, \
-    {   1663,13}, {   3455,12}, {   6911,14}, {   1791,13}, \
-    {   3583,16}, {    511,15}, {   1023,14}, {   2175,13}, \
-    {   4351,14}, {   2303,13}, {   4607,14}, {   2431,13}, \
-    {   4863,15}, {   1279,14}, {   2815,13}, {   5631,14}, \
-    {   2943,13}, {   5887,15}, {   1535,14}, {   3455,13}, \
-    {   6911,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
-    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
-    {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 179
-#define SQR_FFT_THRESHOLD                 3008
-
-#define MULLO_BASECASE_THRESHOLD             0  /* always */
-#define MULLO_DC_THRESHOLD                  62
-#define MULLO_MUL_N_THRESHOLD             9174
-
-#define DC_DIV_QR_THRESHOLD                 46
-#define DC_DIVAPPR_Q_THRESHOLD             155
-#define DC_BDIV_QR_THRESHOLD                50
-#define DC_BDIV_Q_THRESHOLD                 94
-
-#define INV_MULMOD_BNM1_THRESHOLD           48
-#define INV_NEWTON_THRESHOLD               156
-#define INV_APPR_THRESHOLD                 155
-
-#define BINV_NEWTON_THRESHOLD              234
-#define REDC_1_TO_REDC_2_THRESHOLD          22
-#define REDC_2_TO_REDC_N_THRESHOLD          48
-
-#define MU_DIV_QR_THRESHOLD               1187
-#define MU_DIVAPPR_Q_THRESHOLD            1142
-#define MUPI_DIV_QR_THRESHOLD               74
-#define MU_BDIV_QR_THRESHOLD              1017
-#define MU_BDIV_Q_THRESHOLD               1187
-
-#define POWM_SEC_TABLE  1,64,131,269,466
-
-#define MATRIX22_STRASSEN_THRESHOLD         19
-#define HGCD_THRESHOLD                     117
-#define HGCD_APPR_THRESHOLD                151
-#define HGCD_REDUCE_THRESHOLD             2121
-#define GCD_DC_THRESHOLD                   427
-#define GCDEXT_DC_THRESHOLD                342
-#define JACOBI_BASE_METHOD                   4
-
-#define GET_STR_DC_THRESHOLD                11
-#define GET_STR_PRECOMPUTE_THRESHOLD        18
-#define SET_STR_DC_THRESHOLD               552
-#define SET_STR_PRECOMPUTE_THRESHOLD      1561
-
-#define FAC_DSC_THRESHOLD                  656
-#define FAC_ODD_THRESHOLD                   23
+/* Generated by tuneup.c, 2009-01-14, gcc 4.2 */
+
+#define MUL_KARATSUBA_THRESHOLD          18
+#define MUL_TOOM3_THRESHOLD              65
+#define MUL_TOOM44_THRESHOLD            166
+
+#define SQR_BASECASE_THRESHOLD            0  /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD          32
+#define SQR_TOOM3_THRESHOLD              97
+#define SQR_TOOM4_THRESHOLD             163
+
+#define MULLOW_BASECASE_THRESHOLD         0  /* always */
+#define MULLOW_DC_THRESHOLD              20
+#define MULLOW_MUL_N_THRESHOLD          232
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 60
+#define POWM_THRESHOLD                   77
+
+#define MATRIX22_STRASSEN_THRESHOLD      25
+#define HGCD_THRESHOLD                  140
+#define GCD_DC_THRESHOLD                691
+#define GCDEXT_DC_THRESHOLD             760
+#define JACOBI_BASE_METHOD                1
+
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1_THRESHOLD                 3
+#define MOD_1_2_THRESHOLD                 5
+#define MOD_1_4_THRESHOLD                20
+#define USE_PREINV_DIVREM_1               1  /* native */
+#define USE_PREINV_MOD_1                  1
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always (native) */
+
+#define GET_STR_DC_THRESHOLD             10
+#define GET_STR_PRECOMPUTE_THRESHOLD     16
+#define SET_STR_DC_THRESHOLD            668
+#define SET_STR_PRECOMPUTE_THRESHOLD   2052
+
+#define MUL_FFT_TABLE  { 336, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
+#define MUL_FFT_MODF_THRESHOLD          352
+#define MUL_FFT_THRESHOLD              3456
+
+#define SQR_FFT_TABLE  { 336, 736, 1728, 3328, 7168, 20480, 81920, 327680, 0 }
+#define SQR_FFT_MODF_THRESHOLD          352
+#define SQR_FFT_THRESHOLD              2432
+
+/* Generated 2009-01-12, gcc 4.2 */
+
+#define MUL_FFT_TABLE2 {{1,4}, {273,5}, {545,6}, {1217,7}, {3201,8}, {6913,9}, {7681,8}, {8449,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {39937,9}, {42497,10}, {56321,11}, {63489,10}, {81409,11}, {92161,10}, {93185,11}, {96257,12}, {126977,11}, {131073,10}, {138241,11}, {167937,10}, {169473,11}, {169985,10}, {172033,11}, {195585,9}, {196097,11}, {198657,10}, {208897,11}, {217089,12}, {258049,11}, {261121,9}, {262657,10}, {275457,11}, {302081,10}, {307201,11}, {331777,12}, {389121,11}, {425985,13}, {516097,12}, {520193,11}, {598017,12}, {610305,11}, {614401,12}, {651265,11}, {653313,10}, {654337,11}, {673793,10}, {674817,11}, {677889,10}, {679937,11}, {718849,10}, {719873,12}, {782337,11}, {850945,12}, {913409,11}, {925697,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1244161,12}, {1306625,11}, {1310721,12}, {1327105,11}, {1347585,12}, {1355777,11}, {1366017,12}, {1439745,13}, {1564673,12}, {1835009,14}, {1900545,12}, {1904641,14}, {2080769,13}, {2088961,12}, {2488321,13}, {2613249,12}, {2879489,13}, {2932737,12}, {2940929,13}, {3137537,12}, {3403777,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4978689,13}, {5234689,12}, {5500929,13}, {5758977,14}, {6275073,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9957377,14}, {MP_SIZE_T_MAX, 0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {241,5}, {545,6}, {1345,7}, {3201,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {55297,11}, {63489,10}, {80897,11}, {94209,10}, {97281,12}, {126977,11}, {129025,9}, {130049,10}, {138753,11}, {162817,9}, {164353,11}, {170497,10}, {178177,11}, {183297,10}, {184321,11}, {194561,10}, {208897,12}, {219137,11}, {221185,12}, {258049,11}, {261121,9}, {261633,10}, {267777,9}, {268289,11}, {270337,10}, {274945,9}, {276481,10}, {278529,11}, {292865,9}, {293377,10}, {295937,9}, {296449,10}, {306177,9}, {309249,10}, {310273,11}, {328705,12}, {331777,11}, {335873,12}, {344065,11}, {346113,12}, {352257,11}, {356353,12}, {389121,11}, {395265,10}, {398337,11}, {419841,10}, {421889,11}, {423937,13}, {516097,12}, {520193,11}, {546817,10}, {550913,11}, {561153,10}, {563201,11}, {579585,10}, {585729,11}, {621569,12}, {636929,11}, {638977,12}, {651265,11}, {714753,10}, {716801,11}, {718849,12}, {782337,11}, {849921,12}, {913409,11}, {954369,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,12}, {1437697,13}, {1564673,12}, {1961985,14}, {2080769,13}, {2088961,12}, {2486273,13}, {2613249,12}, {2879489,13}, {3137537,12}, {3272705,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4976641,13}, {5234689,12}, {5320705,13}, {5324801,12}, {5447681,13}, {5455873,12}, {5500929,13}, {5758977,14}, {6275073,13}, {6283265,12}, {6549505,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9953281,14}, {MP_SIZE_T_MAX, 0}}
diff --git a/gmp/mpn/x86_64/core2/lshift.asm b/gmp/mpn/x86_64/core2/lshift.asm
index 8ccafeca6c..60518901eb 100644
--- a/gmp/mpn/x86_64/core2/lshift.asm
+++ b/gmp/mpn/x86_64/core2/lshift.asm
@@ -1,83 +1,64 @@
 dnl  x86-64 mpn_lshift optimized for "Core 2".
 
-dnl  Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
+dnl  Copyright 2007 Free Software Foundation, Inc.
 dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
 C	     cycles/limb
-C AMD K8,K9	 4.25
-C AMD K10	 4.25
-C Intel P4	14.7
-C Intel core2	 1.27
-C Intel NHM	 1.375	(up to about n = 260, then 1.5)
-C Intel SBR	 1.87
-C Intel atom	 ?
-C VIA nano	 ?
+C K8,K9:	 4.25
+C K10:		 4.25
+C P4:		14.7
+C P6-15:	 1.27
 
 
 C INPUT PARAMETERS
 define(`rp',	`%rdi')
 define(`up',	`%rsi')
 define(`n',	`%rdx')
-define(`cnt',	`%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt',	`%cl')
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_lshift)
-	FUNC_ENTRY(4)
 	lea	-8(rp,n,8), rp
 	lea	-8(up,n,8), up
 
-	mov	R32(%rdx), R32(%rax)
-	and	$3, R32(%rax)
+	mov	%edx, %eax
+	and	$3, %eax
 	jne	L(nb00)
 L(b00):	C n = 4, 8, 12, ...
 	mov	(up), %r10
 	mov	-8(up), %r11
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r10, %rax
+	xor	%eax, %eax
+	shld	%cl, %r10, %rax
 	mov	-16(up), %r8
 	lea	24(rp), rp
 	sub	$4, n
 	jmp	L(00)
 
 L(nb00):C n = 1, 5, 9, ...
-	cmp	$2, R32(%rax)
+	cmp	$2, %eax
 	jae	L(nb01)
 L(b01):	mov	(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r9, %rax
+	xor	%eax, %eax
+	shld	%cl, %r9, %rax
 	sub	$2, n
 	jb	L(le1)
 	mov	-8(up), %r10
@@ -85,65 +66,62 @@ L(b01):	mov	(up), %r9
 	lea	-8(up), up
 	lea	16(rp), rp
 	jmp	L(01)
-L(le1):	shl	R8(cnt), %r9
+L(le1):	shl	%cl, %r9
 	mov	%r9, (rp)
-	FUNC_EXIT()
 	ret
 
 L(nb01):C n = 2, 6, 10, ...
 	jne	L(b11)
 L(b10):	mov	(up), %r8
 	mov	-8(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r8, %rax
+	xor	%eax, %eax
+	shld	%cl, %r8, %rax
 	sub	$3, n
 	jb	L(le2)
 	mov	-16(up), %r10
 	lea	-16(up), up
 	lea	8(rp), rp
 	jmp	L(10)
-L(le2):	shld	R8(cnt), %r9, %r8
+L(le2):	shld	%cl, %r9, %r8
 	mov	%r8, (rp)
-	shl	R8(cnt), %r9
+	shl	%cl, %r9
 	mov	%r9, -8(rp)
-	FUNC_EXIT()
 	ret
 
 	ALIGN(16)			C performance critical!
 L(b11):	C n = 3, 7, 11, ...
 	mov	(up), %r11
 	mov	-8(up), %r8
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r11, %rax
+	xor	%eax, %eax
+	shld	%cl, %r11, %rax
 	mov	-16(up), %r9
 	lea	-24(up), up
 	sub	$4, n
 	jb	L(end)
 
 	ALIGN(16)
-L(top):	shld	R8(cnt), %r8, %r11
+L(top):	shld	%cl, %r8, %r11
 	mov	(up), %r10
 	mov	%r11, (rp)
-L(10):	shld	R8(cnt), %r9, %r8
+L(10):	shld	%cl, %r9, %r8
 	mov	-8(up), %r11
 	mov	%r8, -8(rp)
-L(01):	shld	R8(cnt), %r10, %r9
+L(01):	shld	%cl, %r10, %r9
 	mov	-16(up), %r8
 	mov	%r9, -16(rp)
-L(00):	shld	R8(cnt), %r11, %r10
+L(00):	shld	%cl, %r11, %r10
 	mov	-24(up), %r9
+	lea	-32(up), up
 	mov	%r10, -24(rp)
-	add	$-32, up
 	lea	-32(rp), rp
 	sub	$4, n
 	jnc	L(top)
 
-L(end):	shld	R8(cnt), %r8, %r11
+L(end):	shld	%cl, %r8, %r11
 	mov	%r11, (rp)
-	shld	R8(cnt), %r9, %r8
+	shld	%cl, %r9, %r8
 	mov	%r8, -8(rp)
-	shl	R8(cnt), %r9
+	shl	%cl, %r9
 	mov	%r9, -16(rp)
-	FUNC_EXIT()
 	ret
 EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/lshiftc.asm b/gmp/mpn/x86_64/core2/lshiftc.asm
deleted file mode 100644
index 65c7b2f1b8..0000000000
--- a/gmp/mpn/x86_64/core2/lshiftc.asm
+++ /dev/null
@@ -1,159 +0,0 @@
-dnl  x86-64 mpn_lshiftc optimized for "Core 2".
-
-dnl  Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 1.5
-C Intel NHM	 2.25	(up to about n = 260, then 1.875)
-C Intel SBR	 2.25
-C Intel atom	 ?
-C VIA nano	 ?
-
-
-C INPUT PARAMETERS
-define(`rp',	`%rdi')
-define(`up',	`%rsi')
-define(`n',	`%rdx')
-define(`cnt',	`%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_lshiftc)
-	FUNC_ENTRY(4)
-	lea	-8(rp,n,8), rp
-	lea	-8(up,n,8), up
-
-	mov	R32(%rdx), R32(%rax)
-	and	$3, R32(%rax)
-	jne	L(nb00)
-L(b00):	C n = 4, 8, 12, ...
-	mov	(up), %r10
-	mov	-8(up), %r11
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r10, %rax
-	mov	-16(up), %r8
-	lea	24(rp), rp
-	sub	$4, n
-	jmp	L(00)
-
-L(nb00):C n = 1, 5, 9, ...
-	cmp	$2, R32(%rax)
-	jae	L(nb01)
-L(b01):	mov	(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r9, %rax
-	sub	$2, n
-	jb	L(le1)
-	mov	-8(up), %r10
-	mov	-16(up), %r11
-	lea	-8(up), up
-	lea	16(rp), rp
-	jmp	L(01)
-L(le1):	shl	R8(cnt), %r9
-	not	%r9
-	mov	%r9, (rp)
-	FUNC_EXIT()
-	ret
-
-L(nb01):C n = 2, 6, 10, ...
-	jne	L(b11)
-L(b10):	mov	(up), %r8
-	mov	-8(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r8, %rax
-	sub	$3, n
-	jb	L(le2)
-	mov	-16(up), %r10
-	lea	-16(up), up
-	lea	8(rp), rp
-	jmp	L(10)
-L(le2):	shld	R8(cnt), %r9, %r8
-	not	%r8
-	mov	%r8, (rp)
-	shl	R8(cnt), %r9
-	not	%r9
-	mov	%r9, -8(rp)
-	FUNC_EXIT()
-	ret
-
-	ALIGN(16)			C performance critical!
-L(b11):	C n = 3, 7, 11, ...
-	mov	(up), %r11
-	mov	-8(up), %r8
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r11, %rax
-	mov	-16(up), %r9
-	lea	-24(up), up
-	sub	$4, n
-	jb	L(end)
-
-	ALIGN(16)
-L(top):	shld	R8(cnt), %r8, %r11
-	mov	(up), %r10
-	not	%r11
-	mov	%r11, (rp)
-L(10):	shld	R8(cnt), %r9, %r8
-	mov	-8(up), %r11
-	not	%r8
-	mov	%r8, -8(rp)
-L(01):	shld	R8(cnt), %r10, %r9
-	mov	-16(up), %r8
-	not	%r9
-	mov	%r9, -16(rp)
-L(00):	shld	R8(cnt), %r11, %r10
-	mov	-24(up), %r9
-	not	%r10
-	mov	%r10, -24(rp)
-	add	$-32, up
-	lea	-32(rp), rp
-	sub	$4, n
-	jnc	L(top)
-
-L(end):	shld	R8(cnt), %r8, %r11
-	not	%r11
-	mov	%r11, (rp)
-	shld	R8(cnt), %r9, %r8
-	not	%r8
-	mov	%r8, -8(rp)
-	shl	R8(cnt), %r9
-	not	%r9
-	mov	%r9, -16(rp)
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/mul_basecase.asm b/gmp/mpn/x86_64/core2/mul_basecase.asm
deleted file mode 100644
index d16be852f7..0000000000
--- a/gmp/mpn/x86_64/core2/mul_basecase.asm
+++ /dev/null
@@ -1,975 +0,0 @@
-dnl  X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
-dnl  It also seems good for Conroe/Wolfdale.
-
-dnl  Contributed to the GNU project by Torbjörn Granlund.
-
-dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb	mul_1		mul_2		mul_3		addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core	 4.0		 4.0		 -		4.18-4.25
-C Intel NHM	 3.75		 3.8		 -		4.06-4.2
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C Code structure:
-C
-C
-C               m_1(0m4)        m_1(1m4)        m_1(2m4)        m_1(3m4)
-C                  |               |               |               |
-C        m_2(0m4)  |     m_2(1m4)  |     m_2(2m4)  |     m_2(3m4)  |
-C           |      /        |      /        |      /        |      /
-C           |     /         |     /         |     /         |     /
-C           |    /          |    /          |    /          |    /
-C          \|/ |/_         \|/ |/_         \|/ |/_         \|/ |/_
-C             _____           _____           _____           _____
-C            /     \         /     \         /     \         /     \
-C          \|/      |      \|/      |      \|/      |      \|/      |
-C        am_2(0m4)  |    am_2(1m4)  |    am_2(2m4)  |    am_2(3m4)  |
-C           \      /|\      \      /|\      \      /|\      \      /|\
-C            \_____/         \_____/         \_____/         \_____/
-
-C TODO
-C  * Tune.  None done so far.
-C  * Currently 2687 bytes, making it smaller would be nice.
-C  * Implement some basecases, say for un < 4.
-C  * Try zeroing with xor in m2 loops.
-C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
-C    between loop header and wind-down code.
-C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-C Define this to $1 to use late loop index variable as zero, $2 to use an
-C explicit $0.
-define(`Z',`$1')
-
-define(`rp',       `%rdi')
-define(`up',       `%rsi')
-define(`un_param', `%rdx')
-define(`vp_param', `%rcx')	C FIXME reallocate vp to rcx but watch performance!
-define(`vn_param', `%r8')
-
-define(`un',       `%r9')
-define(`vn',       `(%rsp)')
-
-define(`v0',       `%r10')
-define(`v1',       `%r11')
-define(`w0',       `%rbx')
-define(`w1',       `%rcx')
-define(`w2',       `%rbp')
-define(`w3',       `%r12')
-define(`i',        `%r13')
-define(`vp',       `%r14')
-
-define(`X0',       `%r8')
-define(`X1',       `%r15')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
-
-ASM_START()
-	TEXT
-	ALIGN(32)
-PROLOGUE(mpn_mul_basecase)
-	FUNC_ENTRY(4)
-IFDOS(`	mov	56(%rsp), %r8d	')
-	mov	(up), %rax		C shared for mul_1 and mul_2
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-
-	mov	(vp_param), v0		C shared for mul_1 and mul_2
-
-	xor	un, un
-	sub	un_param, un		C un = -un_param
-
-	lea	(up,un_param,8), up
-	lea	(rp,un_param,8), rp
-
-	mul	v0			C shared for mul_1 and mul_2
-
-	test	$1, R8(vn_param)
-	jz	L(m2)
-
-	lea	8(vp_param), vp		C FIXME: delay until known needed
-
-	test	$1, R8(un)
-	jnz	L(m1x1)
-
-L(m1x0):test	$2, R8(un)
-	jnz	L(m1s2)
-
-L(m1s0):
-	lea	(un), i
-	mov	%rax, (rp,un,8)
-	mov	8(up,un,8), %rax
-	mov	%rdx, w0		C FIXME: Use lea?
-	lea	L(do_am0)(%rip), %rbp
-	jmp	L(m1e0)
-
-L(m1s2):
-	lea	2(un), i
-	mov	%rax, (rp,un,8)
-	mov	8(up,un,8), %rax
-	mov	%rdx, w0		C FIXME: Use lea?
-	mul	v0
-	lea	L(do_am2)(%rip), %rbp
-	test	i, i
-	jnz	L(m1e2)
-	add	%rax, w0
-	adc	$0, %rdx
-	mov	w0, I(-8(rp),8(rp,un,8))
-	mov	%rdx, I((rp),16(rp,un,8))
-	jmp	L(ret2)
-
-L(m1x1):test	$2, R8(un)
-	jz	L(m1s3)
-
-L(m1s1):
-	lea	1(un), i
-	mov	%rax, (rp,un,8)
-	test	i, i
-	jz	L(1)
-	mov	8(up,un,8), %rax
-	mov	%rdx, w1		C FIXME: Use lea?
-	lea	L(do_am1)(%rip), %rbp
-	jmp	L(m1e1)
-L(1):	mov	%rdx, I((rp),8(rp,un,8))
-	jmp	L(ret2)
-
-L(m1s3):
-	lea	-1(un), i
-	mov	%rax, (rp,un,8)
-	mov	8(up,un,8), %rax
-	mov	%rdx, w1		C FIXME: Use lea?
-	lea	L(do_am3)(%rip), %rbp
-	jmp	L(m1e3)
-
-	ALIGNx
-L(m1top):
-	mul	v0
-	mov	w1, -16(rp,i,8)
-L(m1e2):xor	R32(w1), R32(w1)
-	add	%rax, w0
-	mov	(up,i,8), %rax
-	adc	%rdx, w1
-	mov	w0, -8(rp,i,8)
-L(m1e1):xor	R32(w0), R32(w0)
-	mul	v0
-	add	%rax, w1
-	mov	8(up,i,8), %rax
-	adc	%rdx, w0
-	mov	w1, (rp,i,8)
-L(m1e0):xor	R32(w1), R32(w1)
-	mul	v0
-	add	%rax, w0
-	mov	16(up,i,8), %rax
-	adc	%rdx, w1
-	mov	w0, 8(rp,i,8)
-L(m1e3):xor	R32(w0), R32(w0)
-	mul	v0
-	add	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m1top)
-
-	mul	v0
-	mov	w1, I(-16(rp),-16(rp,i,8))
-	add	%rax, w0
-	adc	$0, %rdx
-	mov	w0, I(-8(rp),-8(rp,i,8))
-	mov	%rdx, I((rp),(rp,i,8))
-
-	dec	vn_param
-	jz	L(ret2)
-	lea	-8(rp), rp
-	jmp	*%rbp
-
-L(m2):
-	mov	8(vp_param), v1
-	lea	16(vp_param), vp	C FIXME: delay until known needed
-
-	test	$1, R8(un)
-	jnz	L(bx1)
-
-L(bx0):	test	$2, R8(un)
-	jnz	L(b10)
-
-L(b00):	lea	(un), i
-	mov	%rax, (rp,un,8)
-	mov	%rdx, w1		C FIXME: Use lea?
-	mov	(up,un,8), %rax
-	mov	$0, R32(w2)
-	jmp	L(m2e0)
-
-L(b10):	lea	-2(un), i
-	mov	%rax, w2		C FIXME: Use lea?
-	mov	(up,un,8), %rax
-	mov	%rdx, w3		C FIXME: Use lea?
-	mov	$0, R32(w0)
-	jmp	L(m2e2)
-
-L(bx1):	test	$2, R8(un)
-	jz	L(b11)
-
-L(b01):	lea	1(un), i
-	mov	%rax, (rp,un,8)
-	mov	(up,un,8), %rax
-	mov	%rdx, w0		C FIXME: Use lea?
-	mov	$0, R32(w1)
-	jmp	L(m2e1)
-
-L(b11):	lea	-1(un), i
-	mov	%rax, w1		C FIXME: Use lea?
-	mov	(up,un,8), %rax
-	mov	%rdx, w2		C FIXME: Use lea?
-	mov	$0, R32(w3)
-	jmp	L(m2e3)
-
-	ALIGNx
-L(m2top0):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-L(m2e0):mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top0)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	w0, I((rp),(rp,i,8))
-	mov	w1, I(8(rp),8(rp,i,8))
-
-	add	$-2, vn_param
-	jz	L(ret2)
-
-L(do_am0):
-	push	%r15
-	push	vn_param
-
-L(olo0):
-	mov	(vp), v0
-	mov	8(vp), v1
-	lea	16(vp), vp
-	lea	16(rp), rp
-	mov	(up,un,8), %rax
-C	lea	0(un), i
-	mov	un, i
-	mul	v0
-	mov	%rax, X0
-	mov	(up,un,8), %rax
-	MOV(	%rdx, X1, 2)
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,un,8), w2
-	mov	%rax, w3
-	jmp	L(lo0)
-
-	ALIGNx
-L(am2top0):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-L(lo0):	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top0)
-
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	w2, X0
-	mov	X0, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	addl	$-2, vn
-	jnz	L(olo0)
-
-L(ret):	pop	%rax
-	pop	%r15
-L(ret2):pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-
-	ALIGNx
-L(m2top1):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-L(m2e1):mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top1)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	w0, I((rp),(rp,i,8))
-	mov	w1, I(8(rp),8(rp,i,8))
-
-	add	$-2, vn_param
-	jz	L(ret2)
-
-L(do_am1):
-	push	%r15
-	push	vn_param
-
-L(olo1):
-	mov	(vp), v0
-	mov	8(vp), v1
-	lea	16(vp), vp
-	lea	16(rp), rp
-	mov	(up,un,8), %rax
-	lea	1(un), i
-	mul	v0
-	mov	%rax, X1
-	MOV(	%rdx, X0, 128)
-	mov	(up,un,8), %rax
-	mov	(rp,un,8), w1
-	mul	v1
-	mov	%rax, w2
-	mov	8(up,un,8), %rax
-	MOV(	%rdx, w3, 1)
-	jmp	L(lo1)
-
-	ALIGNx
-L(am2top1):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-L(lo1):	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top1)
-
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	w2, X0
-	mov	X0, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	addl	$-2, vn
-	jnz	L(olo1)
-
-	pop	%rax
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-
-	ALIGNx
-L(m2top2):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-L(m2e2):mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top2)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	w0, I((rp),(rp,i,8))
-	mov	w1, I(8(rp),8(rp,i,8))
-
-	add	$-2, vn_param
-	jz	L(ret2)
-
-L(do_am2):
-	push	%r15
-	push	vn_param
-
-L(olo2):
-	mov	(vp), v0
-	mov	8(vp), v1
-	lea	16(vp), vp
-	lea	16(rp), rp
-	mov	(up,un,8), %rax
-	lea	-2(un), i
-	mul	v0
-	mov	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	(up,un,8), %rax
-	mov	(rp,un,8), w0
-	mul	v1
-	mov	%rax, w1
-	lea	(%rdx), w2
-	mov	8(up,un,8), %rax
-	jmp	L(lo2)
-
-	ALIGNx
-L(am2top2):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-L(lo2):	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top2)
-
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	w2, X0
-	mov	X0, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	addl	$-2, vn
-	jnz	L(olo2)
-
-	pop	%rax
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-
-	ALIGNx
-L(m2top3):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-L(m2e3):mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top3)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	w0, I((rp),(rp,i,8))
-	mov	w1, I(8(rp),8(rp,i,8))
-
-	add	$-2, vn_param
-	jz	L(ret2)
-
-L(do_am3):
-	push	%r15
-	push	vn_param
-
-L(olo3):
-	mov	(vp), v0
-	mov	8(vp), v1
-	lea	16(vp), vp
-	lea	16(rp), rp
-	mov	(up,un,8), %rax
-	lea	-1(un), i
-	mul	v0
-	mov	%rax, X1
-	MOV(	%rdx, X0, 8)
-	mov	(up,un,8), %rax
-	mov	(rp,un,8), w3
-	mul	v1
-	mov	%rax, w0
-	MOV(	%rdx, w1, 16)
-	mov	8(up,un,8), %rax
-	jmp	L(lo3)
-
-	ALIGNx
-L(am2top3):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-L(lo3):	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top3)
-
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	w2, X0
-	mov	X0, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	addl	$-2, vn
-	jnz	L(olo3)
-
-	pop	%rax
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/mullo_basecase.asm b/gmp/mpn/x86_64/core2/mullo_basecase.asm
deleted file mode 100644
index 0f03d867f6..0000000000
--- a/gmp/mpn/x86_64/core2/mullo_basecase.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-dnl  AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
-
-dnl  Contributed to the GNU project by Torbjörn Granlund.
-
-dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb	mul_2		addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core	 4.0		4.18-4.25
-C Intel NHM	 3.75		4.06-4.2
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C   * Implement proper cor2, replacing current cor0.
-C   * Offset n by 2 in order to avoid the outer loop cmp.  (And sqr_basecase?)
-C   * Micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp',       `%rdi')
-define(`up',       `%rsi')
-define(`vp_param', `%rdx')
-define(`n_param',  `%rcx')
-
-define(`v0',       `%r10')
-define(`v1',       `%r11')
-define(`w0',       `%rbx')
-define(`w1',       `%rcx')
-define(`w2',       `%rbp')
-define(`w3',       `%r12')
-define(`n',        `%r9')
-define(`i',        `%r13')
-define(`vp',       `%r8')
-
-define(`X0',       `%r14')
-define(`X1',       `%r15')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
-
-ASM_START()
-	TEXT
-	ALIGN(32)
-PROLOGUE(mpn_mullo_basecase)
-	FUNC_ENTRY(4)
-
-	mov	(up), %rax
-	mov	vp_param, vp
-
-	cmp	$4, n_param
-	jb	L(small)
-
-	mov	(vp_param), v0
-	push	%rbx
-	lea	(rp,n_param,8), rp	C point rp at R[un]
-	push	%rbp
-	lea	(up,n_param,8), up	C point up right after U's end
-	push	%r12
-	mov	$0, R32(n)		C FIXME
-	sub	n_param, n
-	push	%r13
-	mul	v0
-	mov	8(vp), v1
-
-	test	$1, R8(n_param)
-	jnz	L(m2x1)
-
-L(m2x0):test	$2, R8(n_param)
-	jnz	L(m2b2)
-
-L(m2b0):lea	(n), i
-	mov	%rax, (rp,n,8)
-	mov	%rdx, w1
-	mov	(up,n,8), %rax
-	xor	R32(w2), R32(w2)
-	jmp	L(m2e0)
-
-L(m2b2):lea	-2(n), i
-	mov	%rax, w2
-	mov	(up,n,8), %rax
-	mov	%rdx, w3
-	xor	R32(w0), R32(w0)
-	jmp	L(m2e2)
-
-L(m2x1):test	$2, R8(n_param)
-	jnz	L(m2b3)
-
-L(m2b1):lea	1(n), i
-	mov	%rax, (rp,n,8)
-	mov	(up,n,8), %rax
-	mov	%rdx, w0
-	xor	R32(w1), R32(w1)
-	jmp	L(m2e1)
-
-L(m2b3):lea	-1(n), i
-	xor	R32(w3), R32(w3)
-	mov	%rax, w1
-	mov	%rdx, w2
-	mov	(up,n,8), %rax
-	jmp	L(m2e3)
-
-	ALIGNx
-L(m2tp):mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-L(m2e1):mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-L(m2e0):mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-L(m2e3):mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-L(m2e2):mul	v1
-	mov	$0, R32(w1)		C FIXME: dead in last iteration
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0		C FIXME: dead in last iteration
-	add	$4, i
-	js	L(m2tp)
-
-L(m2ed):imul	v0, %rax
-	add	w3, %rax
-	mov	%rax, I(-8(rp),-8(rp,i,8))
-
-	add	$2, n
-	lea	16(vp), vp
-	lea	-16(up), up
-	cmp	$-2, n
-	jge	L(cor1)
-
-	push	%r14
-	push	%r15
-
-L(outer):
-	mov	(vp), v0
-	mov	8(vp), v1
-	mov	(up,n,8), %rax
-	mul	v0
-	test	$1, R8(n)
-	jnz	L(a1x1)
-
-L(a1x0):mov	%rax, X1
-	MOV(	%rdx, X0, 8)
-	mov	(up,n,8), %rax
-	mul	v1
-	test	$2, R8(n)
-	jnz	L(a110)
-
-L(a100):lea	(n), i
-	mov	(rp,n,8), w3
-	mov	%rax, w0
-	MOV(	%rdx, w1, 16)
-	jmp	L(lo0)
-
-L(a110):lea	2(n), i
-	mov	(rp,n,8), w1
-	mov	%rax, w2
-	mov	8(up,n,8), %rax
-	MOV(	%rdx, w3, 1)
-	jmp	L(lo2)
-
-L(a1x1):mov	%rax, X0
-	MOV(	%rdx, X1, 2)
-	mov	(up,n,8), %rax
-	mul	v1
-	test	$2, R8(n)
-	jz	L(a111)
-
-L(a101):lea	1(n), i
-	MOV(	%rdx, w0, 4)
-	mov	(rp,n,8), w2
-	mov	%rax, w3
-	jmp	L(lo1)
-
-L(a111):lea	-1(n), i
-	MOV(	%rdx, w2, 64)
-	mov	%rax, w1
-	mov	(rp,n,8), w0
-	mov	8(up,n,8), %rax
-	jmp	L(lo3)
-
-	ALIGNx
-L(top):	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	-8(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-L(lo2):	mul	v0
-	add	w1, X1
-	mov	X1, -16(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	-8(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	-8(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-L(lo1):	mov	(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, -8(rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	(up,i,8), %rax
-	mov	(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-L(lo0):	mov	8(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, (rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	8(rp,i,8), w3
-	adc	$0, X1
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	16(up,i,8), %rax
-	adc	$0, w2
-L(lo3):	mul	v0
-	add	w0, X0
-	mov	X0, 8(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	16(up,i,8), %rax
-	mov	16(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(top)
-
-L(end):	imul	v1, %rax
-	add	w0, w1
-	adc	%rax, w2
-	mov	I(-8(up),-8(up,i,8)), %rax
-	imul	v0, %rax
-	add	w1, X1
-	mov	X1, I(-16(rp),-16(rp,i,8))
-	adc	X0, %rax
-	mov	I(-8(rp),-8(rp,i,8)), w1
-	add	w1, w2
-	add	w2, %rax
-	mov	%rax, I(-8(rp),-8(rp,i,8))
-
-	add	$2, n
-	lea	16(vp), vp
-	lea	-16(up), up
-	cmp	$-2, n
-	jl	L(outer)
-
-	pop	%r15
-	pop	%r14
-
-	jnz	L(cor0)
-
-L(cor1):mov	(vp), v0
-	mov	8(vp), v1
-	mov	-16(up), %rax
-	mul	v0			C u0 x v2
-	add	-16(rp), %rax		C FIXME: rp[0] still available in reg?
-	adc	-8(rp), %rdx		C FIXME: rp[1] still available in reg?
-	mov	-8(up), %rbx
-	imul	v0, %rbx
-	mov	-16(up), %rcx
-	imul	v1, %rcx
-	mov	%rax, -16(rp)
-	add	%rbx, %rcx
-	add	%rdx, %rcx
-	mov	%rcx, -8(rp)
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-L(cor0):mov	(vp), %r11
-	imul	-8(up), %r11
-	add	%rax, %r11
-	mov	%r11, -8(rp)
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-	ALIGN(16)
-L(small):
-	cmp	$2, n_param
-	jae	L(gt1)
-L(n1):	imul	(vp_param), %rax
-	mov	%rax, (rp)
-	FUNC_EXIT()
-	ret
-L(gt1):	ja	L(gt2)
-L(n2):	mov	(vp_param), %r9
-	mul	%r9
-	mov	%rax, (rp)
-	mov	8(up), %rax
-	imul	%r9, %rax
-	add	%rax, %rdx
-	mov	8(vp), %r9
-	mov	(up), %rcx
-	imul	%r9, %rcx
-	add	%rcx, %rdx
-	mov	%rdx, 8(rp)
-	FUNC_EXIT()
-	ret
-L(gt2):
-L(n3):	mov	(vp_param), %r9
-	mul	%r9		C u0 x v0
-	mov	%rax, (rp)
-	mov	%rdx, %r10
-	mov	8(up), %rax
-	mul	%r9		C u1 x v0
-	imul	16(up), %r9	C u2 x v0
-	add	%rax, %r10
-	adc	%rdx, %r9
-	mov	8(vp), %r11
-	mov	(up), %rax
-	mul	%r11		C u0 x v1
-	add	%rax, %r10
-	adc	%rdx, %r9
-	imul	8(up), %r11	C u1 x v1
-	add	%r11, %r9
-	mov	%r10, 8(rp)
-	mov	16(vp), %r10
-	mov	(up), %rax
-	imul	%rax, %r10	C u0 x v2
-	add	%r10, %r9
-	mov	%r9, 16(rp)
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/popcount.asm b/gmp/mpn/x86_64/core2/popcount.asm
index e935cf1892..6c22999ff4 100644
--- a/gmp/mpn/x86_64/core2/popcount.asm
+++ b/gmp/mpn/x86_64/core2/popcount.asm
@@ -3,33 +3,21 @@ dnl  x86-64 mpn_popcount optimized for "Core 2".
 dnl  Copyright 2007 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
-dnl
+
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 
 include(`../config.m4')
 
-MULFUNC_PROLOGUE(mpn_popcount)
 include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86_64/core2/redc_1.asm b/gmp/mpn/x86_64/core2/redc_1.asm
deleted file mode 100644
index d0e96ef1cb..0000000000
--- a/gmp/mpn/x86_64/core2/redc_1.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-dnl  X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
-
-dnl  Contributed to the GNU project by Torbjörn Granlund.
-
-dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C AMD bull	 ?
-C AMD pile	 ?
-C AMD steam	 ?
-C AMD bobcat	 ?
-C AMD jaguar	 ?
-C Intel P4	 ?
-C Intel core	 4.5  (fluctuating)
-C Intel NHM	 ?
-C Intel SBR	 ?
-C Intel IBR	 ?
-C Intel HWL	 ?
-C Intel BWL	 ?
-C Intel atom	 ?
-C VIA nano	 ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C  * Micro-optimise, none performed thus far.
-C  * Consider inlining mpn_add_n.
-C  * Single basecases out before the pushes.
-C  * Keep up[i] in registers for basecases (might require pushes).
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp',          `%rdi')   C rcx
-define(`up',          `%rsi')   C rdx
-define(`mp_param',    `%rdx')   C r8
-define(`n',           `%rcx')   C r9
-define(`u0inv',       `%r8')    C stack
-
-define(`i',           `%r14')
-define(`j',           `%r15')
-define(`mp',          `%r12')
-define(`q0',          `%r13')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C  X  q0'  n   X  rp  up      u0i           mp   q0 i   j
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
-	TEXT
-	ALIGN(32)
-PROLOGUE(mpn_redc_1)
-	FUNC_ENTRY(4)
-IFDOS(`	mov	56(%rsp), %r8	')
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-
-	mov	(up), q0
-	mov	n, j			C outer loop induction var
-	lea	(mp_param,n,8), mp
-	lea	-16(up,n,8), up
-	neg	n
-	imul	u0inv, q0		C first iteration q0
-
-	test	$1, R8(n)
-	jz	L(b0)
-
-L(b1):	cmp	$-1, R32(n)
-	jz	L(n1)
-	cmp	$-3, R32(n)
-	jz	L(n3)
-
-	push	rp
-
-L(otp1):lea	3(n), i
-	mov	(mp,n,8), %rax
-	mul	q0
-	lea	(%rax), %rbp
-	mov	8(mp,n,8), %rax
-	lea	(%rdx), %r9
-	mul	q0
-	lea	(%rax), %r11
-	mov	16(mp,n,8), %rax
-	mov	16(up,n,8), %r10
-	lea	(%rdx), %rdi
-	mul	q0
-	add	%rbp, %r10
-	lea	(%rax), %rbp
-	mov	24(mp,n,8), %rax
-	adc	%r9, %r11
-	mov	24(up,n,8), %rbx
-	lea	(%rdx), %r9
-	adc	$0, %rdi
-	mul	q0
-	add	%r11, %rbx
-	lea	(%rax), %r11
-	mov	32(mp,n,8), %rax
-	adc	%rdi, %rbp
-	mov	%rbx, 24(up,n,8)
-	mov	32(up,n,8), %r10
-	lea	(%rdx), %rdi
-	adc	$0, %r9
-	imul	u0inv, %rbx		C next q limb
-	add	$2, i
-	jns	L(ed1)
-
-	ALIGNx
-L(tp1):	mul	q0
-	add	%rbp, %r10
-	lea	(%rax), %rbp
-	mov	(mp,i,8), %rax
-	adc	%r9, %r11
-	mov	%r10, -8(up,i,8)
-	mov	(up,i,8), %r10
-	lea	(%rdx), %r9
-	adc	$0, %rdi
-	mul	q0
-	add	%r11, %r10
-	lea	(%rax), %r11
-	mov	8(mp,i,8), %rax
-	adc	%rdi, %rbp
-	mov	%r10, (up,i,8)
-	mov	8(up,i,8), %r10
-	lea	(%rdx), %rdi
-	adc	$0, %r9
-	add	$2, i
-	js	L(tp1)
-
-L(ed1):	mul	q0
-	add	%rbp, %r10
-	adc	%r9, %r11
-	mov	%r10, I(-8(up),-8(up,i,8))
-	mov	I((up),(up,i,8)), %r10
-	adc	$0, %rdi
-	add	%r11, %r10
-	adc	%rdi, %rax
-	mov	%r10, I((up),(up,i,8))
-	mov	I(8(up),8(up,i,8)), %r10
-	adc	$0, %rdx
-	add	%rax, %r10
-	mov	%r10, I(8(up),8(up,i,8))
-	adc	$0, %rdx
-	mov	%rdx, 16(up,n,8)	C up[0]
-	mov	%rbx, q0		C previously computed q limb -> q0
-	lea	8(up), up		C up++
-	dec	j
-	jnz	L(otp1)
-	jmp	L(cj)
-
-L(b0):	cmp	$-2, R32(n)
-	jz	L(n2)
-	cmp	$-4, R32(n)
-	jz	L(n4)
-
-	push	rp
-
-L(otp0):lea	4(n), i
-	mov	(mp,n,8), %rax
-	mul	q0
-	lea	(%rax), %r11
-	mov	8(mp,n,8), %rax
-	lea	(%rdx), %rdi
-	mul	q0
-	lea	(%rax), %rbp
-	mov	16(mp,n,8), %rax
-	mov	16(up,n,8), %r10
-	lea	(%rdx), %r9
-	mul	q0
-	add	%r11, %r10
-	lea	(%rax), %r11
-	mov	24(mp,n,8), %rax
-	adc	%rdi, %rbp
-	mov	24(up,n,8), %rbx
-	lea	(%rdx), %rdi
-	adc	$0, %r9
-	mul	q0
-	add	%rbp, %rbx
-	lea	(%rax), %rbp
-	mov	32(mp,n,8), %rax
-	adc	%r9, %r11
-	mov	%rbx, 24(up,n,8)
-	mov	32(up,n,8), %r10
-	lea	(%rdx), %r9
-	adc	$0, %rdi
-	imul	u0inv, %rbx		C next q limb
-	jmp	L(e0)
-
-	ALIGNx
-L(tp0):	mul	q0
-	add	%rbp, %r10
-	lea	(%rax), %rbp
-	mov	(mp,i,8), %rax
-	adc	%r9, %r11
-	mov	%r10, -8(up,i,8)
-	mov	(up,i,8), %r10
-	lea	(%rdx), %r9
-	adc	$0, %rdi
-L(e0):	mul	q0
-	add	%r11, %r10
-	lea	(%rax), %r11
-	mov	8(mp,i,8), %rax
-	adc	%rdi, %rbp
-	mov	%r10, (up,i,8)
-	mov	8(up,i,8), %r10
-	lea	(%rdx), %rdi
-	adc	$0, %r9
-	add	$2, i
-	js	L(tp0)
-
-L(ed0):	mul	q0
-	add	%rbp, %r10
-	adc	%r9, %r11
-	mov	%r10, I(-8(up),-8(up,i,8))
-	mov	I((up),(up,i,8)), %r10
-	adc	$0, %rdi
-	add	%r11, %r10
-	adc	%rdi, %rax
-	mov	%r10, I((up),(up,i,8))
-	mov	I(8(up),8(up,i,8)), %r10
-	adc	$0, %rdx
-	add	%rax, %r10
-	mov	%r10, I(8(up),8(up,i,8))
-	adc	$0, %rdx
-	mov	%rdx, 16(up,n,8)	C up[0]
-	mov	%rbx, q0		C previously computed q limb -> q0
-	lea	8(up), up		C up++
-	dec	j
-	jnz	L(otp0)
-
-L(cj):	lea	16(up), up		C FIXME
-	pop	rp
-L(add_n):
-IFSTD(`	lea	(up,n,8), up		C param 2: up
-	lea	(up,n,8), %rdx		C param 3: up - n
-	neg	R32(n)		')	C param 4: n
-
-IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
-	lea	(%rdx,n,8), %r8		C param 3: up - n
-	neg	R32(n)
-	mov	n, %r9			C param 4: n
-	mov	rp, %rcx	')	C param 1: rp
-
-	CALL(	mpn_add_n)
-
-L(ret):	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-L(n1):	mov	(mp_param), %rax
-	mul	q0
-	add	8(up), %rax
-	adc	16(up), %rdx
-	mov	%rdx, (rp)
-	mov	$0, R32(%rax)
-	adc	R32(%rax), R32(%rax)
-	jmp	L(ret)
-
-L(n2):	mov	(mp_param), %rax
-	mov	(up), %rbp
-	mul	q0
-	add	%rax, %rbp
-	mov	%rdx, %r9
-	adc	$0, %r9
-	mov	-8(mp), %rax
-	mov	8(up), %r10
-	mul	q0
-	add	%rax, %r10
-	mov	%rdx, %r11
-	adc	$0, %r11
-	add	%r9, %r10
-	adc	$0, %r11
-	mov	%r10, q0
-	imul	u0inv, q0		C next q0
-	mov	-16(mp), %rax
-	mul	q0
-	add	%rax, %r10
-	mov	%rdx, %r9
-	adc	$0, %r9
-	mov	-8(mp), %rax
-	mov	16(up), %r14
-	mul	q0
-	add	%rax, %r14
-	adc	$0, %rdx
-	add	%r9, %r14
-	adc	$0, %rdx
-	xor	R32(%rax), R32(%rax)
-	add	%r11, %r14
-	adc	24(up), %rdx
-	mov	%r14, (rp)
-	mov	%rdx, 8(rp)
-	adc	R32(%rax), R32(%rax)
-	jmp	L(ret)
-
-	ALIGNx
-L(n3):	mov	-24(mp), %rax
-	mov	-8(up), %r10
-	mul	q0
-	add	%rax, %r10
-	mov	-16(mp), %rax
-	mov	%rdx, %r11
-	adc	$0, %r11
-	mov	(up), %rbp
-	mul	q0
-	add	%rax, %rbp
-	mov	%rdx, %r9
-	adc	$0, %r9
-	mov	-8(mp), %rax
-	add	%r11, %rbp
-	mov	8(up), %r10
-	adc	$0, %r9
-	mul	q0
-	mov	%rbp, q0
-	imul	u0inv, q0		C next q0
-	add	%rax, %r10
-	mov	%rdx, %r11
-	adc	$0, %r11
-	mov	%rbp, (up)
-	add	%r9, %r10
-	adc	$0, %r11
-	mov	%r10, 8(up)
-	mov	%r11, -8(up)		C up[0]
-	lea	8(up), up		C up++
-	dec	j
-	jnz	L(n3)
-
-	mov	-32(up), %rdx
-	mov	-24(up), %rbx
-	xor	R32(%rax), R32(%rax)
-	add	%rbp, %rdx
-	adc	%r10, %rbx
-	adc	8(up), %r11
-	mov	%rdx, (rp)
-	mov	%rbx, 8(rp)
-	mov	%r11, 16(rp)
-	adc	R32(%rax), R32(%rax)
-	jmp	L(ret)
-
-	ALIGNx
-L(n4):	mov	-32(mp), %rax
-	mul	q0
-	lea	(%rax), %r11
-	mov	-24(mp), %rax
-	lea	(%rdx), %r14
-	mul	q0
-	lea	(%rax), %rbp
-	mov	-16(mp), %rax
-	mov	-16(up), %r10
-	lea	(%rdx), %r9
-	mul	q0
-	add	%r11, %r10
-	lea	(%rax), %r11
-	mov	-8(mp), %rax
-	adc	%r14, %rbp
-	mov	-8(up), %rbx
-	lea	(%rdx), %r14
-	adc	$0, %r9
-	mul	q0
-	add	%rbp, %rbx
-	adc	%r9, %r11
-	mov	%rbx, -8(up)
-	mov	(up), %r10
-	adc	$0, %r14
-	imul	u0inv, %rbx		C next q limb
-	add	%r11, %r10
-	adc	%r14, %rax
-	mov	%r10, (up)
-	mov	8(up), %r10
-	adc	$0, %rdx
-	add	%rax, %r10
-	mov	%r10, 8(up)
-	adc	$0, %rdx
-	mov	%rdx, -16(up)		C up[0]
-	mov	%rbx, q0		C previously computed q limb -> q0
-	lea	8(up), up		C up++
-	dec	j
-	jnz	L(n4)
-	lea	16(up), up
-	jmp	L(add_n)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/core2/rsh1aors_n.asm b/gmp/mpn/x86_64/core2/rsh1aors_n.asm
deleted file mode 100644
index 27eed3712d..0000000000
--- a/gmp/mpn/x86_64/core2/rsh1aors_n.asm
+++ /dev/null
@@ -1,169 +0,0 @@
-dnl  X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn.
-
-dnl  Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 3.05
-C Intel NHM	 3.3
-C Intel SBR	 2.5
-C Intel atom	 ?
-C VIA nano	 ?
-
-C TODO
-C  * Loopmix to approach 2.5 c/l on NHM.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n',  `%rcx')
-
-ifdef(`OPERATION_rsh1add_n', `
-	define(ADDSUB,	      add)
-	define(ADCSBB,	      adc)
-	define(func_n,	      mpn_rsh1add_n)
-	define(func_nc,	      mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
-	define(ADDSUB,	      sub)
-	define(ADCSBB,	      sbb)
-	define(func_n,	      mpn_rsh1sub_n)
-	define(func_nc,	      mpn_rsh1sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(func_nc)
-	FUNC_ENTRY(4)
-IFDOS(`	mov	56(%rsp), %r8	')
-	push	%rbx
-	push	%rbp
-
-	neg	%r8			C set C flag from parameter
-	mov	(up), %r8
-	ADCSBB	(vp), %r8
-	jmp	L(ent)
-EPILOGUE()
-
-	ALIGN(16)
-PROLOGUE(func_n)
-	FUNC_ENTRY(4)
-	push	%rbx
-	push	%rbp
-
-	mov	(up), %r8
-	ADDSUB	(vp), %r8
-L(ent):	sbb	R32(%rbx), R32(%rbx)	C save cy
-	mov	%r8, %rax
-	and	$1, R32(%rax)		C return value
-
-	lea	(up,n,8), up
-	lea	(vp,n,8), vp
-	lea	(rp,n,8), rp
-	mov	R32(n), R32(%rbp)
-	neg	n
-	and	$3, R32(%rbp)
-	jz	L(b0)
-	cmp	$2, R32(%rbp)
-	jae	L(n1)
-
-L(b1):	mov	%r8, %rbp
-	inc	n
-	js	L(top)
-	jmp	L(end)
-
-L(n1):	jnz	L(b3)
-	add	R32(%rbx), R32(%rbx)	C restore cy
-	mov	8(up,n,8), %r11
-	ADCSBB	8(vp,n,8), %r11
-	sbb	R32(%rbx), R32(%rbx)	C save cy
-	mov	%r8, %r10
-	add	$-2, n
-	jmp	L(2)
-
-L(b3):	add	R32(%rbx), R32(%rbx)	C restore cy
-	mov	8(up,n,8), %r10
-	mov	16(up,n,8), %r11
-	ADCSBB	8(vp,n,8), %r10
-	ADCSBB	16(vp,n,8), %r11
-	sbb	R32(%rbx), R32(%rbx)	C save cy
-	mov	%r8, %r9
-	dec	n
-	jmp	L(3)
-
-L(b0):	add	R32(%rbx), R32(%rbx)	C restore cy
-	mov	8(up,n,8), %r9
-	mov	16(up,n,8), %r10
-	mov	24(up,n,8), %r11
-	ADCSBB	8(vp,n,8), %r9
-	ADCSBB	16(vp,n,8), %r10
-	ADCSBB	24(vp,n,8), %r11
-	sbb	R32(%rbx), R32(%rbx)	C save cy
-	jmp	L(4)
-
-	ALIGN(16)
-
-L(top):	add	R32(%rbx), R32(%rbx)	C restore cy
-	mov	(up,n,8), %r8
-	mov	8(up,n,8), %r9
-	mov	16(up,n,8), %r10
-	mov	24(up,n,8), %r11
-	ADCSBB	(vp,n,8), %r8
-	ADCSBB	8(vp,n,8), %r9
-	ADCSBB	16(vp,n,8), %r10
-	ADCSBB	24(vp,n,8), %r11
-	sbb	R32(%rbx), R32(%rbx)	C save cy
-	shrd	$1, %r8, %rbp
-	mov	%rbp, -8(rp,n,8)
-L(4):	shrd	$1, %r9, %r8
-	mov	%r8, (rp,n,8)
-L(3):	shrd	$1, %r10, %r9
-	mov	%r9, 8(rp,n,8)
-L(2):	shrd	$1, %r11, %r10
-	mov	%r10, 16(rp,n,8)
-L(1):	add	$4, n
-	mov	%r11, %rbp
-	js	L(top)
-
-L(end):	shrd	$1, %rbx, %rbp
-	mov	%rbp, -8(rp)
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/rshift.asm b/gmp/mpn/x86_64/core2/rshift.asm
index ab32ec85df..9a3fc46f9a 100644
--- a/gmp/mpn/x86_64/core2/rshift.asm
+++ b/gmp/mpn/x86_64/core2/rshift.asm
@@ -1,69 +1,50 @@
 dnl  x86-64 mpn_rshift optimized for "Core 2".
 
-dnl  Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
+dnl  Copyright 2007 Free Software Foundation, Inc.
 dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
+dnl  This file is part of the GNU MP Library.
 dnl
-dnl  or both in parallel, as here.
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
 dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
 dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
 C	     cycles/limb
-C AMD K8,K9	 4.25
-C AMD K10	 4.25
-C Intel P4	14.7
-C Intel core2	 1.27
-C Intel NHM	 1.375	(up to about n = 260, then 1.5)
-C Intel SBR	 1.77
-C Intel atom	 ?
-C VIA nano	 ?
+C K8,K9:	 4.25
+C K10:		 4.25
+C P4:		14.7
+C P6-15:	 1.27
 
 
 C INPUT PARAMETERS
 define(`rp',	`%rdi')
 define(`up',	`%rsi')
 define(`n',	`%rdx')
-define(`cnt',	`%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt',	`%cl')
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_rshift)
-	FUNC_ENTRY(4)
-	mov	R32(%rdx), R32(%rax)
-	and	$3, R32(%rax)
+	mov	%edx, %eax
+	and	$3, %eax
 	jne	L(nb00)
 L(b00):	C n = 4, 8, 12, ...
 	mov	(up), %r10
 	mov	8(up), %r11
-	xor	R32(%rax), R32(%rax)
-	shrd	R8(cnt), %r10, %rax
+	xor	%eax, %eax
+	shrd	%cl, %r10, %rax
 	mov	16(up), %r8
 	lea	8(up), up
 	lea	-24(rp), rp
@@ -71,11 +52,11 @@ L(b00):	C n = 4, 8, 12, ...
 	jmp	L(00)
 
 L(nb00):C n = 1, 5, 9, ...
-	cmp	$2, R32(%rax)
+	cmp	$2, %eax
 	jae	L(nb01)
 L(b01):	mov	(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shrd	R8(cnt), %r9, %rax
+	xor	%eax, %eax
+	shrd	%cl, %r9, %rax
 	sub	$2, n
 	jb	L(le1)
 	mov	8(up), %r10
@@ -83,65 +64,62 @@ L(b01):	mov	(up), %r9
 	lea	16(up), up
 	lea	-16(rp), rp
 	jmp	L(01)
-L(le1):	shr	R8(cnt), %r9
+L(le1):	shr	%cl, %r9
 	mov	%r9, (rp)
-	FUNC_EXIT()
 	ret
 
 L(nb01):C n = 2, 6, 10, ...
 	jne	L(b11)
 L(b10):	mov	(up), %r8
 	mov	8(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shrd	R8(cnt), %r8, %rax
+	xor	%eax, %eax
+	shrd	%cl, %r8, %rax
 	sub	$3, n
 	jb	L(le2)
 	mov	16(up), %r10
 	lea	24(up), up
 	lea	-8(rp), rp
 	jmp	L(10)
-L(le2):	shrd	R8(cnt), %r9, %r8
+L(le2):	shrd	%cl, %r9, %r8
 	mov	%r8, (rp)
-	shr	R8(cnt), %r9
+	shr	%cl, %r9
 	mov	%r9, 8(rp)
-	FUNC_EXIT()
 	ret
 
 	ALIGN(16)
 L(b11):	C n = 3, 7, 11, ...
 	mov	(up), %r11
 	mov	8(up), %r8
-	xor	R32(%rax), R32(%rax)
-	shrd	R8(cnt), %r11, %rax
+	xor	%eax, %eax
+	shrd	%cl, %r11, %rax
 	mov	16(up), %r9
 	lea	32(up), up
 	sub	$4, n
 	jb	L(end)
 
 	ALIGN(16)
-L(top):	shrd	R8(cnt), %r8, %r11
+L(top):	shrd	%cl, %r8, %r11
 	mov	-8(up), %r10
 	mov	%r11, (rp)
-L(10):	shrd	R8(cnt), %r9, %r8
+L(10):	shrd	%cl, %r9, %r8
 	mov	(up), %r11
 	mov	%r8, 8(rp)
-L(01):	shrd	R8(cnt), %r10, %r9
+L(01):	shrd	%cl, %r10, %r9
 	mov	8(up), %r8
 	mov	%r9, 16(rp)
-L(00):	shrd	R8(cnt), %r11, %r10
+L(00):	shrd	%cl, %r11, %r10
 	mov	16(up), %r9
+	lea	32(up), up
 	mov	%r10, 24(rp)
-	add	$32, up
 	lea	32(rp), rp
 	sub	$4, n
 	jnc	L(top)
 
-L(end):	shrd	R8(cnt), %r8, %r11
+L(end):	shrd	%cl, %r8, %r11
 	mov	%r11, (rp)
-	shrd	R8(cnt), %r9, %r8
+	shrd	%cl, %r9, %r8
 	mov	%r8, 8(rp)
-	shr	R8(cnt), %r9
+	shr	%cl, %r9
 	mov	%r9, 16(rp)
-	FUNC_EXIT()
 	ret
 EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/sec_tabselect.asm b/gmp/mpn/x86_64/core2/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/core2/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl  X86-64 mpn_sec_tabselect.
-
-dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/core2/sqr_basecase.asm b/gmp/mpn/x86_64/core2/sqr_basecase.asm
deleted file mode 100644
index a112c1b52e..0000000000
--- a/gmp/mpn/x86_64/core2/sqr_basecase.asm
+++ /dev/null
@@ -1,984 +0,0 @@
-dnl  X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
-dnl  It also seems good for Conroe/Wolfdale.
-
-dnl  Contributed to the GNU project by Torbjörn Granlund.
-
-dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb	mul_2		addmul_2	sqr_diag_addlsh1
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core	 4.9		4.18-4.25		 3.87
-C Intel NHM	 3.8		4.06-4.2		 3.5
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C Code structure:
-C
-C
-C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
-C           |               |               |               |
-C           |               |               |               |
-C           |               |               |               |
-C          \|/             \|/             \|/             \|/
-C              ____________                   ____________
-C             /            \                 /            \
-C            \|/            \               \|/            \
-C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
-C            \            /|\                \            /|\
-C             \____________/                  \____________/
-C                       \                        /
-C                        \                      /
-C                         \                    /
-C                       tail(0m2)          tail(1m2)
-C                            \              /
-C                             \            /
-C                            sqr_diag_addlsh1
-
-C TODO
-C  * Tune.  None done so far.
-C  * Currently 2761 bytes, making it smaller would be nice.
-C  * Consider using a jumptab-based entry sequence.  One might even use a mask-
-C    less sequence, if the table is large enough to support tuneup's needs.
-C    The code would be, using non-PIC code,
-C        lea tab(%rip),%rax; jmp *(n,%rax)
-C    or,
-C        lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
-C    using PIC code.  The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
-C    with the last four entries repeated a safe number of times.
-C  * Consider expanding feed-in code in order to avoid zeroing registers.
-C  * Zero consistently with xor.
-C  * Check if using "lea (reg),reg" should be done in more places; we have some
-C    explicit "mov %rax,reg" now.
-C  * Try zeroing with xor in m2 loops.
-C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
-C    between loop header and wind-down code.
-C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-C Define this to $1 to use late loop index variable as zero, $2 to use an
-C explicit $0.
-define(`Z',`$1')
-
-define(`rp',       `%rdi')
-define(`up',       `%rsi')
-define(`n_param',  `%rdx')
-
-define(`n',        `%r8')
-
-define(`v0',       `%r10')
-define(`v1',       `%r11')
-define(`w0',       `%rbx')
-define(`w1',       `%rcx')
-define(`w2',       `%rbp')
-define(`w3',       `%r9')
-define(`i',        `%r13')
-
-define(`X0',       `%r12')
-define(`X1',       `%r14')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
-
-ASM_START()
-	TEXT
-	ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
-	FUNC_ENTRY(3)
-
-	cmp	$4, n_param
-	jl	L(small)
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-
-	mov	(up), v0
-	mov	8(up), %rax
-	mov	%rax, v1
-
-	mov	$1, R32(n)
-	sub	n_param, n		C n = -n_param+1
-	push	n
-
-	lea	(up,n_param,8), up
-	lea	(rp,n_param,8), rp
-
-	mul	v0
-
-	test	$1, R8(n)
-	jnz	L(bx1)
-
-L(bx0):	test	$2, R8(n)
-	mov	%rax, (rp,n,8)
-	jnz	L(b10)
-
-L(b00):	lea	(n), i			C n = 5, 9, ...
-	mov	%rdx, w1		C FIXME: Use lea?
-	xor	R32(w2), R32(w2)
-	jmp	L(m2e0)
-
-L(b10):	lea	2(n), i			C n = 7, 11, ...
-	mov	8(up,n,8), %rax
-	mov	%rdx, w3		C FIXME: Use lea?
-	xor	R32(w0), R32(w0)
-	xor	R32(w1), R32(w1)
-	jmp	L(m2e2)
-
-L(bx1):	test	$2, R8(n)
-	mov	%rax, (rp,n,8)
-	jz	L(b11)
-
-L(b01):	lea	1(n), i			C n = 6, 10, ...
-	mov	%rdx, w0		C FIXME: Use lea?
-	xor	R32(w1), R32(w1)
-	jmp	L(m2e1)
-
-L(b11):	lea	-1(n), i		C n = 4, 8, 12, ...
-	mov	%rdx, w2		C FIXME: Use lea?
-	xor	R32(w3), R32(w3)
-	jmp	L(m2e3)
-
-
-	ALIGNx
-L(m2top1):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-L(m2e1):mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top1)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	w0, %rax
-	adc	w1, %rdx
-	mov	%rax, I((rp),(rp,i,8))
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n			C decrease |n|
-	jmp	L(am2o3)
-
-	ALIGNx
-L(m2top3):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-L(m2e3):mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top3)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	w0, %rax
-	adc	w1, %rdx
-	mov	%rax, I((rp),(rp,i,8))
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n			C decrease |n|
-	cmp	$-1, n
-	jz	L(cor1)			C jumps iff entry n = 4
-
-L(am2o1):
-	mov	-8(up,n,8), v0
-	mov	(up,n,8), %rax
-	mov	%rax, v1
-	lea	1(n), i
-	mul	v0
-	mov	%rax, X1
-	MOV(	%rdx, X0, 128)
-	mov	(rp,n,8), w1
-	xor	R32(w2), R32(w2)
-	mov	8(up,n,8), %rax
-	xor	R32(w3), R32(w3)
-	jmp	L(lo1)
-
-	ALIGNx
-L(am2top1):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-L(lo1):	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top1)
-
-	mul	v1
-	add	w0, w1
-	adc	w2, %rax
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	X0, %rax
-	mov	%rax, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n
-
-L(am2o3):
-	mov	-8(up,n,8), v0
-	mov	(up,n,8), %rax
-	mov	%rax, v1
-	lea	-1(n), i
-	mul	v0
-	mov	%rax, X1
-	MOV(	%rdx, X0, 8)
-	mov	(rp,n,8), w3
-	xor	R32(w0), R32(w0)
-	xor	R32(w1), R32(w1)
-	mov	8(up,n,8), %rax
-	jmp	L(lo3)
-
-	ALIGNx
-L(am2top3):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-L(lo3):	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top3)
-
-	mul	v1
-	add	w0, w1
-	adc	w2, %rax
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	X0, %rax
-	mov	%rax, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n
-	cmp	$-1, n
-	jnz	L(am2o1)
-
-L(cor1):pop	n
-	mov	%rdx, w3
-	mov	-16(up), v0
-	mov	-8(up), %rax
-	mul	v0
-	add	w3, %rax
-	adc	$0, %rdx
-	mov	%rax, -8(rp)
-	mov	%rdx, (rp)
-	jmp	L(sqr_diag_addlsh1)
-
-	ALIGNx
-L(m2top2):
-L(m2e2):mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-	mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top2)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	w0, %rax
-	adc	w1, %rdx
-	mov	%rax, I((rp),(rp,i,8))
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n			C decrease |n|
-	jmp	L(am2o0)
-
-	ALIGNx
-L(m2top0):
-	mul	v0
-	add	%rax, w3
-	mov	-8(up,i,8), %rax
-	mov	w3, -8(rp,i,8)
-	adc	%rdx, w0
-	adc	$0, R32(w1)
-	mul	v1
-	add	%rax, w0
-	adc	%rdx, w1
-	mov	$0, R32(w2)
-	mov	(up,i,8), %rax
-	mul	v0
-	add	%rax, w0
-	mov	w0, (rp,i,8)
-	adc	%rdx, w1
-	mov	(up,i,8), %rax
-	adc	$0, R32(w2)
-	mul	v1
-	add	%rax, w1
-	adc	%rdx, w2
-L(m2e0):mov	8(up,i,8), %rax
-	mul	v0
-	mov	$0, R32(w3)
-	add	%rax, w1
-	adc	%rdx, w2
-	adc	$0, R32(w3)
-	mov	8(up,i,8), %rax
-	mul	v1
-	add	%rax, w2
-	mov	w1, 8(rp,i,8)
-	adc	%rdx, w3
-	mov	$0, R32(w0)
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	%rax, w2
-	mov	16(up,i,8), %rax
-	adc	%rdx, w3
-	adc	$0, R32(w0)
-	mul	v1
-	mov	$0, R32(w1)
-	add	%rax, w3
-	mov	24(up,i,8), %rax
-	mov	w2, 16(rp,i,8)
-	adc	%rdx, w0
-	add	$4, i
-	js	L(m2top0)
-
-	mul	v0
-	add	%rax, w3
-	mov	I(-8(up),-8(up,i,8)), %rax
-	mov	w3, I(-8(rp),-8(rp,i,8))
-	adc	%rdx, w0
-	adc	R32(w1), R32(w1)
-	mul	v1
-	add	w0, %rax
-	adc	w1, %rdx
-	mov	%rax, I((rp),(rp,i,8))
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n			C decrease |n|
-	cmp	$-2, n
-	jz	L(cor2)			C jumps iff entry n = 5
-
-L(am2o2):
-	mov	-8(up,n,8), v0
-	mov	(up,n,8), %rax
-	mov	%rax, v1
-	lea	-2(n), i
-	mul	v0
-	mov	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	(rp,n,8), w0
-	xor	R32(w1), R32(w1)
-	xor	R32(w2), R32(w2)
-	mov	8(up,n,8), %rax
-	jmp	L(lo2)
-
-	ALIGNx
-L(am2top2):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-L(lo2):	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top2)
-
-	mul	v1
-	add	w0, w1
-	adc	w2, %rax
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	X0, %rax
-	mov	%rax, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n
-
-L(am2o0):
-	mov	-8(up,n,8), v0
-	mov	(up,n,8), %rax
-	mov	%rax, v1
-	lea	0(n), i
-	mul	v0
-	mov	%rax, X0
-	MOV(	%rdx, X1, 2)
-	xor	R32(w0), R32(w0)
-	mov	(rp,n,8), w2
-	xor	R32(w3), R32(w3)
-	jmp	L(lo0)
-
-	ALIGNx
-L(am2top0):
-	mul	v1
-	add	w0, w1
-	adc	%rax, w2
-	mov	(up,i,8), %rax
-	MOV(	%rdx, w3, 1)
-	adc	$0, w3
-	mul	v0
-	add	w1, X1
-	mov	X1, -8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 2)
-	adc	$0, X1
-	mov	(up,i,8), %rax
-	mul	v1
-	MOV(	%rdx, w0, 4)
-	mov	(rp,i,8), w1
-	add	w1, w2
-	adc	%rax, w3
-	adc	$0, w0
-L(lo0):	mov	8(up,i,8), %rax
-	mul	v0
-	add	w2, X0
-	adc	%rax, X1
-	mov	X0, (rp,i,8)
-	MOV(	%rdx, X0, 8)
-	adc	$0, X0
-	mov	8(up,i,8), %rax
-	mov	8(rp,i,8), w2
-	mul	v1
-	add	w2, w3
-	adc	%rax, w0
-	MOV(	%rdx, w1, 16)
-	adc	$0, w1
-	mov	16(up,i,8), %rax
-	mul	v0
-	add	w3, X1
-	mov	X1, 8(rp,i,8)
-	adc	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	16(rp,i,8), w3
-	adc	$0, X1
-	mov	16(up,i,8), %rax
-	mul	v1
-	add	w3, w0
-	MOV(	%rdx, w2, 64)
-	adc	%rax, w1
-	mov	24(up,i,8), %rax
-	adc	$0, w2
-	mul	v0
-	add	w0, X0
-	mov	X0, 16(rp,i,8)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	24(up,i,8), %rax
-	mov	24(rp,i,8), w0
-	adc	$0, X0
-	add	$4, i
-	jnc	L(am2top0)
-
-	mul	v1
-	add	w0, w1
-	adc	w2, %rax
-	adc	Z(i,$0), %rdx
-	add	w1, X1
-	adc	Z(i,$0), X0
-	mov	X1, I(-8(rp),-8(rp,i,8))
-	add	X0, %rax
-	mov	%rax, I((rp),(rp,i,8))
-	adc	Z(i,$0), %rdx
-	mov	%rdx, I(8(rp),8(rp,i,8))
-
-	lea	16(rp), rp
-	add	$2, n
-	cmp	$-2, n
-	jnz	L(am2o2)
-
-L(cor2):pop	n
-	mov	-24(up), v0
-	mov	%rax, w2
-	mov	%rdx, w0
-	mov	-16(up), %rax
-	mov	%rax, v1
-	mul	v0
-	mov	%rax, X0
-	MOV(	%rdx, X1, 32)
-	mov	-8(up), %rax
-	mul	v0
-	add	w2, X0
-	mov	X0, -16(rp)
-	MOV(	%rdx, X0, 128)
-	adc	%rax, X1
-	mov	-8(up), %rax
-	adc	$0, X0
-	mul	v1
-	add	w0, X1
-	adc	$0, X0
-	mov	X1, -8(rp)
-	add	X0, %rax
-	mov	%rax, (rp)
-	adc	$0, %rdx
-	mov	%rdx, 8(rp)
-	lea	8(rp), rp
-
-L(sqr_diag_addlsh1):
-	mov	-8(up,n,8), %rax
-	shl	n
-	xor	R32(%rbx), R32(%rbx)
-	mul	%rax
-	mov	8(rp,n,8), %r11
-	lea	(%rdx), %r10
-	mov	16(rp,n,8), %r9
-	add	%r11, %r11
-	jmp	L(dm)
-
-	ALIGNx
-L(dtop):mul	%rax
-	add	%r11, %r10
-	mov	8(rp,n,8), %r11
-	mov	%r10, -8(rp,n,8)
-	adc	%r9, %rax
-	lea	(%rdx,%rbx), %r10
-	mov	16(rp,n,8), %r9
-	adc	%r11, %r11
-L(dm):	mov	%rax, (rp,n,8)
-	mov	(up,n,4), %rax
-	adc	%r9, %r9
-	setc	R8(%rbx)
-	add	$2, n
-	js	L(dtop)
-
-	mul	%rax
-	add	%r11, %r10
-	mov	%r10, -8(rp)
-	adc	%r9, %rax
-	lea	(%rdx,%rbx), %r10
-	mov	%rax, (rp)
-	adc	$0, %r10
-	mov	%r10, 8(rp)
-
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
-
-	ALIGN(16)
-L(small):
-	mov	(up), %rax
-	cmp	$2, n_param
-	jae	L(gt1)
-L(n1):
-	mul	%rax
-	mov	%rax, (rp)
-	mov	%rdx, 8(rp)
-	FUNC_EXIT()
-	ret
-
-L(gt1):	jne	L(gt2)
-L(n2):	mov	%rax, %r8
-	mul	%rax
-	mov	8(up), %r11
-	mov	%rax, (rp)
-	mov	%r11, %rax
-	mov	%rdx, %r9
-	mul	%rax
-	mov	%rax, %r10
-	mov	%r11, %rax
-	mov	%rdx, %r11
-	mul	%r8
-	xor	%r8, %r8
-	add	%rax, %r9
-	adc	%rdx, %r10
-	adc	%r8, %r11
-	add	%rax, %r9
-	mov	%r9, 8(rp)
-	adc	%rdx, %r10
-	mov	%r10, 16(rp)
-	adc	%r8, %r11
-	mov	%r11, 24(rp)
-	FUNC_EXIT()
-	ret
-
-L(gt2):
-L(n3):	mov	%rax, %r10
-	mul	%rax
-	mov	8(up), %r11
-	mov	%rax, (rp)
-	mov	%r11, %rax
-	mov	%rdx, 8(rp)
-	mul	%rax
-	mov	16(up), %rcx
-	mov	%rax, 16(rp)
-	mov	%rcx, %rax
-	mov	%rdx, 24(rp)
-	mul	%rax
-	mov	%rax, 32(rp)
-	mov	%rdx, 40(rp)
-
-	mov	%r11, %rax
-	mul	%r10
-	mov	%rax, %r8
-	mov	%rcx, %rax
-	mov	%rdx, %r9
-	mul	%r10
-	xor	%r10, %r10
-	add	%rax, %r9
-	mov	%r11, %rax
-	mov	%r10, %r11
-	adc	%rdx, %r10
-
-	mul	%rcx
-	add	%rax, %r10
-	adc	%r11, %rdx
-	add	%r8, %r8
-	adc	%r9, %r9
-	adc	%r10, %r10
-	adc	%rdx, %rdx
-	adc	%r11, %r11
-	add	%r8, 8(rp)
-	adc	%r9, 16(rp)
-	adc	%r10, 24(rp)
-	adc	%rdx, 32(rp)
-	adc	%r11, 40(rp)
-	FUNC_EXIT()
-	ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/sublsh1_n.asm b/gmp/mpn/x86_64/core2/sublsh1_n.asm
deleted file mode 100644
index 46488fcafe..0000000000
--- a/gmp/mpn/x86_64/core2/sublsh1_n.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl  AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
-
-dnl  Contributed to the GNU project by Torbjorn Granlund.
-
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-define(ADDSUB,	sub)
-define(ADCSBB,	sbb)
-define(func,	mpn_sublsh1_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/sublsh2_n.asm b/gmp/mpn/x86_64/core2/sublsh2_n.asm
deleted file mode 100644
index f3b1e28464..0000000000
--- a/gmp/mpn/x86_64/core2/sublsh2_n.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl  AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
-
-dnl  Contributed to the GNU project by Torbjorn Granlund.
-
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-define(ADDSUB,	sub)
-define(ADCSBB,	sbb)
-define(func,	mpn_sublsh2_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/core2/sublshC_n.asm')