17 files changed, 4547 insertions, 0 deletions
diff --git a/gmp/mpn/alpha/ev6/add_n.asm b/gmp/mpn/alpha/ev6/add_n.asm
new file mode 100644
index 0000000000..9261f31b8a
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/add_n.asm
@@ -0,0 +1,283 @@
+dnl  Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     5.4
+C EV6:     2.125
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  vp	r18
+C  n	r19
+C  cy	r20   (for mpn_add_nc)
+
+C TODO
+C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C   Use multi-pronged feed-in.
+C   Perform additional micro-tuning
+
+C  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C  Pair loads and stores where possible
+C  Store pairs oct-aligned where possible (didn't need it here)
+C  Stores are delayed every third cycle
+C  Loads and stores are delayed by fills
+C  U stays still, put code there where possible (note alternation of U1 and U0)
+C  L moves because of loads and stores
+C  Note dampers in L to limit damage
+
+C  This odd-looking optimization expects that were having random bits in our
+C  data, so that a pure zero result is unlikely. so we penalize the unlikely
+C  case to help the common case.
+
+define(`u0', `r0')  define(`u1', `r3')
+define(`v0', `r1')  define(`v1', `r4')
+
+define(`cy0', `r20')  define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+	br	r31,	$entry
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	bis	r31,	r31,	cy0	C clear carry in
+$entry:	cmpult	r19,	5,	r22	C L1 move counter
+	ldq	u1,	0(r17)		C L0 get next ones
+	ldq	v1,	0(r18)		C L1
+	bne	r22,	$Lsmall
+
+	ldq	u0,	8(r17)		C L0 get next ones
+	ldq	v0,	8(r18)		C L1
+	addq	u1,	v1,	r5	C U0 add two data
+
+	cmpult	r5,	v1,	r23	C U0 did it carry
+	ldq	u1,	16(r17)		C L0 get next ones
+	ldq	v1,	16(r18)		C L1
+
+	addq	u0,	v0,	r8	C U1 add two data
+	addq	r5,	cy0,	r5	C U0 carry in
+
+	cmpult	r8,	v0,	r22	C U1 did it carry
+	beq	r5,	$fix5f		C U0 fix exact zero
+$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
+	ldq	v0,	24(r18)		C L1
+
+	addq	r8,	r23,	r8	C U1 carry from last
+	addq	u1,	v1,	r7	C U0 add two data
+
+	beq	r8,	$fix6f		C U1 fix exact zero
+$ret6f:	cmpult	r7,	v1,	r23	C U0 did it carry
+	ldq	u1,	32(r17)		C L0 get next ones
+	ldq	v1,	32(r18)		C L1
+
+	lda	r17,	40(r17)		C L0 move pointer
+	lda	r18,	40(r18)		C L1 move pointer
+
+	lda	r16,	-8(r16)
+	lda	r19,	-13(r19)	C L1 move counter
+	blt	r19,	$Lend		C U1 loop control
+
+
+C Main loop.  8-way unrolled.
+	ALIGN(16)
+$Loop:	addq	u0,	v0,	r2	C U1 add two data
+	addq	r7,	r22,	r7	C U0 add in carry
+	stq	r5,	8(r16)		C L0 put an answer
+	stq	r8,	16(r16)		C L1 pair
+
+	cmpult	r2,	v0,	cy1	C U1 did it carry
+	beq	r7,	$fix7		C U0 fix exact 0
+$ret7:	ldq	u0,	0(r17)		C L0 get next ones
+	ldq	v0,	0(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	addq	r2,	r23,	r2	C U1 carry from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	addq	u1,	v1,	r5	C U0 add two data
+
+	beq	r2,	$fix0		C U1 fix exact zero
+$ret0:	cmpult	r5,	v1,	cy0	C U0 did it carry
+	ldq	u1,	8(r17)		C L0 get next ones
+	ldq	v1,	8(r18)		C L1
+
+	addq	u0,	v0,	r8	C U1 add two data
+	addq	r5,	cy1,	r5	C U0 carry from last
+	stq	r7,	24(r16)		C L0 store pair
+	stq	r2,	32(r16)		C L1
+
+	cmpult	r8,	v0,	r22	C U1 did it carry
+	beq	r5,	$fix1		C U0 fix exact zero
+$ret1:	ldq	u0,	16(r17)		C L0 get next ones
+	ldq	v0,	16(r18)		C L1
+
+	lda	r16,	64(r16)		C L0 move pointer
+	addq	r8,	cy0,	r8	C U1 carry from last
+	lda	r19,	-8(r19)		C L1 move counter
+	addq	u1,	v1,	r7	C U0 add two data
+
+	beq	r8,	$fix2		C U1 fix exact zero
+$ret2:	cmpult	r7,	v1,	r23	C U0 did it carry
+	ldq	u1,	24(r17)		C L0 get next ones
+	ldq	v1,	24(r18)		C L1
+
+	addq	u0,	v0,	r2	C U1 add two data
+	addq	r7,	r22,	r7	C U0 add in carry
+	stq	r5,	-24(r16)	C L0 put an answer
+	stq	r8,	-16(r16)	C L1 pair
+
+	cmpult	r2,	v0,	cy1	C U1 did it carry
+	beq	r7,	$fix3		C U0 fix exact 0
+$ret3:	ldq	u0,	32(r17)		C L0 get next ones
+	ldq	v0,	32(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	addq	r2,	r23,	r2	C U1 carry from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	addq	u1,	v1,	r5	C U0 add two data
+
+	beq	r2,	$fix4		C U1 fix exact zero
+$ret4:	cmpult	r5,	v1,	cy0	C U0 did it carry
+	ldq	u1,	40(r17)		C L0 get next ones
+	ldq	v1,	40(r18)		C L1
+
+	addq	u0,	v0,	r8	C U1 add two data
+	addq	r5,	cy1,	r5	C U0 carry from last
+	stq	r7,	-8(r16)		C L0 store pair
+	stq	r2,	0(r16)		C L1
+
+	cmpult	r8,	v0,	r22	C U1 did it carry
+	beq	r5,	$fix5		C U0 fix exact zero
+$ret5:	ldq	u0,	48(r17)		C L0 get next ones
+	ldq	v0,	48(r18)		C L1
+
+	ldl	r31, 256(r17)		C L0 prefetch
+	addq	r8,	cy0,	r8	C U1 carry from last
+	ldl	r31, 256(r18)		C L1 prefetch
+	addq	u1,	v1,	r7	C U0 add two data
+
+	beq	r8,	$fix6		C U1 fix exact zero
+$ret6:	cmpult	r7,	v1,	r23	C U0 did it carry
+	ldq	u1,	56(r17)		C L0 get next ones
+	ldq	v1,	56(r18)		C L1
+
+	lda	r17,	64(r17)		C L0 move pointer
+	bis	r31,	r31,	r31	C U
+	lda	r18,	64(r18)		C L1 move pointer
+	bge	r19,	$Loop		C U1 loop control
+C ==== main loop end
+
+$Lend:	addq	u0,	v0,	r2	C U1 add two data
+	addq	r7,	r22,	r7	C U0 add in carry
+	stq	r5,	8(r16)		C L0 put an answer
+	stq	r8,	16(r16)		C L1 pair
+	cmpult	r2,	v0,	cy1	C U1 did it carry
+	beq	r7,	$fix7c		C U0 fix exact 0
+$ret7c:	addq	r2,	r23,	r2	C U1 carry from last
+	addq	u1,	v1,	r5	C U0 add two data
+	beq	r2,	$fix0c		C U1 fix exact zero
+$ret0c:	cmpult	r5,	v1,	cy0	C U0 did it carry
+	addq	r5,	cy1,	r5	C U0 carry from last
+	stq	r7,	24(r16)		C L0 store pair
+	stq	r2,	32(r16)		C L1
+	beq	r5,	$fix1c		C U0 fix exact zero
+$ret1c:	stq	r5,	40(r16)		C L0 put an answer
+	lda	r16,	48(r16)		C L0 move pointer
+
+	lda	r19,	8(r19)
+	beq	r19,	$Lret
+
+	ldq	u1,	0(r17)
+	ldq	v1,	0(r18)
+$Lsmall:
+	lda	r19,	-1(r19)
+	beq	r19,	$Lend0
+
+	ALIGN(8)
+$Loop0:	addq	u1,	v1,	r2	C main add
+	cmpult	r2,	v1,	r8	C compute cy from last add
+	ldq	u1,	8(r17)
+	ldq	v1,	8(r18)
+	addq	r2,	cy0,	r5	C carry add
+	lda	r17,	8(r17)
+	lda	r18,	8(r18)
+	stq	r5,	0(r16)
+	cmpult	r5,	r2,	cy0	C compute cy from last add
+	lda	r19,	-1(r19)		C decr loop cnt
+	bis	r8,	cy0,	cy0	C combine cy from the two adds
+	lda	r16,	8(r16)
+	bne	r19,	$Loop0
+$Lend0:	addq	u1,	v1,	r2	C main add
+	addq	r2,	cy0,	r5	C carry add
+	cmpult	r2,	v1,	r8	C compute cy from last add
+	cmpult	r5,	r2,	cy0	C compute cy from last add
+	stq	r5,	0(r16)
+	bis	r8,	cy0,	r0	C combine cy from the two adds
+	ret	r31,(r26),1
+
+	ALIGN(8)
+$Lret:	lda	r0,	0(cy0)		C copy carry into return register
+	ret	r31,(r26),1
+
+$fix5f:	bis	r23,	cy0,	r23	C bring forward carry
+	br	r31,	$ret5f
+$fix6f:	bis	r22,	r23,	r22	C bring forward carry
+	br	r31,	$ret6f
+$fix0:	bis	cy1,	r23,	cy1	C bring forward carry
+	br	r31,	$ret0
+$fix1:	bis	cy0,	cy1,	cy0	C bring forward carry
+	br	r31,	$ret1
+$fix2:	bis	r22,	cy0,	r22	C bring forward carry
+	br	r31,	$ret2
+$fix3:	bis	r23,	r22,	r23	C bring forward carry
+	br	r31,	$ret3
+$fix4:	bis	cy1,	r23,	cy1	C bring forward carry
+	br	r31,	$ret4
+$fix5:	bis	cy1,	cy0,	cy0	C bring forward carry
+	br	r31,	$ret5
+$fix6:	bis	r22,	cy0,	r22	C bring forward carry
+	br	r31,	$ret6
+$fix7:	bis	r23,	r22,	r23	C bring forward carry
+	br	r31,	$ret7
+$fix0c:	bis	cy1,	r23,	cy1	C bring forward carry
+	br	r31,	$ret0c
+$fix1c:	bis	cy0,	cy1,	cy0	C bring forward carry
+	br	r31,	$ret1c
+$fix7c:	bis	r23,	r22,	r23	C bring forward carry
+	br	r31,	$ret7c
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorslsh1_n.asm b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
new file mode 100644
index 0000000000..cb966ce021
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
@@ -0,0 +1,172 @@
+dnl  Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     7
+C EV6:     4
+
+C TODO
+C  * Tune to reach 3.75 c/l on ev6.
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADDSUB,       addq)
+  define(CARRY,       `cmpult $1,$2,$3')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADDSUB,       subq)
+  define(CARRY,       `cmpult $2,$1,$3')
+  define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	and	n, 2, cy0
+	blbs	n, L(bx1)
+L(bx0):	ldq	v1, 0(vp)
+	ldq	u1, 0(up)
+	lda	r2, 0(r31)
+	bne	cy0, L(b10)
+
+L(b00):	lda	vp, 48(vp)
+	lda	up, -16(up)
+	lda	rp, -8(rp)
+	lda	cy0, 0(r31)
+	br	r31, L(lo0)
+
+L(b10):	lda	vp, 32(vp)
+	lda	rp, 8(rp)
+	lda	cy0, 0(r31)
+	br	r31, L(lo2)
+
+L(bx1):	ldq	v0, 0(vp)
+	ldq	u0, 0(up)
+	lda	r3, 0(r31)
+	beq	cy0, L(b01)
+
+L(b11):	lda	vp, 40(vp)
+	lda	up, -24(up)
+	lda	rp, 16(rp)
+	lda	cy1, 0(r31)
+	br	r31, L(lo3)
+
+L(b01):	lda	n, -4(n)
+	lda	cy1, 0(r31)
+	ble	n, L(end)
+	lda	vp, 24(vp)
+	lda	up, -8(up)
+
+	ALIGN(16)
+L(top):	addq	v0, v0, r6
+	ldq	v1, -16(vp)
+	addq	r6, r3, sl	C combined vlimb
+	ldq	u1, 16(up)
+	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
+	cmplt	v0, r31, r2	C high v bits
+	ADDSUB	ps, cy1, rr	C consume carry from previous operation
+	CARRY(	ps, u0, cy0)	C carry out #2
+	stq	rr, 0(rp)
+	CARRY(	rr, ps, cy)	C carry out #3
+	lda	vp, 32(vp)	C bookkeeping
+	addq	cy, cy0, cy0	C final carry out
+L(lo0):	addq	v1, v1, r7
+	ldq	v0, -40(vp)
+	addq	r7, r2, sl
+	ldq	u0, 24(up)
+	ADDSUB	u1, sl, ps
+	cmplt	v1, r31, r3
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy1)
+	stq	rr, 8(rp)
+	CARRY(	rr, ps, cy)
+	lda	rp, 32(rp)	C bookkeeping
+	addq	cy, cy1, cy1
+L(lo3):	addq	v0, v0, r6
+	ldq	v1, -32(vp)
+	addq	r6, r3, sl
+	ldq	u1, 32(up)
+	ADDSUB	u0, sl, ps
+	cmplt	v0, r31, r2
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy0)
+	stq	rr, -16(rp)
+	CARRY(	rr, ps, cy)
+	lda	up, 32(up)	C bookkeeping
+	addq	cy, cy0, cy0
+L(lo2):	addq	v1, v1, r7
+	ldq	v0, -24(vp)
+	addq	r7, r2, sl
+	ldq	u0, 8(up)
+	ADDSUB	u1, sl, ps
+	cmplt	v1, r31, r3
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy1)
+	stq	rr, -8(rp)
+	CARRY(	rr, ps, cy)
+	lda	n, -4(n)	C bookkeeping
+	addq	cy, cy1, cy1
+	bgt	n, L(top)
+
+L(end):	addq	v0, v0, r6
+	addq	r6, r3, sl
+	ADDSUB	u0, sl, ps
+	cmplt	v0, r31, r2
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy0)
+	stq	rr, 0(rp)
+	CARRY(	rr, ps, cy)
+	addq	cy, cy0, cy0
+	addq	cy0, r2, r0
+
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorsmul_1.asm b/gmp/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644
index 0000000000..0e68e6e7ad
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/aorsmul_1.asm
@@ -0,0 +1,398 @@
+dnl  Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.5
+
+C  INPUT PARAMETERS
+define(`rp',	`r16')
+define(`up',	`r17')
+define(`n',	`r18')
+define(`v0',	`r19')
+
+dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl  them, so that further disturbance to the schedule is damped.
+
+dnl  We couldn't pair the loads, because the entangled schedule of the carry's
+dnl  has to happen on one side {0} of the machine.
+
+dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
+dnl  ldq in L1, say that load gets stalled because it collides with a fill from
+dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
+dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
+dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
+dnl  go in L0, or goes there, and causes a further instruction to stall.
+
+dnl  So for b_cache, we're likely going to want to put one or more cycles back
+dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl  At a place where we have an mt followed by a bookkeeping, put the
+dnl  bookkeeping in upper, and the prefetch into lower.
+
+dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
+dnl  like not to have an ldq or an stq to preceded a conditional branch in a
+dnl  quadpack.  The conditional branch moves the retire pointer one cycle
+dnl  later.
+
+ifdef(`OPERATION_addmul_1',`
+    define(`ADDSUB',	`addq')
+    define(`CMPCY',	`cmpult	$2,$1')
+    define(`func',	`mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+    define(`ADDSUB',	`subq')
+    define(`CMPCY',	`cmpult	$1,$2')
+    define(`func',	`mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	ldq	r3,	0(up)		C
+	and	r18,	7,	r20	C
+	lda	r18,	-9(r18)		C
+	cmpeq	r20,	1,	r21	C
+	beq	r21,	$L1		C
+
+$1mod8:	ldq	r5,	0(rp)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r8	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r8,	r20,	r0	C
+	stq	r23,	0(rp)		C
+	bge	r18,	$ent1		C
+	ret	r31,	(r26),	1	C
+
+$L1:	lda	r8,	0(r31)		C zero carry reg
+	lda	r24,	0(r31)		C zero carry reg
+	cmpeq	r20,	2,	r21	C
+	bne	r21,	$2mod8		C
+	cmpeq	r20,	3,	r21	C
+	bne	r21,	$3mod8		C
+	cmpeq	r20,	4,	r21	C
+	bne	r21,	$4mod8		C
+	cmpeq	r20,	5,	r21	C
+	bne	r21,	$5mod8		C
+	cmpeq	r20,	6,	r21	C
+	bne	r21,	$6mod8		C
+	cmpeq	r20,	7,	r21	C
+	beq	r21,	$0mod8		C
+
+$7mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r24	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r24,	r20,	r24	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$6mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r25	C
+	umulh	v0,	r3,	r3	C
+	mulq	v0,	r1,	r28	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	ldq	r1,	24(up)		C
+	lda	up,	48(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r2	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	-32(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r6	C
+	ADDSUB	r4,	r25,	r25	C lo + acc
+	mulq	v0,	r1,	r7	C
+	br	r31,	$ent6		C
+
+$ent1:	lda	up,	8(up)		C
+	lda	rp,	8(rp)		C
+	lda	r8,	0(r0)		C
+	ldq	r3,	0(up)		C
+$0mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r2	C
+	umulh	v0,	r3,	r6	C
+	mulq	v0,	r1,	r7	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r24	C
+	ldq	r1,	24(up)		C
+	mulq	v0,	r0,	r25	C
+	ldq	r5,	8(rp)		C
+	umulh	v0,	r0,	r3	C
+	ADDSUB	r4,	r2,	r2	C lo + acc
+	mulq	v0,	r1,	r28	C
+	lda	rp,	-16(rp)		C
+	br	r31,	$ent0		C
+
+$3mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r8	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r8,	r20,	r24	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$2mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r25	C
+	umulh	v0,	r3,	r3	C
+	mulq	v0,	r1,	r28	C
+	ble	r18,	$n23		C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	ldq	r1,	24(up)		C
+	lda	up,	16(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r2	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	0(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r6	C
+	ADDSUB	r4,	r25,	r25	C lo + acc
+	mulq	v0,	r1,	r7	C
+	br	r31,	$ent2		C
+
+$5mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r24	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r24,	r20,	r8	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$4mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r2	C
+	umulh	v0,	r3,	r6	C
+	mulq	v0,	r1,	r7	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r24	C
+	ldq	r1,	24(up)		C
+	lda	up,	32(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r25	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	16(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r3	C
+	ADDSUB	r4,	r2,	r2	C lo + acc
+	mulq	v0,	r1,	r28	C
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+	ble	r18,	$Lend		C
+	ALIGN(16)
+$Loop:
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
+	addq	r6,	r20,	r6	C U0 hi mul + carry
+	ldq	r0,	0(up)		C
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r7,	r7	C L0 lo + acc
+	addq	r6,	r21,	r6	C U0 hi mul + carry
+	ldq	r4,	0(rp)		C L1
+
+	umulh	v0,	r1,	r8	C U1
+	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
+	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
+	ldq	r1,	8(up)		C L1
+
+	mulq	v0,	r0,	r2	C U1
+	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
+	addq	r24,	r20,	r24	C U0 hi mul + carry
+	ldq	r5,	8(rp)		C L1
+
+	umulh	v0,	r0,	r6	C U1
+	ADDSUB	r4,	r25,	r25	C U0 lo + acc
+	stq	r22,	-16(rp)		C L0
+	stq	r23,	-8(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r7	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r24,	r21,	r24	C U0 hi mul + carry
+$ent2:
+	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r18,	-8(r18)		C L1 bookkeeping
+	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
+	addq	r3,	r20,	r3	C U0 hi mul + carry
+	ldq	r0,	16(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r28,	r28	C L0 lo + acc
+	addq	r3,	r21,	r3	C U0 hi mul + carry
+	ldq	r4,	16(rp)		C L1
+
+	umulh	v0,	r1,	r24	C U1
+	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
+	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
+	ldq	r1,	24(up)		C L1
+
+	mulq	v0,	r0,	r25	C U1
+	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r5,	24(rp)		C L1
+
+	umulh	v0,	r0,	r3	C U1
+	ADDSUB	r4,	r2,	r2	C U0 lo + acc
+	stq	r22,	0(rp)		C L0
+	stq	r23,	8(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r28	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+$ent0:
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	up,	64(up)		C L1 bookkeeping
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
+	addq	r6,	r20,	r6	C U0 hi mul + carry
+	ldq	r0,	-32(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r7,	r7	C L0 lo + acc
+	addq	r6,	r21,	r6	C U0 hi mul + carry
+	ldq	r4,	32(rp)		C L1
+
+	umulh	v0,	r1,	r8	C U1
+	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
+	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
+	ldq	r1,	-24(up)		C L1
+
+	mulq	v0,	r0,	r2	C U1
+	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
+	addq	r24,	r20,	r24	C U0 hi mul + carry
+	ldq	r5,	40(rp)		C L1
+
+	umulh	v0,	r0,	r6	C U1
+	ADDSUB	r4,	r25,	r25	C U0 lo + acc
+	stq	r22,	16(rp)		C L0
+	stq	r23,	24(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r7	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r24,	r21,	r24	C U0 hi mul + carry
+$ent6:
+	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	rp,	64(rp)		C L1 bookkeeping
+	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
+	addq	r3,	r20,	r3	C U0 hi mul + carry
+	ldq	r0,	-16(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r28,	r28	C L0 lo + acc
+	addq	r3,	r21,	r3	C U0 hi mul + carry
+	ldq	r4,	-16(rp)		C L1
+
+	umulh	v0,	r1,	r24	C U1
+	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
+	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
+	ldq	r1,	-8(up)		C L1
+
+	mulq	v0,	r0,	r25	C U1
+	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r5,	-8(rp)		C L1
+
+	umulh	v0,	r0,	r3	C U1
+	ADDSUB	r4,	r2,	r2	C U0 lo + acc
+	stq	r22,	-32(rp)		C L0
+	stq	r23,	-24(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r28	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+	ldl	r31,	256(up)		C prefetch up[]
+	bgt	r18,	$Loop		C U1 bookkeeping
+
+$Lend:	CMPCY(	r2,	r22),	r21	C
+	addq	r6,	r20,	r6	C
+	ADDSUB	r5,	r7,	r7	C
+	addq	r6,	r21,	r6	C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	CMPCY(	r5,	r7),	r20	C
+	ADDSUB	r7,	r6,	r23	C
+	CMPCY(r7,	r23),	r21	C
+	addq	r24,	r20,	r24	C
+	ldq	r5,	8(rp)		C
+	ADDSUB	r4,	r25,	r25	C
+	stq	r22,	-16(rp)		C
+	stq	r23,	-8(rp)		C
+	addq	r24,	r21,	r24	C
+	br	L(x)
+
+	ALIGN(16)
+$n23:	ldq	r4,	0(rp)		C
+	ldq	r5,	8(rp)		C
+	umulh	v0,	r1,	r8	C
+	ADDSUB	r4,	r25,	r25	C
+L(x):	CMPCY(	r4,	r25),	r20	C
+	ADDSUB	r25,	r24,	r22	C
+	CMPCY(	r25,	r22),	r21	C
+	addq	r3,	r20,	r3	C
+	ADDSUB	r5,	r28,	r28	C
+	addq	r3,	r21,	r3	C
+	CMPCY(	r5,	r28),	r20	C
+	ADDSUB	r28,	r3,	r23	C
+	CMPCY(	r28,	r23),	r21	C
+	addq	r8,	r20,	r8	C
+	stq	r22,	0(rp)		C
+	stq	r23,	8(rp)		C
+	addq	r8,	r21,	r0	C
+	ret	r31,	(r26),	1	C
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/gmp-mparam.h b/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..e51d6b0d15
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define DIVEXACT_BY3_METHOD 0	/* override ../diveby3.asm */
+
+/* 500 MHz 21164 (agnesi.math.su.se) */
+/* FFT tuning limit = 20000000 */
+/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        21
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_1N_PI1_METHOD                 2
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD               8
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD               117
+#define MUL_TOOM44_THRESHOLD               124
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 59
+#define SQR_TOOM3_THRESHOLD                123
+#define SQR_TOOM4_THRESHOLD                163
+#define SQR_TOOM6_THRESHOLD                333
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             52
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD                5
+
+#define MUL_FFT_MODF_THRESHOLD             468  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    468, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     19, 7}, {     10, 6}, \
+    {     24, 7}, {     13, 6}, {     27, 7}, {     14, 6}, \
+    {     29, 7}, {     17, 6}, {     35, 7}, {     29, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     51, 9}, {     27, 8}, {     55, 9}, {     35, 8}, \
+    {     71, 9}, {     39,10}, {     23, 9}, {     55,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47, 9}, {     95,10}, {     55,11}, {     31,10}, \
+    {     79,11}, {     47,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
+    {     95,10}, {    199,11}, {    111,12}, {     63,11}, \
+    {    143,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319,12}, {     95,11}, {    191,10}, {    383,11}, \
+    {    207,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    335,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,14}, {    127,13}, \
+    {    255,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1215,14}, {    639,13}, {   1407,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1855,15}, \
+    {    511,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 151
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             412  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    412, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     27, 7}, {     14, 6}, {     29, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     36, 8}, \
+    {     19, 7}, {     39, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    167,11}, {     95,10}, {    191, 9}, \
+    {    383,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    271,11}, {    143,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    303,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \
+    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    703,11}, {   1407,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1151,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1215,14}, {    639,13}, {   1407,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1791,15}, \
+    {    511,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD                 5056
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 100
+#define MULLO_MUL_N_THRESHOLD            11355
+
+#define DC_DIV_QR_THRESHOLD                124
+#define DC_DIVAPPR_Q_THRESHOLD             438
+#define DC_BDIV_QR_THRESHOLD               153
+#define DC_BDIV_Q_THRESHOLD                318
+
+#define INV_MULMOD_BNM1_THRESHOLD           62
+#define INV_NEWTON_THRESHOLD               384
+#define INV_APPR_THRESHOLD                 402
+
+#define BINV_NEWTON_THRESHOLD              381
+#define REDC_1_TO_REDC_N_THRESHOLD         110
+
+#define MU_DIV_QR_THRESHOLD               1752
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD              174
+#define MU_BDIV_QR_THRESHOLD              1387
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define POWM_SEC_TABLE  1,13,66,82,579
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     318
+#define HGCD_APPR_THRESHOLD                363
+#define HGCD_REDUCE_THRESHOLD             2384
+#define GCD_DC_THRESHOLD                  2504
+#define GCDEXT_DC_THRESHOLD                671
+#define JACOBI_BASE_METHOD                   3
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD              3754
+#define SET_STR_PRECOMPUTE_THRESHOLD      8097
+
+#define FAC_DSC_THRESHOLD                  951
+#define FAC_ODD_THRESHOLD                   24
diff --git a/gmp/mpn/alpha/ev6/mod_1_4.asm b/gmp/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 0000000000..836de07c0f
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mod_1_4.asm
@@ -0,0 +1,337 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimise.  2.75 c/l should be possible.
+C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
+C  * Optimise feed-in code, starting the sw pipeline in switch code.
+C  * Shorten software pipeline.  The mul instructions are scheduled too far
+C    from their users.  Fixing this will allow us to use fewer registers.
+C  * If we cannot reduce register usage, write perhaps small-n basecase.
+C  * Does this work for PIC?
+
+C      cycles/limb
+C EV4:     ?
+C EV5:    23
+C EV6:     3
+
+define(`ap',     `r16')
+define(`n',      `r17')
+define(`pl',     `r24')
+define(`ph',     `r25')
+define(`rl',     `r6')
+define(`rh',     `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_4p)
+	lda	r30, -64(r30)
+	stq	r9, 8(r30)
+	ldq	B1modb, 16(r19)
+	stq	r10, 16(r30)
+	ldq	B2modb, 24(r19)
+	stq	r11, 24(r30)
+	ldq	B3modb, 32(r19)
+	stq	r12, 32(r30)
+	ldq	B4modb, 40(r19)
+	stq	r13, 40(r30)
+	ldq	B5modb, 48(r19)
+	s8addq	n, ap, ap		C point ap at vector end
+
+	and	n, 3, r0
+	lda	n, -4(n)
+	beq	r0, L(b0)
+	lda	r6, -2(r0)
+	blt	r6, L(b1)
+	beq	r6, L(b2)
+
+L(b3):	ldq	r21, -16(ap)
+	ldq	r22, -8(ap)
+	ldq	r20, -24(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	addq	r8, r20, pl
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, rl
+	cmpult	rl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, rh
+	lda	ap, -56(ap)
+	br	L(com)
+
+L(b0):	ldq	r21, -24(ap)
+	ldq	r22, -16(ap)
+	ldq	r23, -8(ap)
+	ldq	r20, -32(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	mulq	r23, B3modb, r10
+	umulh	r23, B3modb, r27
+	addq	r8, r20, pl
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, ph
+	addq	r10, pl, rl
+	cmpult	rl, r10, r0
+	addq	r27, ph, ph
+	addq	r0, ph, rh
+	lda	ap, -64(ap)
+	br	L(com)
+
+L(b1):	bis	r31, r31, rh
+	ldq	rl, -8(ap)
+	lda	ap, -40(ap)
+	br	L(com)
+
+L(b2):	ldq	rh, -8(ap)
+	ldq	rl, -16(ap)
+	lda	ap, -48(ap)
+
+L(com):	ble	n, L(ed3)
+	ldq	r21, 8(ap)
+	ldq	r22, 16(ap)
+	ldq	r23, 24(ap)
+	ldq	r20, 0(ap)
+	lda	n, -4(n)
+	lda	ap, -32(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	mulq	r23, B3modb, r10
+	umulh	r23, B3modb, r27
+	mulq	rl, B4modb, r11
+	umulh	rl, B4modb, r28
+	ble	n, L(ed2)
+
+	ALIGN(16)
+L(top):	ldq	r21, 8(ap)
+	mulq	rh, B5modb, rl
+	addq	r8, r20, pl
+	ldq	r22, 16(ap)
+	cmpult	pl, r8, r0
+	umulh	rh, B5modb, rh
+	ldq	r23, 24(ap)
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	mulq	r21, B1modb, r8
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	umulh	r21, B1modb, r12
+	lda	ap, -32(ap)
+	addq	r0, ph, ph
+	addq	r10, pl, pl
+	mulq	r22, B2modb, r9
+	cmpult	pl, r10, r0
+	addq	r27, ph, ph
+	addq	r11, pl, pl
+	umulh	r22, B2modb, r13
+	addq	r0, ph, ph
+	cmpult	pl, r11, r0
+	addq	r28, ph, ph
+	mulq	r23, B3modb, r10
+	ldq	r20, 32(ap)
+	addq	pl, rl, rl
+	umulh	r23, B3modb, r27
+	addq	r0, ph, ph
+	cmpult	rl, pl, r0
+	mulq	rl, B4modb, r11
+	addq	ph, rh, rh
+	umulh	rl, B4modb, r28
+	addq	r0, rh, rh
+	lda	n, -4(n)
+	bgt	n, L(top)
+
+L(ed2):	mulq	rh, B5modb, rl
+	addq	r8, r20, pl
+	umulh	rh, B5modb, rh
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, ph
+	addq	r10, pl, pl
+	cmpult	pl, r10, r0
+	addq	r27, ph, ph
+	addq	r11, pl, pl
+	addq	r0, ph, ph
+	cmpult	pl, r11, r0
+	addq	r28, ph, ph
+	addq	pl, rl, rl
+	addq	r0, ph, ph
+	cmpult	rl, pl, r0
+	addq	ph, rh, rh
+	addq	r0, rh, rh
+
+L(ed3):	mulq	rh, B1modb, r8
+	umulh	rh, B1modb, rh
+	addq	r8, rl, rl
+	cmpult	rl, r8, r0
+	addq	r0, rh, rh
+
+	ldq	r24, 8(r19)		C cnt
+	sll	rh, r24, rh
+	subq	r31, r24, r25
+	srl	rl, r25, r2
+	sll	rl, r24, rl
+	or	r2, rh, rh
+
+	ldq	r23, 0(r19)		C bi
+	mulq	rh, r23, r8
+	umulh	rh, r23, r9
+	addq	rh, 1, r7
+	addq	r8, rl, r8		C ql
+	cmpult	r8, rl, r0
+	addq	r9, r7, r9
+	addq	r0, r9, r9		C qh
+	mulq	r9, r18, r21		C qh * b
+	subq	rl, r21, rl
+	cmpult	r8, rl, r0		C rl > ql
+	negq	r0, r0
+	and	r0, r18, r0
+	addq	rl, r0, rl
+	cmpule	r18, rl, r0		C rl >= b
+	negq	r0, r0
+	and	r0, r18, r0
+	subq	rl, r0, rl
+
+	srl	rl, r24, r0
+
+	ldq	r9, 8(r30)
+	ldq	r10, 16(r30)
+	ldq	r11, 24(r30)
+	ldq	r12, 32(r30)
+	ldq	r13, 40(r30)
+	lda	r30, 64(r30)
+	ret	r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+	lda	r30, -32(r30)
+	stq	r26, 0(r30)
+	stq	r9, 8(r30)
+	stq	r10, 16(r30)
+	stq	r11, 24(r30)
+	mov	r16, r11
+	LEA(	r4, __clz_tab)
+	lda	r10, 65(r31)
+	cmpbge	r31, r17, r1
+	srl	r1, 1, r1
+	xor	r1, 127, r1
+	addq	r1, r4, r1
+	ldq_u	r2, 0(r1)
+	extbl	r2, r1, r2
+	s8subq	r2, 7, r2
+	srl	r17, r2, r3
+	subq	r10, r2, r10
+	addq	r3, r4, r3
+	ldq_u	r1, 0(r3)
+	extbl	r1, r3, r1
+	subq	r10, r1, r10
+	sll	r17, r10, r9
+	mov	r9, r16
+	jsr	r26, mpn_invert_limb
+	ldah	r29, 0(r26)
+	subq	r31, r10, r2
+	lda	r1, 1(r31)
+	sll	r1, r10, r1
+	subq	r31, r9, r3
+	srl	r0, r2, r2
+	ldq	r26, 0(r30)
+	bis	r2, r1, r2
+	lda	r29, 0(r29)
+	stq	r0, 0(r11)
+	stq	r10, 8(r11)
+	mulq	r2, r3, r2
+	srl	r2, r10, r3
+	umulh	r2, r0, r1
+	stq	r3, 16(r11)
+	mulq	r2, r0, r3
+	ornot	r31, r1, r1
+	subq	r1, r2, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r2
+	cmpule	r1, r3, r3
+	cmoveq	r3, r2, r1
+	srl	r1, r10, r3
+	umulh	r1, r0, r2
+	stq	r3, 24(r11)
+	mulq	r1, r0, r3
+	ornot	r31, r2, r2
+	subq	r2, r1, r2
+	mulq	r2, r9, r2
+	addq	r2, r9, r1
+	cmpule	r2, r3, r3
+	cmoveq	r3, r1, r2
+	srl	r2, r10, r1
+	umulh	r2, r0, r3
+	stq	r1, 32(r11)
+	mulq	r2, r0, r1
+	ornot	r31, r3, r3
+	subq	r3, r2, r3
+	mulq	r3, r9, r3
+	addq	r3, r9, r2
+	cmpule	r3, r1, r1
+	cmoveq	r1, r2, r3
+	srl	r3, r10, r2
+	umulh	r3, r0, r1
+	stq	r2, 40(r11)
+	mulq	r3, r0, r0
+	ornot	r31, r1, r1
+	subq	r1, r3, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r9
+	cmpule	r1, r0, r0
+	cmoveq	r0, r9, r1
+	ldq	r9, 8(r30)
+	srl	r1, r10, r1
+	ldq	r10, 16(r30)
+	stq	r1, 48(r11)
+	ldq	r11, 24(r30)
+	lda	r30, 32(r30)
+	ret	r31, (r26), 1
+EPILOGUE()
diff --git a/gmp/mpn/alpha/ev6/mul_1.asm b/gmp/mpn/alpha/ev6/mul_1.asm
new file mode 100644
index 0000000000..8ee19cd429
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mul_1.asm
@@ -0,0 +1,496 @@
+dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl  result in a second limb vector.
+
+dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r16
+C s1_ptr	r17
+C size		r18
+C s2_limb	r19
+
+C This code runs at 2.25 cycles/limb on EV6.
+
+C This code was written in close cooperation with ev6 pipeline expert
+C Steve Root.  Any errors are tege's fault, though.
+
+C Code structure:
+
+C  code for n < 8
+C  code for n > 8	code for (n mod 8)
+C			code for (n div 8)	feed-in code
+C						8-way unrolled loop
+C						wind-down code
+
+C Some notes about unrolled loop:
+C
+C   r1-r8     multiplies and workup
+C   r21-r28   multiplies and workup
+C   r9-r12    loads
+C   r0       -1
+C   r20,r29,r13-r15  scramble
+C
+C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
+C   put-the-carry-into-hi.  The idea is that these branches are very rarely
+C   taken, and since a non-taken branch consumes no resources, that is better
+C   than an addq.
+C
+C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
+C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
+
+C The code could use some further work:
+C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
+C      faster than this for size < 3.
+C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
+C      that is too costly.
+C   3. Consider using 4-way unrolling, even if that runs slower.
+C   4. Reduce register usage.  In particular, try to avoid using r29.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	cmpult	r18,	8,	r1
+	beq	r1,	$Large
+$Lsmall:
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	bic	r31,r31,r4	C clear cy_limb
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Le1a	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	stq	r3,0(r16)
+	beq	r18,$Le2a	C jump if size was == 2
+	ALIGN(8)
+$Lopa:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	lda	r18,-1(r18)	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,16(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,8(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	lda	r16,8(r16)	C res_ptr++
+	bne	r18,$Lopa
+
+$Le2a:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,8(r16)
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Le1a:	stq	r3,0(r16)
+	ret	r31,(r26),1
+
+$Large:
+	lda	r30,	-224(r30)
+	stq	r26,	0(r30)
+	stq	r9,	8(r30)
+	stq	r10,	16(r30)
+	stq	r11,	24(r30)
+	stq	r12,	32(r30)
+	stq	r13,	40(r30)
+	stq	r14,	48(r30)
+	stq	r15,	56(r30)
+	stq	r29,	64(r30)
+
+	and	r18,	7,	r20	C count for the first loop, 0-7
+	srl	r18,	3,	r18	C count for unrolled loop
+	bis	r31,	r31,	r21
+	beq	r20,	$L_8_or_more	C skip first loop
+
+$L_9_or_more:
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	lda	r20,-1(r20)	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	umulh	r2,r19,r21	C r21 = prod_high
+	beq	r20,$Le1b	C jump if size was == 1
+	bis	r31, r31, r0	C FIXME: shouldn't need this
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	lda	r20,-1(r20)	C size--
+	stq	r3,0(r16)
+	lda	r16,8(r16)	C res_ptr++
+	beq	r20,$Le2b	C jump if size was == 2
+	ALIGN(8)
+$Lopb:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
+	lda	r20,-1(r20)	C size--
+	umulh	r2,r19,r21	C r21 = prod_high
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,0(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	lda	r16,8(r16)	C res_ptr++
+	bne	r20,$Lopb
+
+$Le2b:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r21	C r21 = prod_high
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,0(r16)
+	lda	r16,8(r16)	C res_ptr++
+	addq	r21,r0,r21	C cy_limb = prod_high + cy
+	br	r31,	$L_8_or_more
+$Le1b:	stq	r3,0(r16)
+	lda	r16,8(r16)	C res_ptr++
+
+$L_8_or_more:
+	lda	r0,	-1(r31)		C put -1 in r0, for tricky loop control
+	lda	r17,	-32(r17)	C L1 bookkeeping
+	lda	r18,	-1(r18)		C decrement count
+
+	ldq	r9,	32(r17)		C L1
+	ldq	r10,	40(r17)		C L1
+	mulq	r9,	r19,	r22	C U1 #07
+	ldq	r11,	48(r17)		C L1
+	umulh	r9,	r19,	r23	C U1 #08
+	ldq	r12,	56(r17)		C L1
+	mulq	r10,	r19,	r24	C U1 #09
+	ldq	r9,	64(r17)		C L1
+
+	lda	r17,	64(r17)		C L1 bookkeeping
+
+	umulh	r10,	r19,	r25	C U1 #11
+	mulq	r11,	r19,	r26	C U1 #12
+	umulh	r11,	r19,	r27	C U1 #13
+	mulq	r12,	r19,	r28	C U1 #14
+	ldq	r10,	8(r17)		C L1
+	umulh	r12,	r19,	r1	C U1 #15
+	ldq	r11,	16(r17)		C L1
+	mulq	r9,	r19,	r2	C U1 #16
+	ldq	r12,	24(r17)		C L1
+	umulh	r9,	r19,	r3	C U1 #17
+	addq	r21,	r22,	r13	C L1 mov
+	mulq	r10,	r19,	r4	C U1 #18
+	addq	r23,	r24,	r22	C L0 sum 2 mul's
+	cmpult	r13,	r21,	r14	C L1 carry from sum
+	bgt	r18,	$L_16_or_more
+
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	umulh	r11,	r19,	r7	C U1 #04
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	32(r16)		C L1 bookkeeping
+	addq	r13,	r31,	r13	C U0 start carry cascade
+	umulh	r12,	r19,	r21	C U1 #06
+	br	r31,	$ret0c
+
+$L_16_or_more:
+C ---------------------------------------------------------------
+	subq	r18,1,r18
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+	ldq	r9,	32(r17)		C L1
+
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	umulh	r11,	r19,	r7	C U1 #04
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	32(r16)		C L1 bookkeeping
+	addq	r13,	r31,	r13	C U0 start carry cascade
+
+	umulh	r12,	r19,	r21	C U1 #06
+C	beq	r13,	$fix0w		C U0
+$ret0w:	addq	r22,	r14,	r26	C L0
+	ldq	r10,	40(r17)		C L1
+
+	mulq	r9,	r19,	r22	C U1 #07
+	beq	r26,	$fix1w		C U0
+$ret1w:	addq	r23,	r24,	r27	C L0
+	ldq	r11,	48(r17)		C L1
+
+	umulh	r9,	r19,	r23	C U1 #08
+	beq	r27,	$fix2w		C U0
+$ret2w:	addq	r28,	r25,	r28	C L0
+	ldq	r12,	56(r17)		C L1
+
+	mulq	r10,	r19,	r24	C U1 #09
+	beq	r28,	$fix3w		C U0
+$ret3w:	addq	r1,	r2,	r20	C L0 sum 2 mul's
+	ldq	r9,	64(r17)		C L1
+
+	addq	r3,	r4,	r2	C L0 #10 2 mul's
+	lda	r17,	64(r17)		C L1 bookkeeping
+	cmpult	r20,	r1,	r29	C U0 carry from sum
+
+	umulh	r10,	r19,	r25	C U1 #11
+	cmpult	r2,	r4,	r4	C U0 carry from sum
+	stq	r13,	-32(r16)	C L0
+	stq	r26,	-24(r16)	C L1
+
+	mulq	r11,	r19,	r26	C U1 #12
+	addq	r5,	r6,	r14	C U0 sum 2 mul's
+	stq	r27,	-16(r16)	C L0
+	stq	r28,	-8(r16)		C L1
+
+	umulh	r11,	r19,	r27	C U1 #13
+	cmpult	r14,	r6,	r3	C U0 carry from sum
+C could do cross-jumping here:
+C	bra	$L_middle_of_unrolled_loop
+	mulq	r12,	r19,	r28	C U1 #14
+	addq	r7,	r3,	r5	C L0 eat carry
+	addq	r20,	r15,	r20	C U0 carry cascade
+	ldq	r10,	8(r17)		C L1
+
+	umulh	r12,	r19,	r1	C U1 #15
+	beq	r20,	$fix4		C U0
+$ret4w:	addq	r2,	r29,	r6	C L0
+	ldq	r11,	16(r17)		C L1
+
+	mulq	r9,	r19,	r2	C U1 #16
+	beq	r6,	$fix5		C U0
+$ret5w:	addq	r14,	r4,	r7	C L0
+	ldq	r12,	24(r17)		C L1
+
+	umulh	r9,	r19,	r3	C U1 #17
+	beq	r7,	$fix6		C U0
+$ret6w:	addq	r5,	r8,	r8	C L0 sum 2
+	addq	r21,	r22,	r13	C L1 sum 2 mul's
+
+	mulq	r10,	r19,	r4	C U1 #18
+	addq	r23,	r24,	r22	C L0 sum 2 mul's
+	cmpult	r13,	r21,	r14	C L1 carry from sum
+	ble	r18,	$Lend		C U0
+C ---------------------------------------------------------------
+	ALIGN(16)
+$Loop:
+	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
+	cmpult	r8,	r5,	r29	C L0 carry from last bunch
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+	ldq	r9,	32(r17)		C L1
+
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	stq	r20,	0(r16)		C L0
+	stq	r6,	8(r16)		C L1
+
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	stq	r7,	16(r16)		C L0
+	stq	r8,	24(r16)		C L1
+
+	umulh	r11,	r19,	r7	C U1 #04
+	bis	r31,	r31,	r31	C L0 st slosh
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r13,	r29,	r13	C U0 start carry cascade
+
+	umulh	r12,	r19,	r21	C U1 #06
+	beq	r13,	$fix0		C U0
+$ret0:	addq	r22,	r14,	r26	C L0
+	ldq	r10,	40(r17)		C L1
+
+	mulq	r9,	r19,	r22	C U1 #07
+	beq	r26,	$fix1		C U0
+$ret1:	addq	r23,	r24,	r27	C L0
+	ldq	r11,	48(r17)		C L1
+
+	umulh	r9,	r19,	r23	C U1 #08
+	beq	r27,	$fix2		C U0
+$ret2:	addq	r28,	r25,	r28	C L0
+	ldq	r12,	56(r17)		C L1
+
+	mulq	r10,	r19,	r24	C U1 #09
+	beq	r28,	$fix3		C U0
+$ret3:	addq	r1,	r2,	r20	C L0 sum 2 mul's
+	ldq	r9,	64(r17)		C L1
+
+	addq	r3,	r4,	r2	C L0 #10 2 mul's
+	bis	r31,	r31,	r31	C U1 mul hole
+	lda	r17,	64(r17)		C L1 bookkeeping
+	cmpult	r20,	r1,	r29	C U0 carry from sum
+
+	umulh	r10,	r19,	r25	C U1 #11
+	cmpult	r2,	r4,	r4	C U0 carry from sum
+	stq	r13,	-32(r16)	C L0
+	stq	r26,	-24(r16)	C L1
+
+	mulq	r11,	r19,	r26	C U1 #12
+	addq	r5,	r6,	r14	C U0 sum 2 mul's
+	stq	r27,	-16(r16)	C L0
+	stq	r28,	-8(r16)		C L1
+
+	umulh	r11,	r19,	r27	C U1 #13
+	bis	r31,	r31,	r31	C L0 st slosh
+	bis	r31,	r31,	r31	C L1 st slosh
+	cmpult	r14,	r6,	r3	C U0 carry from sum
+$L_middle_of_unrolled_loop:
+	mulq	r12,	r19,	r28	C U1 #14
+	addq	r7,	r3,	r5	C L0 eat carry
+	addq	r20,	r15,	r20	C U0 carry cascade
+	ldq	r10,	8(r17)		C L1
+
+	umulh	r12,	r19,	r1	C U1 #15
+	beq	r20,	$fix4		C U0
+$ret4:	addq	r2,	r29,	r6	C L0
+	ldq	r11,	16(r17)		C L1
+
+	mulq	r9,	r19,	r2	C U1 #16
+	beq	r6,	$fix5		C U0
+$ret5:	addq	r14,	r4,	r7	C L0
+	ldq	r12,	24(r17)		C L1
+
+	umulh	r9,	r19,	r3	C U1 #17
+	beq	r7,	$fix6		C U0
+$ret6:	addq	r5,	r8,	r8	C L0 sum 2
+	addq	r21,	r22,	r13	C L1 sum 2 mul's
+
+	mulq	r10,	r19,	r4	C U1 #18
+	addq	r23,	r24,	r22	C L0 sum 2 mul's
+	cmpult	r13,	r21,	r14	C L1 carry from sum
+	bgt	r18,	$Loop		C U0
+C ---------------------------------------------------------------
+$Lend:
+	cmpult	r8,	r5,	r29	C L0 carry from last bunch
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	stq	r20,	0(r16)		C L0
+	stq	r6,	8(r16)		C L1
+
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	stq	r7,	16(r16)		C L0
+	stq	r8,	24(r16)		C L1
+
+	umulh	r11,	r19,	r7	C U1 #04
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r13,	r29,	r13	C U0 start carry cascade
+
+	umulh	r12,	r19,	r21	C U1 #06
+	beq	r13,	$fix0c		C U0
+$ret0c:	addq	r22,	r14,	r26	C L0
+	beq	r26,	$fix1c		C U0
+$ret1c:	addq	r23,	r24,	r27	C L0
+	beq	r27,	$fix2c		C U0
+$ret2c:	addq	r28,	r25,	r28	C L0
+	beq	r28,	$fix3c		C U0
+$ret3c:	addq	r1,	r2,	r20	C L0 sum 2 mul's
+	addq	r3,	r4,	r2	C L0 #10 2 mul's
+	lda	r17,	64(r17)		C L1 bookkeeping
+	cmpult	r20,	r1,	r29	C U0 carry from sum
+	cmpult	r2,	r4,	r4	C U0 carry from sum
+	stq	r13,	-32(r16)	C L0
+	stq	r26,	-24(r16)	C L1
+	addq	r5,	r6,	r14	C U0 sum 2 mul's
+	stq	r27,	-16(r16)	C L0
+	stq	r28,	-8(r16)		C L1
+	cmpult	r14,	r6,	r3	C U0 carry from sum
+	addq	r7,	r3,	r5	C L0 eat carry
+	addq	r20,	r15,	r20	C U0 carry cascade
+	beq	r20,	$fix4c		C U0
+$ret4c:	addq	r2,	r29,	r6	C L0
+	beq	r6,	$fix5c		C U0
+$ret5c:	addq	r14,	r4,	r7	C L0
+	beq	r7,	$fix6c		C U0
+$ret6c:	addq	r5,	r8,	r8	C L0 sum 2
+	cmpult	r8,	r5,	r29	C L0 carry from last bunch
+	stq	r20,	0(r16)		C L0
+	stq	r6,	8(r16)		C L1
+	stq	r7,	16(r16)		C L0
+	stq	r8,	24(r16)		C L1
+	addq	r29,	r21,	r0
+
+	ldq	r26,	0(r30)
+	ldq	r9,	8(r30)
+	ldq	r10,	16(r30)
+	ldq	r11,	24(r30)
+	ldq	r12,	32(r30)
+	ldq	r13,	40(r30)
+	ldq	r14,	48(r30)
+	ldq	r15,	56(r30)
+	ldq	r29,	64(r30)
+	lda	r30,	224(r30)
+	ret	r31,	(r26),	1
+
+C $fix0w:	bis	r14,	r29,	r14	C join carries
+C	br	r31,	$ret0w
+$fix1w:	bis	r24,	r14,	r24	C join carries
+	br	r31,	$ret1w
+$fix2w:	bis	r25,	r24,	r25	C join carries
+	br	r31,	$ret2w
+$fix3w:	bis	r15,	r25,	r15	C join carries
+	br	r31,	$ret3w
+$fix0:	bis	r14,	r29,	r14	C join carries
+	br	r31,	$ret0
+$fix1:	bis	r24,	r14,	r24	C join carries
+	br	r31,	$ret1
+$fix2:	bis	r25,	r24,	r25	C join carries
+	br	r31,	$ret2
+$fix3:	bis	r15,	r25,	r15	C join carries
+	br	r31,	$ret3
+$fix4:	bis	r29,	r15,	r29	C join carries
+	br	r31,	$ret4
+$fix5:	bis	r4,	r29,	r4	C join carries
+	br	r31,	$ret5
+$fix6:	addq	r5,	r4,	r5	C can't carry twice!
+	br	r31,	$ret6
+$fix0c:	bis	r14,	r29,	r14	C join carries
+	br	r31,	$ret0c
+$fix1c:	bis	r24,	r14,	r24	C join carries
+	br	r31,	$ret1c
+$fix2c:	bis	r25,	r24,	r25	C join carries
+	br	r31,	$ret2c
+$fix3c:	bis	r15,	r25,	r15	C join carries
+	br	r31,	$ret3c
+$fix4c:	bis	r29,	r15,	r29	C join carries
+	br	r31,	$ret4c
+$fix5c:	bis	r4,	r29,	r4	C join carries
+	br	r31,	$ret5c
+$fix6c:	addq	r5,	r4,	r5	C can't carry twice!
+	br	r31,	$ret6c
+
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/README b/gmp/mpn/alpha/ev6/nails/README
new file mode 100644
index 0000000000..b214ac50ad
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/README
@@ -0,0 +1,65 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains assembly code for nails-enabled 21264.  The code is not
+very well optimized.
+
+For addmul_N, as N grows larger, we could make multiple loads together, then do
+about 3.3 i/c.  10 cycles after the last load, we can increase to 4 i/c.  This
+would surely allow addmul_4 to run at 2 c/l, but the same should be possible
+also for addmul_3 and perhaps even addmul_2.
+
+
+		current		fair		best
+Routine		c/l  unroll	c/l  unroll	c/l  i/c
+mul_1		3.25		2.75		2.75 3.273
+addmul_1	4.0	4	3.5	4 14	3.25 3.385
+addmul_2	4.0	1	2.5	2 10	2.25 3.333
+addmul_3	3.0	1	2.33	2 14	2    3.333
+addmul_4	2.5	1	2.125	2 17	2    3.135
+
+addmul_5			2	1 10
+addmul_6			2	1 12
+addmul_7			2	1 14
+
+(The "best" column doesn't account for bookkeeping instructions and
+thereby assumes infinite unrolling.)
+
+Basecase usages:
+
+1	 addmul_1
+2	 addmul_2
+3	 addmul_3
+4	 addmul_4
+5	 addmul_3 + addmul_2	2.3998
+6	 addmul_4 + addmul_2
+7	 addmul_4 + addmul_3
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_1.asm b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
new file mode 100644
index 0000000000..711d4e66e5
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
@@ -0,0 +1,396 @@
+dnl  Alpha ev6 nails mpn_addmul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	addq	t1,	m1b,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	addq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	addq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	addq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	addq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	addq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	addq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	addq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	addq	t1,	m1b,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_2.asm b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 0000000000..6ff6b3ad6b
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
@@ -0,0 +1,146 @@
+dnl  Alpha ev6 nails mpn_addmul_2.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 4.0 cycles/limb.
+
+C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
+C or 4-way unrolling over 20 cycles, for 2.5 c/l.
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	bis	r31,	r31,	r31		C U1	nop
+	addq	r19,	acc0,	acc0		C U0	propagate nail
+	ldq	rlimb,	0(rp)			C L0
+	ldq	ulimb,	0(up)			C L1
+
+	lda	rp,	8(rp)			C L1
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L0
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L1
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L0	nop
+
+	addq	rlimb,	r19,	r19		C L1	FINAL PROD-SUM
+	srl	m1a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L0
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	bis	r31,	m1b,	acc1		C L1
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L0	extract numb part
+
+	unop
+	srl	r19,NUMB_BITS,	r19		C U1	extract nail part
+	stq	r28,	-8(rp)			C L1
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	bis	r31,	m1b,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	stq	r28,	-8(rp)
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	r0
+
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_3.asm b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 0000000000..a1ffb680ec
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
@@ -0,0 +1,169 @@
+dnl  Alpha ev6 nails mpn_addmul_3.
+
+dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 3.0 cycles/limb.
+
+C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+	ldq	v2,	16(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	acc2		C	zero acc2
+	sll	v2,NAIL_BITS,	v2
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	mulq	v2,	ulimb,	m2a		C U1
+	umulh	v2,	ulimb,	m2b		C U1
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	ldq	rlimb,	0(rp)			C L1
+	ldq	ulimb,	0(up)			C L0
+	bis	r31,	r31,	r31		C U0	nop
+	addq	r19,	acc0,	acc0		C U1	propagate nail
+
+	lda	rp,	8(rp)			C L1
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L0
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L1
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L0	nop
+
+	addq	rlimb,	r19,	r19		C L1
+	srl	m1a,NAIL_BITS,	r8		C U0
+	bis	r31,	r31,	r31		C L0	nop
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	addq	m1b,	acc2,	acc1		C L1
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L0	extract numb part
+
+	bis	r31,	r31,	r31		C L1	nop
+	srl	m2a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L0
+	mulq	v2,	ulimb,	m2a		C U1
+
+	addq	r8,	acc1,	acc1		C L0
+	bis	r31,	m2b,	acc2		C L1
+	umulh	v2,	ulimb,	m2b		C U1
+	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
+
+	stq	r28,	-8(rp)			C L
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	addq	m1b,	acc2,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+	srl	m2a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc1,	acc1
+	bis	r31,	m2b,	acc2
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	stq	r28,	-8(rp)
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	acc1
+
+	and	acc1,numb_mask,	r28
+	stq	r28,	8(rp)
+	srl	acc1,NUMB_BITS,	r19
+	addq	r19,	acc2,	m0a
+
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_4.asm b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 0000000000..77e02a4316
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
@@ -0,0 +1,210 @@
+dnl  Alpha ev6 nails mpn_addmul_4.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 2.5 cycles/limb.
+
+C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
+C to 3.24 insn/cycle.
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(4-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+	lda	r30,	-240(r30)
+	stq	r12,	32(r30)
+	stq	r13,	40(r30)
+	stq	r14,	48(r30)
+	stq	r15,	56(r30)
+
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+	ldq	v2,	16(vp)
+	ldq	v3,	24(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	acc2		C	zero acc2
+	sll	v2,NAIL_BITS,	v2
+	bis	r31,	r31,	acc3		C	zero acc3
+	sll	v3,NAIL_BITS,	v3
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	mulq	v2,	ulimb,	m2a		C U1
+	umulh	v2,	ulimb,	m2b		C U1
+	mulq	v3,	ulimb,	m3a		C U1
+	umulh	v3,	ulimb,	m3b		C U1
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	bis	r31,	r31,	r31		C U1	nop
+	ldq	rlimb,	0(rp)			C L0
+	ldq	ulimb,	0(up)			C L1
+	addq	r19,	acc0,	acc0		C U0	propagate nail
+
+	bis	r31,	r31,	r31		C L0	nop
+	bis	r31,	r31,	r31		C U1	nop
+	bis	r31,	r31,	r31		C L1	nop
+	bis	r31,	r31,	r31		C U0	nop
+
+	lda	rp,	8(rp)			C L0
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L1
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L0
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L1	nop
+
+	addq	rlimb,	r19,	r19		C L0
+	srl	m1a,NAIL_BITS,	r8		C U0
+	bis	r31,	r31,	r31		C L1	nop
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	addq	m1b,	acc2,	acc1		C L0
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L1	extract numb part
+
+	bis	r31,	r31,	r31		C L0	nop
+	srl	m2a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L1
+	mulq	v2,	ulimb,	m2a		C U1
+
+	addq	r8,	acc1,	acc1		C L1
+	addq	m2b,	acc3,	acc2		C L0
+	umulh	v2,	ulimb,	m2b		C U1
+	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
+
+	bis	r31,	r31,	r31		C L0	nop
+	srl	m3a,NAIL_BITS,	r8		C U0
+	stq	r28,	-8(rp)			C L1
+	mulq	v3,	ulimb,	m3a		C U1
+
+	addq	r8,	acc2,	acc2		C L0
+	bis	r31,	m3b,	acc3		C L1
+	umulh	v3,	ulimb,	m3b		C U1
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)			C FIXME: DELETE
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	addq	m1b,	acc2,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+	srl	m2a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc1,	acc1
+	addq	m2b,	acc3,	acc2
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	srl	m3a,NAIL_BITS,	r8		C U0
+	stq	r28,	-8(rp)
+	addq	r8,	acc2,	acc2
+	bis	r31,	m3b,	acc3
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	acc1
+
+	and	acc1,numb_mask,	r28
+	stq	r28,	8(rp)
+	srl	acc1,NUMB_BITS,	r19
+	addq	r19,	acc2,	acc2
+
+	and	acc2,numb_mask,	r28
+	stq	r28,	16(rp)
+	srl	acc2,NUMB_BITS,	r19
+	addq	r19,	acc3,	r0
+
+	ldq	r12,	32(r30)
+	ldq	r13,	40(r30)
+	ldq	r14,	48(r30)
+	ldq	r15,	56(r30)
+	lda	r30,	240(r30)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/aors_n.asm b/gmp/mpn/alpha/ev6/nails/aors_n.asm
new file mode 100644
index 0000000000..f6586773f5
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/aors_n.asm
@@ -0,0 +1,233 @@
+dnl  Alpha ev6 nails mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Runs at 2.5 cycles/limb.  It would be possible to reach 2.0 cycles/limb
+dnl  with 8-way unrolling.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n',`r19')
+
+define(`rl0',`r0')
+define(`rl1',`r1')
+define(`rl2',`r2')
+define(`rl3',`r3')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r6')
+define(`ul3',`r7')
+
+define(`vl0',`r22')
+define(`vl1',`r23')
+define(`vl2',`r24')
+define(`vl3',`r25')
+
+define(`numb_mask',`r21')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`CYSH',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ifdef(`OPERATION_add_n', `
+	define(`OP',        addq)
+	define(`CYSH',`GMP_NUMB_BITS')
+	define(`func',  mpn_add_n)')
+ifdef(`OPERATION_sub_n', `
+	define(`OP',        subq)
+	define(`CYSH',63)
+	define(`func',  mpn_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+	bis	r31,	r31,	r20
+
+	and	n,	3,	r25
+	lda	n,	-4(n)
+	beq	r25,	L(ge4)
+
+L(lp0):	ldq	ul0,	0(up)
+	lda	up,	8(up)
+	ldq	vl0,	0(vp)
+	lda	vp,	8(vp)
+	lda	rp,	8(rp)
+	lda	r25,	-1(r25)
+	OP	ul0,	vl0,	rl0
+	OP	rl0,	r20,	rl0
+	and	rl0, numb_mask,	r28
+	stq	r28,	-8(rp)
+	srl	rl0,	CYSH,	r20
+	bne	r25,	L(lp0)
+
+	blt	n,	L(ret)
+
+L(ge4):	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+	lda	n,	-4(n)
+	bge	n,	L(ge8)
+
+	OP	ul0,	vl0,	rl0	C		main-add 0
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	br	r31,	L(cj0)
+
+L(ge8):	OP	ul0,	vl0,	rl0	C		main-add 0
+	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+	lda	rp,	32(rp)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+	lda	n,	-4(n)
+	blt	n,	L(end)
+
+	ALIGN(32)
+L(top):	OP	ul0,	vl0,	rl0	C		main-add 0
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	and	rl3,numb_mask,	r28
+	stq	r27,	-16(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	stq	r28,	-8(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+	bis	r31,	r31,	r31
+
+	bis	r31,	r31,	r31
+	lda	n,	-4(n)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+
+	bis	r31,	r31,	r31
+	bis	r31,	r31,	r31
+	lda	rp,	32(rp)
+	bge	n,	L(top)
+
+L(end):	OP	ul0,	vl0,	rl0	C		main-add 0
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	and	rl3,numb_mask,	r28
+	stq	r27,	-16(rp)
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	stq	r28,	-8(rp)
+L(cj0):	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	and	rl3,numb_mask,	r28
+	stq	r27,	16(rp)
+	stq	r28,	24(rp)
+
+L(ret):	and	r20,	1,	r0
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
new file mode 100644
index 0000000000..7949fe8df8
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD             40
+#define MUL_TOOM33_THRESHOLD            236
+
+#define SQR_BASECASE_THRESHOLD            7  /* karatsuba */
+#define SQR_TOOM2_THRESHOLD               0  /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD             120
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIV_DC_THRESHOLD                 48
+#define POWM_THRESHOLD                  113
+
+#define HGCD_THRESHOLD                   78
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                392
+#define JACOBI_BASE_METHOD                1
+
+#define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIVREM_1_UNNORM_THRESHOLD     MP_SIZE_T_MAX  /* no preinv with nails */
+#define MOD_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* no preinv with nails */
+#define MOD_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* no preinv with nails */
+#define USE_PREINV_DIVREM_1               0  /* no preinv with nails */
+#define USE_PREINV_MOD_1                  0  /* no preinv with nails */
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             15
+#define GET_STR_PRECOMPUTE_THRESHOLD     24
+#define SET_STR_THRESHOLD              6336
+
+#define MUL_FFT_TABLE  { 688, 1440, 3648, 6400, 25600, 0 }
+#define MUL_FFT_MODF_THRESHOLD          488
+#define MUL_FFT_THRESHOLD              3712
+
+#define SQR_FFT_TABLE  { 432, 864, 3136, 6400, 25600, 0 }
+#define SQR_FFT_MODF_THRESHOLD          480
+#define SQR_FFT_THRESHOLD              2976
diff --git a/gmp/mpn/alpha/ev6/nails/mul_1.asm b/gmp/mpn/alpha/ev6/nails/mul_1.asm
new file mode 100644
index 0000000000..da2ee3d099
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/mul_1.asm
@@ -0,0 +1,364 @@
+dnl  Alpha ev6 nails mpn_mul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.25
+
+C TODO
+C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	and	acc1,numb_mask,	r28
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	addq	t1,	m1b,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	unop					C U0
+	lda	n,	-4(n)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	addq	t1,	m1b,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/submul_1.asm b/gmp/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 0000000000..f473a59ba8
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/submul_1.asm
@@ -0,0 +1,396 @@
+dnl  Alpha ev6 nails mpn_submul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	subq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	subq	m1b,	t1,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	subq	m1b,	t1,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/slot.pl b/gmp/mpn/alpha/ev6/slot.pl
new file mode 100755
index 0000000000..a4c8a36882
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/slot.pl
@@ -0,0 +1,318 @@
+#!/usr/bin/perl -w
+
+# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: slot.pl [filename.o]...
+#
+# Run "objdump" to produce a disassembly of the given object file(s) and
+# annotate the output with "U" or "L" slotting which Alpha EV6 will use.
+#
+# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as
+# a reminder that it wasn't a fixed requirement that gave the U or L, but
+# the octaword slotting rules.
+#
+# If an instruction is not recognised, that octaword does not get any U/L
+# shown, only lower-case "u", "l" or "e" for the instructions which are
+# known.  Add any unknown instructions to %optable below.
+
+
+use strict;
+
+# The U or L which various instructions demand, or E if either.
+#
+my %optable =
+  (
+   'addq'   => 'E',
+   'and'    => 'E',
+   'andnot' => 'E',
+   'beq'    => 'U',
+   'bge'    => 'U',
+   'bgt'    => 'U',
+   'bic'    => 'E',
+   'bis'    => 'E',
+   'blt'    => 'U',
+   'bne'    => 'U',
+   'br'     => 'L',
+   'clr'    => 'E',
+   'cmpule' => 'E',
+   'cmpult' => 'E',
+   'cmpeq'  => 'E',
+   'cmoveq' => 'E',
+   'cmovne' => 'E',
+   'ctpop'  => 'U',
+   'ctlz'   => 'U',
+   'cttz'   => 'U',
+   'extbl'  => 'U',
+   'extlh'  => 'U',
+   'extll'  => 'U',
+   'extqh'  => 'U',
+   'extql'  => 'U',
+   'extwh'  => 'U',
+   'extwl'  => 'U',
+   'jsr'    => 'L',
+   'lda'    => 'E',
+   'ldah'   => 'E',
+   'ldbu'   => 'L',
+   'ldl'    => 'L',
+   'ldq'    => 'L',
+   'ldt'    => 'L',
+   'ret'    => 'L',
+   'mov'    => 'E',
+   'mull'   => 'U',
+   'mulq'   => 'U',
+   'negq'   => 'E',
+   'nop'    => 'E',
+   'not'    => 'E',
+   's8addq' => 'E',
+   's8subq' => 'E',
+   # 'sextb'  => ?
+   # 'sextl'  => ?
+   'sll'    => 'U',
+   'srl'    => 'U',
+   'stq'    => 'L',
+   'subq'   => 'E',
+   'umulh'  => 'U',
+   'unop'   => 'E',
+   'xor'    => 'E',
+  );
+
+# Slottings used for a given pattern of U/L/E in an octaword.  This is as
+# per the "Ebox Slotting" section of the EV6 hardware reference manual.
+#
+my %slottable =
+  (
+   'EEEE' => 'ULUL',
+   'EEEL' => 'ULUL',
+   'EEEU' => 'ULLU',
+   'EELE' => 'ULLU',
+   'EELL' => 'UULL',
+   'EELU' => 'ULLU',
+   'EEUE' => 'ULUL',
+   'EEUL' => 'ULUL',
+   'EEUU' => 'LLUU',
+   'ELEE' => 'ULUL',
+   'ELEL' => 'ULUL',
+   'ELEU' => 'ULLU',
+   'ELLE' => 'ULLU',
+   'ELLL' => 'ULLL',
+   'ELLU' => 'ULLU',
+   'ELUE' => 'ULUL',
+   'ELUL' => 'ULUL',
+
+   'LLLL' => 'LLLL',
+   'LLLU' => 'LLLU',
+   'LLUE' => 'LLUU',
+   'LLUL' => 'LLUL',
+   'LLUU' => 'LLUU',
+   'LUEE' => 'LULU',
+   'LUEL' => 'LUUL',
+   'LUEU' => 'LULU',
+   'LULE' => 'LULU',
+   'LULL' => 'LULL',
+   'LULU' => 'LULU',
+   'LUUE' => 'LUUL',
+   'LUUL' => 'LUUL',
+   'LUUU' => 'LUUU',
+   'UEEE' => 'ULUL',
+   'UEEL' => 'ULUL',
+   'UEEU' => 'ULLU',
+
+   'ELUU' => 'LLUU',
+   'EUEE' => 'LULU',
+   'EUEL' => 'LUUL',
+   'EUEU' => 'LULU',
+   'EULE' => 'LULU',
+   'EULL' => 'UULL',
+   'EULU' => 'LULU',
+   'EUUE' => 'LUUL',
+   'EUUL' => 'LUUL',
+   'EUUU' => 'LUUU',
+   'LEEE' => 'LULU',
+   'LEEL' => 'LUUL',
+   'LEEU' => 'LULU',
+   'LELE' => 'LULU',
+   'LELL' => 'LULL',
+   'LELU' => 'LULU',
+   'LEUE' => 'LUUL',
+   'LEUL' => 'LUUL',
+   'LEUU' => 'LLUU',
+   'LLEE' => 'LLUU',
+   'LLEL' => 'LLUL',
+   'LLEU' => 'LLUU',
+   'LLLE' => 'LLLU',
+
+   'UELE' => 'ULLU',
+   'UELL' => 'UULL',
+   'UELU' => 'ULLU',
+   'UEUE' => 'ULUL',
+   'UEUL' => 'ULUL',
+   'UEUU' => 'ULUU',
+   'ULEE' => 'ULUL',
+   'ULEL' => 'ULUL',
+   'ULEU' => 'ULLU',
+   'ULLE' => 'ULLU',
+   'ULLL' => 'ULLL',
+   'ULLU' => 'ULLU',
+   'ULUE' => 'ULUL',
+   'ULUL' => 'ULUL',
+   'ULUU' => 'ULUU',
+   'UUEE' => 'UULL',
+   'UUEL' => 'UULL',
+   'UUEU' => 'UULU',
+   'UULE' => 'UULL',
+   'UULL' => 'UULL',
+   'UULU' => 'UULU',
+   'UUUE' => 'UUUL',
+   'UUUL' => 'UUUL',
+   'UUUU' => 'UUUU',
+  );
+
+# Check all combinations of U/L/E are present in %slottable.
+sub coverage {
+  foreach my $a ('U', 'L', 'E') {
+    foreach my $b ('U', 'L', 'E') {
+      foreach my $c ('U', 'L', 'E') {
+        foreach my $d ('U', 'L', 'E') {
+          my $x = $a . $b . $c . $d;
+          if (! defined $slottable{$x}) {
+            print "slottable missing: $x\n"
+          }
+        }
+      }
+    }
+  }
+}
+
+# Certain consistency checks for %slottable.
+sub check {
+  foreach my $x (keys %slottable) {
+    my $a = substr($x,0,1);
+    my $b = substr($x,1,1);
+    my $c = substr($x,2,1);
+    my $d = substr($x,3,1);
+    my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E');
+    my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L');
+    my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U');
+
+    my $got = $slottable{$x};
+    my $want = $x;
+
+    if ($es == 0) {
+
+    } elsif ($es == 1) {
+      # when only one E, it's mapped to whichever of U or L is otherwise
+      # used the least
+      if ($ls > $us) {
+        $want =~ s/E/U/;
+      } else {
+        $want =~ s/E/L/;
+      }
+    } elsif ($es == 2) {
+      # when two E's and two U, then the E's map to L; vice versa for two E
+      # and two L
+      if ($ls == 2) {
+        $want =~ s/E/U/g;
+      } elsif ($us == 2) {
+        $want =~ s/E/L/g;
+      } else {
+        next;
+      }
+    } elsif ($es == 3) {
+      next;
+
+    } else { # $es == 4
+      next;
+    }
+
+    if ($want ne $got) {
+      print "slottable $x want $want got $got\n";
+    }
+  }
+}
+
+sub disassemble {
+  my ($file) = @_;
+
+  open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n";
+
+  my (%pre, %post, %type);
+  while (<IN>) {
+    my $line = $_ . "";
+
+    if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) {
+      my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4);
+
+      my $this_type = $optable{$opcode};
+      if (! defined ($this_type)) { $this_type = ' '; }
+
+      $pre{$addr} = $this_pre;
+      $post{$addr} = $this_post;
+      $type{$addr} = $this_type;
+
+      if ($addr eq 'c') {
+        my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' ');
+
+        my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'};
+        $str = $slottable{$str};
+        if (defined $str) {
+          $slot{'c'} = substr($str,0,1);
+          $slot{'8'} = substr($str,1,1);
+          $slot{'4'} = substr($str,2,1);
+          $slot{'0'} = substr($str,3,1);
+        }
+
+        foreach my $i ('0', '4', '8', 'c') {
+          if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; }
+          print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, '  ', $post{$i}, "\n";
+        }
+
+        %pre = ();
+        %type = ();
+        %post = ();
+      }
+    }
+  }
+
+  close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+coverage();
+check();
+
+my @files;
+if ($#ARGV >= 0) {
+  @files = @ARGV;
+} else {
+  die
+}
+
+foreach (@files)  {
+    disassemble($_);
+}
diff --git a/gmp/mpn/alpha/ev6/sub_n.asm b/gmp/mpn/alpha/ev6/sub_n.asm
new file mode 100644
index 0000000000..a35ba40d34
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/sub_n.asm
@@ -0,0 +1,283 @@
+dnl  Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl  and store difference in a third limb vector.
+
+dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     5.4
+C EV6:     2.125
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  vp	r18
+C  n	r19
+C  cy	r20   (for mpn_add_nc)
+
+C TODO
+C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C   Use multi-pronged feed-in.
+C   Perform additional micro-tuning
+
+C  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C  Pair loads and stores where possible
+C  Store pairs oct-aligned where possible (didn't need it here)
+C  Stores are delayed every third cycle
+C  Loads and stores are delayed by fills
+C  U stays still, put code there where possible (note alternation of U1 and U0)
+C  L moves because of loads and stores
+C  Note dampers in L to limit damage
+
+C  This odd-looking optimization expects that were having random bits in our
+C  data, so that a pure zero result is unlikely. so we penalize the unlikely
+C  case to help the common case.
+
+define(`u0', `r0')  define(`u1', `r3')
+define(`v0', `r1')  define(`v1', `r4')
+
+define(`cy0', `r20')  define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+	br	r31,	$entry
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	bis	r31,	r31,	cy0	C clear carry in
+$entry:	cmpult	r19,	5,	r22	C L1 move counter
+	ldq	u1,	0(r17)		C L0 get next ones
+	ldq	v1,	0(r18)		C L1
+	bne	r22,	$Lsmall
+
+	ldq	u0,	8(r17)		C L0 get next ones
+	ldq	v0,	8(r18)		C L1
+	subq	u1,	v1,	r5	C U0 sub two data
+
+	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	16(r17)		C L0 get next ones
+	ldq	v1,	16(r18)		C L1
+
+	subq	u0,	v0,	r8	C U1 sub two data
+	subq	r5,	cy0,	r24	C U0 borrow in
+
+	cmpult	u0,	v0,	r22	C U1 did it borrow
+	beq	r5,	$fix5f		C U0 fix exact zero
+$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
+	ldq	v0,	24(r18)		C L1
+
+	subq	r8,	r23,	r25	C U1 borrow from last
+	subq	u1,	v1,	r7	C U0 sub two data
+
+	beq	r8,	$fix6f		C U1 fix exact zero
+$ret6f:	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	32(r17)		C L0 get next ones
+	ldq	v1,	32(r18)		C L1
+
+	lda	r17,	40(r17)		C L0 move pointer
+	lda	r18,	40(r18)		C L1 move pointer
+
+	lda	r16,	-8(r16)
+	lda	r19,	-13(r19)	C L1 move counter
+	blt	r19,	$Lend		C U1 loop control
+
+
+C Main loop.  8-way unrolled.
+	ALIGN(16)
+$Loop:	subq	u0,	v0,	r2	C U1 sub two data
+	stq	r24,	8(r16)		C L0 put an answer
+	subq	r7,	r22,	r24	C U0 borrow from last
+	stq	r25,	16(r16)		C L1 pair
+
+	cmpult	u0,	v0,	cy1	C U1 did it borrow
+	beq	r7,	$fix7		C U0 fix exact 0
+$ret7:	ldq	u0,	0(r17)		C L0 get next ones
+	ldq	v0,	0(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	subq	r2,	r23,	r25	C U1 borrow from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	subq	u1,	v1,	r5	C U0 sub two data
+
+	beq	r2,	$fix0		C U1 fix exact zero
+$ret0:	cmpult	u1,	v1,	cy0	C U0 did it borrow
+	ldq	u1,	8(r17)		C L0 get next ones
+	ldq	v1,	8(r18)		C L1
+
+	subq	u0,	v0,	r8	C U1 sub two data
+	stq	r24,	24(r16)		C L0 store pair
+	subq	r5,	cy1,	r24	C U0 borrow from last
+	stq	r25,	32(r16)		C L1
+
+	cmpult	u0,	v0,	r22	C U1 did it borrow
+	beq	r5,	$fix1		C U0 fix exact zero
+$ret1:	ldq	u0,	16(r17)		C L0 get next ones
+	ldq	v0,	16(r18)		C L1
+
+	lda	r16,	64(r16)		C L0 move pointer
+	subq	r8,	cy0,	r25	C U1 borrow from last
+	lda	r19,	-8(r19)		C L1 move counter
+	subq	u1,	v1,	r7	C U0 sub two data
+
+	beq	r8,	$fix2		C U1 fix exact zero
+$ret2:	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	24(r17)		C L0 get next ones
+	ldq	v1,	24(r18)		C L1
+
+	subq	u0,	v0,	r2	C U1 sub two data
+	stq	r24,	-24(r16)	C L0 put an answer
+	subq	r7,	r22,	r24	C U0 borrow from last
+	stq	r25,	-16(r16)	C L1 pair
+
+	cmpult	u0,	v0,	cy1	C U1 did it borrow
+	beq	r7,	$fix3		C U0 fix exact 0
+$ret3:	ldq	u0,	32(r17)		C L0 get next ones
+	ldq	v0,	32(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	subq	r2,	r23,	r25	C U1 borrow from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	subq	u1,	v1,	r5	C U0 sub two data
+
+	beq	r2,	$fix4		C U1 fix exact zero
+$ret4:	cmpult	u1,	v1,	cy0	C U0 did it borrow
+	ldq	u1,	40(r17)		C L0 get next ones
+	ldq	v1,	40(r18)		C L1
+
+	subq	u0,	v0,	r8	C U1 sub two data
+	stq	r24,	-8(r16)		C L0 store pair
+	subq	r5,	cy1,	r24	C U0 borrow from last
+	stq	r25,	0(r16)		C L1
+
+	cmpult	u0,	v0,	r22	C U1 did it borrow
+	beq	r5,	$fix5		C U0 fix exact zero
+$ret5:	ldq	u0,	48(r17)		C L0 get next ones
+	ldq	v0,	48(r18)		C L1
+
+	ldl	r31, 256(r17)		C L0 prefetch
+	subq	r8,	cy0,	r25	C U1 borrow from last
+	ldl	r31, 256(r18)		C L1 prefetch
+	subq	u1,	v1,	r7	C U0 sub two data
+
+	beq	r8,	$fix6		C U1 fix exact zero
+$ret6:	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	56(r17)		C L0 get next ones
+	ldq	v1,	56(r18)		C L1
+
+	lda	r17,	64(r17)		C L0 move pointer
+	bis	r31,	r31,	r31	C U
+	lda	r18,	64(r18)		C L1 move pointer
+	bge	r19,	$Loop		C U1 loop control
+C ==== main loop end
+
+$Lend:	subq	u0,	v0,	r2	C U1 sub two data
+	stq	r24,	8(r16)		C L0 put an answer
+	subq	r7,	r22,	r24	C U0 borrow from last
+	stq	r25,	16(r16)		C L1 pair
+	cmpult	u0,	v0,	cy1	C U1 did it borrow
+	beq	r7,	$fix7c		C U0 fix exact 0
+$ret7c:	subq	r2,	r23,	r25	C U1 borrow from last
+	subq	u1,	v1,	r5	C U0 sub two data
+	beq	r2,	$fix0c		C U1 fix exact zero
+$ret0c:	cmpult	u1,	v1,	cy0	C U0 did it borrow
+	stq	r24,	24(r16)		C L0 store pair
+	subq	r5,	cy1,	r24	C U0 borrow from last
+	stq	r25,	32(r16)		C L1
+	beq	r5,	$fix1c		C U0 fix exact zero
+$ret1c:	stq	r24,	40(r16)		C L0 put an answer
+	lda	r16,	48(r16)		C L0 move pointer
+
+	lda	r19,	8(r19)
+	beq	r19,	$Lret
+
+	ldq	u1,	0(r17)
+	ldq	v1,	0(r18)
+$Lsmall:
+	lda	r19,	-1(r19)
+	beq	r19,	$Lend0
+
+	ALIGN(8)
+$Loop0:	subq	u1,	v1,	r2	C main sub
+	cmpult	u1,	v1,	r8	C compute bw from last sub
+	ldq	u1,	8(r17)
+	ldq	v1,	8(r18)
+	subq	r2,	cy0,	r5	C borrow sub
+	lda	r17,	8(r17)
+	lda	r18,	8(r18)
+	stq	r5,	0(r16)
+	cmpult	r2,	cy0,	cy0	C compute bw from last sub
+	lda	r19,	-1(r19)		C decr loop cnt
+	bis	r8,	cy0,	cy0	C combine bw from the two subs
+	lda	r16,	8(r16)
+	bne	r19,	$Loop0
+$Lend0:	subq	u1,	v1,	r2	C main sub
+	subq	r2,	cy0,	r5	C borrow sub
+	cmpult	u1,	v1,	r8	C compute bw from last sub
+	cmpult	r2,	cy0,	cy0	C compute bw from last sub
+	stq	r5,	0(r16)
+	bis	r8,	cy0,	r0	C combine bw from the two subs
+	ret	r31,(r26),1
+
+	ALIGN(8)
+$Lret:	lda	r0,	0(cy0)		C copy borrow into return register
+	ret	r31,(r26),1
+
+$fix5f:	bis	r23,	cy0,	r23	C bring forward borrow
+	br	r31,	$ret5f
+$fix6f:	bis	r22,	r23,	r22	C bring forward borrow
+	br	r31,	$ret6f
+$fix0:	bis	cy1,	r23,	cy1	C bring forward borrow
+	br	r31,	$ret0
+$fix1:	bis	cy0,	cy1,	cy0	C bring forward borrow
+	br	r31,	$ret1
+$fix2:	bis	r22,	cy0,	r22	C bring forward borrow
+	br	r31,	$ret2
+$fix3:	bis	r23,	r22,	r23	C bring forward borrow
+	br	r31,	$ret3
+$fix4:	bis	cy1,	r23,	cy1	C bring forward borrow
+	br	r31,	$ret4
+$fix5:	bis	cy1,	cy0,	cy0	C bring forward borrow
+	br	r31,	$ret5
+$fix6:	bis	r22,	cy0,	r22	C bring forward borrow
+	br	r31,	$ret6
+$fix7:	bis	r23,	r22,	r23	C bring forward borrow
+	br	r31,	$ret7
+$fix0c:	bis	cy1,	r23,	cy1	C bring forward borrow
+	br	r31,	$ret0c
+$fix1c:	bis	cy0,	cy1,	cy0	C bring forward borrow
+	br	r31,	$ret1c
+$fix7c:	bis	r23,	r22,	r23	C bring forward borrow
+	br	r31,	$ret7c
+
+EPILOGUE()
+ASM_END()