summaryrefslogtreecommitdiff
path: root/gmp/mpn/alpha/ev6
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/alpha/ev6')
-rw-r--r--gmp/mpn/alpha/ev6/add_n.asm283
-rw-r--r--gmp/mpn/alpha/ev6/aorslsh1_n.asm172
-rw-r--r--gmp/mpn/alpha/ev6/aorsmul_1.asm398
-rw-r--r--gmp/mpn/alpha/ev6/gmp-mparam.h209
-rw-r--r--gmp/mpn/alpha/ev6/mod_1_4.asm337
-rw-r--r--gmp/mpn/alpha/ev6/mul_1.asm496
-rw-r--r--gmp/mpn/alpha/ev6/nails/README65
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_1.asm396
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_2.asm146
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_3.asm169
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_4.asm210
-rw-r--r--gmp/mpn/alpha/ev6/nails/aors_n.asm233
-rw-r--r--gmp/mpn/alpha/ev6/nails/gmp-mparam.h72
-rw-r--r--gmp/mpn/alpha/ev6/nails/mul_1.asm364
-rw-r--r--gmp/mpn/alpha/ev6/nails/submul_1.asm396
-rwxr-xr-xgmp/mpn/alpha/ev6/slot.pl318
-rw-r--r--gmp/mpn/alpha/ev6/sub_n.asm283
17 files changed, 4547 insertions, 0 deletions
diff --git a/gmp/mpn/alpha/ev6/add_n.asm b/gmp/mpn/alpha/ev6/add_n.asm
new file mode 100644
index 0000000000..9261f31b8a
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/add_n.asm
@@ -0,0 +1,283 @@
+dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 5.4
+C EV6: 2.125
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
+C cy r20 (for mpn_add_nc)
+
+C TODO
+C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C Use multi-pronged feed-in.
+C Perform additional micro-tuning
+
+C This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C Pair loads and stores where possible
+C Store pairs oct-aligned where possible (didn't need it here)
+C Stores are delayed every third cycle
+C Loads and stores are delayed by fills
+C U stays still, put code there where possible (note alternation of U1 and U0)
+C L moves because of loads and stores
+C Note dampers in L to limit damage
+
+C This odd-looking optimization expects that were having random bits in our
+C data, so that a pure zero result is unlikely. so we penalize the unlikely
+C case to help the common case.
+
+define(`u0', `r0') define(`u1', `r3')
+define(`v0', `r1') define(`v1', `r4')
+
+define(`cy0', `r20') define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+ br r31, $entry
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+ bis r31, r31, cy0 C clear carry in
+$entry: cmpult r19, 5, r22 C L1 move counter
+ ldq u1, 0(r17) C L0 get next ones
+ ldq v1, 0(r18) C L1
+ bne r22, $Lsmall
+
+ ldq u0, 8(r17) C L0 get next ones
+ ldq v0, 8(r18) C L1
+ addq u1, v1, r5 C U0 add two data
+
+ cmpult r5, v1, r23 C U0 did it carry
+ ldq u1, 16(r17) C L0 get next ones
+ ldq v1, 16(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy0, r5 C U0 carry in
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix5f C U0 fix exact zero
+$ret5f: ldq u0, 24(r17) C L0 get next ones
+ ldq v0, 24(r18) C L1
+
+ addq r8, r23, r8 C U1 carry from last
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix6f C U1 fix exact zero
+$ret6f: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 32(r17) C L0 get next ones
+ ldq v1, 32(r18) C L1
+
+ lda r17, 40(r17) C L0 move pointer
+ lda r18, 40(r18) C L1 move pointer
+
+ lda r16, -8(r16)
+ lda r19, -13(r19) C L1 move counter
+ blt r19, $Lend C U1 loop control
+
+
+C Main loop. 8-way unrolled.
+ ALIGN(16)
+$Loop: addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, 8(r16) C L0 put an answer
+ stq r8, 16(r16) C L1 pair
+
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix7 C U0 fix exact 0
+$ret7: ldq u0, 0(r17) C L0 get next ones
+ ldq v0, 0(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ addq r2, r23, r2 C U1 carry from last
+ bis r31, r31, r31 C L moves in L !
+ addq u1, v1, r5 C U0 add two data
+
+ beq r2, $fix0 C U1 fix exact zero
+$ret0: cmpult r5, v1, cy0 C U0 did it carry
+ ldq u1, 8(r17) C L0 get next ones
+ ldq v1, 8(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, 24(r16) C L0 store pair
+ stq r2, 32(r16) C L1
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix1 C U0 fix exact zero
+$ret1: ldq u0, 16(r17) C L0 get next ones
+ ldq v0, 16(r18) C L1
+
+ lda r16, 64(r16) C L0 move pointer
+ addq r8, cy0, r8 C U1 carry from last
+ lda r19, -8(r19) C L1 move counter
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix2 C U1 fix exact zero
+$ret2: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 24(r17) C L0 get next ones
+ ldq v1, 24(r18) C L1
+
+ addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, -24(r16) C L0 put an answer
+ stq r8, -16(r16) C L1 pair
+
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix3 C U0 fix exact 0
+$ret3: ldq u0, 32(r17) C L0 get next ones
+ ldq v0, 32(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ addq r2, r23, r2 C U1 carry from last
+ bis r31, r31, r31 C L moves in L !
+ addq u1, v1, r5 C U0 add two data
+
+ beq r2, $fix4 C U1 fix exact zero
+$ret4: cmpult r5, v1, cy0 C U0 did it carry
+ ldq u1, 40(r17) C L0 get next ones
+ ldq v1, 40(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, -8(r16) C L0 store pair
+ stq r2, 0(r16) C L1
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix5 C U0 fix exact zero
+$ret5: ldq u0, 48(r17) C L0 get next ones
+ ldq v0, 48(r18) C L1
+
+ ldl r31, 256(r17) C L0 prefetch
+ addq r8, cy0, r8 C U1 carry from last
+ ldl r31, 256(r18) C L1 prefetch
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix6 C U1 fix exact zero
+$ret6: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 56(r17) C L0 get next ones
+ ldq v1, 56(r18) C L1
+
+ lda r17, 64(r17) C L0 move pointer
+ bis r31, r31, r31 C U
+ lda r18, 64(r18) C L1 move pointer
+ bge r19, $Loop C U1 loop control
+C ==== main loop end
+
+$Lend: addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, 8(r16) C L0 put an answer
+ stq r8, 16(r16) C L1 pair
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix7c C U0 fix exact 0
+$ret7c: addq r2, r23, r2 C U1 carry from last
+ addq u1, v1, r5 C U0 add two data
+ beq r2, $fix0c C U1 fix exact zero
+$ret0c: cmpult r5, v1, cy0 C U0 did it carry
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, 24(r16) C L0 store pair
+ stq r2, 32(r16) C L1
+ beq r5, $fix1c C U0 fix exact zero
+$ret1c: stq r5, 40(r16) C L0 put an answer
+ lda r16, 48(r16) C L0 move pointer
+
+ lda r19, 8(r19)
+ beq r19, $Lret
+
+ ldq u1, 0(r17)
+ ldq v1, 0(r18)
+$Lsmall:
+ lda r19, -1(r19)
+ beq r19, $Lend0
+
+ ALIGN(8)
+$Loop0: addq u1, v1, r2 C main add
+ cmpult r2, v1, r8 C compute cy from last add
+ ldq u1, 8(r17)
+ ldq v1, 8(r18)
+ addq r2, cy0, r5 C carry add
+ lda r17, 8(r17)
+ lda r18, 8(r18)
+ stq r5, 0(r16)
+ cmpult r5, r2, cy0 C compute cy from last add
+ lda r19, -1(r19) C decr loop cnt
+ bis r8, cy0, cy0 C combine cy from the two adds
+ lda r16, 8(r16)
+ bne r19, $Loop0
+$Lend0: addq u1, v1, r2 C main add
+ addq r2, cy0, r5 C carry add
+ cmpult r2, v1, r8 C compute cy from last add
+ cmpult r5, r2, cy0 C compute cy from last add
+ stq r5, 0(r16)
+ bis r8, cy0, r0 C combine cy from the two adds
+ ret r31,(r26),1
+
+ ALIGN(8)
+$Lret: lda r0, 0(cy0) C copy carry into return register
+ ret r31,(r26),1
+
+$fix5f: bis r23, cy0, r23 C bring forward carry
+ br r31, $ret5f
+$fix6f: bis r22, r23, r22 C bring forward carry
+ br r31, $ret6f
+$fix0: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret0
+$fix1: bis cy0, cy1, cy0 C bring forward carry
+ br r31, $ret1
+$fix2: bis r22, cy0, r22 C bring forward carry
+ br r31, $ret2
+$fix3: bis r23, r22, r23 C bring forward carry
+ br r31, $ret3
+$fix4: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret4
+$fix5: bis cy1, cy0, cy0 C bring forward carry
+ br r31, $ret5
+$fix6: bis r22, cy0, r22 C bring forward carry
+ br r31, $ret6
+$fix7: bis r23, r22, r23 C bring forward carry
+ br r31, $ret7
+$fix0c: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret0c
+$fix1c: bis cy0, cy1, cy0 C bring forward carry
+ br r31, $ret1c
+$fix7c: bis r23, r22, r23 C bring forward carry
+ br r31, $ret7c
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorslsh1_n.asm b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
new file mode 100644
index 0000000000..cb966ce021
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
@@ -0,0 +1,172 @@
+dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 7
+C EV6: 4
+
+C TODO
+C * Tune to reach 3.75 c/l on ev6.
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh1_n',`
+ define(ADDSUB, addq)
+ define(CARRY, `cmpult $1,$2,$3')
+ define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+ define(ADDSUB, subq)
+ define(CARRY, `cmpult $2,$1,$3')
+ define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+ and n, 2, cy0
+ blbs n, L(bx1)
+L(bx0): ldq v1, 0(vp)
+ ldq u1, 0(up)
+ lda r2, 0(r31)
+ bne cy0, L(b10)
+
+L(b00): lda vp, 48(vp)
+ lda up, -16(up)
+ lda rp, -8(rp)
+ lda cy0, 0(r31)
+ br r31, L(lo0)
+
+L(b10): lda vp, 32(vp)
+ lda rp, 8(rp)
+ lda cy0, 0(r31)
+ br r31, L(lo2)
+
+L(bx1): ldq v0, 0(vp)
+ ldq u0, 0(up)
+ lda r3, 0(r31)
+ beq cy0, L(b01)
+
+L(b11): lda vp, 40(vp)
+ lda up, -24(up)
+ lda rp, 16(rp)
+ lda cy1, 0(r31)
+ br r31, L(lo3)
+
+L(b01): lda n, -4(n)
+ lda cy1, 0(r31)
+ ble n, L(end)
+ lda vp, 24(vp)
+ lda up, -8(up)
+
+ ALIGN(16)
+L(top): addq v0, v0, r6
+ ldq v1, -16(vp)
+ addq r6, r3, sl C combined vlimb
+ ldq u1, 16(up)
+ ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
+ cmplt v0, r31, r2 C high v bits
+ ADDSUB ps, cy1, rr C consume carry from previous operation
+ CARRY( ps, u0, cy0) C carry out #2
+ stq rr, 0(rp)
+ CARRY( rr, ps, cy) C carry out #3
+ lda vp, 32(vp) C bookkeeping
+ addq cy, cy0, cy0 C final carry out
+L(lo0): addq v1, v1, r7
+ ldq v0, -40(vp)
+ addq r7, r2, sl
+ ldq u0, 24(up)
+ ADDSUB u1, sl, ps
+ cmplt v1, r31, r3
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy1)
+ stq rr, 8(rp)
+ CARRY( rr, ps, cy)
+ lda rp, 32(rp) C bookkeeping
+ addq cy, cy1, cy1
+L(lo3): addq v0, v0, r6
+ ldq v1, -32(vp)
+ addq r6, r3, sl
+ ldq u1, 32(up)
+ ADDSUB u0, sl, ps
+ cmplt v0, r31, r2
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy0)
+ stq rr, -16(rp)
+ CARRY( rr, ps, cy)
+ lda up, 32(up) C bookkeeping
+ addq cy, cy0, cy0
+L(lo2): addq v1, v1, r7
+ ldq v0, -24(vp)
+ addq r7, r2, sl
+ ldq u0, 8(up)
+ ADDSUB u1, sl, ps
+ cmplt v1, r31, r3
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy1)
+ stq rr, -8(rp)
+ CARRY( rr, ps, cy)
+ lda n, -4(n) C bookkeeping
+ addq cy, cy1, cy1
+ bgt n, L(top)
+
+L(end): addq v0, v0, r6
+ addq r6, r3, sl
+ ADDSUB u0, sl, ps
+ cmplt v0, r31, r2
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy0)
+ stq rr, 0(rp)
+ CARRY( rr, ps, cy)
+ addq cy, cy0, cy0
+ addq cy0, r2, r0
+
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorsmul_1.asm b/gmp/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644
index 0000000000..0e68e6e7ad
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/aorsmul_1.asm
@@ -0,0 +1,398 @@
+dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.5
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n', `r18')
+define(`v0', `r19')
+
+dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl them, so that further disturbance to the schedule is damped.
+
+dnl We couldn't pair the loads, because the entangled schedule of the carry's
+dnl has to happen on one side {0} of the machine.
+
+dnl This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl The lockup on U0 means that any stall can't be recovered from. Consider a
+dnl ldq in L1, say that load gets stalled because it collides with a fill from
+dnl the b_cache. On the next cycle, this load gets priority. If first looks
+dnl at L0, and goes there. The instruction we intended for L0 gets to look at
+dnl L1, which is NOT where we want it. It either stalls 1, because it can't
+dnl go in L0, or goes there, and causes a further instruction to stall.
+
+dnl So for b_cache, we're likely going to want to put one or more cycles back
+dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl At a place where we have an mt followed by a bookkeeping, put the
+dnl bookkeeping in upper, and the prefetch into lower.
+
+dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd
+dnl like not to have an ldq or an stq to preceded a conditional branch in a
+dnl quadpack. The conditional branch moves the retire pointer one cycle
+dnl later.
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `addq')
+ define(`CMPCY', `cmpult $2,$1')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `subq')
+ define(`CMPCY', `cmpult $1,$2')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ ldq r3, 0(up) C
+ and r18, 7, r20 C
+ lda r18, -9(r18) C
+ cmpeq r20, 1, r21 C
+ beq r21, $L1 C
+
+$1mod8: ldq r5, 0(rp) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r8 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r8, r20, r0 C
+ stq r23, 0(rp) C
+ bge r18, $ent1 C
+ ret r31, (r26), 1 C
+
+$L1: lda r8, 0(r31) C zero carry reg
+ lda r24, 0(r31) C zero carry reg
+ cmpeq r20, 2, r21 C
+ bne r21, $2mod8 C
+ cmpeq r20, 3, r21 C
+ bne r21, $3mod8 C
+ cmpeq r20, 4, r21 C
+ bne r21, $4mod8 C
+ cmpeq r20, 5, r21 C
+ bne r21, $5mod8 C
+ cmpeq r20, 6, r21 C
+ bne r21, $6mod8 C
+ cmpeq r20, 7, r21 C
+ beq r21, $0mod8 C
+
+$7mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r24 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r24, r20, r24 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$6mod8: ldq r1, 8(up) C
+ mulq v0, r3, r25 C
+ umulh v0, r3, r3 C
+ mulq v0, r1, r28 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ ldq r1, 24(up) C
+ lda up, 48(up) C L1 bookkeeping
+ mulq v0, r0, r2 C
+ ldq r5, 8(rp) C
+ lda rp, -32(rp) C L1 bookkeeping
+ umulh v0, r0, r6 C
+ ADDSUB r4, r25, r25 C lo + acc
+ mulq v0, r1, r7 C
+ br r31, $ent6 C
+
+$ent1: lda up, 8(up) C
+ lda rp, 8(rp) C
+ lda r8, 0(r0) C
+ ldq r3, 0(up) C
+$0mod8: ldq r1, 8(up) C
+ mulq v0, r3, r2 C
+ umulh v0, r3, r6 C
+ mulq v0, r1, r7 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r24 C
+ ldq r1, 24(up) C
+ mulq v0, r0, r25 C
+ ldq r5, 8(rp) C
+ umulh v0, r0, r3 C
+ ADDSUB r4, r2, r2 C lo + acc
+ mulq v0, r1, r28 C
+ lda rp, -16(rp) C
+ br r31, $ent0 C
+
+$3mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r8 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r8, r20, r24 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$2mod8: ldq r1, 8(up) C
+ mulq v0, r3, r25 C
+ umulh v0, r3, r3 C
+ mulq v0, r1, r28 C
+ ble r18, $n23 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ ldq r1, 24(up) C
+ lda up, 16(up) C L1 bookkeeping
+ mulq v0, r0, r2 C
+ ldq r5, 8(rp) C
+ lda rp, 0(rp) C L1 bookkeeping
+ umulh v0, r0, r6 C
+ ADDSUB r4, r25, r25 C lo + acc
+ mulq v0, r1, r7 C
+ br r31, $ent2 C
+
+$5mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r24 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r24, r20, r8 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$4mod8: ldq r1, 8(up) C
+ mulq v0, r3, r2 C
+ umulh v0, r3, r6 C
+ mulq v0, r1, r7 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r24 C
+ ldq r1, 24(up) C
+ lda up, 32(up) C L1 bookkeeping
+ mulq v0, r0, r25 C
+ ldq r5, 8(rp) C
+ lda rp, 16(rp) C L1 bookkeeping
+ umulh v0, r0, r3 C
+ ADDSUB r4, r2, r2 C lo + acc
+ mulq v0, r1, r28 C
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+ ble r18, $Lend C
+ ALIGN(16)
+$Loop:
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r2, r22), r21 C L0 hi add => carry
+ addq r6, r20, r6 C U0 hi mul + carry
+ ldq r0, 0(up) C
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r7, r7 C L0 lo + acc
+ addq r6, r21, r6 C U0 hi mul + carry
+ ldq r4, 0(rp) C L1
+
+ umulh v0, r1, r8 C U1
+ CMPCY( r5, r7), r20 C L0 lo add => carry
+ ADDSUB r7, r6, r23 C U0 hi add => answer
+ ldq r1, 8(up) C L1
+
+ mulq v0, r0, r2 C U1
+ CMPCY( r7, r23), r21 C L0 hi add => carry
+ addq r24, r20, r24 C U0 hi mul + carry
+ ldq r5, 8(rp) C L1
+
+ umulh v0, r0, r6 C U1
+ ADDSUB r4, r25, r25 C U0 lo + acc
+ stq r22, -16(rp) C L0
+ stq r23, -8(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r7 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r24, r21, r24 C U0 hi mul + carry
+$ent2:
+ CMPCY( r4, r25), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r18, -8(r18) C L1 bookkeeping
+ ADDSUB r25, r24, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r25, r22), r21 C L0 hi add => carry
+ addq r3, r20, r3 C U0 hi mul + carry
+ ldq r0, 16(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r28, r28 C L0 lo + acc
+ addq r3, r21, r3 C U0 hi mul + carry
+ ldq r4, 16(rp) C L1
+
+ umulh v0, r1, r24 C U1
+ CMPCY( r5, r28), r20 C L0 lo add => carry
+ ADDSUB r28, r3, r23 C U0 hi add => answer
+ ldq r1, 24(up) C L1
+
+ mulq v0, r0, r25 C U1
+ CMPCY( r28, r23), r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r5, 24(rp) C L1
+
+ umulh v0, r0, r3 C U1
+ ADDSUB r4, r2, r2 C U0 lo + acc
+ stq r22, 0(rp) C L0
+ stq r23, 8(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r28 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+$ent0:
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda up, 64(up) C L1 bookkeeping
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r2, r22), r21 C L0 hi add => carry
+ addq r6, r20, r6 C U0 hi mul + carry
+ ldq r0, -32(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r7, r7 C L0 lo + acc
+ addq r6, r21, r6 C U0 hi mul + carry
+ ldq r4, 32(rp) C L1
+
+ umulh v0, r1, r8 C U1
+ CMPCY( r5, r7), r20 C L0 lo add => carry
+ ADDSUB r7, r6, r23 C U0 hi add => answer
+ ldq r1, -24(up) C L1
+
+ mulq v0, r0, r2 C U1
+ CMPCY( r7, r23), r21 C L0 hi add => carry
+ addq r24, r20, r24 C U0 hi mul + carry
+ ldq r5, 40(rp) C L1
+
+ umulh v0, r0, r6 C U1
+ ADDSUB r4, r25, r25 C U0 lo + acc
+ stq r22, 16(rp) C L0
+ stq r23, 24(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r7 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r24, r21, r24 C U0 hi mul + carry
+$ent6:
+ CMPCY( r4, r25), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda rp, 64(rp) C L1 bookkeeping
+ ADDSUB r25, r24, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r25, r22), r21 C L0 hi add => carry
+ addq r3, r20, r3 C U0 hi mul + carry
+ ldq r0, -16(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r28, r28 C L0 lo + acc
+ addq r3, r21, r3 C U0 hi mul + carry
+ ldq r4, -16(rp) C L1
+
+ umulh v0, r1, r24 C U1
+ CMPCY( r5, r28), r20 C L0 lo add => carry
+ ADDSUB r28, r3, r23 C U0 hi add => answer
+ ldq r1, -8(up) C L1
+
+ mulq v0, r0, r25 C U1
+ CMPCY( r28, r23), r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r5, -8(rp) C L1
+
+ umulh v0, r0, r3 C U1
+ ADDSUB r4, r2, r2 C U0 lo + acc
+ stq r22, -32(rp) C L0
+ stq r23, -24(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r28 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+ ldl r31, 256(up) C prefetch up[]
+ bgt r18, $Loop C U1 bookkeeping
+
+$Lend: CMPCY( r2, r22), r21 C
+ addq r6, r20, r6 C
+ ADDSUB r5, r7, r7 C
+ addq r6, r21, r6 C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ CMPCY( r5, r7), r20 C
+ ADDSUB r7, r6, r23 C
+ CMPCY(r7, r23), r21 C
+ addq r24, r20, r24 C
+ ldq r5, 8(rp) C
+ ADDSUB r4, r25, r25 C
+ stq r22, -16(rp) C
+ stq r23, -8(rp) C
+ addq r24, r21, r24 C
+ br L(x)
+
+ ALIGN(16)
+$n23: ldq r4, 0(rp) C
+ ldq r5, 8(rp) C
+ umulh v0, r1, r8 C
+ ADDSUB r4, r25, r25 C
+L(x): CMPCY( r4, r25), r20 C
+ ADDSUB r25, r24, r22 C
+ CMPCY( r25, r22), r21 C
+ addq r3, r20, r3 C
+ ADDSUB r5, r28, r28 C
+ addq r3, r21, r3 C
+ CMPCY( r5, r28), r20 C
+ ADDSUB r28, r3, r23 C
+ CMPCY( r28, r23), r21 C
+ addq r8, r20, r8 C
+ stq r22, 0(rp) C
+ stq r23, 8(rp) C
+ addq r8, r21, r0 C
+ ret r31, (r26), 1 C
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/gmp-mparam.h b/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..e51d6b0d15
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define DIVEXACT_BY3_METHOD 0 /* override ../diveby3.asm */
+
+/* 500 MHz 21164 (agnesi.math.su.se) */
+/* FFT tuning limit = 20000000 */
+/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_1N_PI1_METHOD 2
+#define DIV_QR_1_NORM_THRESHOLD 5
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD 8
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 117
+#define MUL_TOOM44_THRESHOLD 124
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 59
+#define SQR_TOOM3_THRESHOLD 123
+#define SQR_TOOM4_THRESHOLD 163
+#define SQR_TOOM6_THRESHOLD 333
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 52
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 5
+
+#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 468, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 19, 7}, { 10, 6}, \
+ { 24, 7}, { 13, 6}, { 27, 7}, { 14, 6}, \
+ { 29, 7}, { 17, 6}, { 35, 7}, { 29, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 51, 9}, { 27, 8}, { 55, 9}, { 35, 8}, \
+ { 71, 9}, { 39,10}, { 23, 9}, { 55,10}, \
+ { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \
+ { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \
+ { 79,11}, { 47,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \
+ { 95,10}, { 199,11}, { 111,12}, { 63,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,11}, \
+ { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \
+ { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 151
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 412 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 27, 7}, { 14, 6}, { 29, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \
+ { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \
+ { 383,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 703,11}, { 1407,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1151,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \
+ { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD 5056
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 100
+#define MULLO_MUL_N_THRESHOLD 11355
+
+#define DC_DIV_QR_THRESHOLD 124
+#define DC_DIVAPPR_Q_THRESHOLD 438
+#define DC_BDIV_QR_THRESHOLD 153
+#define DC_BDIV_Q_THRESHOLD 318
+
+#define INV_MULMOD_BNM1_THRESHOLD 62
+#define INV_NEWTON_THRESHOLD 384
+#define INV_APPR_THRESHOLD 402
+
+#define BINV_NEWTON_THRESHOLD 381
+#define REDC_1_TO_REDC_N_THRESHOLD 110
+
+#define MU_DIV_QR_THRESHOLD 1752
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 174
+#define MU_BDIV_QR_THRESHOLD 1387
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define POWM_SEC_TABLE 1,13,66,82,579
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 318
+#define HGCD_APPR_THRESHOLD 363
+#define HGCD_REDUCE_THRESHOLD 2384
+#define GCD_DC_THRESHOLD 2504
+#define GCDEXT_DC_THRESHOLD 671
+#define JACOBI_BASE_METHOD 3
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 3754
+#define SET_STR_PRECOMPUTE_THRESHOLD 8097
+
+#define FAC_DSC_THRESHOLD 951
+#define FAC_ODD_THRESHOLD 24
diff --git a/gmp/mpn/alpha/ev6/mod_1_4.asm b/gmp/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 0000000000..836de07c0f
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mod_1_4.asm
@@ -0,0 +1,337 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimise. 2.75 c/l should be possible.
+C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
+C * Optimise feed-in code, starting the sw pipeline in switch code.
+C * Shorten software pipeline. The mul instructions are scheduled too far
+C from their users. Fixing this will allow us to use fewer registers.
+C * If we cannot reduce register usage, write perhaps small-n basecase.
+C * Does this work for PIC?
+
+C cycles/limb
+C EV4: ?
+C EV5: 23
+C EV6: 3
+
+define(`ap', `r16')
+define(`n', `r17')
+define(`pl', `r24')
+define(`ph', `r25')
+define(`rl', `r6')
+define(`rh', `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_4p)
+ lda r30, -64(r30)
+ stq r9, 8(r30)
+ ldq B1modb, 16(r19)
+ stq r10, 16(r30)
+ ldq B2modb, 24(r19)
+ stq r11, 24(r30)
+ ldq B3modb, 32(r19)
+ stq r12, 32(r30)
+ ldq B4modb, 40(r19)
+ stq r13, 40(r30)
+ ldq B5modb, 48(r19)
+ s8addq n, ap, ap C point ap at vector end
+
+ and n, 3, r0
+ lda n, -4(n)
+ beq r0, L(b0)
+ lda r6, -2(r0)
+ blt r6, L(b1)
+ beq r6, L(b2)
+
+L(b3): ldq r21, -16(ap)
+ ldq r22, -8(ap)
+ ldq r20, -24(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, rl
+ cmpult rl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, rh
+ lda ap, -56(ap)
+ br L(com)
+
+L(b0): ldq r21, -24(ap)
+ ldq r22, -16(ap)
+ ldq r23, -8(ap)
+ ldq r20, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r27
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, rl
+ cmpult rl, r10, r0
+ addq r27, ph, ph
+ addq r0, ph, rh
+ lda ap, -64(ap)
+ br L(com)
+
+L(b1): bis r31, r31, rh
+ ldq rl, -8(ap)
+ lda ap, -40(ap)
+ br L(com)
+
+L(b2): ldq rh, -8(ap)
+ ldq rl, -16(ap)
+ lda ap, -48(ap)
+
+L(com): ble n, L(ed3)
+ ldq r21, 8(ap)
+ ldq r22, 16(ap)
+ ldq r23, 24(ap)
+ ldq r20, 0(ap)
+ lda n, -4(n)
+ lda ap, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r27
+ mulq rl, B4modb, r11
+ umulh rl, B4modb, r28
+ ble n, L(ed2)
+
+ ALIGN(16)
+L(top): ldq r21, 8(ap)
+ mulq rh, B5modb, rl
+ addq r8, r20, pl
+ ldq r22, 16(ap)
+ cmpult pl, r8, r0
+ umulh rh, B5modb, rh
+ ldq r23, 24(ap)
+ addq r0, r12, ph
+ addq r9, pl, pl
+ mulq r21, B1modb, r8
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ umulh r21, B1modb, r12
+ lda ap, -32(ap)
+ addq r0, ph, ph
+ addq r10, pl, pl
+ mulq r22, B2modb, r9
+ cmpult pl, r10, r0
+ addq r27, ph, ph
+ addq r11, pl, pl
+ umulh r22, B2modb, r13
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r28, ph, ph
+ mulq r23, B3modb, r10
+ ldq r20, 32(ap)
+ addq pl, rl, rl
+ umulh r23, B3modb, r27
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ mulq rl, B4modb, r11
+ addq ph, rh, rh
+ umulh rl, B4modb, r28
+ addq r0, rh, rh
+ lda n, -4(n)
+ bgt n, L(top)
+
+L(ed2): mulq rh, B5modb, rl
+ addq r8, r20, pl
+ umulh rh, B5modb, rh
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, pl
+ cmpult pl, r10, r0
+ addq r27, ph, ph
+ addq r11, pl, pl
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r28, ph, ph
+ addq pl, rl, rl
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ addq ph, rh, rh
+ addq r0, rh, rh
+
+L(ed3): mulq rh, B1modb, r8
+ umulh rh, B1modb, rh
+ addq r8, rl, rl
+ cmpult rl, r8, r0
+ addq r0, rh, rh
+
+ ldq r24, 8(r19) C cnt
+ sll rh, r24, rh
+ subq r31, r24, r25
+ srl rl, r25, r2
+ sll rl, r24, rl
+ or r2, rh, rh
+
+ ldq r23, 0(r19) C bi
+ mulq rh, r23, r8
+ umulh rh, r23, r9
+ addq rh, 1, r7
+ addq r8, rl, r8 C ql
+ cmpult r8, rl, r0
+ addq r9, r7, r9
+ addq r0, r9, r9 C qh
+ mulq r9, r18, r21 C qh * b
+ subq rl, r21, rl
+ cmpult r8, rl, r0 C rl > ql
+ negq r0, r0
+ and r0, r18, r0
+ addq rl, r0, rl
+ cmpule r18, rl, r0 C rl >= b
+ negq r0, r0
+ and r0, r18, r0
+ subq rl, r0, rl
+
+ srl rl, r24, r0
+
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ lda r30, 64(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+ lda r30, -32(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ mov r16, r11
+ LEA( r4, __clz_tab)
+ lda r10, 65(r31)
+ cmpbge r31, r17, r1
+ srl r1, 1, r1
+ xor r1, 127, r1
+ addq r1, r4, r1
+ ldq_u r2, 0(r1)
+ extbl r2, r1, r2
+ s8subq r2, 7, r2
+ srl r17, r2, r3
+ subq r10, r2, r10
+ addq r3, r4, r3
+ ldq_u r1, 0(r3)
+ extbl r1, r3, r1
+ subq r10, r1, r10
+ sll r17, r10, r9
+ mov r9, r16
+ jsr r26, mpn_invert_limb
+ ldah r29, 0(r26)
+ subq r31, r10, r2
+ lda r1, 1(r31)
+ sll r1, r10, r1
+ subq r31, r9, r3
+ srl r0, r2, r2
+ ldq r26, 0(r30)
+ bis r2, r1, r2
+ lda r29, 0(r29)
+ stq r0, 0(r11)
+ stq r10, 8(r11)
+ mulq r2, r3, r2
+ srl r2, r10, r3
+ umulh r2, r0, r1
+ stq r3, 16(r11)
+ mulq r2, r0, r3
+ ornot r31, r1, r1
+ subq r1, r2, r1
+ mulq r1, r9, r1
+ addq r1, r9, r2
+ cmpule r1, r3, r3
+ cmoveq r3, r2, r1
+ srl r1, r10, r3
+ umulh r1, r0, r2
+ stq r3, 24(r11)
+ mulq r1, r0, r3
+ ornot r31, r2, r2
+ subq r2, r1, r2
+ mulq r2, r9, r2
+ addq r2, r9, r1
+ cmpule r2, r3, r3
+ cmoveq r3, r1, r2
+ srl r2, r10, r1
+ umulh r2, r0, r3
+ stq r1, 32(r11)
+ mulq r2, r0, r1
+ ornot r31, r3, r3
+ subq r3, r2, r3
+ mulq r3, r9, r3
+ addq r3, r9, r2
+ cmpule r3, r1, r1
+ cmoveq r1, r2, r3
+ srl r3, r10, r2
+ umulh r3, r0, r1
+ stq r2, 40(r11)
+ mulq r3, r0, r0
+ ornot r31, r1, r1
+ subq r1, r3, r1
+ mulq r1, r9, r1
+ addq r1, r9, r9
+ cmpule r1, r0, r0
+ cmoveq r0, r9, r1
+ ldq r9, 8(r30)
+ srl r1, r10, r1
+ ldq r10, 16(r30)
+ stq r1, 48(r11)
+ ldq r11, 24(r30)
+ lda r30, 32(r30)
+ ret r31, (r26), 1
+EPILOGUE()
diff --git a/gmp/mpn/alpha/ev6/mul_1.asm b/gmp/mpn/alpha/ev6/mul_1.asm
new file mode 100644
index 0000000000..8ee19cd429
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mul_1.asm
@@ -0,0 +1,496 @@
+dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl result in a second limb vector.
+
+dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r16
+C s1_ptr r17
+C size r18
+C s2_limb r19
+
+C This code runs at 2.25 cycles/limb on EV6.
+
+C This code was written in close cooperation with ev6 pipeline expert
+C Steve Root. Any errors are tege's fault, though.
+
+C Code structure:
+
+C code for n < 8
+C code for n > 8 code for (n mod 8)
+C code for (n div 8) feed-in code
+C 8-way unrolled loop
+C wind-down code
+
+C Some notes about unrolled loop:
+C
+C r1-r8 multiplies and workup
+C r21-r28 multiplies and workup
+C r9-r12 loads
+C r0 -1
+C r20,r29,r13-r15 scramble
+C
+C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
+C put-the-carry-into-hi. The idea is that these branches are very rarely
+C taken, and since a non-taken branch consumes no resources, that is better
+C than an addq.
+C
+C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
+C add NEXT cycle #09 which feeds a store in NEXT cycle #02
+
+C The code could use some further work:
+C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
+C faster than this for size < 3.
+C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
+C that is too costly.
+C 3. Consider using 4-way unrolling, even if that runs slower.
+C 4. Reduce register usage. In particular, try to avoid using r29.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ cmpult r18, 8, r1
+ beq r1, $Large
+$Lsmall:
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ bic r31,r31,r4 C clear cy_limb
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Le1a C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ stq r3,0(r16)
+ beq r18,$Le2a C jump if size was == 2
+ ALIGN(8)
+$Lopa: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ lda r18,-1(r18) C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,16(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,8(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ lda r16,8(r16) C res_ptr++
+ bne r18,$Lopa
+
+$Le2a: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,8(r16)
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Le1a: stq r3,0(r16)
+ ret r31,(r26),1
+
+$Large:
+ lda r30, -224(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+ stq r29, 64(r30)
+
+ and r18, 7, r20 C count for the first loop, 0-7
+ srl r18, 3, r18 C count for unrolled loop
+ bis r31, r31, r21
+ beq r20, $L_8_or_more C skip first loop
+
+$L_9_or_more:
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ lda r20,-1(r20) C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ umulh r2,r19,r21 C r21 = prod_high
+ beq r20,$Le1b C jump if size was == 1
+ bis r31, r31, r0 C FIXME: shouldn't need this
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ lda r20,-1(r20) C size--
+ stq r3,0(r16)
+ lda r16,8(r16) C res_ptr++
+ beq r20,$Le2b C jump if size was == 2
+ ALIGN(8)
+$Lopb: mulq r2,r19,r3 C r3 = prod_low
+ addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
+ lda r20,-1(r20) C size--
+ umulh r2,r19,r21 C r21 = prod_high
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,0(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ lda r16,8(r16) C res_ptr++
+ bne r20,$Lopb
+
+$Le2b: mulq r2,r19,r3 C r3 = prod_low
+ addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r21 C r21 = prod_high
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,0(r16)
+ lda r16,8(r16) C res_ptr++
+ addq r21,r0,r21 C cy_limb = prod_high + cy
+ br r31, $L_8_or_more
+$Le1b: stq r3,0(r16)
+ lda r16,8(r16) C res_ptr++
+
+$L_8_or_more:
+ lda r0, -1(r31) C put -1 in r0, for tricky loop control
+ lda r17, -32(r17) C L1 bookkeeping
+ lda r18, -1(r18) C decrement count
+
+ ldq r9, 32(r17) C L1
+ ldq r10, 40(r17) C L1
+ mulq r9, r19, r22 C U1 #07
+ ldq r11, 48(r17) C L1
+ umulh r9, r19, r23 C U1 #08
+ ldq r12, 56(r17) C L1
+ mulq r10, r19, r24 C U1 #09
+ ldq r9, 64(r17) C L1
+
+ lda r17, 64(r17) C L1 bookkeeping
+
+ umulh r10, r19, r25 C U1 #11
+ mulq r11, r19, r26 C U1 #12
+ umulh r11, r19, r27 C U1 #13
+ mulq r12, r19, r28 C U1 #14
+ ldq r10, 8(r17) C L1
+ umulh r12, r19, r1 C U1 #15
+ ldq r11, 16(r17) C L1
+ mulq r9, r19, r2 C U1 #16
+ ldq r12, 24(r17) C L1
+ umulh r9, r19, r3 C U1 #17
+ addq r21, r22, r13 C L1 mov
+ mulq r10, r19, r4 C U1 #18
+ addq r23, r24, r22 C L0 sum 2 mul's
+ cmpult r13, r21, r14 C L1 carry from sum
+ bgt r18, $L_16_or_more
+
+ cmpult r22, r24, r24 C U0 carry from sum
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ umulh r11, r19, r7 C U1 #04
+ addq r27, r28, r28 C U0 sum 2 mul's
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 32(r16) C L1 bookkeeping
+ addq r13, r31, r13 C U0 start carry cascade
+ umulh r12, r19, r21 C U1 #06
+ br r31, $ret0c
+
+$L_16_or_more:
+C ---------------------------------------------------------------
+ subq r18,1,r18
+ cmpult r22, r24, r24 C U0 carry from sum
+ ldq r9, 32(r17) C L1
+
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ umulh r11, r19, r7 C U1 #04
+ addq r27, r28, r28 C U0 sum 2 mul's
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 32(r16) C L1 bookkeeping
+ addq r13, r31, r13 C U0 start carry cascade
+
+ umulh r12, r19, r21 C U1 #06
+C beq r13, $fix0w C U0
+$ret0w: addq r22, r14, r26 C L0
+ ldq r10, 40(r17) C L1
+
+ mulq r9, r19, r22 C U1 #07
+ beq r26, $fix1w C U0
+$ret1w: addq r23, r24, r27 C L0
+ ldq r11, 48(r17) C L1
+
+ umulh r9, r19, r23 C U1 #08
+ beq r27, $fix2w C U0
+$ret2w: addq r28, r25, r28 C L0
+ ldq r12, 56(r17) C L1
+
+ mulq r10, r19, r24 C U1 #09
+ beq r28, $fix3w C U0
+$ret3w: addq r1, r2, r20 C L0 sum 2 mul's
+ ldq r9, 64(r17) C L1
+
+ addq r3, r4, r2 C L0 #10 2 mul's
+ lda r17, 64(r17) C L1 bookkeeping
+ cmpult r20, r1, r29 C U0 carry from sum
+
+ umulh r10, r19, r25 C U1 #11
+ cmpult r2, r4, r4 C U0 carry from sum
+ stq r13, -32(r16) C L0
+ stq r26, -24(r16) C L1
+
+ mulq r11, r19, r26 C U1 #12
+ addq r5, r6, r14 C U0 sum 2 mul's
+ stq r27, -16(r16) C L0
+ stq r28, -8(r16) C L1
+
+ umulh r11, r19, r27 C U1 #13
+ cmpult r14, r6, r3 C U0 carry from sum
+C could do cross-jumping here:
+C bra $L_middle_of_unrolled_loop
+ mulq r12, r19, r28 C U1 #14
+ addq r7, r3, r5 C L0 eat carry
+ addq r20, r15, r20 C U0 carry cascade
+ ldq r10, 8(r17) C L1
+
+ umulh r12, r19, r1 C U1 #15
+ beq r20, $fix4 C U0
+$ret4w: addq r2, r29, r6 C L0
+ ldq r11, 16(r17) C L1
+
+ mulq r9, r19, r2 C U1 #16
+ beq r6, $fix5 C U0
+$ret5w: addq r14, r4, r7 C L0
+ ldq r12, 24(r17) C L1
+
+ umulh r9, r19, r3 C U1 #17
+ beq r7, $fix6 C U0
+$ret6w: addq r5, r8, r8 C L0 sum 2
+ addq r21, r22, r13 C L1 sum 2 mul's
+
+ mulq r10, r19, r4 C U1 #18
+ addq r23, r24, r22 C L0 sum 2 mul's
+ cmpult r13, r21, r14 C L1 carry from sum
+ ble r18, $Lend C U0
+C ---------------------------------------------------------------
+ ALIGN(16)
+$Loop:
+ umulh r0, r18, r18 C U1 #01 decrement r18!
+ cmpult r8, r5, r29 C L0 carry from last bunch
+ cmpult r22, r24, r24 C U0 carry from sum
+ ldq r9, 32(r17) C L1
+
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ stq r20, 0(r16) C L0
+ stq r6, 8(r16) C L1
+
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ stq r7, 16(r16) C L0
+ stq r8, 24(r16) C L1
+
+ umulh r11, r19, r7 C U1 #04
+ bis r31, r31, r31 C L0 st slosh
+ bis r31, r31, r31 C L1 st slosh
+ addq r27, r28, r28 C U0 sum 2 mul's
+
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r13, r29, r13 C U0 start carry cascade
+
+ umulh r12, r19, r21 C U1 #06
+ beq r13, $fix0 C U0
+$ret0: addq r22, r14, r26 C L0
+ ldq r10, 40(r17) C L1
+
+ mulq r9, r19, r22 C U1 #07
+ beq r26, $fix1 C U0
+$ret1: addq r23, r24, r27 C L0
+ ldq r11, 48(r17) C L1
+
+ umulh r9, r19, r23 C U1 #08
+ beq r27, $fix2 C U0
+$ret2: addq r28, r25, r28 C L0
+ ldq r12, 56(r17) C L1
+
+ mulq r10, r19, r24 C U1 #09
+ beq r28, $fix3 C U0
+$ret3: addq r1, r2, r20 C L0 sum 2 mul's
+ ldq r9, 64(r17) C L1
+
+ addq r3, r4, r2 C L0 #10 2 mul's
+ bis r31, r31, r31 C U1 mul hole
+ lda r17, 64(r17) C L1 bookkeeping
+ cmpult r20, r1, r29 C U0 carry from sum
+
+ umulh r10, r19, r25 C U1 #11
+ cmpult r2, r4, r4 C U0 carry from sum
+ stq r13, -32(r16) C L0
+ stq r26, -24(r16) C L1
+
+ mulq r11, r19, r26 C U1 #12
+ addq r5, r6, r14 C U0 sum 2 mul's
+ stq r27, -16(r16) C L0
+ stq r28, -8(r16) C L1
+
+ umulh r11, r19, r27 C U1 #13
+ bis r31, r31, r31 C L0 st slosh
+ bis r31, r31, r31 C L1 st slosh
+ cmpult r14, r6, r3 C U0 carry from sum
+$L_middle_of_unrolled_loop:
+ mulq r12, r19, r28 C U1 #14
+ addq r7, r3, r5 C L0 eat carry
+ addq r20, r15, r20 C U0 carry cascade
+ ldq r10, 8(r17) C L1
+
+ umulh r12, r19, r1 C U1 #15
+ beq r20, $fix4 C U0
+$ret4: addq r2, r29, r6 C L0
+ ldq r11, 16(r17) C L1
+
+ mulq r9, r19, r2 C U1 #16
+ beq r6, $fix5 C U0
+$ret5: addq r14, r4, r7 C L0
+ ldq r12, 24(r17) C L1
+
+ umulh r9, r19, r3 C U1 #17
+ beq r7, $fix6 C U0
+$ret6: addq r5, r8, r8 C L0 sum 2
+ addq r21, r22, r13 C L1 sum 2 mul's
+
+ mulq r10, r19, r4 C U1 #18
+ addq r23, r24, r22 C L0 sum 2 mul's
+ cmpult r13, r21, r14 C L1 carry from sum
+ bgt r18, $Loop C U0
+C ---------------------------------------------------------------
+$Lend:
+ cmpult r8, r5, r29 C L0 carry from last bunch
+ cmpult r22, r24, r24 C U0 carry from sum
+
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ stq r20, 0(r16) C L0
+ stq r6, 8(r16) C L1
+
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ stq r7, 16(r16) C L0
+ stq r8, 24(r16) C L1
+
+ umulh r11, r19, r7 C U1 #04
+ addq r27, r28, r28 C U0 sum 2 mul's
+
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r13, r29, r13 C U0 start carry cascade
+
+ umulh r12, r19, r21 C U1 #06
+ beq r13, $fix0c C U0
+$ret0c: addq r22, r14, r26 C L0
+ beq r26, $fix1c C U0
+$ret1c: addq r23, r24, r27 C L0
+ beq r27, $fix2c C U0
+$ret2c: addq r28, r25, r28 C L0
+ beq r28, $fix3c C U0
+$ret3c: addq r1, r2, r20 C L0 sum 2 mul's
+ addq r3, r4, r2 C L0 #10 2 mul's
+ lda r17, 64(r17) C L1 bookkeeping
+ cmpult r20, r1, r29 C U0 carry from sum
+ cmpult r2, r4, r4 C U0 carry from sum
+ stq r13, -32(r16) C L0
+ stq r26, -24(r16) C L1
+ addq r5, r6, r14 C U0 sum 2 mul's
+ stq r27, -16(r16) C L0
+ stq r28, -8(r16) C L1
+ cmpult r14, r6, r3 C U0 carry from sum
+ addq r7, r3, r5 C L0 eat carry
+ addq r20, r15, r20 C U0 carry cascade
+ beq r20, $fix4c C U0
+$ret4c: addq r2, r29, r6 C L0
+ beq r6, $fix5c C U0
+$ret5c: addq r14, r4, r7 C L0
+ beq r7, $fix6c C U0
+$ret6c: addq r5, r8, r8 C L0 sum 2
+ cmpult r8, r5, r29 C L0 carry from last bunch
+ stq r20, 0(r16) C L0
+ stq r6, 8(r16) C L1
+ stq r7, 16(r16) C L0
+ stq r8, 24(r16) C L1
+ addq r29, r21, r0
+
+ ldq r26, 0(r30)
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ ldq r29, 64(r30)
+ lda r30, 224(r30)
+ ret r31, (r26), 1
+
+C $fix0w: bis r14, r29, r14 C join carries
+C br r31, $ret0w
+$fix1w: bis r24, r14, r24 C join carries
+ br r31, $ret1w
+$fix2w: bis r25, r24, r25 C join carries
+ br r31, $ret2w
+$fix3w: bis r15, r25, r15 C join carries
+ br r31, $ret3w
+$fix0: bis r14, r29, r14 C join carries
+ br r31, $ret0
+$fix1: bis r24, r14, r24 C join carries
+ br r31, $ret1
+$fix2: bis r25, r24, r25 C join carries
+ br r31, $ret2
+$fix3: bis r15, r25, r15 C join carries
+ br r31, $ret3
+$fix4: bis r29, r15, r29 C join carries
+ br r31, $ret4
+$fix5: bis r4, r29, r4 C join carries
+ br r31, $ret5
+$fix6: addq r5, r4, r5 C can't carry twice!
+ br r31, $ret6
+$fix0c: bis r14, r29, r14 C join carries
+ br r31, $ret0c
+$fix1c: bis r24, r14, r24 C join carries
+ br r31, $ret1c
+$fix2c: bis r25, r24, r25 C join carries
+ br r31, $ret2c
+$fix3c: bis r15, r25, r15 C join carries
+ br r31, $ret3c
+$fix4c: bis r29, r15, r29 C join carries
+ br r31, $ret4c
+$fix5c: bis r4, r29, r4 C join carries
+ br r31, $ret5c
+$fix6c: addq r5, r4, r5 C can't carry twice!
+ br r31, $ret6c
+
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/README b/gmp/mpn/alpha/ev6/nails/README
new file mode 100644
index 0000000000..b214ac50ad
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/README
@@ -0,0 +1,65 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains assembly code for nails-enabled 21264. The code is not
+very well optimized.
+
+For addmul_N, as N grows larger, we could make multiple loads together, then do
+about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This
+would surely allow addmul_4 to run at 2 c/l, but the same should be possible
+also for addmul_3 and perhaps even addmul_2.
+
+
+ current fair best
+Routine c/l unroll c/l unroll c/l i/c
+mul_1 3.25 2.75 2.75 3.273
+addmul_1 4.0 4 3.5 4 14 3.25 3.385
+addmul_2 4.0 1 2.5 2 10 2.25 3.333
+addmul_3 3.0 1 2.33 2 14 2 3.333
+addmul_4 2.5 1 2.125 2 17 2 3.135
+
+addmul_5 2 1 10
+addmul_6 2 1 12
+addmul_7 2 1 14
+
+(The "best" column doesn't account for bookkeeping instructions and
+thereby assumes infinite unrolling.)
+
+Basecase usages:
+
+1 addmul_1
+2 addmul_2
+3 addmul_3
+4 addmul_4
+5 addmul_3 + addmul_2 2.3998
+6 addmul_4 + addmul_2
+7 addmul_4 + addmul_3
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_1.asm b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
new file mode 100644
index 0000000000..711d4e66e5
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
@@ -0,0 +1,396 @@
+dnl Alpha ev6 nails mpn_addmul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 4
+
+C TODO
+C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq rl1, t0, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ addq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ addq rl1, acc1, acc1 C U0
+ ldq rl2, 0(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ addq rl2, acc0, acc0 C U0
+ ldq rl3, 8(rp) C L1
+C
+ lda n, -4(n) C L1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ addq rl3, acc1, acc1 C U0
+ ldq rl0, 16(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ addq rl0, acc0, acc0 C U0
+ ldq rl1, 24(rp) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ unop
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq rl1, acc1, acc1
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_2.asm b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 0000000000..6ff6b3ad6b
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
@@ -0,0 +1,146 @@
+dnl Alpha ev6 nails mpn_addmul_2.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 4.0 cycles/limb.
+
+C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
+C or 4-way unrolling over 20 cycles, for 2.5 c/l.
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): bis r31, r31, r31 C U1 nop
+ addq r19, acc0, acc0 C U0 propagate nail
+ ldq rlimb, 0(rp) C L0
+ ldq ulimb, 0(up) C L1
+
+ lda rp, 8(rp) C L1
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L0
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L1
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L0 nop
+
+ addq rlimb, r19, r19 C L1 FINAL PROD-SUM
+ srl m1a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L0
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ bis r31, m1b, acc1 C L1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L0 extract numb part
+
+ unop
+ srl r19,NUMB_BITS, r19 C U1 extract nail part
+ stq r28, -8(rp) C L1
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ bis r31, m1b, acc1
+ and r19,numb_mask, r28 C extract limb
+
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, r0
+
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_3.asm b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 0000000000..a1ffb680ec
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
@@ -0,0 +1,169 @@
+dnl Alpha ev6 nails mpn_addmul_3.
+
+dnl Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 3.0 cycles/limb.
+
+C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): ldq rlimb, 0(rp) C L1
+ ldq ulimb, 0(up) C L0
+ bis r31, r31, r31 C U0 nop
+ addq r19, acc0, acc0 C U1 propagate nail
+
+ lda rp, 8(rp) C L1
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L0
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L1
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L0 nop
+
+ addq rlimb, r19, r19 C L1
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C L0 nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ addq m1b, acc2, acc1 C L1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L0 extract numb part
+
+ bis r31, r31, r31 C L1 nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L0
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1 C L0
+ bis r31, m2b, acc2 C L1
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C U0 extract nail part
+
+ stq r28, -8(rp) C L
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ bis r31, m2b, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, m0a
+
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_4.asm b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 0000000000..77e02a4316
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
@@ -0,0 +1,210 @@
+dnl Alpha ev6 nails mpn_addmul_4.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 2.5 cycles/limb.
+
+C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
+C to 3.24 insn/cycle.
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(4-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+ lda r30, -240(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+ ldq v3, 24(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, acc3 C zero acc3
+ sll v3,NAIL_BITS, v3
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ mulq v3, ulimb, m3a C U1
+ umulh v3, ulimb, m3b C U1
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): bis r31, r31, r31 C U1 nop
+ ldq rlimb, 0(rp) C L0
+ ldq ulimb, 0(up) C L1
+ addq r19, acc0, acc0 C U0 propagate nail
+
+ bis r31, r31, r31 C L0 nop
+ bis r31, r31, r31 C U1 nop
+ bis r31, r31, r31 C L1 nop
+ bis r31, r31, r31 C U0 nop
+
+ lda rp, 8(rp) C L0
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L1
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L0
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L1 nop
+
+ addq rlimb, r19, r19 C L0
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C L1 nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ addq m1b, acc2, acc1 C L0
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L1 extract numb part
+
+ bis r31, r31, r31 C L0 nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L1
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1 C L1
+ addq m2b, acc3, acc2 C L0
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C U0 extract nail part
+
+ bis r31, r31, r31 C L0 nop
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp) C L1
+ mulq v3, ulimb, m3a C U1
+
+ addq r8, acc2, acc2 C L0
+ bis r31, m3b, acc3 C L1
+ umulh v3, ulimb, m3b C U1
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp) C FIXME: DELETE
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ addq m2b, acc3, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp)
+ addq r8, acc2, acc2
+ bis r31, m3b, acc3
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, acc2
+
+ and acc2,numb_mask, r28
+ stq r28, 16(rp)
+ srl acc2,NUMB_BITS, r19
+ addq r19, acc3, r0
+
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ lda r30, 240(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/aors_n.asm b/gmp/mpn/alpha/ev6/nails/aors_n.asm
new file mode 100644
index 0000000000..f6586773f5
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/aors_n.asm
@@ -0,0 +1,233 @@
+dnl Alpha ev6 nails mpn_add_n and mpn_sub_n.
+
+dnl Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
+dnl with 8-way unrolling.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n',`r19')
+
+define(`rl0',`r0')
+define(`rl1',`r1')
+define(`rl2',`r2')
+define(`rl3',`r3')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r6')
+define(`ul3',`r7')
+
+define(`vl0',`r22')
+define(`vl1',`r23')
+define(`vl2',`r24')
+define(`vl3',`r25')
+
+define(`numb_mask',`r21')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`CYSH',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ifdef(`OPERATION_add_n', `
+ define(`OP', addq)
+ define(`CYSH',`GMP_NUMB_BITS')
+ define(`func', mpn_add_n)')
+ifdef(`OPERATION_sub_n', `
+ define(`OP', subq)
+ define(`CYSH',63)
+ define(`func', mpn_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+ bis r31, r31, r20
+
+ and n, 3, r25
+ lda n, -4(n)
+ beq r25, L(ge4)
+
+L(lp0): ldq ul0, 0(up)
+ lda up, 8(up)
+ ldq vl0, 0(vp)
+ lda vp, 8(vp)
+ lda rp, 8(rp)
+ lda r25, -1(r25)
+ OP ul0, vl0, rl0
+ OP rl0, r20, rl0
+ and rl0, numb_mask, r28
+ stq r28, -8(rp)
+ srl rl0, CYSH, r20
+ bne r25, L(lp0)
+
+ blt n, L(ret)
+
+L(ge4): ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+ lda up, 32(up)
+ lda vp, 32(vp)
+ lda n, -4(n)
+ bge n, L(ge8)
+
+ OP ul0, vl0, rl0 C main-add 0
+ OP rl0, r20, rl0 C cy-add 0
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ br r31, L(cj0)
+
+L(ge8): OP ul0, vl0, rl0 C main-add 0
+ ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+ OP rl0, r20, rl0 C cy-add 0
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+ lda rp, 32(rp)
+ lda up, 32(up)
+ lda vp, 32(vp)
+ lda n, -4(n)
+ blt n, L(end)
+
+ ALIGN(32)
+L(top): OP ul0, vl0, rl0 C main-add 0
+ srl rl3, CYSH, r20 C gen cy 3
+ ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+
+ OP rl0, r20, rl0 C cy-add 0
+ and rl3,numb_mask, r28
+ stq r27, -16(rp)
+ bis r31, r31, r31
+
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ stq r28, -8(rp)
+ bis r31, r31, r31
+
+ OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ bis r31, r31, r31
+
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+ bis r31, r31, r31
+
+ bis r31, r31, r31
+ lda n, -4(n)
+ lda up, 32(up)
+ lda vp, 32(vp)
+
+ bis r31, r31, r31
+ bis r31, r31, r31
+ lda rp, 32(rp)
+ bge n, L(top)
+
+L(end): OP ul0, vl0, rl0 C main-add 0
+ srl rl3, CYSH, r20 C gen cy 3
+ OP rl0, r20, rl0 C cy-add 0
+ and rl3,numb_mask, r28
+ stq r27, -16(rp)
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ stq r28, -8(rp)
+L(cj0): OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+
+ srl rl3, CYSH, r20 C gen cy 3
+ and rl3,numb_mask, r28
+ stq r27, 16(rp)
+ stq r28, 24(rp)
+
+L(ret): and r20, 1, r0
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
new file mode 100644
index 0000000000..7949fe8df8
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD 40
+#define MUL_TOOM33_THRESHOLD 236
+
+#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */
+#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD 120
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIV_DC_THRESHOLD 48
+#define POWM_THRESHOLD 113
+
+#define HGCD_THRESHOLD 78
+#define GCD_ACCEL_THRESHOLD 3
+#define GCD_DC_THRESHOLD 392
+#define JACOBI_BASE_METHOD 1
+
+#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */
+#define USE_PREINV_MOD_1 0 /* no preinv with nails */
+#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_THRESHOLD 6336
+
+#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 }
+#define MUL_FFT_MODF_THRESHOLD 488
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 }
+#define SQR_FFT_MODF_THRESHOLD 480
+#define SQR_FFT_THRESHOLD 2976
diff --git a/gmp/mpn/alpha/ev6/nails/mul_1.asm b/gmp/mpn/alpha/ev6/nails/mul_1.asm
new file mode 100644
index 0000000000..da2ee3d099
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/mul_1.asm
@@ -0,0 +1,364 @@
+dnl Alpha ev6 nails mpn_mul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.25
+
+C TODO
+C * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m1a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ lda n, -4(n) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/submul_1.asm b/gmp/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 0000000000..f473a59ba8
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/submul_1.asm
@@ -0,0 +1,396 @@
+dnl Alpha ev6 nails mpn_submul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 4
+
+C TODO
+C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ subq rl1, t0, acc1
+ and acc1,numb_mask, r28
+ sra acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ subq m1b, t1, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ subq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ sra acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ sra acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ subq rl1, acc1, acc1 C U0
+ ldq rl2, 0(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ sra acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ subq rl2, acc0, acc0 C U0
+ ldq rl3, 8(rp) C L1
+C
+ lda n, -4(n) C L1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ sra acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ subq rl3, acc1, acc1 C U0
+ ldq rl0, 16(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ sra acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ subq rl0, acc0, acc0 C U0
+ ldq rl1, 24(rp) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ subq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ sra acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ unop
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ subq rl1, acc1, acc1
+ addq t1, acc1, acc1
+ sra acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ subq m1b, t1, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/slot.pl b/gmp/mpn/alpha/ev6/slot.pl
new file mode 100755
index 0000000000..a4c8a36882
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/slot.pl
@@ -0,0 +1,318 @@
+#!/usr/bin/perl -w
+
+# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of either:
+#
+# * the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# or
+#
+# * the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# or both in parallel, as here.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received copies of the GNU General Public License and the
+# GNU Lesser General Public License along with the GNU MP Library. If not,
+# see https://www.gnu.org/licenses/.
+
+
+# Usage: slot.pl [filename.o]...
+#
+# Run "objdump" to produce a disassembly of the given object file(s) and
+# annotate the output with "U" or "L" slotting which Alpha EV6 will use.
+#
+# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as
+# a reminder that it wasn't a fixed requirement that gave the U or L, but
+# the octaword slotting rules.
+#
+# If an instruction is not recognised, that octaword does not get any U/L
+# shown, only lower-case "u", "l" or "e" for the instructions which are
+# known. Add any unknown instructions to %optable below.
+
+
+use strict;
+
+# The U or L which various instructions demand, or E if either.
+#
+my %optable =
+ (
+ 'addq' => 'E',
+ 'and' => 'E',
+ 'andnot' => 'E',
+ 'beq' => 'U',
+ 'bge' => 'U',
+ 'bgt' => 'U',
+ 'bic' => 'E',
+ 'bis' => 'E',
+ 'blt' => 'U',
+ 'bne' => 'U',
+ 'br' => 'L',
+ 'clr' => 'E',
+ 'cmpule' => 'E',
+ 'cmpult' => 'E',
+ 'cmpeq' => 'E',
+ 'cmoveq' => 'E',
+ 'cmovne' => 'E',
+ 'ctpop' => 'U',
+ 'ctlz' => 'U',
+ 'cttz' => 'U',
+ 'extbl' => 'U',
+ 'extlh' => 'U',
+ 'extll' => 'U',
+ 'extqh' => 'U',
+ 'extql' => 'U',
+ 'extwh' => 'U',
+ 'extwl' => 'U',
+ 'jsr' => 'L',
+ 'lda' => 'E',
+ 'ldah' => 'E',
+ 'ldbu' => 'L',
+ 'ldl' => 'L',
+ 'ldq' => 'L',
+ 'ldt' => 'L',
+ 'ret' => 'L',
+ 'mov' => 'E',
+ 'mull' => 'U',
+ 'mulq' => 'U',
+ 'negq' => 'E',
+ 'nop' => 'E',
+ 'not' => 'E',
+ 's8addq' => 'E',
+ 's8subq' => 'E',
+ # 'sextb' => ?
+ # 'sextl' => ?
+ 'sll' => 'U',
+ 'srl' => 'U',
+ 'stq' => 'L',
+ 'subq' => 'E',
+ 'umulh' => 'U',
+ 'unop' => 'E',
+ 'xor' => 'E',
+ );
+
+# Slottings used for a given pattern of U/L/E in an octaword. This is as
+# per the "Ebox Slotting" section of the EV6 hardware reference manual.
+#
+my %slottable =
+ (
+ 'EEEE' => 'ULUL',
+ 'EEEL' => 'ULUL',
+ 'EEEU' => 'ULLU',
+ 'EELE' => 'ULLU',
+ 'EELL' => 'UULL',
+ 'EELU' => 'ULLU',
+ 'EEUE' => 'ULUL',
+ 'EEUL' => 'ULUL',
+ 'EEUU' => 'LLUU',
+ 'ELEE' => 'ULUL',
+ 'ELEL' => 'ULUL',
+ 'ELEU' => 'ULLU',
+ 'ELLE' => 'ULLU',
+ 'ELLL' => 'ULLL',
+ 'ELLU' => 'ULLU',
+ 'ELUE' => 'ULUL',
+ 'ELUL' => 'ULUL',
+
+ 'LLLL' => 'LLLL',
+ 'LLLU' => 'LLLU',
+ 'LLUE' => 'LLUU',
+ 'LLUL' => 'LLUL',
+ 'LLUU' => 'LLUU',
+ 'LUEE' => 'LULU',
+ 'LUEL' => 'LUUL',
+ 'LUEU' => 'LULU',
+ 'LULE' => 'LULU',
+ 'LULL' => 'LULL',
+ 'LULU' => 'LULU',
+ 'LUUE' => 'LUUL',
+ 'LUUL' => 'LUUL',
+ 'LUUU' => 'LUUU',
+ 'UEEE' => 'ULUL',
+ 'UEEL' => 'ULUL',
+ 'UEEU' => 'ULLU',
+
+ 'ELUU' => 'LLUU',
+ 'EUEE' => 'LULU',
+ 'EUEL' => 'LUUL',
+ 'EUEU' => 'LULU',
+ 'EULE' => 'LULU',
+ 'EULL' => 'UULL',
+ 'EULU' => 'LULU',
+ 'EUUE' => 'LUUL',
+ 'EUUL' => 'LUUL',
+ 'EUUU' => 'LUUU',
+ 'LEEE' => 'LULU',
+ 'LEEL' => 'LUUL',
+ 'LEEU' => 'LULU',
+ 'LELE' => 'LULU',
+ 'LELL' => 'LULL',
+ 'LELU' => 'LULU',
+ 'LEUE' => 'LUUL',
+ 'LEUL' => 'LUUL',
+ 'LEUU' => 'LLUU',
+ 'LLEE' => 'LLUU',
+ 'LLEL' => 'LLUL',
+ 'LLEU' => 'LLUU',
+ 'LLLE' => 'LLLU',
+
+ 'UELE' => 'ULLU',
+ 'UELL' => 'UULL',
+ 'UELU' => 'ULLU',
+ 'UEUE' => 'ULUL',
+ 'UEUL' => 'ULUL',
+ 'UEUU' => 'ULUU',
+ 'ULEE' => 'ULUL',
+ 'ULEL' => 'ULUL',
+ 'ULEU' => 'ULLU',
+ 'ULLE' => 'ULLU',
+ 'ULLL' => 'ULLL',
+ 'ULLU' => 'ULLU',
+ 'ULUE' => 'ULUL',
+ 'ULUL' => 'ULUL',
+ 'ULUU' => 'ULUU',
+ 'UUEE' => 'UULL',
+ 'UUEL' => 'UULL',
+ 'UUEU' => 'UULU',
+ 'UULE' => 'UULL',
+ 'UULL' => 'UULL',
+ 'UULU' => 'UULU',
+ 'UUUE' => 'UUUL',
+ 'UUUL' => 'UUUL',
+ 'UUUU' => 'UUUU',
+ );
+
+# Check all combinations of U/L/E are present in %slottable.
+sub coverage {
+ foreach my $a ('U', 'L', 'E') {
+ foreach my $b ('U', 'L', 'E') {
+ foreach my $c ('U', 'L', 'E') {
+ foreach my $d ('U', 'L', 'E') {
+ my $x = $a . $b . $c . $d;
+ if (! defined $slottable{$x}) {
+ print "slottable missing: $x\n"
+ }
+ }
+ }
+ }
+ }
+}
+
+# Certain consistency checks for %slottable.
+sub check {
+ foreach my $x (keys %slottable) {
+ my $a = substr($x,0,1);
+ my $b = substr($x,1,1);
+ my $c = substr($x,2,1);
+ my $d = substr($x,3,1);
+ my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E');
+ my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L');
+ my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U');
+
+ my $got = $slottable{$x};
+ my $want = $x;
+
+ if ($es == 0) {
+
+ } elsif ($es == 1) {
+ # when only one E, it's mapped to whichever of U or L is otherwise
+ # used the least
+ if ($ls > $us) {
+ $want =~ s/E/U/;
+ } else {
+ $want =~ s/E/L/;
+ }
+ } elsif ($es == 2) {
+ # when two E's and two U, then the E's map to L; vice versa for two E
+ # and two L
+ if ($ls == 2) {
+ $want =~ s/E/U/g;
+ } elsif ($us == 2) {
+ $want =~ s/E/L/g;
+ } else {
+ next;
+ }
+ } elsif ($es == 3) {
+ next;
+
+ } else { # $es == 4
+ next;
+ }
+
+ if ($want ne $got) {
+ print "slottable $x want $want got $got\n";
+ }
+ }
+}
+
+sub disassemble {
+ my ($file) = @_;
+
+ open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n";
+
+ my (%pre, %post, %type);
+ while (<IN>) {
+ my $line = $_ . "";
+
+ if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) {
+ my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4);
+
+ my $this_type = $optable{$opcode};
+ if (! defined ($this_type)) { $this_type = ' '; }
+
+ $pre{$addr} = $this_pre;
+ $post{$addr} = $this_post;
+ $type{$addr} = $this_type;
+
+ if ($addr eq 'c') {
+ my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' ');
+
+ my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'};
+ $str = $slottable{$str};
+ if (defined $str) {
+ $slot{'c'} = substr($str,0,1);
+ $slot{'8'} = substr($str,1,1);
+ $slot{'4'} = substr($str,2,1);
+ $slot{'0'} = substr($str,3,1);
+ }
+
+ foreach my $i ('0', '4', '8', 'c') {
+ if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; }
+ print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, ' ', $post{$i}, "\n";
+ }
+
+ %pre = ();
+ %type = ();
+ %post = ();
+ }
+ }
+ }
+
+ close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+coverage();
+check();
+
+my @files;
+if ($#ARGV >= 0) {
+ @files = @ARGV;
+} else {
+ die
+}
+
+foreach (@files) {
+ disassemble($_);
+}
diff --git a/gmp/mpn/alpha/ev6/sub_n.asm b/gmp/mpn/alpha/ev6/sub_n.asm
new file mode 100644
index 0000000000..a35ba40d34
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/sub_n.asm
@@ -0,0 +1,283 @@
+dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 5.4
+C EV6: 2.125
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
+C cy r20 (for mpn_add_nc)
+
+C TODO
+C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C Use multi-pronged feed-in.
+C Perform additional micro-tuning
+
+C This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C Pair loads and stores where possible
+C Store pairs oct-aligned where possible (didn't need it here)
+C Stores are delayed every third cycle
+C Loads and stores are delayed by fills
+C U stays still, put code there where possible (note alternation of U1 and U0)
+C L moves because of loads and stores
+C Note dampers in L to limit damage
+
+C This odd-looking optimization expects that were having random bits in our
+C data, so that a pure zero result is unlikely. so we penalize the unlikely
+C case to help the common case.
+
+define(`u0', `r0') define(`u1', `r3')
+define(`v0', `r1') define(`v1', `r4')
+
+define(`cy0', `r20') define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+ br r31, $entry
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+ bis r31, r31, cy0 C clear carry in
+$entry: cmpult r19, 5, r22 C L1 move counter
+ ldq u1, 0(r17) C L0 get next ones
+ ldq v1, 0(r18) C L1
+ bne r22, $Lsmall
+
+ ldq u0, 8(r17) C L0 get next ones
+ ldq v0, 8(r18) C L1
+ subq u1, v1, r5 C U0 sub two data
+
+ cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 16(r17) C L0 get next ones
+ ldq v1, 16(r18) C L1
+
+ subq u0, v0, r8 C U1 sub two data
+ subq r5, cy0, r24 C U0 borrow in
+
+ cmpult u0, v0, r22 C U1 did it borrow
+ beq r5, $fix5f C U0 fix exact zero
+$ret5f: ldq u0, 24(r17) C L0 get next ones
+ ldq v0, 24(r18) C L1
+
+ subq r8, r23, r25 C U1 borrow from last
+ subq u1, v1, r7 C U0 sub two data
+
+ beq r8, $fix6f C U1 fix exact zero
+$ret6f: cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 32(r17) C L0 get next ones
+ ldq v1, 32(r18) C L1
+
+ lda r17, 40(r17) C L0 move pointer
+ lda r18, 40(r18) C L1 move pointer
+
+ lda r16, -8(r16)
+ lda r19, -13(r19) C L1 move counter
+ blt r19, $Lend C U1 loop control
+
+
+C Main loop. 8-way unrolled.
+ ALIGN(16)
+$Loop: subq u0, v0, r2 C U1 sub two data
+ stq r24, 8(r16) C L0 put an answer
+ subq r7, r22, r24 C U0 borrow from last
+ stq r25, 16(r16) C L1 pair
+
+ cmpult u0, v0, cy1 C U1 did it borrow
+ beq r7, $fix7 C U0 fix exact 0
+$ret7: ldq u0, 0(r17) C L0 get next ones
+ ldq v0, 0(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ subq r2, r23, r25 C U1 borrow from last
+ bis r31, r31, r31 C L moves in L !
+ subq u1, v1, r5 C U0 sub two data
+
+ beq r2, $fix0 C U1 fix exact zero
+$ret0: cmpult u1, v1, cy0 C U0 did it borrow
+ ldq u1, 8(r17) C L0 get next ones
+ ldq v1, 8(r18) C L1
+
+ subq u0, v0, r8 C U1 sub two data
+ stq r24, 24(r16) C L0 store pair
+ subq r5, cy1, r24 C U0 borrow from last
+ stq r25, 32(r16) C L1
+
+ cmpult u0, v0, r22 C U1 did it borrow
+ beq r5, $fix1 C U0 fix exact zero
+$ret1: ldq u0, 16(r17) C L0 get next ones
+ ldq v0, 16(r18) C L1
+
+ lda r16, 64(r16) C L0 move pointer
+ subq r8, cy0, r25 C U1 borrow from last
+ lda r19, -8(r19) C L1 move counter
+ subq u1, v1, r7 C U0 sub two data
+
+ beq r8, $fix2 C U1 fix exact zero
+$ret2: cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 24(r17) C L0 get next ones
+ ldq v1, 24(r18) C L1
+
+ subq u0, v0, r2 C U1 sub two data
+ stq r24, -24(r16) C L0 put an answer
+ subq r7, r22, r24 C U0 borrow from last
+ stq r25, -16(r16) C L1 pair
+
+ cmpult u0, v0, cy1 C U1 did it borrow
+ beq r7, $fix3 C U0 fix exact 0
+$ret3: ldq u0, 32(r17) C L0 get next ones
+ ldq v0, 32(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ subq r2, r23, r25 C U1 borrow from last
+ bis r31, r31, r31 C L moves in L !
+ subq u1, v1, r5 C U0 sub two data
+
+ beq r2, $fix4 C U1 fix exact zero
+$ret4: cmpult u1, v1, cy0 C U0 did it borrow
+ ldq u1, 40(r17) C L0 get next ones
+ ldq v1, 40(r18) C L1
+
+ subq u0, v0, r8 C U1 sub two data
+ stq r24, -8(r16) C L0 store pair
+ subq r5, cy1, r24 C U0 borrow from last
+ stq r25, 0(r16) C L1
+
+ cmpult u0, v0, r22 C U1 did it borrow
+ beq r5, $fix5 C U0 fix exact zero
+$ret5: ldq u0, 48(r17) C L0 get next ones
+ ldq v0, 48(r18) C L1
+
+ ldl r31, 256(r17) C L0 prefetch
+ subq r8, cy0, r25 C U1 borrow from last
+ ldl r31, 256(r18) C L1 prefetch
+ subq u1, v1, r7 C U0 sub two data
+
+ beq r8, $fix6 C U1 fix exact zero
+$ret6: cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 56(r17) C L0 get next ones
+ ldq v1, 56(r18) C L1
+
+ lda r17, 64(r17) C L0 move pointer
+ bis r31, r31, r31 C U
+ lda r18, 64(r18) C L1 move pointer
+ bge r19, $Loop C U1 loop control
+C ==== main loop end
+
+$Lend: subq u0, v0, r2 C U1 sub two data
+ stq r24, 8(r16) C L0 put an answer
+ subq r7, r22, r24 C U0 borrow from last
+ stq r25, 16(r16) C L1 pair
+ cmpult u0, v0, cy1 C U1 did it borrow
+ beq r7, $fix7c C U0 fix exact 0
+$ret7c: subq r2, r23, r25 C U1 borrow from last
+ subq u1, v1, r5 C U0 sub two data
+ beq r2, $fix0c C U1 fix exact zero
+$ret0c: cmpult u1, v1, cy0 C U0 did it borrow
+ stq r24, 24(r16) C L0 store pair
+ subq r5, cy1, r24 C U0 borrow from last
+ stq r25, 32(r16) C L1
+ beq r5, $fix1c C U0 fix exact zero
+$ret1c: stq r24, 40(r16) C L0 put an answer
+ lda r16, 48(r16) C L0 move pointer
+
+ lda r19, 8(r19)
+ beq r19, $Lret
+
+ ldq u1, 0(r17)
+ ldq v1, 0(r18)
+$Lsmall:
+ lda r19, -1(r19)
+ beq r19, $Lend0
+
+ ALIGN(8)
+$Loop0: subq u1, v1, r2 C main sub
+ cmpult u1, v1, r8 C compute bw from last sub
+ ldq u1, 8(r17)
+ ldq v1, 8(r18)
+ subq r2, cy0, r5 C borrow sub
+ lda r17, 8(r17)
+ lda r18, 8(r18)
+ stq r5, 0(r16)
+ cmpult r2, cy0, cy0 C compute bw from last sub
+ lda r19, -1(r19) C decr loop cnt
+ bis r8, cy0, cy0 C combine bw from the two subs
+ lda r16, 8(r16)
+ bne r19, $Loop0
+$Lend0: subq u1, v1, r2 C main sub
+ subq r2, cy0, r5 C borrow sub
+ cmpult u1, v1, r8 C compute bw from last sub
+ cmpult r2, cy0, cy0 C compute bw from last sub
+ stq r5, 0(r16)
+ bis r8, cy0, r0 C combine bw from the two subs
+ ret r31,(r26),1
+
+ ALIGN(8)
+$Lret: lda r0, 0(cy0) C copy borrow into return register
+ ret r31,(r26),1
+
+$fix5f: bis r23, cy0, r23 C bring forward borrow
+ br r31, $ret5f
+$fix6f: bis r22, r23, r22 C bring forward borrow
+ br r31, $ret6f
+$fix0: bis cy1, r23, cy1 C bring forward borrow
+ br r31, $ret0
+$fix1: bis cy0, cy1, cy0 C bring forward borrow
+ br r31, $ret1
+$fix2: bis r22, cy0, r22 C bring forward borrow
+ br r31, $ret2
+$fix3: bis r23, r22, r23 C bring forward borrow
+ br r31, $ret3
+$fix4: bis cy1, r23, cy1 C bring forward borrow
+ br r31, $ret4
+$fix5: bis cy1, cy0, cy0 C bring forward borrow
+ br r31, $ret5
+$fix6: bis r22, cy0, r22 C bring forward borrow
+ br r31, $ret6
+$fix7: bis r23, r22, r23 C bring forward borrow
+ br r31, $ret7
+$fix0c: bis cy1, r23, cy1 C bring forward borrow
+ br r31, $ret0c
+$fix1c: bis cy0, cy1, cy0 C bring forward borrow
+ br r31, $ret1c
+$fix7c: bis r23, r22, r23 C bring forward borrow
+ br r31, $ret7c
+
+EPILOGUE()
+ASM_END()