summaryrefslogtreecommitdiff
path: root/gmp/mpn/alpha
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/alpha')
-rw-r--r--gmp/mpn/alpha/README208
-rw-r--r--gmp/mpn/alpha/add_n.asm164
-rw-r--r--gmp/mpn/alpha/addmul_1.asm99
-rw-r--r--gmp/mpn/alpha/alpha-defs.m4107
-rw-r--r--gmp/mpn/alpha/aorslsh1_n.asm164
-rw-r--r--gmp/mpn/alpha/aorslsh2_n.asm167
-rw-r--r--gmp/mpn/alpha/bdiv_dbm1c.asm282
-rw-r--r--gmp/mpn/alpha/cntlz.asm55
-rw-r--r--gmp/mpn/alpha/com.asm176
-rw-r--r--gmp/mpn/alpha/copyd.asm88
-rw-r--r--gmp/mpn/alpha/copyi.asm86
-rw-r--r--gmp/mpn/alpha/default.m4127
-rw-r--r--gmp/mpn/alpha/dive_1.c115
-rw-r--r--gmp/mpn/alpha/divrem_2.asm177
-rw-r--r--gmp/mpn/alpha/ev5/diveby3.asm332
-rw-r--r--gmp/mpn/alpha/ev5/gmp-mparam.h187
-rw-r--r--gmp/mpn/alpha/ev6/add_n.asm283
-rw-r--r--gmp/mpn/alpha/ev6/aorslsh1_n.asm172
-rw-r--r--gmp/mpn/alpha/ev6/aorsmul_1.asm398
-rw-r--r--gmp/mpn/alpha/ev6/gmp-mparam.h209
-rw-r--r--gmp/mpn/alpha/ev6/mod_1_4.asm337
-rw-r--r--gmp/mpn/alpha/ev6/mul_1.asm496
-rw-r--r--gmp/mpn/alpha/ev6/nails/README65
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_1.asm396
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_2.asm146
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_3.asm169
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_4.asm210
-rw-r--r--gmp/mpn/alpha/ev6/nails/aors_n.asm233
-rw-r--r--gmp/mpn/alpha/ev6/nails/gmp-mparam.h72
-rw-r--r--gmp/mpn/alpha/ev6/nails/mul_1.asm364
-rw-r--r--gmp/mpn/alpha/ev6/nails/submul_1.asm396
-rwxr-xr-xgmp/mpn/alpha/ev6/slot.pl318
-rw-r--r--gmp/mpn/alpha/ev6/sub_n.asm283
-rw-r--r--gmp/mpn/alpha/ev67/gcd_1.asm145
-rw-r--r--gmp/mpn/alpha/ev67/hamdist.asm111
-rw-r--r--gmp/mpn/alpha/ev67/popcount.asm101
-rw-r--r--gmp/mpn/alpha/gmp-mparam.h86
-rw-r--r--gmp/mpn/alpha/invert_limb.asm95
-rw-r--r--gmp/mpn/alpha/lshift.asm182
-rw-r--r--gmp/mpn/alpha/mod_34lsub1.asm164
-rw-r--r--gmp/mpn/alpha/mode1o.asm209
-rw-r--r--gmp/mpn/alpha/mul_1.asm102
-rw-r--r--gmp/mpn/alpha/rshift.asm180
-rw-r--r--gmp/mpn/alpha/sec_tabselect.asm137
-rw-r--r--gmp/mpn/alpha/sqr_diag_addlsh1.asm93
-rw-r--r--gmp/mpn/alpha/sub_n.asm164
-rw-r--r--gmp/mpn/alpha/submul_1.asm99
-rw-r--r--gmp/mpn/alpha/umul.asm44
-rw-r--r--gmp/mpn/alpha/unicos.m4131
49 files changed, 9124 insertions, 0 deletions
diff --git a/gmp/mpn/alpha/README b/gmp/mpn/alpha/README
new file mode 100644
index 0000000000..09c2f04047
--- /dev/null
+++ b/gmp/mpn/alpha/README
@@ -0,0 +1,208 @@
+Copyright 1996, 1997, 1999-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs special
+handling by unwinding. It also says whether $27 is really needed for computing
+the gp. The `.mask M' pseudo op says which registers are saved on the stack,
+and at what offset in the frame.
+
+Cray T3 code is very very different...
+
+"$6" / "$f6" etc is the usual syntax for registers, but on Unicos instead "r6"
+/ "f6" is required. We use the "r6" / "f6" forms, and have m4 defines expand
+them to "$6" or "$f6" where necessary.
+
+"0x" introduces a hex constant in gas and DEC as, but on Unicos "^X" is
+required. The X() macro accommodates this difference.
+
+"cvttqc" is required by DEC as, "cvttq/c" is required by Unicos, and gas will
+accept either. We use cvttqc and have an m4 define expand to cvttq/c where
+necessary.
+
+"not" as an alias for "ornot r31, ..." is available in gas and DEC as, but not
+the Unicos assembler. The full "ornot" must be used.
+
+"unop" is not available in Unicos. We make an m4 define to the usual "ldq_u
+r31,0(r30)", and in fact use that define on all systems since it comes out the
+same.
+
+"!literal!123" etc explicit relocations as per Tru64 4.0 are apparently not
+available in older alpha assemblers (including gas prior to 2.12), according to
+the GCC manual, so the assembler macro forms must be used (eg. ldgp).
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth. The on-chip L1 cache is write-
+ through, and a cache line is transferred from the store buffer to the off-
+ chip L2 in as much 15 cycles on most systems. This delay hurts mpn_add_n,
+ mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+ instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of these
+ cycles are pipelined. Thus, multiply instructions can be issued at a rate
+ of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip is good, both for loads and stores. The
+ L1 cache can handle two loads or one store per cycle, but two cycles after a
+ store, no ld can issue.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+ umulh has a latency of 14 cycles and an issue rate of 1 each 10th cycle.
+ (Note that published documentation gets these numbers slightly wrong.)
+
+3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12
+ are memory operations. This will take at least
+ ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+ We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+ cache cycles, which should be completely hidden in the 19 issue cycles.
+ The computation is inherently serial, with these dependencies:
+
+ ldq ldq
+ \ /\
+ (or) addq |
+ |\ / \ |
+ | addq cmpult
+ \ | |
+ cmpult |
+ \ /
+ or
+
+ I.e., 3 operations are needed between carry-in and carry-out, making 12
+ cycles the absolute minimum for the 4 limbs. We could replace the `or' with
+ a cmoveq/cmovne, which could issue one cycle earlier that the `or', but that
+ might waste a cycle on EV4. The total depth remain unaffected, since cmov
+ has a latency of 2 cycles.
+
+ addq
+ / \
+ addq cmpult
+ | \
+ cmpult -> cmovne
+
+ Montgomery has a slightly different way of computing carry that requires one
+ less instruction, but has depth 4 (instead of the current 3). Since the code
+ is currently instruction issue bound, Montgomery's idea should save us 1/2
+ cycle per limb, or bring us down to a total of 17 cycles or 4.25 cycles/limb.
+ Unfortunately, this method will not be good for the EV6.
+
+4. addmul_1 and friends: We previously had a scheme for splitting the single-
+ limb operand in 21-bits chunks and the multi-limb operand in 32-bit chunks,
+ and then use FP operations for every 2nd multiply, and integer operations
+ for every 2nd multiply.
+
+ But it seems much better to split the single-limb operand in 16-bit chunks,
+ since we save many integer shifts and adds that way. See powerpc64/README
+ for some more details.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle. In actual practice, it is never possible to sustain
+more than 3.5 integer insns/cycle due to rename register shortage. One integer
+multiply instruction can issue each cycle. To get optimal speed, we need to
+pretend we are vectorizing the code, i.e., minimize the depth of recurrences.
+
+There are two dependencies to watch out for. 1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling loops, so that
+addresses don't depend heavily on an index variable. Avoiding serializing
+because of carry propagation is trickier; the ultimate performance of the code
+will be determined of the number of latency cycles it takes from accepting
+carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines. Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2. CMOV
+split the mapping process (see pg 2-26 in cmpwrgd.pdf), suggesting the CMOV
+should always be placed as the last instruction of an aligned 4 instruction
+block, or perhaps simply avoided.
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency for
+consumers in the opposite cluster. Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+
+
+REFERENCES
+
+"Alpha Architecture Handbook", version 4, Compaq, October 1998, order number
+EC-QD2KC-TE.
+
+"Alpha 21164 Microprocessor Hardware Reference Manual", Compaq, December 1998,
+order number EC-QP99C-TE.
+
+"Alpha 21264/EV67 Microprocessor Hardware Reference Manual", revision 1.4,
+Compaq, September 2000, order number DS-0028B-TE.
+
+"Compiler Writer's Guide for the Alpha 21264", Compaq, June 1999, order number
+EC-RJ66A-TE.
+
+All of the above are available online from
+
+ http://ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ ftp://ftp.compaq.com/pub/products/alphaCPUdocs
+
+"Tru64 Unix Assembly Language Programmer's Guide", Compaq, March 1996, part
+number AA-PS31D-TE.
+
+"Digital UNIX Calling Standard for Alpha Systems", Digital Equipment Corp,
+March 1996, part number AA-PY8AC-TE.
+
+The above are available online,
+
+ http://h30097.www3.hp.com/docs/pub_page/V40F_DOCS.HTM
+
+(Dunno what h30097 means in this URL, but if it moves try searching for "tru64
+online documentation" from the main www.hp.com page.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 79
+End:
diff --git a/gmp/mpn/alpha/add_n.asm b/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000000..bc572a57a9
--- /dev/null
+++ b/gmp/mpn/alpha/add_n.asm
@@ -0,0 +1,164 @@
+dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 4.75
+C EV6: 3
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+ bis r20,r31,r25
+ br L(com)
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+ bis r31,r31,r25 C clear cy
+L(com): subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ addq r0,r4,r28 C 1st main add
+ ldq r2,16(r18)
+ addq r25,r28,r20 C 1st carry add
+ ldq r3,24(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r6,-16(r17)
+ cmpult r20,r28,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two adds
+ subq r19,4,r19 C decr loop cnt
+ addq r1,r5,r28 C 2nd main add
+ addq r18,32,r18 C update s2_ptr
+ addq r28,r25,r21 C 2nd carry add
+ cmpult r28,r5,r8 C compute cy from last add
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r21,r28,r25 C compute cy from last add
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two adds
+ ldq r1,8(r18)
+ addq r2,r6,r28 C 3rd main add
+ ldq r4,0(r17)
+ addq r28,r25,r22 C 3rd carry add
+ ldq r5,8(r17)
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ addq r0,r4,r28 C 1st main add
+ ldq r2,16(r18)
+ addq r25,r28,r20 C 1st carry add
+ ldq r3,24(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r6,-16(r17)
+ cmpult r20,r28,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two adds
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ addq r1,r5,r28 C 2nd main add
+ stq r23,-8(r16)
+ addq r25,r28,r21 C 2nd carry add
+ addq r18,32,r18 C update s2_ptr
+ cmpult r28,r5,r8 C compute cy from last add
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r21,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r2,r6,r28 C 3rd main add
+ addq r28,r25,r22 C 3rd carry add
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: addq r0,r4,r28 C main add
+ ldq r0,8(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r4,8(r17)
+ addq r28,r25,r20 C carry add
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r20,r28,r25 C compute cy from last add
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: addq r0,r4,r28 C main add
+ addq r28,r25,r20 C carry add
+ cmpult r28,r4,r8 C compute cy from last add
+ cmpult r20,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/addmul_1.asm b/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000000..c4e6834b61
--- /dev/null
+++ b/gmp/mpn/alpha/addmul_1.asm
@@ -0,0 +1,99 @@
+dnl Alpha mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl result to a second limb vector.
+
+dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 7
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C vl r19
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ addq r5,r3,r3
+ cmpult r3,r5,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/gmp/mpn/alpha/alpha-defs.m4 b/gmp/mpn/alpha/alpha-defs.m4
new file mode 100644
index 0000000000..af34c9294c
--- /dev/null
+++ b/gmp/mpn/alpha/alpha-defs.m4
@@ -0,0 +1,107 @@
+divert(-1)
+
+dnl m4 macros for Alpha assembler.
+
+dnl Copyright 2003, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Usage: ASSERT([reg] [,code])
+dnl
+dnl Require that the given reg is non-zero after executing the test code.
+dnl For example,
+dnl
+dnl ASSERT(r8,
+dnl ` cmpult r16, r17, r8')
+dnl
+dnl If the register argument is empty then nothing is tested, the code is
+dnl just executed. This can be used for setups required by later ASSERTs.
+dnl If the code argument is omitted then the register is just tested, with
+dnl no special setup code.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$2',,,`$2')
+ifelse(`$1',,,
+` bne $1, L(ASSERTok`'ASSERT_label_counter)
+ .long 0 C halt
+L(ASSERTok`'ASSERT_label_counter):
+define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
+')
+')')
+define(`ASSERT_label_counter',1)
+
+
+dnl Usage: bigend(`code')
+dnl
+dnl Emit the given code only for a big-endian system, like Unicos. This
+dnl can be used for instance for extra stuff needed by extwl.
+
+define(bigend,
+m4_assert_numargs(1)
+`ifdef(`HAVE_LIMB_BIG_ENDIAN',`$1',
+`ifdef(`HAVE_LIMB_LITTLE_ENDIAN',`',
+`m4_error(`Cannot assemble, unknown limb endianness')')')')
+
+
+dnl Usage: bwx_available_p
+dnl
+dnl Evaluate to 1 if the BWX byte memory instructions are available, or to
+dnl 0 if not.
+dnl
+dnl Listing the chips which do have BWX means anything we haven't looked at
+dnl will use safe non-BWX code. The only targets without BWX currently are
+dnl plain alpha (ie. ev4) and alphaev5.
+
+define(bwx_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+ `HAVE_HOST_CPU_alphaev56',
+ `HAVE_HOST_CPU_alphapca56',
+ `HAVE_HOST_CPU_alphapca57',
+ `HAVE_HOST_CPU_alphaev6',
+ `HAVE_HOST_CPU_alphaev67',
+ `HAVE_HOST_CPU_alphaev68',
+ `HAVE_HOST_CPU_alphaev69',
+ `HAVE_HOST_CPU_alphaev7',
+ `HAVE_HOST_CPU_alphaev79')')
+
+
+dnl Usage: unop
+dnl
+dnl The Cray Unicos assembler lacks unop, so give the equivalent ldq_u
+dnl explicitly.
+
+define(unop,
+m4_assert_numargs(-1)
+`ldq_u r31, 0(r30)')
+
+
+divert
diff --git a/gmp/mpn/alpha/aorslsh1_n.asm b/gmp/mpn/alpha/aorslsh1_n.asm
new file mode 100644
index 0000000000..9525e669db
--- /dev/null
+++ b/gmp/mpn/alpha/aorslsh1_n.asm
@@ -0,0 +1,164 @@
+dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 6.25
+C EV6: 4.5
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh1_n',`
+ define(ADDSUB, addq)
+ define(CARRY, `cmpult $1,$2,$3')
+ define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+ define(ADDSUB, subq)
+ define(CARRY, `cmpult $2,$1,$3')
+ define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+ and n, 2, cy0
+ blbs n, L(bx1)
+L(bx0): ldq v1, 0(vp)
+ ldq u1, 0(up)
+ nop
+ bne cy0, L(b10)
+
+L(b00): lda vp, 48(vp)
+ lda up, -16(up)
+ lda rp, -8(rp)
+ br r31, L(lo0)
+
+L(b10): lda vp, 32(vp)
+ lda rp, 8(rp)
+ lda cy0, 0(r31)
+ br r31, L(lo2)
+
+L(bx1): ldq v0, 0(vp)
+ ldq u0, 0(up)
+ lda cy1, 0(r31)
+ beq cy0, L(b01)
+
+L(b11): lda vp, 40(vp)
+ lda up, -24(up)
+ lda rp, 16(rp)
+ br r31, L(lo3)
+
+L(b01): lda n, -4(n)
+ ble n, L(end)
+ lda vp, 24(vp)
+ lda up, -8(up)
+
+ ALIGN(16)
+L(top): addq v0, v0, sl C left shift vlimb
+ ldq v1, -16(vp)
+ ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
+ cmplt v0, r31, cy0 C carry out #1
+ ldq u1, 16(up)
+ ADDSUB ps, cy1, rr C consume carry from previous operation
+ CARRY( ps, u0, cy) C carry out #2
+ stq rr, 0(rp)
+ addq cy, cy0, cy0 C combine carry out #1 and #2
+ CARRY( rr, ps, cy) C carry out #3
+ addq cy, cy0, cy0 C final carry out
+ lda vp, 32(vp) C bookkeeping
+L(lo0): addq v1, v1, sl
+ ldq v0, -40(vp)
+ ADDSUB u1, sl, ps
+ cmplt v1, r31, cy1
+ ldq u0, 24(up)
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy)
+ stq rr, 8(rp)
+ addq cy, cy1, cy1
+ CARRY( rr, ps, cy)
+ addq cy, cy1, cy1
+ lda rp, 32(rp) C bookkeeping
+L(lo3): addq v0, v0, sl
+ ldq v1, -32(vp)
+ ADDSUB u0, sl, ps
+ cmplt v0, r31, cy0
+ ldq u1, 32(up)
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy)
+ stq rr, -16(rp)
+ addq cy, cy0, cy0
+ CARRY( rr, ps, cy)
+ addq cy, cy0, cy0
+ lda up, 32(up) C bookkeeping
+L(lo2): addq v1, v1, sl
+ ldq v0, -24(vp)
+ ADDSUB u1, sl, ps
+ cmplt v1, r31, cy1
+ ldq u0, 8(up)
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy)
+ stq rr, -8(rp)
+ addq cy, cy1, cy1
+ CARRY( rr, ps, cy)
+ addq cy, cy1, cy1
+ lda n, -4(n) C bookkeeping
+ bgt n, L(top)
+
+L(end): addq v0, v0, sl
+ ADDSUB u0, sl, ps
+ ADDSUB ps, cy1, rr
+ cmplt v0, r31, cy0
+ CARRY( ps, u0, cy)
+ stq rr, 0(rp)
+ addq cy, cy0, cy0
+ CARRY( rr, ps, cy)
+ addq cy, cy0, r0
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/aorslsh2_n.asm b/gmp/mpn/alpha/aorslsh2_n.asm
new file mode 100644
index 0000000000..bdee1d6d02
--- /dev/null
+++ b/gmp/mpn/alpha/aorslsh2_n.asm
@@ -0,0 +1,167 @@
+dnl Alpha mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
+
+dnl Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 6
+C EV6: 3.75
+
+C TODO
+C * Tune to reach 3.5 c/l on ev6 and 5.75 c/l on ev5.
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh2_n',`
+ define(ADDSUB, addq)
+ define(CARRY, `cmpult $1,$2,$3')
+ define(func, mpn_addlsh2_n)
+')
+ifdef(`OPERATION_sublsh2_n',`
+ define(ADDSUB, subq)
+ define(CARRY, `cmpult $2,$1,$3')
+ define(func, mpn_sublsh2_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
+
+ASM_START()
+PROLOGUE(func)
+ and n, 2, cy0
+ blbs n, L(bx1)
+L(bx0): ldq v1, 0(vp)
+ ldq u1, 0(up)
+ bis r31, r31, r2
+ bne cy0, L(b10)
+
+L(b00): lda vp, 48(vp)
+ lda up, -16(up)
+ lda rp, -8(rp)
+ s4addq v1, r31, sl
+ br r31, L(lo0)
+
+L(b10): lda vp, 32(vp)
+ lda rp, 8(rp)
+ lda cy0, 0(r31)
+ br r31, L(lo2)
+
+L(bx1): ldq v0, 0(vp)
+ ldq u0, 0(up)
+ lda cy1, 0(r31)
+ bis r31, r31, r3
+ nop
+ beq cy0, L(b01)
+
+L(b11): lda vp, 40(vp)
+ lda up, -24(up)
+ lda rp, 16(rp)
+ br r31, L(lo3)
+
+L(b01): lda n, -4(n)
+ ble n, L(end)
+ lda vp, 24(vp)
+ lda up, -8(up)
+
+ ALIGN(16)
+L(top): s4addq v0, r3, sl C combined vlimb
+ ldq v1, -16(vp)
+ ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
+ ldq u1, 16(up)
+ srl v0, 62, r2 C high v bits
+ ADDSUB ps, cy1, rr C consume carry from previous operation
+ CARRY( ps, u0, cy0) C carry out #2
+ stq rr, 0(rp)
+ CARRY( rr, ps, cy) C carry out #3
+ lda vp, 32(vp) C bookkeeping
+ addq cy, cy0, cy0 C final carry out
+ s4addq v1, r2, sl
+L(lo0): ldq v0, -40(vp)
+ ADDSUB u1, sl, ps
+ ldq u0, 24(up)
+ srl v1, 62, r3
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy1)
+ stq rr, 8(rp)
+ CARRY( rr, ps, cy)
+ lda rp, 32(rp) C bookkeeping
+ addq cy, cy1, cy1
+L(lo3): s4addq v0, r3, sl
+ ldq v1, -32(vp)
+ ADDSUB u0, sl, ps
+ ldq u1, 32(up)
+ srl v0, 62, r2
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy0)
+ stq rr, -16(rp)
+ CARRY( rr, ps, cy)
+ lda up, 32(up) C bookkeeping
+ addq cy, cy0, cy0
+L(lo2): s4addq v1, r2, sl
+ ldq v0, -24(vp)
+ ADDSUB u1, sl, ps
+ ldq u0, 8(up)
+ srl v1, 62, r3
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy1)
+ stq rr, -8(rp)
+ CARRY( rr, ps, cy)
+ lda n, -4(n) C bookkeeping
+ addq cy, cy1, cy1
+ bgt n, L(top)
+
+L(end): s4addq v0, r3, sl
+ ADDSUB u0, sl, ps
+ srl v0, 62, r2
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy0)
+ stq rr, 0(rp)
+ CARRY( rr, ps, cy)
+ addq cy, cy0, cy0
+ addq cy0, r2, r0
+
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/bdiv_dbm1c.asm b/gmp/mpn/alpha/bdiv_dbm1c.asm
new file mode 100644
index 0000000000..472966ca98
--- /dev/null
+++ b/gmp/mpn/alpha/bdiv_dbm1c.asm
@@ -0,0 +1,282 @@
+dnl Alpha mpn_bdiv_dbm1c.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3
+
+C TODO
+C * Try less unrolling, 2-way should give the same performance.
+C * Optimize feed-in and wind-down code, for speed, and perhaps further for
+C code size.
+C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
+C path. We have not tried very hard to find a better algorithm. Perhaps
+C it would be a good task for the GNU superoptimizer.
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n', `r18')
+define(`bd', `r19')
+define(`cy', `r19')
+
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+ mov r20, r8
+
+ ldq r24, 0(r17)
+ and r18, 3, r28
+ lda r18, -4(r18)
+ beq r28, L(b0)
+ cmpeq r28, 1, r21
+ bne r21, L(b1)
+ cmpeq r28, 2, r21
+ bne r21, L(b2)
+
+
+L(b3): ldq r2, 8(r17)
+ ldq r3, 16(r17)
+ bgt r18, L(gt3)
+
+ mulq r24, r19, r5 C U1
+ umulh r24, r19, r21 C U1
+ mulq r2, r19, r6 C U1
+ umulh r2, r19, r22 C U1
+ mulq r3, r19, r7 C U1
+ umulh r3, r19, r23 C U1
+ lda r16, -32(r16)
+ br L(cj3)
+
+L(gt3): ldq r0, 24(r17)
+ mulq r24, r19, r5 C U1
+ umulh r24, r19, r21 C U1
+ ldq r1, 32(r17)
+ mulq r2, r19, r6 C U1
+ umulh r2, r19, r22 C U1
+ ldq r2, 40(r17)
+ mulq r3, r19, r7 C U1
+ umulh r3, r19, r23 C U1
+ ldq r3, 48(r17)
+ lda r18, -4(r18)
+ lda r17, 56(r17)
+ mulq r0, r19, r4 C U1
+ bgt r18, L(L3)
+
+ br L(cj7)
+
+
+L(b2): ldq r3, 8(r17)
+ bgt r18, L(gt2)
+
+ mulq r24, r19, r6 C U1
+ umulh r24, r19, r22 C U1
+ mulq r3, r19, r7 C U1
+ umulh r3, r19, r23 C U1
+ lda r16, -40(r16)
+ br L(cj2)
+
+L(gt2): ldq r0, 16(r17)
+ ldq r1, 24(r17)
+ mulq r24, r19, r6 C U1
+ umulh r24, r19, r22 C U1
+ ldq r2, 32(r17)
+ mulq r3, r19, r7 C U1
+ umulh r3, r19, r23 C U1
+ ldq r3, 40(r17)
+ lda r18, -4(r18)
+ lda r17, 48(r17)
+ mulq r0, r19, r4 C U1
+ umulh r0, r19, r20 C U1
+ lda r16, -8(r16)
+ bgt r18, L(gt6)
+
+ mulq r1, r19, r5 C U1
+ br L(cj6)
+
+L(gt6): ldq r0, 0(r17)
+ mulq r1, r19, r5 C U1
+ br L(L2)
+
+
+L(b1): bgt r18, L(gt1)
+
+ mulq r24, r19, r7 C U1
+ umulh r24, r19, r23 C U1
+ lda r16, -48(r16)
+ br L(cj1)
+
+L(gt1): ldq r0, 8(r17)
+ ldq r1, 16(r17)
+ ldq r2, 24(r17)
+ mulq r24, r19, r7 C U1
+ umulh r24, r19, r23 C U1
+ ldq r3, 32(r17)
+ lda r18, -4(r18)
+ lda r17, 40(r17)
+ mulq r0, r19, r4 C U1
+ umulh r0, r19, r20 C U1
+ lda r16, -16(r16)
+ bgt r18, L(gt5)
+
+ mulq r1, r19, r5 C U1
+ umulh r1, r19, r21 C U1
+ mulq r2, r19, r6 C U1
+ br L(cj5)
+
+L(gt5): ldq r0, 0(r17)
+ mulq r1, r19, r5 C U1
+ umulh r1, r19, r21 C U1
+ ldq r1, 8(r17)
+ mulq r2, r19, r6 C U1
+ br L(L1)
+
+
+L(b0): ldq r1, 8(r17)
+ ldq r2, 16(r17)
+ ldq r3, 24(r17)
+ lda r17, 32(r17)
+ lda r16, -24(r16)
+ mulq r24, r19, r4 C U1
+ umulh r24, r19, r20 C U1
+ bgt r18, L(gt4)
+
+ mulq r1, r19, r5 C U1
+ umulh r1, r19, r21 C U1
+ mulq r2, r19, r6 C U1
+ umulh r2, r19, r22 C U1
+ mulq r3, r19, r7 C U1
+ br L(cj4)
+
+L(gt4): ldq r0, 0(r17)
+ mulq r1, r19, r5 C U1
+ umulh r1, r19, r21 C U1
+ ldq r1, 8(r17)
+ mulq r2, r19, r6 C U1
+ umulh r2, r19, r22 C U1
+ ldq r2, 16(r17)
+ mulq r3, r19, r7 C U1
+ br L(L0)
+
+C *** MAIN LOOP START ***
+ ALIGN(16)
+L(top): mulq r0, r19, r4 C U1
+ subq r8, r28, r8
+L(L3): umulh r0, r19, r20 C U1
+ cmpult r8, r5, r28
+ ldq r0, 0(r17)
+ subq r8, r5, r8
+ addq r21, r28, r28
+ stq r8, 0(r16)
+
+ mulq r1, r19, r5 C U1
+ subq r8, r28, r8
+L(L2): umulh r1, r19, r21 C U1
+ cmpult r8, r6, r28
+ ldq r1, 8(r17)
+ subq r8, r6, r8
+ addq r22, r28, r28
+ stq r8, 8(r16)
+
+ mulq r2, r19, r6 C U1
+ subq r8, r28, r8
+L(L1): umulh r2, r19, r22 C U1
+ cmpult r8, r7, r28
+ ldq r2, 16(r17)
+ subq r8, r7, r8
+ addq r23, r28, r28
+ stq r8, 16(r16)
+
+ mulq r3, r19, r7 C U1
+ subq r8, r28, r8
+L(L0): umulh r3, r19, r23 C U1
+ cmpult r8, r4, r28
+ ldq r3, 24(r17)
+ subq r8, r4, r8
+ addq r20, r28, r28
+ stq r8, 24(r16)
+
+ lda r18, -4(r18)
+ lda r17, 32(r17)
+ lda r16, 32(r16)
+ bgt r18, L(top)
+C *** MAIN LOOP END ***
+
+ mulq r0, r19, r4 C U1
+ subq r8, r28, r8
+L(cj7): umulh r0, r19, r20 C U1
+ cmpult r8, r5, r28
+ subq r8, r5, r8
+ addq r21, r28, r28
+ stq r8, 0(r16)
+ mulq r1, r19, r5 C U1
+ subq r8, r28, r8
+L(cj6): umulh r1, r19, r21 C U1
+ cmpult r8, r6, r28
+ subq r8, r6, r8
+ addq r22, r28, r28
+ stq r8, 8(r16)
+ mulq r2, r19, r6 C U1
+ subq r8, r28, r8
+L(cj5): umulh r2, r19, r22 C U1
+ cmpult r8, r7, r28
+ subq r8, r7, r8
+ addq r23, r28, r28
+ stq r8, 16(r16)
+ mulq r3, r19, r7 C U1
+ subq r8, r28, r8
+L(cj4): umulh r3, r19, r23 C U1
+ cmpult r8, r4, r28
+ subq r8, r4, r8
+ addq r20, r28, r28
+ stq r8, 24(r16)
+ subq r8, r28, r8
+L(cj3): cmpult r8, r5, r28
+ subq r8, r5, r8
+ addq r21, r28, r28
+ stq r8, 32(r16)
+ subq r8, r28, r8
+L(cj2): cmpult r8, r6, r28
+ subq r8, r6, r8
+ addq r22, r28, r28
+ stq r8, 40(r16)
+ subq r8, r28, r8
+L(cj1): cmpult r8, r7, r28
+ subq r8, r7, r8
+ addq r23, r28, r28
+ stq r8, 48(r16)
+ subq r8, r28, r0
+ ret r31, (r26), 1
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/cntlz.asm b/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000000..25af19b131
--- /dev/null
+++ b/gmp/mpn/alpha/cntlz.asm
@@ -0,0 +1,55 @@
+dnl Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl Copyright 1997, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+ASM_START()
+EXTERN(__clz_tab)
+PROLOGUE(mpn_count_leading_zeros,gp)
+ cmpbge r31, r16, r1
+ LEA(r3,__clz_tab)
+ sra r1, 1, r1
+ xor r1, 127, r1
+ srl r16, 1, r16
+ addq r1, r3, r1
+ ldq_u r0, 0(r1)
+ lda r2, 64
+ extbl r0, r1, r0
+ s8subl r0, 8, r0
+ srl r16, r0, r16
+ addq r16, r3, r16
+ ldq_u r1, 0(r16)
+ extbl r1, r16, r1
+ subq r2, r1, r2
+ subq r2, r0, r0
+ ret r31, (r26),1
+EPILOGUE(mpn_count_leading_zeros)
+ASM_END()
diff --git a/gmp/mpn/alpha/com.asm b/gmp/mpn/alpha/com.asm
new file mode 100644
index 0000000000..f084ab5e96
--- /dev/null
+++ b/gmp/mpn/alpha/com.asm
@@ -0,0 +1,176 @@
+dnl Alpha mpn_com -- mpn one's complement.
+
+dnl Copyright 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C EV4: 4.75
+C EV5: 2.0
+C EV6: 1.5
+
+
+C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
+C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop
+C will be 1.5+2/N c/l.
+C
+C 2 cycles of loop control are unavoidable, for pointer updates and the
+C taken branch bubble, but also since ldq cannot issue two cycles after stq
+C (and with a run of stqs that means neither of two cycles at the end of the
+C loop.
+C
+C The fbeq is forced into the second cycle of the loop using unops, since
+C the first time through it must wait for the cvtqt result. Once that
+C result is ready (a 1 cycle stall) then both the branch and following loads
+C can issue together.
+C
+C The main loop handles an odd count of limbs, being two limbs loaded before
+C each size test, plus one pipelined around from the previous iteration (or
+C setup in the entry sequence).
+C
+C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
+C entry sequence, and an increment of the pointers. For an odd size there's
+C no increment and the first store in the loop (r24) is a repeat of dst[0].
+C
+C Note that the load for r24 after the possible pointer increment is done
+C before the explicit store to dst[0], in case src==dst.
+
+
+ASM_START()
+
+FLOAT64(L(dat), 2.0)
+
+ ALIGN(16)
+
+PROLOGUE(mpn_com,gp)
+
+ C r16 dst
+ C r17 src
+ C r18 size
+
+ lda r30, -16(r30) C temporary stack space
+ lda r7, -3(r18) C size - 3
+
+ ldq r20, 0(r17) C src[0]
+ srl r7, 1, r6 C (size-3)/2
+
+ stq r6, 8(r30) C (size-3)/2
+ and r7, 1, r5 C 1 if size even
+
+ LEA( r8, L(dat))
+ s8addq r5, r17, r17 C skip src[0] if even
+
+ ornot r31, r20, r20 C ~src[0]
+ unop
+
+ ldt f0, 8(r30) C (size-3)/2
+ ldq r24, 0(r17) C src[0 or 1]
+
+ stq r20, 0(r16) C dst[0]
+ s8addq r5, r16, r19 C skip dst[0] if even
+
+ ldt f1, 0(r8) C data 2.0
+ lda r30, 16(r30) C restore stack
+ unop
+ cvtqt f0, f0 C (size-3)/2 as float
+
+ ornot r31, r24, r24
+ blt r7, L(done_1) C if size<=2
+ unop
+ unop
+
+
+ C 16-byte alignment here
+L(top):
+ C r17 src, incrementing
+ C r19 dst, incrementing
+ C r24 dst[i] result, ready to store
+ C f0 (size-3)/2, decrementing
+ C f1 2.0
+
+ ldq r20, 8(r17) C src[i+1]
+ ldq r21, 16(r17) C src[i+2]
+ unop
+ unop
+
+ fbeq f0, L(done_2)
+ unop
+ ldq r22, 24(r17) C src[i+3]
+ ldq r23, 32(r17) C src[i+4]
+
+ stq r24, 0(r19) C dst[i]
+ ornot r31, r20, r20
+ subt f0, f1, f0 C count -= 2
+ unop
+
+ stq r20, 8(r19) C dst[i+1]
+ ornot r31, r21, r21
+ unop
+ unop
+
+ stq r21, 16(r19) C dst[i+2]
+ ornot r31, r22, r22
+
+ stq r22, 24(r19) C dst[i+3]
+ ornot r31, r23, r24
+
+ lda r17, 32(r17) C src += 4
+ lda r19, 32(r19) C dst += 4
+ unop
+ fbge f0, L(top)
+
+
+L(done_1):
+ C r19 &dst[size-1]
+ C r24 result for dst[size-1]
+
+ stq r24, 0(r19) C dst[size-1]
+ ret r31, (r26), 1
+
+
+L(done_2):
+ C r19 &dst[size-3]
+ C r20 src[size-2]
+ C r21 src[size-1]
+ C r24 result for dst[size-3]
+
+ stq r24, 0(r19) C dst[size-3]
+ ornot r31, r20, r20
+
+ stq r20, 8(r19) C dst[size-2]
+ ornot r31, r21, r21
+
+ stq r21, 16(r19) C dst[size-1]
+ ret r31, (r26), 1
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/copyd.asm b/gmp/mpn/alpha/copyd.asm
new file mode 100644
index 0000000000..b41b5366cc
--- /dev/null
+++ b/gmp/mpn/alpha/copyd.asm
@@ -0,0 +1,88 @@
+dnl Alpha mpn_copyd -- copy, decrementing.
+
+dnl Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 4
+C EV5: 1.75
+C EV6: 1
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ s8addq r18,r16,r16 C E0
+ s8addq r18,r17,r17 C E1
+ lda r18,-8(r18) C E0
+ blt r18,$Lend C E1
+$Loop: ldq r0,-8(r17) C E0
+ ldq r1,-16(r17) C E1
+ ldq r2,-24(r17) C E0
+ ldq r3,-32(r17) C E1
+ ldq r4,-40(r17) C E0
+ ldq r5,-48(r17) C E1
+ ldq r6,-56(r17) C E0
+ ldq r7,-64(r17) C E1
+ stq r0,-8(r16) C E0
+ lda r17,-64(r17) C E1
+ stq r1,-16(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r2,-24(r16) C E0
+ lda r18,-8(r18) C E1
+ stq r3,-32(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r4,-40(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r5,-48(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r6,-56(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r7,-64(r16) C E0
+ lda r16,-64(r16) C E1
+ bge r18,$Loop C E1
+$Lend: lda r18,7(r18) C E0
+ blt r18,$Lret C E1
+ ldq r0,-8(r17) C E0
+ beq r18,$Lend0 C E1
+$Loop0: stq r0,-8(r16) C E0
+ lda r16,-8(r16) C E1
+ ldq r0,-16(r17) C E0
+ lda r18,-1(r18) C E1
+ lda r17,-8(r17) C E0
+ bgt r18,$Loop0 C E1
+$Lend0: stq r0,-8(r16) C E0
+$Lret: ret r31,(r26),1 C E1
+EPILOGUE(mpn_copyd)
+ASM_END()
diff --git a/gmp/mpn/alpha/copyi.asm b/gmp/mpn/alpha/copyi.asm
new file mode 100644
index 0000000000..f7e2ad6f6a
--- /dev/null
+++ b/gmp/mpn/alpha/copyi.asm
@@ -0,0 +1,86 @@
+dnl Alpha mpn_copyi -- copy, incrementing.
+
+dnl Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 4
+C EV5: 1.75
+C EV6: 1
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ lda r18,-8(r18) C E0
+ blt r18,$Lend C E1
+$Loop: ldq r0,0(r17) C E0
+ ldq r1,8(r17) C E1
+ ldq r2,16(r17) C E0
+ ldq r3,24(r17) C E1
+ ldq r4,32(r17) C E0
+ ldq r5,40(r17) C E1
+ ldq r6,48(r17) C E0
+ ldq r7,56(r17) C E1
+ stq r0,0(r16) C E0
+ lda r17,64(r17) C E1
+ stq r1,8(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r2,16(r16) C E0
+ lda r18,-8(r18) C E1
+ stq r3,24(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r4,32(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r5,40(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r6,48(r16) C E0
+ bis r31, r31, r31 C E1
+ stq r7,56(r16) C E0
+ lda r16,64(r16) C E1
+ bge r18,$Loop C E1
+$Lend: lda r18,7(r18) C E0
+ blt r18,$Lret C E1
+ ldq r0,0(r17) C E0
+ beq r18,$Lend0 C E1
+$Loop0: stq r0,0(r16) C E0
+ lda r16,8(r16) C E1
+ ldq r0,8(r17) C E0
+ lda r18,-1(r18) C E1
+ lda r17,8(r17) C E0
+ bgt r18,$Loop0 C E1
+$Lend0: stq r0,0(r16) C E0
+$Lret: ret r31,(r26),1 C E1
+EPILOGUE(mpn_copyi)
+ASM_END()
diff --git a/gmp/mpn/alpha/default.m4 b/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000000..8fe7c4e122
--- /dev/null
+++ b/gmp/mpn/alpha/default.m4
@@ -0,0 +1,127 @@
+divert(-1)
+
+dnl m4 macros for alpha assembler (everywhere except unicos).
+
+
+dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Usage: ASM_START()
+define(`ASM_START',
+m4_assert_numargs(0)
+` .set noreorder
+ .set noat')
+
+dnl Usage: X(value)
+define(`X',
+m4_assert_numargs(1)
+`0x$1')
+
+dnl Usage: FLOAT64(label,value)
+define(`FLOAT64',
+m4_assert_numargs(2)
+` .align 3
+$1: .t_floating $2')
+
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign])
+dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',gp,,
+`ifelse(`$2',noalign,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter
+')')')')dnl
+ .text
+ifelse(`$2',noalign,,` ALIGN(16)')
+ .globl $1
+ .ent $1
+$1:
+ .frame r30,0,r26,0
+ifelse(`$2',gp,` ldgp r29, 0(r27)
+`$'$1..ng:')
+ .prologue ifelse(`$2',gp,1,0)')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+` .end $1')
+
+
+dnl Usage: LDGP(dst,src)
+dnl
+dnl Emit an "ldgp dst,src", but only if the system uses a GOT.
+
+define(LDGP,
+m4_assert_numargs(2)
+`ldgp `$1', `$2'')
+
+
+dnl Usage: EXTERN(variable_name)
+define(`EXTERN',
+m4_assert_numargs(1)
+)
+
+dnl Usage: r0 ... r31
+dnl f0 ... f31
+dnl
+dnl Map register names r0 to $0, and f0 to $f0, etc.
+dnl This is needed on all systems but Unicos
+dnl
+dnl defreg() is used to protect the $ in $0 (otherwise it would represent a
+dnl macro argument). Double quoting is used to protect the f0 in $f0
+dnl (otherwise it would be an infinite recursion).
+
+forloop(i,0,31,`defreg(`r'i,$i)')
+forloop(i,0,31,`deflit(`f'i,``$f''i)')
+
+
+dnl Usage: DATASTART(name,align) or DATASTART(name)
+dnl DATAEND()
+
+define(`DATASTART',
+m4_assert_numargs_range(1,2)
+` RODATA
+ ALIGN(ifelse($#,1,2,$2))
+$1:')
+define(`DATAEND',
+m4_assert_numargs(0)
+)
+
+dnl Load a symbolic address into a register
+define(`LEA',
+m4_assert_numargs(2)
+`lda $1, $2')
+
+dnl Usage: ASM_END()
+define(`ASM_END',
+m4_assert_numargs(0)
+)
+
+divert
diff --git a/gmp/mpn/alpha/dive_1.c b/gmp/mpn/alpha/dive_1.c
new file mode 100644
index 0000000000..88b82db2f7
--- /dev/null
+++ b/gmp/mpn/alpha/dive_1.c
@@ -0,0 +1,115 @@
+/* Alpha mpn_divexact_1 -- mpn by limb exact division.
+
+ THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
+ CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+ FUTURE GNU MP RELEASES.
+
+Copyright 2000-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* cycles/limb
+ EV4: 47.0
+ EV5: 30.0
+ EV6: 15.0
+*/
+
+
+/* The dependent chain is as follows (the same as modexact), and this is
+ what the code runs as.
+
+ ev4 ev5 ev6
+ 1 1 1 sub y = x - h
+ 23 13 7 mulq q = y * inverse
+ 23 15 7 umulh h = high (q * d)
+ -- -- --
+ 47 30 15
+
+ The time to load src[i+1] and establish x hides under the umulh latency. */
+
+void
+mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+ mp_limb_t inverse, lshift_mask, s, sr, s_next, c, h, x, y, q, dummy;
+ unsigned rshift, lshift;
+
+ ASSERT (size >= 1);
+ ASSERT (divisor != 0);
+ ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
+ ASSERT_MPN (src, size);
+ ASSERT_LIMB (divisor);
+
+ s_next = *src++; /* src[0] */
+
+ rshift = 0;
+ lshift_mask = 0;
+ if ((divisor & 1) == 0)
+ {
+ count_trailing_zeros (rshift, divisor);
+ lshift_mask = MP_LIMB_T_MAX;
+ divisor >>= rshift;
+ }
+
+ binvert_limb (inverse, divisor);
+ lshift = 64 - rshift;
+
+ c = 0;
+ h = 0;
+ sr = s_next >> rshift;
+
+ size--;
+ if (LIKELY (size != 0))
+ {
+ do
+ {
+ s_next = *src++; /* src[i+1] */
+ s = sr | ((s_next << lshift) & lshift_mask);
+ x = s - c;
+ c = s < c;
+ sr = s_next >> rshift;
+
+ y = x - h;
+ c += (x < h);
+ q = y * inverse;
+ *dst++ = q;
+ umul_ppmm (h, dummy, q, divisor);
+
+ size--;
+ }
+ while (size != 0);
+ }
+
+ x = sr - c;
+ y = x - h;
+ q = y * inverse;
+ *dst = q; /* dst[size-1] */
+}
diff --git a/gmp/mpn/alpha/divrem_2.asm b/gmp/mpn/alpha/divrem_2.asm
new file mode 100644
index 0000000000..046b246a95
--- /dev/null
+++ b/gmp/mpn/alpha/divrem_2.asm
@@ -0,0 +1,177 @@
+dnl Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C norm frac
+C ev4
+C ev5 70 70
+C ev6 29 29
+
+C TODO
+C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
+C any registers (thus save ~10 cycles per call).
+C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle
+C or two.
+C * Check cluster delays (for ev6). We very likely could save some cycles.
+C * Use branch-free code for computing di.
+C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
+
+C INPUT PARAMETERS
+define(`qp', `r16')
+define(`fn', `r17')
+define(`up_param', `r18')
+define(`un_param', `r19')
+define(`dp', `r20')
+
+ASM_START()
+PROLOGUE(mpn_divrem_2,gp)
+ lda r30, -80(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+C stq r14, 48(r30)
+ stq r15, 56(r30)
+ .prologue 1
+ stq r16, 64(r30)
+ bis r31, r17, r15
+ s8addq r19, r18, r13
+ lda r13, -24(r13)
+ ldq r12, 8(r20)
+ ldq r10, 0(r20)
+ ldq r11, 16(r13)
+ ldq r9, 8(r13)
+
+ bis r31, r31, r3 C most_significant_q_limb = 0
+ cmpult r11, r12, r1
+ bne r1, L(L8)
+ cmpule r11, r12, r1
+ cmpult r9, r10, r2
+ and r1, r2, r1
+ bne r1, L(L8)
+ subq r11, r12, r11
+ subq r11, r2, r11
+ subq r9, r10, r9
+ lda r3, 1(r31) C most_significant_q_limb = 1
+L(L8): stq r3, 72(r30)
+
+ addq r15, r19, r19
+ lda r19, -3(r19)
+ blt r19, L(L10)
+ bis r31, r12, r16
+ jsr r26, mpn_invert_limb
+ LDGP( r29, 0(r26))
+ mulq r0, r12, r4 C t0 = LO(di * d1)
+ umulh r0, r10, r2 C s1 = HI(di * d0)
+ addq r4, r10, r4 C t0 += d0
+ cmpule r10, r4, r7 C (t0 < d0)
+ addq r4, r2, r4 C t0 += s1
+ cmpult r4, r2, r1
+ subq r1, r7, r7 C t1 (-1, 0, or 1)
+ blt r7, L(L42)
+L(L22):
+ lda r0, -1(r0) C di--
+ cmpult r4, r12, r1 C cy for: t0 -= d1 (below)
+ subq r7, r1, r7 C t1 -= cy
+ subq r4, r12, r4 C t0 -= d1
+ bge r7, L(L22)
+L(L42):
+ ldq r16, 64(r30)
+ s8addq r19, r16, r16
+ ALIGN(16)
+L(loop):
+ mulq r11, r0, r5 C q0 (early)
+ umulh r11, r0, r6 C q (early)
+ addq r5, r9, r8 C q0 += n1
+ addq r6, r11, r6 C q += n2
+ cmpult r8, r5, r1 C cy for: q0 += n1
+ addq r6, r1, r6 C q += cy
+ unop
+ mulq r12, r6, r1 C LO(d1 * q)
+ umulh r10, r6, r7 C t1 = HI(d0 * q)
+ subq r9, r1, r9 C n1 -= LO(d1 * q)
+ mulq r10, r6, r4 C t0 = LO(d0 * q)
+ unop
+ cmple r15, r19, r5 C condition and n0...
+ beq r5, L(L31)
+ ldq r5, 0(r13)
+ lda r13, -8(r13)
+L(L31): subq r9, r12, r9 C n1 -= d1
+ cmpult r5, r10, r1 C
+ subq r9, r1, r9 C
+ subq r5, r10, r5 C n0 -= d0
+ subq r9, r7, r9 C n1 -= t0
+ cmpult r5, r4, r1 C
+ subq r9, r1, r2 C
+ subq r5, r4, r5 C n0 -= t1
+ cmpult r2, r8, r1 C (n1 < q0)
+ addq r6, r1, r6 C q += cond
+ lda r1, -1(r1) C -(n1 >= q0)
+ and r1, r10, r4 C
+ addq r5, r4, r9 C n0 += mask & d0
+ and r1, r12, r1 C
+ cmpult r9, r5, r11 C cy for: n0 += mask & d0
+ addq r2, r1, r1 C n1 += mask & d1
+ addq r1, r11, r11 C n1 += cy
+ cmpult r11, r12, r1 C
+ beq r1, L(fix) C
+L(bck): stq r6, 0(r16)
+ lda r16, -8(r16)
+ lda r19, -1(r19)
+ bge r19, L(loop)
+
+L(L10): stq r9, 8(r13)
+ stq r11, 16(r13)
+ ldq r0, 72(r30)
+ ldq r26, 0(r30)
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+C ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ lda r30, 80(r30)
+ ret r31, (r26), 1
+
+L(fix): cmpule r11, r12, r1
+ cmpult r9, r10, r2
+ and r1, r2, r1
+ bne r1, L(bck)
+ subq r11, r12, r11
+ subq r11, r2, r11
+ subq r9, r10, r9
+ lda r6, 1(r6)
+ br L(bck)
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev5/diveby3.asm b/gmp/mpn/alpha/ev5/diveby3.asm
new file mode 100644
index 0000000000..3758188e02
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/diveby3.asm
@@ -0,0 +1,332 @@
+dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
+
+dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 22
+C EV5: 11.5
+C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster
+
+C TODO
+C * Remove the unops, they benefit just ev6, which no longer uses this file.
+C * Try prefetch for destination, using lds.
+C * Improve feed-in code, by moving initial mulq earlier; make initial load
+C to u0/u0 to save some copying.
+C * Combine u0 and u2, u1 and u3.
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n', `r18')
+define(`cy', `r19')
+
+ASM_START()
+
+DATASTART(L(LC),8)
+ .quad 0xAAAAAAAAAAAAAAAB
+ .quad 0x5555555555555555
+ .quad 0xAAAAAAAAAAAAAAAA
+DATAEND()
+
+define(`xAAAAAAAAAAAAAAAB', `r20')
+define(`x5555555555555555', `r21')
+define(`xAAAAAAAAAAAAAAAA', `r22')
+define(`u0', `r0') define(`u1', `r1')
+define(`u2', `r2') define(`u3', `r3')
+define(`l0', `r25') define(`x', `r8')
+define(`q0', `r4') define(`q1', `r5')
+define(`p6', `r6') define(`p7', `r7')
+define(`t0', `r23') define(`t1', `r24')
+define(`cymask',`r28')
+
+
+PROLOGUE(mpn_divexact_by3c,gp)
+
+ ldq r28, 0(up) C load first limb early
+
+C Put magic constants in registers
+ lda r0, L(LC)
+ ldq xAAAAAAAAAAAAAAAB, 0(r0)
+ ldq x5555555555555555, 8(r0)
+ ldq xAAAAAAAAAAAAAAAA, 16(r0)
+
+C Compute initial l0 value
+ cmpeq cy, 1, p6
+ cmpeq cy, 2, p7
+ negq p6, p6
+ and p6, x5555555555555555, l0
+ cmovne p7, xAAAAAAAAAAAAAAAA, l0
+
+C Feed-in depending on (n mod 4)
+ and n, 3, r8
+ lda n, -3(n)
+ cmpeq r8, 1, r4
+ cmpeq r8, 2, r5
+ bne r4, $Lb01
+ bne r5, $Lb10
+ beq r8, $Lb00
+
+$Lb11: ldq u3, 8(up)
+ lda up, -24(up)
+ lda rp, -24(rp)
+ mulq r28, xAAAAAAAAAAAAAAAB, q0
+ mov r28, u2
+ br r31, $L11
+
+$Lb00: ldq u2, 8(up)
+ lda up, -16(up)
+ lda rp, -16(rp)
+ mulq r28, xAAAAAAAAAAAAAAAB, q1
+ mov r28, u1
+ br r31, $L00
+
+$Lb01: lda rp, -8(rp)
+ mulq r28, xAAAAAAAAAAAAAAAB, q0
+ mov r28, u0
+ blt n, $Lcj1
+ ldq u1, 8(up)
+ lda up, -8(up)
+ br r31, $L01
+
+$Lb10: ldq u0, 8(up)
+ mulq r28, xAAAAAAAAAAAAAAAB, q1
+ mov r28, u3
+ blt n, $Lend
+
+ ALIGN(16)
+$Ltop:
+C 0
+ cmpult u3, cy, cy C L0
+ mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1
+ ldq u1, 16(up) C L1
+ addq q1, l0, x C U0
+C 1
+ negq cy, cymask C L0
+ unop C U1
+ unop C L1
+ cmpult x5555555555555555, x, p6 C U0
+C 2
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ unop
+ unop
+ negq p6, t0 C L0
+C 3
+ negq p7, t1 C L0
+ and cymask, x5555555555555555, l0 C U1
+ addq p6, cy, cy
+ and t0, x5555555555555555, t0
+C 4
+ and t1, x5555555555555555, t1
+ addq p7, cy, cy
+ unop
+ addq t0, l0, l0
+C 5
+ addq t1, l0, l0
+ unop
+ stq x, 0(rp) C L1
+ unop
+$L01:
+C 0
+ cmpult u0, cy, cy C L0
+ mulq u1, xAAAAAAAAAAAAAAAB, q1 C U1
+ ldq u2, 24(up) C L1
+ addq q0, l0, x C U0
+C 1
+ negq cy, cymask C L0
+ unop C U1
+ unop C L1
+ cmpult x5555555555555555, x, p6 C U0
+C 2
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ unop
+ unop
+ negq p6, t0 C L0
+C 3
+ negq p7, t1 C L0
+ and cymask, x5555555555555555, l0 C U1
+ addq p6, cy, cy
+ and t0, x5555555555555555, t0
+C 4
+ and t1, x5555555555555555, t1
+ addq p7, cy, cy
+ unop
+ addq t0, l0, l0
+C 5
+ addq t1, l0, l0
+ unop
+ stq x, 8(rp) C L1
+ unop
+$L00:
+C 0
+ cmpult u1, cy, cy C L0
+ mulq u2, xAAAAAAAAAAAAAAAB, q0 C U1
+ ldq u3, 32(up) C L1
+ addq q1, l0, x C U0
+C 1
+ negq cy, cymask C L0
+ unop C U1
+ unop C L1
+ cmpult x5555555555555555, x, p6 C U0
+C 2
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ unop
+ unop
+ negq p6, t0 C L0
+C 3
+ negq p7, t1 C L0
+ and cymask, x5555555555555555, l0 C U1
+ addq p6, cy, cy
+ and t0, x5555555555555555, t0
+C 4
+ and t1, x5555555555555555, t1
+ addq p7, cy, cy
+ unop
+ addq t0, l0, l0
+C 5
+ addq t1, l0, l0
+ unop
+ stq x, 16(rp) C L1
+ unop
+$L11:
+C 0
+ cmpult u2, cy, cy C L0
+ mulq u3, xAAAAAAAAAAAAAAAB, q1 C U1
+ ldq u0, 40(up) C L1
+ addq q0, l0, x C U0
+C 1
+ negq cy, cymask C L0
+ unop C U1
+ unop C L1
+ cmpult x5555555555555555, x, p6 C U0
+C 2
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ lda n, -4(n) C L1 bookkeeping
+ unop
+ negq p6, t0 C L0
+C 3
+ negq p7, t1 C L0
+ and cymask, x5555555555555555, l0 C U1
+ addq p6, cy, cy
+ and t0, x5555555555555555, t0
+C 4
+ and t1, x5555555555555555, t1
+ addq p7, cy, cy
+ unop
+ addq t0, l0, l0
+C 5
+ addq t1, l0, l0
+ unop
+ stq x, 24(rp) C L1
+ lda up, 32(up)
+C
+ ldl r31, 256(up) C prefetch
+ unop
+ lda rp, 32(rp)
+ bge n, $Ltop C U1
+C *** MAIN LOOP END ***
+$Lend:
+
+ cmpult u3, cy, cy C L0
+ mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1
+ unop
+ addq q1, l0, x C U0
+C 1
+ negq cy, cymask C L0
+ unop C U1
+ unop C L1
+ cmpult x5555555555555555, x, p6 C U0
+C 2
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ unop
+ unop
+ negq p6, t0 C L0
+C 3
+ negq p7, t1 C L0
+ and cymask, x5555555555555555, l0 C U1
+ addq p6, cy, cy
+ and t0, x5555555555555555, t0
+C 4
+ and t1, x5555555555555555, t1
+ addq p7, cy, cy
+ unop
+ addq t0, l0, l0
+C 5
+ addq t1, l0, l0
+ unop
+ stq x, 0(rp) C L1
+ unop
+$Lcj1:
+ cmpult u0, cy, cy C L0
+ addq q0, l0, x C U0
+ cmpult x5555555555555555, x, p6 C U0
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ addq p6, cy, cy
+ addq p7, cy, r0
+ stq x, 8(rp) C L1
+
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
+
+C This is useful for playing with various schedules.
+C Expand as: one(0)one(1)one(2)one(3)
+define(`one',`
+C 0
+ cmpult `$'eval(($1+3)%4), cy, cy C L0
+ mulq `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
+ ldq `$'eval(($1+1)%4), eval($1*8+16)(up) C L1
+ addq `$'eval(4+($1+1)%2), l0, x C U0
+C 1
+ negq cy, cymask C L0
+ unop C U1
+ unop C L1
+ cmpult x5555555555555555, x, p6 C U0
+C 2
+ cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
+ unop
+ unop
+ negq p6, t0 C L0
+C 3
+ negq p7, t1 C L0
+ and cymask, x5555555555555555, l0 C U1
+ addq p6, cy, cy
+ and t0, x5555555555555555, t0
+C 4
+ and t1, x5555555555555555, t1
+ addq p7, cy, cy
+ unop
+ addq t0, l0, l0
+C 5
+ addq t1, l0, l0
+ unop
+ stq x, eval($1*8)(rp) C L1
+ unop
+')
diff --git a/gmp/mpn/alpha/ev5/gmp-mparam.h b/gmp/mpn/alpha/ev5/gmp-mparam.h
new file mode 100644
index 0000000000..b560c20afe
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/gmp-mparam.h
@@ -0,0 +1,187 @@
+/* Alpha EV5 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 600 MHz 21164A */
+/* FFT tuning limit = 5000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 22
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 76
+
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 50
+#define MUL_TOOM44_THRESHOLD 118
+#define MUL_TOOM6H_THRESHOLD 157
+#define MUL_TOOM8H_THRESHOLD 236
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 260
+
+#define MULMID_TOOM42_THRESHOLD 18
+
+#define MULMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 12
+
+#define MUL_FFT_MODF_THRESHOLD 284 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 284, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
+ { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
+ { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \
+ { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
+ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \
+ { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
+ { 23, 8}, { 47,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 63, 8}, \
+ { 255, 7}, { 511,10}, { 71, 9}, { 143, 8}, \
+ { 287, 7}, { 575, 9}, { 159, 8}, { 319,11}, \
+ { 47,12}, { 31,11}, { 63, 9}, { 255, 8}, \
+ { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \
+ { 159, 9}, { 319,10}, { 175, 9}, { 351, 8}, \
+ { 703,10}, { 191, 9}, { 383,10}, { 207, 9}, \
+ { 415,12}, { 63,10}, { 255,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,10}, { 415,11}, { 223,13}, \
+ { 63,11}, { 287,10}, { 575,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,12}, { 191,11}, \
+ { 415,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,12}, { 287,11}, { 575,12}, { 351,13}, \
+ { 191,12}, { 479,13}, { 255,12}, { 575,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,14}, { 255,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 121
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 240, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
+ { 14, 5}, { 29, 7}, { 9, 6}, { 19, 7}, \
+ { 13, 6}, { 27, 8}, { 7, 7}, { 21, 8}, \
+ { 11, 7}, { 29, 8}, { 19, 9}, { 11, 8}, \
+ { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
+ { 19, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
+ { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
+ { 287,10}, { 79,11}, { 47,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
+ { 287,11}, { 79,10}, { 159, 9}, { 319,10}, \
+ { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207, 9}, { 415,11}, { 111,10}, { 223,12}, \
+ { 63,11}, { 175,12}, { 95,11}, { 207,13}, \
+ { 63,12}, { 127,11}, { 287,12}, { 159,11}, \
+ { 351,12}, { 191,11}, { 415,12}, { 223,11}, \
+ { 447,13}, { 127,12}, { 351,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,13}, { 319,12}, { 703,13}, \
+ { 383,12}, { 831,13}, { 447,14}, { 255,13}, \
+ { 511,12}, { 1023,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 105
+#define SQR_FFT_THRESHOLD 3968
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 45
+#define MULLO_MUL_N_THRESHOLD 8397
+
+#define DC_DIV_QR_THRESHOLD 47
+#define DC_DIVAPPR_Q_THRESHOLD 168
+#define DC_BDIV_QR_THRESHOLD 47
+#define DC_BDIV_Q_THRESHOLD 110
+
+#define INV_MULMOD_BNM1_THRESHOLD 26
+#define INV_NEWTON_THRESHOLD 189
+#define INV_APPR_THRESHOLD 181
+
+#define BINV_NEWTON_THRESHOLD 196
+#define REDC_1_TO_REDC_N_THRESHOLD 51
+
+#define MU_DIV_QR_THRESHOLD 1558
+#define MU_DIVAPPR_Q_THRESHOLD 1558
+#define MUPI_DIV_QR_THRESHOLD 90
+#define MU_BDIV_QR_THRESHOLD 855
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define POWM_SEC_TABLE 1,16,90,452,1221
+
+#define MATRIX22_STRASSEN_THRESHOLD 11
+#define HGCD_THRESHOLD 99
+#define HGCD_APPR_THRESHOLD 103
+#define HGCD_REDUCE_THRESHOLD 2899
+#define GCD_DC_THRESHOLD 283
+#define GCDEXT_DC_THRESHOLD 201
+#define JACOBI_BASE_METHOD 3
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 426
+#define SET_STR_PRECOMPUTE_THRESHOLD 1505
+
+#define FAC_DSC_THRESHOLD 1404
+#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/alpha/ev6/add_n.asm b/gmp/mpn/alpha/ev6/add_n.asm
new file mode 100644
index 0000000000..9261f31b8a
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/add_n.asm
@@ -0,0 +1,283 @@
+dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 5.4
+C EV6: 2.125
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
+C cy r20 (for mpn_add_nc)
+
+C TODO
+C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C Use multi-pronged feed-in.
+C Perform additional micro-tuning
+
+C This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C Pair loads and stores where possible
+C Store pairs oct-aligned where possible (didn't need it here)
+C Stores are delayed every third cycle
+C Loads and stores are delayed by fills
+C U stays still, put code there where possible (note alternation of U1 and U0)
+C L moves because of loads and stores
+C Note dampers in L to limit damage
+
+C This odd-looking optimization expects that were having random bits in our
+C data, so that a pure zero result is unlikely. so we penalize the unlikely
+C case to help the common case.
+
+define(`u0', `r0') define(`u1', `r3')
+define(`v0', `r1') define(`v1', `r4')
+
+define(`cy0', `r20') define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+ br r31, $entry
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+ bis r31, r31, cy0 C clear carry in
+$entry: cmpult r19, 5, r22 C L1 move counter
+ ldq u1, 0(r17) C L0 get next ones
+ ldq v1, 0(r18) C L1
+ bne r22, $Lsmall
+
+ ldq u0, 8(r17) C L0 get next ones
+ ldq v0, 8(r18) C L1
+ addq u1, v1, r5 C U0 add two data
+
+ cmpult r5, v1, r23 C U0 did it carry
+ ldq u1, 16(r17) C L0 get next ones
+ ldq v1, 16(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy0, r5 C U0 carry in
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix5f C U0 fix exact zero
+$ret5f: ldq u0, 24(r17) C L0 get next ones
+ ldq v0, 24(r18) C L1
+
+ addq r8, r23, r8 C U1 carry from last
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix6f C U1 fix exact zero
+$ret6f: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 32(r17) C L0 get next ones
+ ldq v1, 32(r18) C L1
+
+ lda r17, 40(r17) C L0 move pointer
+ lda r18, 40(r18) C L1 move pointer
+
+ lda r16, -8(r16)
+ lda r19, -13(r19) C L1 move counter
+ blt r19, $Lend C U1 loop control
+
+
+C Main loop. 8-way unrolled.
+ ALIGN(16)
+$Loop: addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, 8(r16) C L0 put an answer
+ stq r8, 16(r16) C L1 pair
+
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix7 C U0 fix exact 0
+$ret7: ldq u0, 0(r17) C L0 get next ones
+ ldq v0, 0(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ addq r2, r23, r2 C U1 carry from last
+ bis r31, r31, r31 C L moves in L !
+ addq u1, v1, r5 C U0 add two data
+
+ beq r2, $fix0 C U1 fix exact zero
+$ret0: cmpult r5, v1, cy0 C U0 did it carry
+ ldq u1, 8(r17) C L0 get next ones
+ ldq v1, 8(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, 24(r16) C L0 store pair
+ stq r2, 32(r16) C L1
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix1 C U0 fix exact zero
+$ret1: ldq u0, 16(r17) C L0 get next ones
+ ldq v0, 16(r18) C L1
+
+ lda r16, 64(r16) C L0 move pointer
+ addq r8, cy0, r8 C U1 carry from last
+ lda r19, -8(r19) C L1 move counter
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix2 C U1 fix exact zero
+$ret2: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 24(r17) C L0 get next ones
+ ldq v1, 24(r18) C L1
+
+ addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, -24(r16) C L0 put an answer
+ stq r8, -16(r16) C L1 pair
+
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix3 C U0 fix exact 0
+$ret3: ldq u0, 32(r17) C L0 get next ones
+ ldq v0, 32(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ addq r2, r23, r2 C U1 carry from last
+ bis r31, r31, r31 C L moves in L !
+ addq u1, v1, r5 C U0 add two data
+
+ beq r2, $fix4 C U1 fix exact zero
+$ret4: cmpult r5, v1, cy0 C U0 did it carry
+ ldq u1, 40(r17) C L0 get next ones
+ ldq v1, 40(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, -8(r16) C L0 store pair
+ stq r2, 0(r16) C L1
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix5 C U0 fix exact zero
+$ret5: ldq u0, 48(r17) C L0 get next ones
+ ldq v0, 48(r18) C L1
+
+ ldl r31, 256(r17) C L0 prefetch
+ addq r8, cy0, r8 C U1 carry from last
+ ldl r31, 256(r18) C L1 prefetch
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix6 C U1 fix exact zero
+$ret6: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 56(r17) C L0 get next ones
+ ldq v1, 56(r18) C L1
+
+ lda r17, 64(r17) C L0 move pointer
+ bis r31, r31, r31 C U
+ lda r18, 64(r18) C L1 move pointer
+ bge r19, $Loop C U1 loop control
+C ==== main loop end
+
+$Lend: addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, 8(r16) C L0 put an answer
+ stq r8, 16(r16) C L1 pair
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix7c C U0 fix exact 0
+$ret7c: addq r2, r23, r2 C U1 carry from last
+ addq u1, v1, r5 C U0 add two data
+ beq r2, $fix0c C U1 fix exact zero
+$ret0c: cmpult r5, v1, cy0 C U0 did it carry
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, 24(r16) C L0 store pair
+ stq r2, 32(r16) C L1
+ beq r5, $fix1c C U0 fix exact zero
+$ret1c: stq r5, 40(r16) C L0 put an answer
+ lda r16, 48(r16) C L0 move pointer
+
+ lda r19, 8(r19)
+ beq r19, $Lret
+
+ ldq u1, 0(r17)
+ ldq v1, 0(r18)
+$Lsmall:
+ lda r19, -1(r19)
+ beq r19, $Lend0
+
+ ALIGN(8)
+$Loop0: addq u1, v1, r2 C main add
+ cmpult r2, v1, r8 C compute cy from last add
+ ldq u1, 8(r17)
+ ldq v1, 8(r18)
+ addq r2, cy0, r5 C carry add
+ lda r17, 8(r17)
+ lda r18, 8(r18)
+ stq r5, 0(r16)
+ cmpult r5, r2, cy0 C compute cy from last add
+ lda r19, -1(r19) C decr loop cnt
+ bis r8, cy0, cy0 C combine cy from the two adds
+ lda r16, 8(r16)
+ bne r19, $Loop0
+$Lend0: addq u1, v1, r2 C main add
+ addq r2, cy0, r5 C carry add
+ cmpult r2, v1, r8 C compute cy from last add
+ cmpult r5, r2, cy0 C compute cy from last add
+ stq r5, 0(r16)
+ bis r8, cy0, r0 C combine cy from the two adds
+ ret r31,(r26),1
+
+ ALIGN(8)
+$Lret: lda r0, 0(cy0) C copy carry into return register
+ ret r31,(r26),1
+
+$fix5f: bis r23, cy0, r23 C bring forward carry
+ br r31, $ret5f
+$fix6f: bis r22, r23, r22 C bring forward carry
+ br r31, $ret6f
+$fix0: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret0
+$fix1: bis cy0, cy1, cy0 C bring forward carry
+ br r31, $ret1
+$fix2: bis r22, cy0, r22 C bring forward carry
+ br r31, $ret2
+$fix3: bis r23, r22, r23 C bring forward carry
+ br r31, $ret3
+$fix4: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret4
+$fix5: bis cy1, cy0, cy0 C bring forward carry
+ br r31, $ret5
+$fix6: bis r22, cy0, r22 C bring forward carry
+ br r31, $ret6
+$fix7: bis r23, r22, r23 C bring forward carry
+ br r31, $ret7
+$fix0c: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret0c
+$fix1c: bis cy0, cy1, cy0 C bring forward carry
+ br r31, $ret1c
+$fix7c: bis r23, r22, r23 C bring forward carry
+ br r31, $ret7c
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorslsh1_n.asm b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
new file mode 100644
index 0000000000..cb966ce021
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
@@ -0,0 +1,172 @@
+dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 7
+C EV6: 4
+
+C TODO
+C * Tune to reach 3.75 c/l on ev6.
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh1_n',`
+ define(ADDSUB, addq)
+ define(CARRY, `cmpult $1,$2,$3')
+ define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+ define(ADDSUB, subq)
+ define(CARRY, `cmpult $2,$1,$3')
+ define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+ and n, 2, cy0
+ blbs n, L(bx1)
+L(bx0): ldq v1, 0(vp)
+ ldq u1, 0(up)
+ lda r2, 0(r31)
+ bne cy0, L(b10)
+
+L(b00): lda vp, 48(vp)
+ lda up, -16(up)
+ lda rp, -8(rp)
+ lda cy0, 0(r31)
+ br r31, L(lo0)
+
+L(b10): lda vp, 32(vp)
+ lda rp, 8(rp)
+ lda cy0, 0(r31)
+ br r31, L(lo2)
+
+L(bx1): ldq v0, 0(vp)
+ ldq u0, 0(up)
+ lda r3, 0(r31)
+ beq cy0, L(b01)
+
+L(b11): lda vp, 40(vp)
+ lda up, -24(up)
+ lda rp, 16(rp)
+ lda cy1, 0(r31)
+ br r31, L(lo3)
+
+L(b01): lda n, -4(n)
+ lda cy1, 0(r31)
+ ble n, L(end)
+ lda vp, 24(vp)
+ lda up, -8(up)
+
+ ALIGN(16)
+L(top): addq v0, v0, r6
+ ldq v1, -16(vp)
+ addq r6, r3, sl C combined vlimb
+ ldq u1, 16(up)
+ ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
+ cmplt v0, r31, r2 C high v bits
+ ADDSUB ps, cy1, rr C consume carry from previous operation
+ CARRY( ps, u0, cy0) C carry out #2
+ stq rr, 0(rp)
+ CARRY( rr, ps, cy) C carry out #3
+ lda vp, 32(vp) C bookkeeping
+ addq cy, cy0, cy0 C final carry out
+L(lo0): addq v1, v1, r7
+ ldq v0, -40(vp)
+ addq r7, r2, sl
+ ldq u0, 24(up)
+ ADDSUB u1, sl, ps
+ cmplt v1, r31, r3
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy1)
+ stq rr, 8(rp)
+ CARRY( rr, ps, cy)
+ lda rp, 32(rp) C bookkeeping
+ addq cy, cy1, cy1
+L(lo3): addq v0, v0, r6
+ ldq v1, -32(vp)
+ addq r6, r3, sl
+ ldq u1, 32(up)
+ ADDSUB u0, sl, ps
+ cmplt v0, r31, r2
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy0)
+ stq rr, -16(rp)
+ CARRY( rr, ps, cy)
+ lda up, 32(up) C bookkeeping
+ addq cy, cy0, cy0
+L(lo2): addq v1, v1, r7
+ ldq v0, -24(vp)
+ addq r7, r2, sl
+ ldq u0, 8(up)
+ ADDSUB u1, sl, ps
+ cmplt v1, r31, r3
+ ADDSUB ps, cy0, rr
+ CARRY( ps, u1, cy1)
+ stq rr, -8(rp)
+ CARRY( rr, ps, cy)
+ lda n, -4(n) C bookkeeping
+ addq cy, cy1, cy1
+ bgt n, L(top)
+
+L(end): addq v0, v0, r6
+ addq r6, r3, sl
+ ADDSUB u0, sl, ps
+ cmplt v0, r31, r2
+ ADDSUB ps, cy1, rr
+ CARRY( ps, u0, cy0)
+ stq rr, 0(rp)
+ CARRY( rr, ps, cy)
+ addq cy, cy0, cy0
+ addq cy0, r2, r0
+
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorsmul_1.asm b/gmp/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644
index 0000000000..0e68e6e7ad
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/aorsmul_1.asm
@@ -0,0 +1,398 @@
+dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.5
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n', `r18')
+define(`v0', `r19')
+
+dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl them, so that further disturbance to the schedule is damped.
+
+dnl We couldn't pair the loads, because the entangled schedule of the carry's
+dnl has to happen on one side {0} of the machine.
+
+dnl This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl The lockup on U0 means that any stall can't be recovered from. Consider a
+dnl ldq in L1, say that load gets stalled because it collides with a fill from
+dnl the b_cache. On the next cycle, this load gets priority. If first looks
+dnl at L0, and goes there. The instruction we intended for L0 gets to look at
+dnl L1, which is NOT where we want it. It either stalls 1, because it can't
+dnl go in L0, or goes there, and causes a further instruction to stall.
+
+dnl So for b_cache, we're likely going to want to put one or more cycles back
+dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl At a place where we have an mt followed by a bookkeeping, put the
+dnl bookkeeping in upper, and the prefetch into lower.
+
+dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd
+dnl like not to have an ldq or an stq to preceded a conditional branch in a
+dnl quadpack. The conditional branch moves the retire pointer one cycle
+dnl later.
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `addq')
+ define(`CMPCY', `cmpult $2,$1')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `subq')
+ define(`CMPCY', `cmpult $1,$2')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ ldq r3, 0(up) C
+ and r18, 7, r20 C
+ lda r18, -9(r18) C
+ cmpeq r20, 1, r21 C
+ beq r21, $L1 C
+
+$1mod8: ldq r5, 0(rp) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r8 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r8, r20, r0 C
+ stq r23, 0(rp) C
+ bge r18, $ent1 C
+ ret r31, (r26), 1 C
+
+$L1: lda r8, 0(r31) C zero carry reg
+ lda r24, 0(r31) C zero carry reg
+ cmpeq r20, 2, r21 C
+ bne r21, $2mod8 C
+ cmpeq r20, 3, r21 C
+ bne r21, $3mod8 C
+ cmpeq r20, 4, r21 C
+ bne r21, $4mod8 C
+ cmpeq r20, 5, r21 C
+ bne r21, $5mod8 C
+ cmpeq r20, 6, r21 C
+ bne r21, $6mod8 C
+ cmpeq r20, 7, r21 C
+ beq r21, $0mod8 C
+
+$7mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r24 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r24, r20, r24 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$6mod8: ldq r1, 8(up) C
+ mulq v0, r3, r25 C
+ umulh v0, r3, r3 C
+ mulq v0, r1, r28 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ ldq r1, 24(up) C
+ lda up, 48(up) C L1 bookkeeping
+ mulq v0, r0, r2 C
+ ldq r5, 8(rp) C
+ lda rp, -32(rp) C L1 bookkeeping
+ umulh v0, r0, r6 C
+ ADDSUB r4, r25, r25 C lo + acc
+ mulq v0, r1, r7 C
+ br r31, $ent6 C
+
+$ent1: lda up, 8(up) C
+ lda rp, 8(rp) C
+ lda r8, 0(r0) C
+ ldq r3, 0(up) C
+$0mod8: ldq r1, 8(up) C
+ mulq v0, r3, r2 C
+ umulh v0, r3, r6 C
+ mulq v0, r1, r7 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r24 C
+ ldq r1, 24(up) C
+ mulq v0, r0, r25 C
+ ldq r5, 8(rp) C
+ umulh v0, r0, r3 C
+ ADDSUB r4, r2, r2 C lo + acc
+ mulq v0, r1, r28 C
+ lda rp, -16(rp) C
+ br r31, $ent0 C
+
+$3mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r8 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r8, r20, r24 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$2mod8: ldq r1, 8(up) C
+ mulq v0, r3, r25 C
+ umulh v0, r3, r3 C
+ mulq v0, r1, r28 C
+ ble r18, $n23 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ ldq r1, 24(up) C
+ lda up, 16(up) C L1 bookkeeping
+ mulq v0, r0, r2 C
+ ldq r5, 8(rp) C
+ lda rp, 0(rp) C L1 bookkeeping
+ umulh v0, r0, r6 C
+ ADDSUB r4, r25, r25 C lo + acc
+ mulq v0, r1, r7 C
+ br r31, $ent2 C
+
+$5mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r24 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r24, r20, r8 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$4mod8: ldq r1, 8(up) C
+ mulq v0, r3, r2 C
+ umulh v0, r3, r6 C
+ mulq v0, r1, r7 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r24 C
+ ldq r1, 24(up) C
+ lda up, 32(up) C L1 bookkeeping
+ mulq v0, r0, r25 C
+ ldq r5, 8(rp) C
+ lda rp, 16(rp) C L1 bookkeeping
+ umulh v0, r0, r3 C
+ ADDSUB r4, r2, r2 C lo + acc
+ mulq v0, r1, r28 C
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+ ble r18, $Lend C
+ ALIGN(16)
+$Loop:
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r2, r22), r21 C L0 hi add => carry
+ addq r6, r20, r6 C U0 hi mul + carry
+ ldq r0, 0(up) C
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r7, r7 C L0 lo + acc
+ addq r6, r21, r6 C U0 hi mul + carry
+ ldq r4, 0(rp) C L1
+
+ umulh v0, r1, r8 C U1
+ CMPCY( r5, r7), r20 C L0 lo add => carry
+ ADDSUB r7, r6, r23 C U0 hi add => answer
+ ldq r1, 8(up) C L1
+
+ mulq v0, r0, r2 C U1
+ CMPCY( r7, r23), r21 C L0 hi add => carry
+ addq r24, r20, r24 C U0 hi mul + carry
+ ldq r5, 8(rp) C L1
+
+ umulh v0, r0, r6 C U1
+ ADDSUB r4, r25, r25 C U0 lo + acc
+ stq r22, -16(rp) C L0
+ stq r23, -8(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r7 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r24, r21, r24 C U0 hi mul + carry
+$ent2:
+ CMPCY( r4, r25), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r18, -8(r18) C L1 bookkeeping
+ ADDSUB r25, r24, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r25, r22), r21 C L0 hi add => carry
+ addq r3, r20, r3 C U0 hi mul + carry
+ ldq r0, 16(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r28, r28 C L0 lo + acc
+ addq r3, r21, r3 C U0 hi mul + carry
+ ldq r4, 16(rp) C L1
+
+ umulh v0, r1, r24 C U1
+ CMPCY( r5, r28), r20 C L0 lo add => carry
+ ADDSUB r28, r3, r23 C U0 hi add => answer
+ ldq r1, 24(up) C L1
+
+ mulq v0, r0, r25 C U1
+ CMPCY( r28, r23), r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r5, 24(rp) C L1
+
+ umulh v0, r0, r3 C U1
+ ADDSUB r4, r2, r2 C U0 lo + acc
+ stq r22, 0(rp) C L0
+ stq r23, 8(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r28 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+$ent0:
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda up, 64(up) C L1 bookkeeping
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r2, r22), r21 C L0 hi add => carry
+ addq r6, r20, r6 C U0 hi mul + carry
+ ldq r0, -32(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r7, r7 C L0 lo + acc
+ addq r6, r21, r6 C U0 hi mul + carry
+ ldq r4, 32(rp) C L1
+
+ umulh v0, r1, r8 C U1
+ CMPCY( r5, r7), r20 C L0 lo add => carry
+ ADDSUB r7, r6, r23 C U0 hi add => answer
+ ldq r1, -24(up) C L1
+
+ mulq v0, r0, r2 C U1
+ CMPCY( r7, r23), r21 C L0 hi add => carry
+ addq r24, r20, r24 C U0 hi mul + carry
+ ldq r5, 40(rp) C L1
+
+ umulh v0, r0, r6 C U1
+ ADDSUB r4, r25, r25 C U0 lo + acc
+ stq r22, 16(rp) C L0
+ stq r23, 24(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r7 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r24, r21, r24 C U0 hi mul + carry
+$ent6:
+ CMPCY( r4, r25), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda rp, 64(rp) C L1 bookkeeping
+ ADDSUB r25, r24, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r25, r22), r21 C L0 hi add => carry
+ addq r3, r20, r3 C U0 hi mul + carry
+ ldq r0, -16(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r28, r28 C L0 lo + acc
+ addq r3, r21, r3 C U0 hi mul + carry
+ ldq r4, -16(rp) C L1
+
+ umulh v0, r1, r24 C U1
+ CMPCY( r5, r28), r20 C L0 lo add => carry
+ ADDSUB r28, r3, r23 C U0 hi add => answer
+ ldq r1, -8(up) C L1
+
+ mulq v0, r0, r25 C U1
+ CMPCY( r28, r23), r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r5, -8(rp) C L1
+
+ umulh v0, r0, r3 C U1
+ ADDSUB r4, r2, r2 C U0 lo + acc
+ stq r22, -32(rp) C L0
+ stq r23, -24(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r28 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+ ldl r31, 256(up) C prefetch up[]
+ bgt r18, $Loop C U1 bookkeeping
+
+$Lend: CMPCY( r2, r22), r21 C
+ addq r6, r20, r6 C
+ ADDSUB r5, r7, r7 C
+ addq r6, r21, r6 C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ CMPCY( r5, r7), r20 C
+ ADDSUB r7, r6, r23 C
+ CMPCY(r7, r23), r21 C
+ addq r24, r20, r24 C
+ ldq r5, 8(rp) C
+ ADDSUB r4, r25, r25 C
+ stq r22, -16(rp) C
+ stq r23, -8(rp) C
+ addq r24, r21, r24 C
+ br L(x)
+
+ ALIGN(16)
+$n23: ldq r4, 0(rp) C
+ ldq r5, 8(rp) C
+ umulh v0, r1, r8 C
+ ADDSUB r4, r25, r25 C
+L(x): CMPCY( r4, r25), r20 C
+ ADDSUB r25, r24, r22 C
+ CMPCY( r25, r22), r21 C
+ addq r3, r20, r3 C
+ ADDSUB r5, r28, r28 C
+ addq r3, r21, r3 C
+ CMPCY( r5, r28), r20 C
+ ADDSUB r28, r3, r23 C
+ CMPCY( r28, r23), r21 C
+ addq r8, r20, r8 C
+ stq r22, 0(rp) C
+ stq r23, 8(rp) C
+ addq r8, r21, r0 C
+ ret r31, (r26), 1 C
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/gmp-mparam.h b/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..e51d6b0d15
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define DIVEXACT_BY3_METHOD 0 /* override ../diveby3.asm */
+
+/* 500 MHz 21164 (agnesi.math.su.se) */
+/* FFT tuning limit = 20000000 */
+/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_1N_PI1_METHOD 2
+#define DIV_QR_1_NORM_THRESHOLD 5
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD 8
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 117
+#define MUL_TOOM44_THRESHOLD 124
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 59
+#define SQR_TOOM3_THRESHOLD 123
+#define SQR_TOOM4_THRESHOLD 163
+#define SQR_TOOM6_THRESHOLD 333
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 52
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 5
+
+#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 468, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 19, 7}, { 10, 6}, \
+ { 24, 7}, { 13, 6}, { 27, 7}, { 14, 6}, \
+ { 29, 7}, { 17, 6}, { 35, 7}, { 29, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 51, 9}, { 27, 8}, { 55, 9}, { 35, 8}, \
+ { 71, 9}, { 39,10}, { 23, 9}, { 55,10}, \
+ { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \
+ { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \
+ { 79,11}, { 47,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \
+ { 95,10}, { 199,11}, { 111,12}, { 63,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,11}, \
+ { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \
+ { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 151
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 412 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 27, 7}, { 14, 6}, { 29, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \
+ { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \
+ { 383,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 703,11}, { 1407,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1151,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \
+ { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD 5056
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 100
+#define MULLO_MUL_N_THRESHOLD 11355
+
+#define DC_DIV_QR_THRESHOLD 124
+#define DC_DIVAPPR_Q_THRESHOLD 438
+#define DC_BDIV_QR_THRESHOLD 153
+#define DC_BDIV_Q_THRESHOLD 318
+
+#define INV_MULMOD_BNM1_THRESHOLD 62
+#define INV_NEWTON_THRESHOLD 384
+#define INV_APPR_THRESHOLD 402
+
+#define BINV_NEWTON_THRESHOLD 381
+#define REDC_1_TO_REDC_N_THRESHOLD 110
+
+#define MU_DIV_QR_THRESHOLD 1752
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 174
+#define MU_BDIV_QR_THRESHOLD 1387
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define POWM_SEC_TABLE 1,13,66,82,579
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 318
+#define HGCD_APPR_THRESHOLD 363
+#define HGCD_REDUCE_THRESHOLD 2384
+#define GCD_DC_THRESHOLD 2504
+#define GCDEXT_DC_THRESHOLD 671
+#define JACOBI_BASE_METHOD 3
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 3754
+#define SET_STR_PRECOMPUTE_THRESHOLD 8097
+
+#define FAC_DSC_THRESHOLD 951
+#define FAC_ODD_THRESHOLD 24
diff --git a/gmp/mpn/alpha/ev6/mod_1_4.asm b/gmp/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 0000000000..836de07c0f
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mod_1_4.asm
@@ -0,0 +1,337 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimise. 2.75 c/l should be possible.
+C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
+C * Optimise feed-in code, starting the sw pipeline in switch code.
+C * Shorten software pipeline. The mul instructions are scheduled too far
+C from their users. Fixing this will allow us to use fewer registers.
+C * If we cannot reduce register usage, write perhaps small-n basecase.
+C * Does this work for PIC?
+
+C cycles/limb
+C EV4: ?
+C EV5: 23
+C EV6: 3
+
+define(`ap', `r16')
+define(`n', `r17')
+define(`pl', `r24')
+define(`ph', `r25')
+define(`rl', `r6')
+define(`rh', `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_4p)
+ lda r30, -64(r30)
+ stq r9, 8(r30)
+ ldq B1modb, 16(r19)
+ stq r10, 16(r30)
+ ldq B2modb, 24(r19)
+ stq r11, 24(r30)
+ ldq B3modb, 32(r19)
+ stq r12, 32(r30)
+ ldq B4modb, 40(r19)
+ stq r13, 40(r30)
+ ldq B5modb, 48(r19)
+ s8addq n, ap, ap C point ap at vector end
+
+ and n, 3, r0
+ lda n, -4(n)
+ beq r0, L(b0)
+ lda r6, -2(r0)
+ blt r6, L(b1)
+ beq r6, L(b2)
+
+L(b3): ldq r21, -16(ap)
+ ldq r22, -8(ap)
+ ldq r20, -24(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, rl
+ cmpult rl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, rh
+ lda ap, -56(ap)
+ br L(com)
+
+L(b0): ldq r21, -24(ap)
+ ldq r22, -16(ap)
+ ldq r23, -8(ap)
+ ldq r20, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r27
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, rl
+ cmpult rl, r10, r0
+ addq r27, ph, ph
+ addq r0, ph, rh
+ lda ap, -64(ap)
+ br L(com)
+
+L(b1): bis r31, r31, rh
+ ldq rl, -8(ap)
+ lda ap, -40(ap)
+ br L(com)
+
+L(b2): ldq rh, -8(ap)
+ ldq rl, -16(ap)
+ lda ap, -48(ap)
+
+L(com): ble n, L(ed3)
+ ldq r21, 8(ap)
+ ldq r22, 16(ap)
+ ldq r23, 24(ap)
+ ldq r20, 0(ap)
+ lda n, -4(n)
+ lda ap, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r27
+ mulq rl, B4modb, r11
+ umulh rl, B4modb, r28
+ ble n, L(ed2)
+
+ ALIGN(16)
+L(top): ldq r21, 8(ap)
+ mulq rh, B5modb, rl
+ addq r8, r20, pl
+ ldq r22, 16(ap)
+ cmpult pl, r8, r0
+ umulh rh, B5modb, rh
+ ldq r23, 24(ap)
+ addq r0, r12, ph
+ addq r9, pl, pl
+ mulq r21, B1modb, r8
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ umulh r21, B1modb, r12
+ lda ap, -32(ap)
+ addq r0, ph, ph
+ addq r10, pl, pl
+ mulq r22, B2modb, r9
+ cmpult pl, r10, r0
+ addq r27, ph, ph
+ addq r11, pl, pl
+ umulh r22, B2modb, r13
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r28, ph, ph
+ mulq r23, B3modb, r10
+ ldq r20, 32(ap)
+ addq pl, rl, rl
+ umulh r23, B3modb, r27
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ mulq rl, B4modb, r11
+ addq ph, rh, rh
+ umulh rl, B4modb, r28
+ addq r0, rh, rh
+ lda n, -4(n)
+ bgt n, L(top)
+
+L(ed2): mulq rh, B5modb, rl
+ addq r8, r20, pl
+ umulh rh, B5modb, rh
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, pl
+ cmpult pl, r10, r0
+ addq r27, ph, ph
+ addq r11, pl, pl
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r28, ph, ph
+ addq pl, rl, rl
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ addq ph, rh, rh
+ addq r0, rh, rh
+
+L(ed3): mulq rh, B1modb, r8
+ umulh rh, B1modb, rh
+ addq r8, rl, rl
+ cmpult rl, r8, r0
+ addq r0, rh, rh
+
+ ldq r24, 8(r19) C cnt
+ sll rh, r24, rh
+ subq r31, r24, r25
+ srl rl, r25, r2
+ sll rl, r24, rl
+ or r2, rh, rh
+
+ ldq r23, 0(r19) C bi
+ mulq rh, r23, r8
+ umulh rh, r23, r9
+ addq rh, 1, r7
+ addq r8, rl, r8 C ql
+ cmpult r8, rl, r0
+ addq r9, r7, r9
+ addq r0, r9, r9 C qh
+ mulq r9, r18, r21 C qh * b
+ subq rl, r21, rl
+ cmpult r8, rl, r0 C rl > ql
+ negq r0, r0
+ and r0, r18, r0
+ addq rl, r0, rl
+ cmpule r18, rl, r0 C rl >= b
+ negq r0, r0
+ and r0, r18, r0
+ subq rl, r0, rl
+
+ srl rl, r24, r0
+
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ lda r30, 64(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+ lda r30, -32(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ mov r16, r11
+ LEA( r4, __clz_tab)
+ lda r10, 65(r31)
+ cmpbge r31, r17, r1
+ srl r1, 1, r1
+ xor r1, 127, r1
+ addq r1, r4, r1
+ ldq_u r2, 0(r1)
+ extbl r2, r1, r2
+ s8subq r2, 7, r2
+ srl r17, r2, r3
+ subq r10, r2, r10
+ addq r3, r4, r3
+ ldq_u r1, 0(r3)
+ extbl r1, r3, r1
+ subq r10, r1, r10
+ sll r17, r10, r9
+ mov r9, r16
+ jsr r26, mpn_invert_limb
+ ldah r29, 0(r26)
+ subq r31, r10, r2
+ lda r1, 1(r31)
+ sll r1, r10, r1
+ subq r31, r9, r3
+ srl r0, r2, r2
+ ldq r26, 0(r30)
+ bis r2, r1, r2
+ lda r29, 0(r29)
+ stq r0, 0(r11)
+ stq r10, 8(r11)
+ mulq r2, r3, r2
+ srl r2, r10, r3
+ umulh r2, r0, r1
+ stq r3, 16(r11)
+ mulq r2, r0, r3
+ ornot r31, r1, r1
+ subq r1, r2, r1
+ mulq r1, r9, r1
+ addq r1, r9, r2
+ cmpule r1, r3, r3
+ cmoveq r3, r2, r1
+ srl r1, r10, r3
+ umulh r1, r0, r2
+ stq r3, 24(r11)
+ mulq r1, r0, r3
+ ornot r31, r2, r2
+ subq r2, r1, r2
+ mulq r2, r9, r2
+ addq r2, r9, r1
+ cmpule r2, r3, r3
+ cmoveq r3, r1, r2
+ srl r2, r10, r1
+ umulh r2, r0, r3
+ stq r1, 32(r11)
+ mulq r2, r0, r1
+ ornot r31, r3, r3
+ subq r3, r2, r3
+ mulq r3, r9, r3
+ addq r3, r9, r2
+ cmpule r3, r1, r1
+ cmoveq r1, r2, r3
+ srl r3, r10, r2
+ umulh r3, r0, r1
+ stq r2, 40(r11)
+ mulq r3, r0, r0
+ ornot r31, r1, r1
+ subq r1, r3, r1
+ mulq r1, r9, r1
+ addq r1, r9, r9
+ cmpule r1, r0, r0
+ cmoveq r0, r9, r1
+ ldq r9, 8(r30)
+ srl r1, r10, r1
+ ldq r10, 16(r30)
+ stq r1, 48(r11)
+ ldq r11, 24(r30)
+ lda r30, 32(r30)
+ ret r31, (r26), 1
+EPILOGUE()
diff --git a/gmp/mpn/alpha/ev6/mul_1.asm b/gmp/mpn/alpha/ev6/mul_1.asm
new file mode 100644
index 0000000000..8ee19cd429
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mul_1.asm
@@ -0,0 +1,496 @@
+dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl result in a second limb vector.
+
+dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r16
+C s1_ptr r17
+C size r18
+C s2_limb r19
+
+C This code runs at 2.25 cycles/limb on EV6.
+
+C This code was written in close cooperation with ev6 pipeline expert
+C Steve Root. Any errors are tege's fault, though.
+
+C Code structure:
+
+C code for n < 8
+C code for n > 8 code for (n mod 8)
+C code for (n div 8) feed-in code
+C 8-way unrolled loop
+C wind-down code
+
+C Some notes about unrolled loop:
+C
+C r1-r8 multiplies and workup
+C r21-r28 multiplies and workup
+C r9-r12 loads
+C r0 -1
+C r20,r29,r13-r15 scramble
+C
+C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
+C put-the-carry-into-hi. The idea is that these branches are very rarely
+C taken, and since a non-taken branch consumes no resources, that is better
+C than an addq.
+C
+C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
+C add NEXT cycle #09 which feeds a store in NEXT cycle #02
+
+C The code could use some further work:
+C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
+C faster than this for size < 3.
+C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
+C that is too costly.
+C 3. Consider using 4-way unrolling, even if that runs slower.
+C 4. Reduce register usage. In particular, try to avoid using r29.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ cmpult r18, 8, r1
+ beq r1, $Large
+$Lsmall:
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ bic r31,r31,r4 C clear cy_limb
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Le1a C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ stq r3,0(r16)
+ beq r18,$Le2a C jump if size was == 2
+ ALIGN(8)
+$Lopa: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ lda r18,-1(r18) C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,16(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,8(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ lda r16,8(r16) C res_ptr++
+ bne r18,$Lopa
+
+$Le2a: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,8(r16)
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Le1a: stq r3,0(r16)
+ ret r31,(r26),1
+
+$Large:
+ lda r30, -224(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+ stq r29, 64(r30)
+
+ and r18, 7, r20 C count for the first loop, 0-7
+ srl r18, 3, r18 C count for unrolled loop
+ bis r31, r31, r21
+ beq r20, $L_8_or_more C skip first loop
+
+$L_9_or_more:
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ lda r20,-1(r20) C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ umulh r2,r19,r21 C r21 = prod_high
+ beq r20,$Le1b C jump if size was == 1
+ bis r31, r31, r0 C FIXME: shouldn't need this
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ lda r20,-1(r20) C size--
+ stq r3,0(r16)
+ lda r16,8(r16) C res_ptr++
+ beq r20,$Le2b C jump if size was == 2
+ ALIGN(8)
+$Lopb: mulq r2,r19,r3 C r3 = prod_low
+ addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
+ lda r20,-1(r20) C size--
+ umulh r2,r19,r21 C r21 = prod_high
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,0(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ lda r16,8(r16) C res_ptr++
+ bne r20,$Lopb
+
+$Le2b: mulq r2,r19,r3 C r3 = prod_low
+ addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r21 C r21 = prod_high
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,0(r16)
+ lda r16,8(r16) C res_ptr++
+ addq r21,r0,r21 C cy_limb = prod_high + cy
+ br r31, $L_8_or_more
+$Le1b: stq r3,0(r16)
+ lda r16,8(r16) C res_ptr++
+
+$L_8_or_more:
+ lda r0, -1(r31) C put -1 in r0, for tricky loop control
+ lda r17, -32(r17) C L1 bookkeeping
+ lda r18, -1(r18) C decrement count
+
+ ldq r9, 32(r17) C L1
+ ldq r10, 40(r17) C L1
+ mulq r9, r19, r22 C U1 #07
+ ldq r11, 48(r17) C L1
+ umulh r9, r19, r23 C U1 #08
+ ldq r12, 56(r17) C L1
+ mulq r10, r19, r24 C U1 #09
+ ldq r9, 64(r17) C L1
+
+ lda r17, 64(r17) C L1 bookkeeping
+
+ umulh r10, r19, r25 C U1 #11
+ mulq r11, r19, r26 C U1 #12
+ umulh r11, r19, r27 C U1 #13
+ mulq r12, r19, r28 C U1 #14
+ ldq r10, 8(r17) C L1
+ umulh r12, r19, r1 C U1 #15
+ ldq r11, 16(r17) C L1
+ mulq r9, r19, r2 C U1 #16
+ ldq r12, 24(r17) C L1
+ umulh r9, r19, r3 C U1 #17
+ addq r21, r22, r13 C L1 mov
+ mulq r10, r19, r4 C U1 #18
+ addq r23, r24, r22 C L0 sum 2 mul's
+ cmpult r13, r21, r14 C L1 carry from sum
+ bgt r18, $L_16_or_more
+
+ cmpult r22, r24, r24 C U0 carry from sum
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ umulh r11, r19, r7 C U1 #04
+ addq r27, r28, r28 C U0 sum 2 mul's
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 32(r16) C L1 bookkeeping
+ addq r13, r31, r13 C U0 start carry cascade
+ umulh r12, r19, r21 C U1 #06
+ br r31, $ret0c
+
+$L_16_or_more:
+C ---------------------------------------------------------------
+ subq r18,1,r18
+ cmpult r22, r24, r24 C U0 carry from sum
+ ldq r9, 32(r17) C L1
+
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ umulh r11, r19, r7 C U1 #04
+ addq r27, r28, r28 C U0 sum 2 mul's
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 32(r16) C L1 bookkeeping
+ addq r13, r31, r13 C U0 start carry cascade
+
+ umulh r12, r19, r21 C U1 #06
+C beq r13, $fix0w C U0
+$ret0w: addq r22, r14, r26 C L0
+ ldq r10, 40(r17) C L1
+
+ mulq r9, r19, r22 C U1 #07
+ beq r26, $fix1w C U0
+$ret1w: addq r23, r24, r27 C L0
+ ldq r11, 48(r17) C L1
+
+ umulh r9, r19, r23 C U1 #08
+ beq r27, $fix2w C U0
+$ret2w: addq r28, r25, r28 C L0
+ ldq r12, 56(r17) C L1
+
+ mulq r10, r19, r24 C U1 #09
+ beq r28, $fix3w C U0
+$ret3w: addq r1, r2, r20 C L0 sum 2 mul's
+ ldq r9, 64(r17) C L1
+
+ addq r3, r4, r2 C L0 #10 2 mul's
+ lda r17, 64(r17) C L1 bookkeeping
+ cmpult r20, r1, r29 C U0 carry from sum
+
+ umulh r10, r19, r25 C U1 #11
+ cmpult r2, r4, r4 C U0 carry from sum
+ stq r13, -32(r16) C L0
+ stq r26, -24(r16) C L1
+
+ mulq r11, r19, r26 C U1 #12
+ addq r5, r6, r14 C U0 sum 2 mul's
+ stq r27, -16(r16) C L0
+ stq r28, -8(r16) C L1
+
+ umulh r11, r19, r27 C U1 #13
+ cmpult r14, r6, r3 C U0 carry from sum
+C could do cross-jumping here:
+C bra $L_middle_of_unrolled_loop
+ mulq r12, r19, r28 C U1 #14
+ addq r7, r3, r5 C L0 eat carry
+ addq r20, r15, r20 C U0 carry cascade
+ ldq r10, 8(r17) C L1
+
+ umulh r12, r19, r1 C U1 #15
+ beq r20, $fix4 C U0
+$ret4w: addq r2, r29, r6 C L0
+ ldq r11, 16(r17) C L1
+
+ mulq r9, r19, r2 C U1 #16
+ beq r6, $fix5 C U0
+$ret5w: addq r14, r4, r7 C L0
+ ldq r12, 24(r17) C L1
+
+ umulh r9, r19, r3 C U1 #17
+ beq r7, $fix6 C U0
+$ret6w: addq r5, r8, r8 C L0 sum 2
+ addq r21, r22, r13 C L1 sum 2 mul's
+
+ mulq r10, r19, r4 C U1 #18
+ addq r23, r24, r22 C L0 sum 2 mul's
+ cmpult r13, r21, r14 C L1 carry from sum
+ ble r18, $Lend C U0
+C ---------------------------------------------------------------
+ ALIGN(16)
+$Loop:
+ umulh r0, r18, r18 C U1 #01 decrement r18!
+ cmpult r8, r5, r29 C L0 carry from last bunch
+ cmpult r22, r24, r24 C U0 carry from sum
+ ldq r9, 32(r17) C L1
+
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ stq r20, 0(r16) C L0
+ stq r6, 8(r16) C L1
+
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ stq r7, 16(r16) C L0
+ stq r8, 24(r16) C L1
+
+ umulh r11, r19, r7 C U1 #04
+ bis r31, r31, r31 C L0 st slosh
+ bis r31, r31, r31 C L1 st slosh
+ addq r27, r28, r28 C U0 sum 2 mul's
+
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r13, r29, r13 C U0 start carry cascade
+
+ umulh r12, r19, r21 C U1 #06
+ beq r13, $fix0 C U0
+$ret0: addq r22, r14, r26 C L0
+ ldq r10, 40(r17) C L1
+
+ mulq r9, r19, r22 C U1 #07
+ beq r26, $fix1 C U0
+$ret1: addq r23, r24, r27 C L0
+ ldq r11, 48(r17) C L1
+
+ umulh r9, r19, r23 C U1 #08
+ beq r27, $fix2 C U0
+$ret2: addq r28, r25, r28 C L0
+ ldq r12, 56(r17) C L1
+
+ mulq r10, r19, r24 C U1 #09
+ beq r28, $fix3 C U0
+$ret3: addq r1, r2, r20 C L0 sum 2 mul's
+ ldq r9, 64(r17) C L1
+
+ addq r3, r4, r2 C L0 #10 2 mul's
+ bis r31, r31, r31 C U1 mul hole
+ lda r17, 64(r17) C L1 bookkeeping
+ cmpult r20, r1, r29 C U0 carry from sum
+
+ umulh r10, r19, r25 C U1 #11
+ cmpult r2, r4, r4 C U0 carry from sum
+ stq r13, -32(r16) C L0
+ stq r26, -24(r16) C L1
+
+ mulq r11, r19, r26 C U1 #12
+ addq r5, r6, r14 C U0 sum 2 mul's
+ stq r27, -16(r16) C L0
+ stq r28, -8(r16) C L1
+
+ umulh r11, r19, r27 C U1 #13
+ bis r31, r31, r31 C L0 st slosh
+ bis r31, r31, r31 C L1 st slosh
+ cmpult r14, r6, r3 C U0 carry from sum
+$L_middle_of_unrolled_loop:
+ mulq r12, r19, r28 C U1 #14
+ addq r7, r3, r5 C L0 eat carry
+ addq r20, r15, r20 C U0 carry cascade
+ ldq r10, 8(r17) C L1
+
+ umulh r12, r19, r1 C U1 #15
+ beq r20, $fix4 C U0
+$ret4: addq r2, r29, r6 C L0
+ ldq r11, 16(r17) C L1
+
+ mulq r9, r19, r2 C U1 #16
+ beq r6, $fix5 C U0
+$ret5: addq r14, r4, r7 C L0
+ ldq r12, 24(r17) C L1
+
+ umulh r9, r19, r3 C U1 #17
+ beq r7, $fix6 C U0
+$ret6: addq r5, r8, r8 C L0 sum 2
+ addq r21, r22, r13 C L1 sum 2 mul's
+
+ mulq r10, r19, r4 C U1 #18
+ addq r23, r24, r22 C L0 sum 2 mul's
+ cmpult r13, r21, r14 C L1 carry from sum
+ bgt r18, $Loop C U0
+C ---------------------------------------------------------------
+$Lend:
+ cmpult r8, r5, r29 C L0 carry from last bunch
+ cmpult r22, r24, r24 C U0 carry from sum
+
+ umulh r10, r19, r5 C U1 #02
+ addq r25, r26, r23 C U0 sum 2 mul's
+ stq r20, 0(r16) C L0
+ stq r6, 8(r16) C L1
+
+ mulq r11, r19, r6 C U1 #03
+ cmpult r23, r26, r25 C U0 carry from sum
+ stq r7, 16(r16) C L0
+ stq r8, 24(r16) C L1
+
+ umulh r11, r19, r7 C U1 #04
+ addq r27, r28, r28 C U0 sum 2 mul's
+
+ mulq r12, r19, r8 C U1 #05
+ cmpult r28, r27, r15 C L0 carry from sum
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r13, r29, r13 C U0 start carry cascade
+
+ umulh r12, r19, r21 C U1 #06
+ beq r13, $fix0c C U0
+$ret0c: addq r22, r14, r26 C L0
+ beq r26, $fix1c C U0
+$ret1c: addq r23, r24, r27 C L0
+ beq r27, $fix2c C U0
+$ret2c: addq r28, r25, r28 C L0
+ beq r28, $fix3c C U0
+$ret3c: addq r1, r2, r20 C L0 sum 2 mul's
+ addq r3, r4, r2 C L0 #10 2 mul's
+ lda r17, 64(r17) C L1 bookkeeping
+ cmpult r20, r1, r29 C U0 carry from sum
+ cmpult r2, r4, r4 C U0 carry from sum
+ stq r13, -32(r16) C L0
+ stq r26, -24(r16) C L1
+ addq r5, r6, r14 C U0 sum 2 mul's
+ stq r27, -16(r16) C L0
+ stq r28, -8(r16) C L1
+ cmpult r14, r6, r3 C U0 carry from sum
+ addq r7, r3, r5 C L0 eat carry
+ addq r20, r15, r20 C U0 carry cascade
+ beq r20, $fix4c C U0
+$ret4c: addq r2, r29, r6 C L0
+ beq r6, $fix5c C U0
+$ret5c: addq r14, r4, r7 C L0
+ beq r7, $fix6c C U0
+$ret6c: addq r5, r8, r8 C L0 sum 2
+ cmpult r8, r5, r29 C L0 carry from last bunch
+ stq r20, 0(r16) C L0
+ stq r6, 8(r16) C L1
+ stq r7, 16(r16) C L0
+ stq r8, 24(r16) C L1
+ addq r29, r21, r0
+
+ ldq r26, 0(r30)
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ ldq r29, 64(r30)
+ lda r30, 224(r30)
+ ret r31, (r26), 1
+
+C $fix0w: bis r14, r29, r14 C join carries
+C br r31, $ret0w
+$fix1w: bis r24, r14, r24 C join carries
+ br r31, $ret1w
+$fix2w: bis r25, r24, r25 C join carries
+ br r31, $ret2w
+$fix3w: bis r15, r25, r15 C join carries
+ br r31, $ret3w
+$fix0: bis r14, r29, r14 C join carries
+ br r31, $ret0
+$fix1: bis r24, r14, r24 C join carries
+ br r31, $ret1
+$fix2: bis r25, r24, r25 C join carries
+ br r31, $ret2
+$fix3: bis r15, r25, r15 C join carries
+ br r31, $ret3
+$fix4: bis r29, r15, r29 C join carries
+ br r31, $ret4
+$fix5: bis r4, r29, r4 C join carries
+ br r31, $ret5
+$fix6: addq r5, r4, r5 C can't carry twice!
+ br r31, $ret6
+$fix0c: bis r14, r29, r14 C join carries
+ br r31, $ret0c
+$fix1c: bis r24, r14, r24 C join carries
+ br r31, $ret1c
+$fix2c: bis r25, r24, r25 C join carries
+ br r31, $ret2c
+$fix3c: bis r15, r25, r15 C join carries
+ br r31, $ret3c
+$fix4c: bis r29, r15, r29 C join carries
+ br r31, $ret4c
+$fix5c: bis r4, r29, r4 C join carries
+ br r31, $ret5c
+$fix6c: addq r5, r4, r5 C can't carry twice!
+ br r31, $ret6c
+
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/README b/gmp/mpn/alpha/ev6/nails/README
new file mode 100644
index 0000000000..b214ac50ad
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/README
@@ -0,0 +1,65 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains assembly code for nails-enabled 21264. The code is not
+very well optimized.
+
+For addmul_N, as N grows larger, we could make multiple loads together, then do
+about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This
+would surely allow addmul_4 to run at 2 c/l, but the same should be possible
+also for addmul_3 and perhaps even addmul_2.
+
+
+ current fair best
+Routine c/l unroll c/l unroll c/l i/c
+mul_1 3.25 2.75 2.75 3.273
+addmul_1 4.0 4 3.5 4 14 3.25 3.385
+addmul_2 4.0 1 2.5 2 10 2.25 3.333
+addmul_3 3.0 1 2.33 2 14 2 3.333
+addmul_4 2.5 1 2.125 2 17 2 3.135
+
+addmul_5 2 1 10
+addmul_6 2 1 12
+addmul_7 2 1 14
+
+(The "best" column doesn't account for bookkeeping instructions and
+thereby assumes infinite unrolling.)
+
+Basecase usages:
+
+1 addmul_1
+2 addmul_2
+3 addmul_3
+4 addmul_4
+5 addmul_3 + addmul_2 2.3998
+6 addmul_4 + addmul_2
+7 addmul_4 + addmul_3
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_1.asm b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
new file mode 100644
index 0000000000..711d4e66e5
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
@@ -0,0 +1,396 @@
+dnl Alpha ev6 nails mpn_addmul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 4
+
+C TODO
+C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq rl1, t0, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ addq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ addq rl1, acc1, acc1 C U0
+ ldq rl2, 0(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ addq rl2, acc0, acc0 C U0
+ ldq rl3, 8(rp) C L1
+C
+ lda n, -4(n) C L1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ addq rl3, acc1, acc1 C U0
+ ldq rl0, 16(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ addq rl0, acc0, acc0 C U0
+ ldq rl1, 24(rp) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ unop
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq rl1, acc1, acc1
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_2.asm b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 0000000000..6ff6b3ad6b
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
@@ -0,0 +1,146 @@
+dnl Alpha ev6 nails mpn_addmul_2.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 4.0 cycles/limb.
+
+C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
+C or 4-way unrolling over 20 cycles, for 2.5 c/l.
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): bis r31, r31, r31 C U1 nop
+ addq r19, acc0, acc0 C U0 propagate nail
+ ldq rlimb, 0(rp) C L0
+ ldq ulimb, 0(up) C L1
+
+ lda rp, 8(rp) C L1
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L0
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L1
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L0 nop
+
+ addq rlimb, r19, r19 C L1 FINAL PROD-SUM
+ srl m1a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L0
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ bis r31, m1b, acc1 C L1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L0 extract numb part
+
+ unop
+ srl r19,NUMB_BITS, r19 C U1 extract nail part
+ stq r28, -8(rp) C L1
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ bis r31, m1b, acc1
+ and r19,numb_mask, r28 C extract limb
+
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, r0
+
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_3.asm b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 0000000000..a1ffb680ec
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
@@ -0,0 +1,169 @@
+dnl Alpha ev6 nails mpn_addmul_3.
+
+dnl Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 3.0 cycles/limb.
+
+C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): ldq rlimb, 0(rp) C L1
+ ldq ulimb, 0(up) C L0
+ bis r31, r31, r31 C U0 nop
+ addq r19, acc0, acc0 C U1 propagate nail
+
+ lda rp, 8(rp) C L1
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L0
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L1
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L0 nop
+
+ addq rlimb, r19, r19 C L1
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C L0 nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ addq m1b, acc2, acc1 C L1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L0 extract numb part
+
+ bis r31, r31, r31 C L1 nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L0
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1 C L0
+ bis r31, m2b, acc2 C L1
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C U0 extract nail part
+
+ stq r28, -8(rp) C L
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ bis r31, m2b, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, m0a
+
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_4.asm b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 0000000000..77e02a4316
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
@@ -0,0 +1,210 @@
+dnl Alpha ev6 nails mpn_addmul_4.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 2.5 cycles/limb.
+
+C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
+C to 3.24 insn/cycle.
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(4-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+ lda r30, -240(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+ ldq v3, 24(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, acc3 C zero acc3
+ sll v3,NAIL_BITS, v3
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ mulq v3, ulimb, m3a C U1
+ umulh v3, ulimb, m3b C U1
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): bis r31, r31, r31 C U1 nop
+ ldq rlimb, 0(rp) C L0
+ ldq ulimb, 0(up) C L1
+ addq r19, acc0, acc0 C U0 propagate nail
+
+ bis r31, r31, r31 C L0 nop
+ bis r31, r31, r31 C U1 nop
+ bis r31, r31, r31 C L1 nop
+ bis r31, r31, r31 C U0 nop
+
+ lda rp, 8(rp) C L0
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L1
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L0
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L1 nop
+
+ addq rlimb, r19, r19 C L0
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C L1 nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ addq m1b, acc2, acc1 C L0
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L1 extract numb part
+
+ bis r31, r31, r31 C L0 nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L1
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1 C L1
+ addq m2b, acc3, acc2 C L0
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C U0 extract nail part
+
+ bis r31, r31, r31 C L0 nop
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp) C L1
+ mulq v3, ulimb, m3a C U1
+
+ addq r8, acc2, acc2 C L0
+ bis r31, m3b, acc3 C L1
+ umulh v3, ulimb, m3b C U1
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp) C FIXME: DELETE
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ addq m2b, acc3, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp)
+ addq r8, acc2, acc2
+ bis r31, m3b, acc3
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, acc2
+
+ and acc2,numb_mask, r28
+ stq r28, 16(rp)
+ srl acc2,NUMB_BITS, r19
+ addq r19, acc3, r0
+
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ lda r30, 240(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/aors_n.asm b/gmp/mpn/alpha/ev6/nails/aors_n.asm
new file mode 100644
index 0000000000..f6586773f5
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/aors_n.asm
@@ -0,0 +1,233 @@
+dnl Alpha ev6 nails mpn_add_n and mpn_sub_n.
+
+dnl Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
+dnl with 8-way unrolling.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n',`r19')
+
+define(`rl0',`r0')
+define(`rl1',`r1')
+define(`rl2',`r2')
+define(`rl3',`r3')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r6')
+define(`ul3',`r7')
+
+define(`vl0',`r22')
+define(`vl1',`r23')
+define(`vl2',`r24')
+define(`vl3',`r25')
+
+define(`numb_mask',`r21')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`CYSH',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ifdef(`OPERATION_add_n', `
+ define(`OP', addq)
+ define(`CYSH',`GMP_NUMB_BITS')
+ define(`func', mpn_add_n)')
+ifdef(`OPERATION_sub_n', `
+ define(`OP', subq)
+ define(`CYSH',63)
+ define(`func', mpn_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+ bis r31, r31, r20
+
+ and n, 3, r25
+ lda n, -4(n)
+ beq r25, L(ge4)
+
+L(lp0): ldq ul0, 0(up)
+ lda up, 8(up)
+ ldq vl0, 0(vp)
+ lda vp, 8(vp)
+ lda rp, 8(rp)
+ lda r25, -1(r25)
+ OP ul0, vl0, rl0
+ OP rl0, r20, rl0
+ and rl0, numb_mask, r28
+ stq r28, -8(rp)
+ srl rl0, CYSH, r20
+ bne r25, L(lp0)
+
+ blt n, L(ret)
+
+L(ge4): ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+ lda up, 32(up)
+ lda vp, 32(vp)
+ lda n, -4(n)
+ bge n, L(ge8)
+
+ OP ul0, vl0, rl0 C main-add 0
+ OP rl0, r20, rl0 C cy-add 0
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ br r31, L(cj0)
+
+L(ge8): OP ul0, vl0, rl0 C main-add 0
+ ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+ OP rl0, r20, rl0 C cy-add 0
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+ lda rp, 32(rp)
+ lda up, 32(up)
+ lda vp, 32(vp)
+ lda n, -4(n)
+ blt n, L(end)
+
+ ALIGN(32)
+L(top): OP ul0, vl0, rl0 C main-add 0
+ srl rl3, CYSH, r20 C gen cy 3
+ ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+
+ OP rl0, r20, rl0 C cy-add 0
+ and rl3,numb_mask, r28
+ stq r27, -16(rp)
+ bis r31, r31, r31
+
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ stq r28, -8(rp)
+ bis r31, r31, r31
+
+ OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ bis r31, r31, r31
+
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+ bis r31, r31, r31
+
+ bis r31, r31, r31
+ lda n, -4(n)
+ lda up, 32(up)
+ lda vp, 32(vp)
+
+ bis r31, r31, r31
+ bis r31, r31, r31
+ lda rp, 32(rp)
+ bge n, L(top)
+
+L(end): OP ul0, vl0, rl0 C main-add 0
+ srl rl3, CYSH, r20 C gen cy 3
+ OP rl0, r20, rl0 C cy-add 0
+ and rl3,numb_mask, r28
+ stq r27, -16(rp)
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ stq r28, -8(rp)
+L(cj0): OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+
+ srl rl3, CYSH, r20 C gen cy 3
+ and rl3,numb_mask, r28
+ stq r27, 16(rp)
+ stq r28, 24(rp)
+
+L(ret): and r20, 1, r0
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
new file mode 100644
index 0000000000..7949fe8df8
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD 40
+#define MUL_TOOM33_THRESHOLD 236
+
+#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */
+#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD 120
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIV_DC_THRESHOLD 48
+#define POWM_THRESHOLD 113
+
+#define HGCD_THRESHOLD 78
+#define GCD_ACCEL_THRESHOLD 3
+#define GCD_DC_THRESHOLD 392
+#define JACOBI_BASE_METHOD 1
+
+#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */
+#define USE_PREINV_MOD_1 0 /* no preinv with nails */
+#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_THRESHOLD 6336
+
+#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 }
+#define MUL_FFT_MODF_THRESHOLD 488
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 }
+#define SQR_FFT_MODF_THRESHOLD 480
+#define SQR_FFT_THRESHOLD 2976
diff --git a/gmp/mpn/alpha/ev6/nails/mul_1.asm b/gmp/mpn/alpha/ev6/nails/mul_1.asm
new file mode 100644
index 0000000000..da2ee3d099
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/mul_1.asm
@@ -0,0 +1,364 @@
+dnl Alpha ev6 nails mpn_mul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.25
+
+C TODO
+C * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m1a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ lda n, -4(n) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/nails/submul_1.asm b/gmp/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 0000000000..f473a59ba8
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/submul_1.asm
@@ -0,0 +1,396 @@
+dnl Alpha ev6 nails mpn_submul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 4
+
+C TODO
+C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ subq rl1, t0, acc1
+ and acc1,numb_mask, r28
+ sra acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ subq m1b, t1, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ subq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ sra acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ sra acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ subq rl1, acc1, acc1 C U0
+ ldq rl2, 0(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ sra acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ subq rl2, acc0, acc0 C U0
+ ldq rl3, 8(rp) C L1
+C
+ lda n, -4(n) C L1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ sra acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ subq rl3, acc1, acc1 C U0
+ ldq rl0, 16(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ sra acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ subq rl0, acc0, acc0 C U0
+ ldq rl1, 24(rp) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ subq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ sra acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ unop
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ subq rl1, acc1, acc1
+ addq t1, acc1, acc1
+ sra acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ subq m1b, t1, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/slot.pl b/gmp/mpn/alpha/ev6/slot.pl
new file mode 100755
index 0000000000..a4c8a36882
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/slot.pl
@@ -0,0 +1,318 @@
+#!/usr/bin/perl -w
+
+# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of either:
+#
+# * the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# or
+#
+# * the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# or both in parallel, as here.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received copies of the GNU General Public License and the
+# GNU Lesser General Public License along with the GNU MP Library. If not,
+# see https://www.gnu.org/licenses/.
+
+
+# Usage: slot.pl [filename.o]...
+#
+# Run "objdump" to produce a disassembly of the given object file(s) and
+# annotate the output with "U" or "L" slotting which Alpha EV6 will use.
+#
+# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as
+# a reminder that it wasn't a fixed requirement that gave the U or L, but
+# the octaword slotting rules.
+#
+# If an instruction is not recognised, that octaword does not get any U/L
+# shown, only lower-case "u", "l" or "e" for the instructions which are
+# known. Add any unknown instructions to %optable below.
+
+
+use strict;
+
+# The U or L which various instructions demand, or E if either.
+#
+my %optable =
+ (
+ 'addq' => 'E',
+ 'and' => 'E',
+ 'andnot' => 'E',
+ 'beq' => 'U',
+ 'bge' => 'U',
+ 'bgt' => 'U',
+ 'bic' => 'E',
+ 'bis' => 'E',
+ 'blt' => 'U',
+ 'bne' => 'U',
+ 'br' => 'L',
+ 'clr' => 'E',
+ 'cmpule' => 'E',
+ 'cmpult' => 'E',
+ 'cmpeq' => 'E',
+ 'cmoveq' => 'E',
+ 'cmovne' => 'E',
+ 'ctpop' => 'U',
+ 'ctlz' => 'U',
+ 'cttz' => 'U',
+ 'extbl' => 'U',
+ 'extlh' => 'U',
+ 'extll' => 'U',
+ 'extqh' => 'U',
+ 'extql' => 'U',
+ 'extwh' => 'U',
+ 'extwl' => 'U',
+ 'jsr' => 'L',
+ 'lda' => 'E',
+ 'ldah' => 'E',
+ 'ldbu' => 'L',
+ 'ldl' => 'L',
+ 'ldq' => 'L',
+ 'ldt' => 'L',
+ 'ret' => 'L',
+ 'mov' => 'E',
+ 'mull' => 'U',
+ 'mulq' => 'U',
+ 'negq' => 'E',
+ 'nop' => 'E',
+ 'not' => 'E',
+ 's8addq' => 'E',
+ 's8subq' => 'E',
+ # 'sextb' => ?
+ # 'sextl' => ?
+ 'sll' => 'U',
+ 'srl' => 'U',
+ 'stq' => 'L',
+ 'subq' => 'E',
+ 'umulh' => 'U',
+ 'unop' => 'E',
+ 'xor' => 'E',
+ );
+
+# Slottings used for a given pattern of U/L/E in an octaword. This is as
+# per the "Ebox Slotting" section of the EV6 hardware reference manual.
+#
+my %slottable =
+ (
+ 'EEEE' => 'ULUL',
+ 'EEEL' => 'ULUL',
+ 'EEEU' => 'ULLU',
+ 'EELE' => 'ULLU',
+ 'EELL' => 'UULL',
+ 'EELU' => 'ULLU',
+ 'EEUE' => 'ULUL',
+ 'EEUL' => 'ULUL',
+ 'EEUU' => 'LLUU',
+ 'ELEE' => 'ULUL',
+ 'ELEL' => 'ULUL',
+ 'ELEU' => 'ULLU',
+ 'ELLE' => 'ULLU',
+ 'ELLL' => 'ULLL',
+ 'ELLU' => 'ULLU',
+ 'ELUE' => 'ULUL',
+ 'ELUL' => 'ULUL',
+
+ 'LLLL' => 'LLLL',
+ 'LLLU' => 'LLLU',
+ 'LLUE' => 'LLUU',
+ 'LLUL' => 'LLUL',
+ 'LLUU' => 'LLUU',
+ 'LUEE' => 'LULU',
+ 'LUEL' => 'LUUL',
+ 'LUEU' => 'LULU',
+ 'LULE' => 'LULU',
+ 'LULL' => 'LULL',
+ 'LULU' => 'LULU',
+ 'LUUE' => 'LUUL',
+ 'LUUL' => 'LUUL',
+ 'LUUU' => 'LUUU',
+ 'UEEE' => 'ULUL',
+ 'UEEL' => 'ULUL',
+ 'UEEU' => 'ULLU',
+
+ 'ELUU' => 'LLUU',
+ 'EUEE' => 'LULU',
+ 'EUEL' => 'LUUL',
+ 'EUEU' => 'LULU',
+ 'EULE' => 'LULU',
+ 'EULL' => 'UULL',
+ 'EULU' => 'LULU',
+ 'EUUE' => 'LUUL',
+ 'EUUL' => 'LUUL',
+ 'EUUU' => 'LUUU',
+ 'LEEE' => 'LULU',
+ 'LEEL' => 'LUUL',
+ 'LEEU' => 'LULU',
+ 'LELE' => 'LULU',
+ 'LELL' => 'LULL',
+ 'LELU' => 'LULU',
+ 'LEUE' => 'LUUL',
+ 'LEUL' => 'LUUL',
+ 'LEUU' => 'LLUU',
+ 'LLEE' => 'LLUU',
+ 'LLEL' => 'LLUL',
+ 'LLEU' => 'LLUU',
+ 'LLLE' => 'LLLU',
+
+ 'UELE' => 'ULLU',
+ 'UELL' => 'UULL',
+ 'UELU' => 'ULLU',
+ 'UEUE' => 'ULUL',
+ 'UEUL' => 'ULUL',
+ 'UEUU' => 'ULUU',
+ 'ULEE' => 'ULUL',
+ 'ULEL' => 'ULUL',
+ 'ULEU' => 'ULLU',
+ 'ULLE' => 'ULLU',
+ 'ULLL' => 'ULLL',
+ 'ULLU' => 'ULLU',
+ 'ULUE' => 'ULUL',
+ 'ULUL' => 'ULUL',
+ 'ULUU' => 'ULUU',
+ 'UUEE' => 'UULL',
+ 'UUEL' => 'UULL',
+ 'UUEU' => 'UULU',
+ 'UULE' => 'UULL',
+ 'UULL' => 'UULL',
+ 'UULU' => 'UULU',
+ 'UUUE' => 'UUUL',
+ 'UUUL' => 'UUUL',
+ 'UUUU' => 'UUUU',
+ );
+
+# Check all combinations of U/L/E are present in %slottable.
+sub coverage {
+ foreach my $a ('U', 'L', 'E') {
+ foreach my $b ('U', 'L', 'E') {
+ foreach my $c ('U', 'L', 'E') {
+ foreach my $d ('U', 'L', 'E') {
+ my $x = $a . $b . $c . $d;
+ if (! defined $slottable{$x}) {
+ print "slottable missing: $x\n"
+ }
+ }
+ }
+ }
+ }
+}
+
+# Certain consistency checks for %slottable.
+sub check {
+ foreach my $x (keys %slottable) {
+ my $a = substr($x,0,1);
+ my $b = substr($x,1,1);
+ my $c = substr($x,2,1);
+ my $d = substr($x,3,1);
+ my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E');
+ my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L');
+ my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U');
+
+ my $got = $slottable{$x};
+ my $want = $x;
+
+ if ($es == 0) {
+
+ } elsif ($es == 1) {
+ # when only one E, it's mapped to whichever of U or L is otherwise
+ # used the least
+ if ($ls > $us) {
+ $want =~ s/E/U/;
+ } else {
+ $want =~ s/E/L/;
+ }
+ } elsif ($es == 2) {
+ # when two E's and two U, then the E's map to L; vice versa for two E
+ # and two L
+ if ($ls == 2) {
+ $want =~ s/E/U/g;
+ } elsif ($us == 2) {
+ $want =~ s/E/L/g;
+ } else {
+ next;
+ }
+ } elsif ($es == 3) {
+ next;
+
+ } else { # $es == 4
+ next;
+ }
+
+ if ($want ne $got) {
+ print "slottable $x want $want got $got\n";
+ }
+ }
+}
+
+sub disassemble {
+ my ($file) = @_;
+
+ open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n";
+
+ my (%pre, %post, %type);
+ while (<IN>) {
+ my $line = $_ . "";
+
+ if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) {
+ my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4);
+
+ my $this_type = $optable{$opcode};
+ if (! defined ($this_type)) { $this_type = ' '; }
+
+ $pre{$addr} = $this_pre;
+ $post{$addr} = $this_post;
+ $type{$addr} = $this_type;
+
+ if ($addr eq 'c') {
+ my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' ');
+
+ my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'};
+ $str = $slottable{$str};
+ if (defined $str) {
+ $slot{'c'} = substr($str,0,1);
+ $slot{'8'} = substr($str,1,1);
+ $slot{'4'} = substr($str,2,1);
+ $slot{'0'} = substr($str,3,1);
+ }
+
+ foreach my $i ('0', '4', '8', 'c') {
+ if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; }
+ print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, ' ', $post{$i}, "\n";
+ }
+
+ %pre = ();
+ %type = ();
+ %post = ();
+ }
+ }
+ }
+
+ close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+coverage();
+check();
+
+my @files;
+if ($#ARGV >= 0) {
+ @files = @ARGV;
+} else {
+ die
+}
+
+foreach (@files) {
+ disassemble($_);
+}
diff --git a/gmp/mpn/alpha/ev6/sub_n.asm b/gmp/mpn/alpha/ev6/sub_n.asm
new file mode 100644
index 0000000000..a35ba40d34
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/sub_n.asm
@@ -0,0 +1,283 @@
+dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 5.4
+C EV6: 2.125
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
+C cy r20 (for mpn_add_nc)
+
+C TODO
+C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C Use multi-pronged feed-in.
+C Perform additional micro-tuning
+
+C This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C Pair loads and stores where possible
+C Store pairs oct-aligned where possible (didn't need it here)
+C Stores are delayed every third cycle
+C Loads and stores are delayed by fills
+C U stays still, put code there where possible (note alternation of U1 and U0)
+C L moves because of loads and stores
+C Note dampers in L to limit damage
+
+C This odd-looking optimization expects that were having random bits in our
+C data, so that a pure zero result is unlikely. so we penalize the unlikely
+C case to help the common case.
+
+define(`u0', `r0') define(`u1', `r3')
+define(`v0', `r1') define(`v1', `r4')
+
+define(`cy0', `r20') define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+ br r31, $entry
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+ bis r31, r31, cy0 C clear carry in
+$entry: cmpult r19, 5, r22 C L1 move counter
+ ldq u1, 0(r17) C L0 get next ones
+ ldq v1, 0(r18) C L1
+ bne r22, $Lsmall
+
+ ldq u0, 8(r17) C L0 get next ones
+ ldq v0, 8(r18) C L1
+ subq u1, v1, r5 C U0 sub two data
+
+ cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 16(r17) C L0 get next ones
+ ldq v1, 16(r18) C L1
+
+ subq u0, v0, r8 C U1 sub two data
+ subq r5, cy0, r24 C U0 borrow in
+
+ cmpult u0, v0, r22 C U1 did it borrow
+ beq r5, $fix5f C U0 fix exact zero
+$ret5f: ldq u0, 24(r17) C L0 get next ones
+ ldq v0, 24(r18) C L1
+
+ subq r8, r23, r25 C U1 borrow from last
+ subq u1, v1, r7 C U0 sub two data
+
+ beq r8, $fix6f C U1 fix exact zero
+$ret6f: cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 32(r17) C L0 get next ones
+ ldq v1, 32(r18) C L1
+
+ lda r17, 40(r17) C L0 move pointer
+ lda r18, 40(r18) C L1 move pointer
+
+ lda r16, -8(r16)
+ lda r19, -13(r19) C L1 move counter
+ blt r19, $Lend C U1 loop control
+
+
+C Main loop. 8-way unrolled.
+ ALIGN(16)
+$Loop: subq u0, v0, r2 C U1 sub two data
+ stq r24, 8(r16) C L0 put an answer
+ subq r7, r22, r24 C U0 borrow from last
+ stq r25, 16(r16) C L1 pair
+
+ cmpult u0, v0, cy1 C U1 did it borrow
+ beq r7, $fix7 C U0 fix exact 0
+$ret7: ldq u0, 0(r17) C L0 get next ones
+ ldq v0, 0(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ subq r2, r23, r25 C U1 borrow from last
+ bis r31, r31, r31 C L moves in L !
+ subq u1, v1, r5 C U0 sub two data
+
+ beq r2, $fix0 C U1 fix exact zero
+$ret0: cmpult u1, v1, cy0 C U0 did it borrow
+ ldq u1, 8(r17) C L0 get next ones
+ ldq v1, 8(r18) C L1
+
+ subq u0, v0, r8 C U1 sub two data
+ stq r24, 24(r16) C L0 store pair
+ subq r5, cy1, r24 C U0 borrow from last
+ stq r25, 32(r16) C L1
+
+ cmpult u0, v0, r22 C U1 did it borrow
+ beq r5, $fix1 C U0 fix exact zero
+$ret1: ldq u0, 16(r17) C L0 get next ones
+ ldq v0, 16(r18) C L1
+
+ lda r16, 64(r16) C L0 move pointer
+ subq r8, cy0, r25 C U1 borrow from last
+ lda r19, -8(r19) C L1 move counter
+ subq u1, v1, r7 C U0 sub two data
+
+ beq r8, $fix2 C U1 fix exact zero
+$ret2: cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 24(r17) C L0 get next ones
+ ldq v1, 24(r18) C L1
+
+ subq u0, v0, r2 C U1 sub two data
+ stq r24, -24(r16) C L0 put an answer
+ subq r7, r22, r24 C U0 borrow from last
+ stq r25, -16(r16) C L1 pair
+
+ cmpult u0, v0, cy1 C U1 did it borrow
+ beq r7, $fix3 C U0 fix exact 0
+$ret3: ldq u0, 32(r17) C L0 get next ones
+ ldq v0, 32(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ subq r2, r23, r25 C U1 borrow from last
+ bis r31, r31, r31 C L moves in L !
+ subq u1, v1, r5 C U0 sub two data
+
+ beq r2, $fix4 C U1 fix exact zero
+$ret4: cmpult u1, v1, cy0 C U0 did it borrow
+ ldq u1, 40(r17) C L0 get next ones
+ ldq v1, 40(r18) C L1
+
+ subq u0, v0, r8 C U1 sub two data
+ stq r24, -8(r16) C L0 store pair
+ subq r5, cy1, r24 C U0 borrow from last
+ stq r25, 0(r16) C L1
+
+ cmpult u0, v0, r22 C U1 did it borrow
+ beq r5, $fix5 C U0 fix exact zero
+$ret5: ldq u0, 48(r17) C L0 get next ones
+ ldq v0, 48(r18) C L1
+
+ ldl r31, 256(r17) C L0 prefetch
+ subq r8, cy0, r25 C U1 borrow from last
+ ldl r31, 256(r18) C L1 prefetch
+ subq u1, v1, r7 C U0 sub two data
+
+ beq r8, $fix6 C U1 fix exact zero
+$ret6: cmpult u1, v1, r23 C U0 did it borrow
+ ldq u1, 56(r17) C L0 get next ones
+ ldq v1, 56(r18) C L1
+
+ lda r17, 64(r17) C L0 move pointer
+ bis r31, r31, r31 C U
+ lda r18, 64(r18) C L1 move pointer
+ bge r19, $Loop C U1 loop control
+C ==== main loop end
+
+$Lend: subq u0, v0, r2 C U1 sub two data
+ stq r24, 8(r16) C L0 put an answer
+ subq r7, r22, r24 C U0 borrow from last
+ stq r25, 16(r16) C L1 pair
+ cmpult u0, v0, cy1 C U1 did it borrow
+ beq r7, $fix7c C U0 fix exact 0
+$ret7c: subq r2, r23, r25 C U1 borrow from last
+ subq u1, v1, r5 C U0 sub two data
+ beq r2, $fix0c C U1 fix exact zero
+$ret0c: cmpult u1, v1, cy0 C U0 did it borrow
+ stq r24, 24(r16) C L0 store pair
+ subq r5, cy1, r24 C U0 borrow from last
+ stq r25, 32(r16) C L1
+ beq r5, $fix1c C U0 fix exact zero
+$ret1c: stq r24, 40(r16) C L0 put an answer
+ lda r16, 48(r16) C L0 move pointer
+
+ lda r19, 8(r19)
+ beq r19, $Lret
+
+ ldq u1, 0(r17)
+ ldq v1, 0(r18)
+$Lsmall:
+ lda r19, -1(r19)
+ beq r19, $Lend0
+
+ ALIGN(8)
+$Loop0: subq u1, v1, r2 C main sub
+ cmpult u1, v1, r8 C compute bw from last sub
+ ldq u1, 8(r17)
+ ldq v1, 8(r18)
+ subq r2, cy0, r5 C borrow sub
+ lda r17, 8(r17)
+ lda r18, 8(r18)
+ stq r5, 0(r16)
+ cmpult r2, cy0, cy0 C compute bw from last sub
+ lda r19, -1(r19) C decr loop cnt
+ bis r8, cy0, cy0 C combine bw from the two subs
+ lda r16, 8(r16)
+ bne r19, $Loop0
+$Lend0: subq u1, v1, r2 C main sub
+ subq r2, cy0, r5 C borrow sub
+ cmpult u1, v1, r8 C compute bw from last sub
+ cmpult r2, cy0, cy0 C compute bw from last sub
+ stq r5, 0(r16)
+ bis r8, cy0, r0 C combine bw from the two subs
+ ret r31,(r26),1
+
+ ALIGN(8)
+$Lret: lda r0, 0(cy0) C copy borrow into return register
+ ret r31,(r26),1
+
+$fix5f: bis r23, cy0, r23 C bring forward borrow
+ br r31, $ret5f
+$fix6f: bis r22, r23, r22 C bring forward borrow
+ br r31, $ret6f
+$fix0: bis cy1, r23, cy1 C bring forward borrow
+ br r31, $ret0
+$fix1: bis cy0, cy1, cy0 C bring forward borrow
+ br r31, $ret1
+$fix2: bis r22, cy0, r22 C bring forward borrow
+ br r31, $ret2
+$fix3: bis r23, r22, r23 C bring forward borrow
+ br r31, $ret3
+$fix4: bis cy1, r23, cy1 C bring forward borrow
+ br r31, $ret4
+$fix5: bis cy1, cy0, cy0 C bring forward borrow
+ br r31, $ret5
+$fix6: bis r22, cy0, r22 C bring forward borrow
+ br r31, $ret6
+$fix7: bis r23, r22, r23 C bring forward borrow
+ br r31, $ret7
+$fix0c: bis cy1, r23, cy1 C bring forward borrow
+ br r31, $ret0c
+$fix1c: bis cy0, cy1, cy0 C bring forward borrow
+ br r31, $ret1c
+$fix7c: bis r23, r22, r23 C bring forward borrow
+ br r31, $ret7c
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev67/gcd_1.asm b/gmp/mpn/alpha/ev67/gcd_1.asm
new file mode 100644
index 0000000000..55fa7d3673
--- /dev/null
+++ b/gmp/mpn/alpha/ev67/gcd_1.asm
@@ -0,0 +1,145 @@
+dnl Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
+
+dnl Copyright 2003, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 3.4 cycles/bitpair for 1x1 part
+
+
+C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
+C
+C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
+C strip trailing zeros from abs(x-y) to maintain x and y both odd.
+C
+C The trailing zeros are calculated from just x-y, since in twos-complement
+C there's the same number of trailing zeros on d or -d. This means the cttz
+C runs in parallel with abs(x-y).
+C
+C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
+C operands with this algorithm gives the measured 3.4 c/l.
+C
+C The slottings shown are for SVR4 style systems, Unicos differs in the
+C initial gp setup and the LEA.
+C
+C Enhancement:
+C
+C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
+C it to a bsr, but probably only in a static binary. Plain "jsr foo" gives
+C the right object code for relaxation, and ought to be available
+C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
+C the usual case of running in a shared library.
+C
+C bsr could perhaps be used explicitly anyway. We should be able to assume
+C modexact is in the same module as us (ie. shared library or mainline).
+C Would there be any worries about the size of the displacement? Could
+C always put modexact and gcd_1 in the same .o to be certain.
+
+ASM_START()
+PROLOGUE(mpn_gcd_1, gp)
+
+ C r16 xp
+ C r17 size
+ C r18 y
+
+ C ldah C l
+ C lda C u
+
+ ldq r0, 0(r16) C L x = xp[0]
+ lda r30, -32(r30) C u alloc stack
+
+ LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldq (gp)
+ stq r10, 16(r30) C L save r10
+ cttz r18, r10 C U0 y twos
+ cmpeq r17, 1, r5 C u test size==1
+
+ stq r9, 8(r30) C L save r9
+ clr r19 C u zero c for modexact
+ unop
+ unop
+
+ cttz r0, r6 C U0 x twos
+ stq r26, 0(r30) C L save ra
+
+ srl r18, r10, r18 C U y odd
+
+ mov r18, r9 C l hold y across call
+
+ cmpult r6, r10, r2 C u test x_twos < y_twos
+
+ cmovne r2, r6, r10 C l common_twos = min(x_twos,y_twos)
+ bne r5, L(one) C U no modexact if size==1
+ jsr r26, (r27), mpn_modexact_1c_odd C L0
+
+ LDGP( r29, 0(r26)) C u,l ldah,lda
+ cttz r0, r6 C U0 new x twos
+ ldq r26, 0(r30) C L restore ra
+
+L(one):
+ mov r9, r1 C u y
+ ldq r9, 8(r30) C L restore r9
+ mov r10, r2 C u common twos
+ ldq r10, 16(r30) C L restore r10
+
+ lda r30, 32(r30) C l free stack
+ beq r0, L(done) C U return y if x%y==0
+
+ srl r0, r6, r0 C U x odd
+ unop
+
+ ALIGN(16)
+L(top):
+ C r0 x
+ C r1 y
+ C r2 common twos, for use at end
+
+ subq r0, r1, r7 C l0 d = x - y
+ cmpult r0, r1, r16 C u0 test x >= y
+
+ subq r1, r0, r4 C l0 new_x = y - x
+ cttz r7, r8 C U0 d twos
+
+ cmoveq r16, r7, r4 C l0 new_x = d if x>=y
+ cmovne r16, r0, r1 C u0 y = x if x<y
+ unop C l \ force cmoveq into l0
+ unop C u /
+
+ C C cmoveq2 L0, cmovne2 U0
+
+ srl r4, r8, r0 C U0 x = new_x >> twos
+ bne r7, L(top) C U1 stop when d==0
+
+
+L(done):
+ sll r1, r2, r0 C U0 return y << common_twos
+ ret r31, (r26), 1 C L0
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev67/hamdist.asm b/gmp/mpn/alpha/ev67/hamdist.asm
new file mode 100644
index 0000000000..4b13e9f14f
--- /dev/null
+++ b/gmp/mpn/alpha/ev67/hamdist.asm
@@ -0,0 +1,111 @@
+dnl Alpha ev67 mpn_hamdist -- mpn hamming distance.
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 2.5 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C The hope was for 2.0 c/l here, but that isn't achieved. We're limited by
+C renaming register shortage. Since we need 5 instructions per limb, further
+C unrolling could approach 1.5 c/l.
+C
+C The main loop processes two limbs from each operand on each iteration. An
+C odd size is handled by processing xp[0]^yp[0] at the start. If the size
+C is even that result is discarded, and is repeated by the main loop.
+C
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+ C r16 xp
+ C r17 yp
+ C r18 size
+
+ ldq r1, 0(r16) C L0 xp[0]
+ ldq r2, 0(r17) C L1 yp[0]
+ and r18, 1, r8 C U1 1 if size odd
+ srl r18, 1, r18 C U0 size, limb pairs
+
+ clr r0 C L0 initial total
+ s8addq r8, r17, r17 C U1 yp++ if size odd
+ s8addq r8, r16, r16 C L1 xp++ if size odd
+ clr r6 C U0 dummy initial xor 1
+
+ xor r1, r2, r5 C L initial xor 0
+ beq r18, L(one) C U if size==1
+
+ cmoveq r8, r31, r5 C L discard first limb if size even
+ unop C U
+
+
+ ALIGN(16)
+L(top):
+ C r0 total accumulating
+ C r7 xor 0
+ C r8 xor 1
+ C r16 xp, incrementing
+ C r17 yp, incrementing
+ C r18 size, limb pairs, decrementing
+
+ ldq r1, 0(r16) C L
+ ldq r2, 0(r17) C L
+ ctpop r5, r7 C U0
+ lda r16, 16(r16) C U
+
+ ldq r3, -8(r16) C L
+ ldq r4, 8(r17) C L
+ ctpop r6, r8 C U0
+ lda r17, 16(r17) C U
+
+ ldl r31, 256(r16) C L prefetch
+ ldl r31, 256(r17) C L prefetch
+ xor r1, r2, r5 C U
+ lda r18, -1(r18) C U
+
+ xor r3, r4, r6 C U
+ addq r0, r7, r0 C L
+ addq r0, r8, r0 C L
+ bne r18, L(top) C U
+
+
+ ctpop r6, r8 C U0
+ addq r0, r8, r0 C L
+L(one):
+ ctpop r5, r7 C U0
+ addq r0, r7, r0 C L
+
+ ret r31, (r26), 1 C L0
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev67/popcount.asm b/gmp/mpn/alpha/ev67/popcount.asm
new file mode 100644
index 0000000000..049c1cd239
--- /dev/null
+++ b/gmp/mpn/alpha/ev67/popcount.asm
@@ -0,0 +1,101 @@
+dnl Alpha ev67 mpn_popcount -- mpn bit population count.
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 1.5 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide
+C all latencies, the addq's must be deferred to the next iteration.
+C
+C Since we need just 3 instructions per limb, further unrolling could approach
+C 1.0 c/l.
+C
+C The main loop processes two limbs at a time. An odd size is handled by
+C processing src[0] at the start. If the size is even that result is
+C discarded, and src[0] is repeated by the main loop.
+C
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+ C r16 src
+ C r17 size
+
+ ldq r0, 0(r16) C L0 src[0]
+ and r17, 1, r8 C U1 1 if size odd
+ srl r17, 1, r17 C U0 size, limb pairs
+
+ s8addq r8, r16, r16 C L1 src++ if size odd
+ ctpop r0, r0 C U0
+ beq r17, L(one) C U1 if size==1
+
+ cmoveq r8, r31, r0 C L discard first limb if size even
+ clr r3 C L
+
+ clr r4 C L
+ unop C U
+ unop C L
+ unop C U
+
+
+ ALIGN(16)
+L(top):
+ C r0 total accumulating
+ C r3 pop 0
+ C r4 pop 1
+ C r16 src, incrementing
+ C r17 size, decrementing
+
+ ldq r1, 0(r16) C L
+ ldq r2, 8(r16) C L
+ lda r16, 16(r16) C U
+ lda r17, -1(r17) C U
+
+ addq r0, r3, r0 C L
+ addq r0, r4, r0 C L
+ ctpop r1, r3 C U0
+ ctpop r2, r4 C U0
+
+ ldl r31, 512(r16) C L prefetch
+ bne r17, L(top) C U
+
+
+ addq r0, r3, r0 C L
+ addq r0, r4, r0 C U
+L(one):
+ ret r31, (r26), 1 C L0
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/gmp-mparam.h b/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000000..b850bd24b5
--- /dev/null
+++ b/gmp/mpn/alpha/gmp-mparam.h
@@ -0,0 +1,86 @@
+/* Alpha EV4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+/* 175MHz 21064 */
+
+/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */
+
+#define MUL_TOOM22_THRESHOLD 12
+#define MUL_TOOM33_THRESHOLD 69
+#define MUL_TOOM44_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_TOOM2_THRESHOLD 20
+#define SQR_TOOM3_THRESHOLD 62
+#define SQR_TOOM4_THRESHOLD 155
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 40
+#define MULLO_MUL_N_THRESHOLD 202
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
+#define DIV_DC_THRESHOLD 38
+#define POWM_THRESHOLD 60
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 80
+#define GCD_DC_THRESHOLD 237
+#define GCDEXT_DC_THRESHOLD 198
+#define JACOBI_BASE_METHOD 2
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 2
+#define MOD_1_2_THRESHOLD 9
+#define MOD_1_4_THRESHOLD 20
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define USE_PREINV_MOD_1 1 /* preinv always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 20
+#define GET_STR_PRECOMPUTE_THRESHOLD 37
+#define SET_STR_DC_THRESHOLD 746
+#define SET_STR_PRECOMPUTE_THRESHOLD 1332
+
+#define MUL_FFT_TABLE { 240, 480, 1344, 2304, 5120, 20480, 49152, 0 }
+#define MUL_FFT_MODF_THRESHOLD 232
+#define MUL_FFT_THRESHOLD 1664
+
+#define SQR_FFT_TABLE { 240, 480, 1216, 2304, 5120, 12288, 49152, 0 }
+#define SQR_FFT_MODF_THRESHOLD 232
+#define SQR_FFT_THRESHOLD 1408
diff --git a/gmp/mpn/alpha/invert_limb.asm b/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000000..afc010f58c
--- /dev/null
+++ b/gmp/mpn/alpha/invert_limb.asm
@@ -0,0 +1,95 @@
+dnl Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 137/140 (with BWX/without BWX)
+C EV6: 71/72 (with BWX/without BWX)
+
+C This was compiler generated, with minimal manual edits. Surely several
+C cycles could be cut with some thought.
+
+ASM_START()
+PROLOGUE(mpn_invert_limb,gp)
+ LEA( r2, approx_tab)
+ srl r16, 54, r1
+ srl r16, 24, r4
+ and r16, 1, r5
+ bic r1, 1, r7
+ lda r4, 1(r4)
+ srl r16, 1, r3
+ addq r7, r2, r1
+ifelse(bwx_available_p,1,`
+ ldwu r0, -512(r1)
+',`
+ ldq_u r0, -512(r1)
+ extwl r0, r7, r0
+')
+ addq r3, r5, r3
+ mull r0, r0, r1
+ sll r0, 11, r0
+ mulq r1, r4, r1
+ srl r1, 40, r1
+ subq r0, r1, r0
+ lda r0, -1(r0)
+ mulq r0, r0, r2
+ sll r0, 60, r1
+ sll r0, 13, r0
+ mulq r2, r4, r2
+ subq r1, r2, r1
+ srl r1, 47, r1
+ addq r0, r1, r0
+ mulq r0, r3, r3
+ srl r0, 1, r1
+ cmoveq r5, 0, r1
+ subq r1, r3, r1
+ umulh r1, r0, r3
+ sll r0, 31, r0
+ srl r3, 1, r1
+ addq r0, r1, r0
+ mulq r0, r16, r2
+ umulh r0, r16, r3
+ addq r2, r16, r1
+ addq r3, r16, r16
+ cmpult r1, r2, r1
+ addq r16, r1, r3
+ subq r0, r3, r0
+ ret r31, (r26), 1
+EPILOGUE()
+DATASTART(approx_tab,8)
+forloop(i,256,512-1,dnl
+` .word eval(0x7fd00/i)
+')dnl
+ SIZE(approx_tab, 512)
+ TYPE(approx_tab, object)
+DATAEND()
+ASM_END()
diff --git a/gmp/mpn/alpha/lshift.asm b/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000000..c62a856aea
--- /dev/null
+++ b/gmp/mpn/alpha/lshift.asm
@@ -0,0 +1,182 @@
+dnl Alpha mpn_lshift -- Shift a number left.
+
+dnl Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 3.25
+C EV6: 1.75
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C cnt r19
+
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r31,r19,r20
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ srl r4,r20,r0 C compute function result
+
+ beq r28,L(L0)
+ subq r18,r28,r18
+
+ ALIGN(8)
+L(top0):
+ ldq r3,-16(r17)
+ subq r16,8,r16
+ sll r4,r19,r5
+ subq r17,8,r17
+ subq r28,1,r28
+ srl r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r28,L(top0)
+
+L(L0): sll r4,r19,r24
+ beq r18,L(end)
+C warm up phase 1
+ ldq r1,-16(r17)
+ subq r18,4,r18
+ ldq r2,-24(r17)
+ ldq r3,-32(r17)
+ ldq r4,-40(r17)
+C warm up phase 2
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ beq r18,L(end1)
+ ldq r1,-48(r17)
+ sll r2,r19,r22
+ ldq r2,-56(r17)
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ ldq r3,-64(r17)
+ sll r4,r19,r24
+ ldq r4,-72(r17)
+ subq r18,4,r18
+ beq r18,L(end2)
+ ALIGN(16)
+C main loop
+L(top): stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+
+ srl r1,r20,r7
+ subq r18,4,r18
+ sll r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ srl r2,r20,r8
+ ldq r1,-80(r17)
+ sll r2,r19,r22
+ ldq r2,-88(r17)
+
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+
+ srl r3,r20,r5
+ unop C ldq r31,-96(r17)
+ sll r3,r19,r23
+ subq r16,32,r16
+
+ srl r4,r20,r6
+ ldq r3,-96(r17)
+ sll r4,r19,r24
+ ldq r4,-104(r17)
+
+ subq r17,32,r17
+ bne r18,L(top)
+C cool down phase 2/1
+L(end2):
+ stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+ srl r3,r20,r5
+ sll r3,r19,r23
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 2/2
+ stq r7,-40(r16)
+ bis r5,r22,r5
+ stq r8,-48(r16)
+ bis r6,r23,r6
+ stq r5,-56(r16)
+ stq r6,-64(r16)
+C cool down phase 2/3
+ stq r24,-72(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+L(end1):
+ sll r2,r19,r22
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 1/2
+ stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ stq r5,-24(r16)
+ stq r6,-32(r16)
+ stq r24,-40(r16)
+ ret r31,(r26),1
+
+L(end): stq r24,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/gmp/mpn/alpha/mod_34lsub1.asm b/gmp/mpn/alpha/mod_34lsub1.asm
new file mode 100644
index 0000000000..1b03b637d8
--- /dev/null
+++ b/gmp/mpn/alpha/mod_34lsub1.asm
@@ -0,0 +1,164 @@
+dnl Alpha mpn_mod_34lsub1.
+
+dnl Copyright 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 4 (?)
+C EV5: 2.67
+C EV6: 1.67
+
+
+dnl INPUT PARAMETERS
+dnl up r16
+dnl n r17
+
+define(`l0',`r18')
+define(`l1',`r19')
+define(`l2',`r20')
+define(`a0',`r21')
+define(`a1',`r22')
+define(`a2',`r23')
+define(`c0',`r24')
+define(`c1',`r5')
+define(`c2',`r6')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+ bis r31, r31, c0
+ bis r31, r31, c1
+ bis r31, r31, c2
+
+ lda r17, -3(r17)
+ bge r17, $L_3_or_more
+ bis r31, r31, a0
+ bis r31, r31, a1
+ bis r31, r31, a2
+ br r31, $L_012
+
+$L_3_or_more:
+ ldq a0, 0(r16)
+ ldq a1, 8(r16)
+ ldq a2, 16(r16)
+ lda r16, 24(r16)
+ lda r17, -3(r17)
+ blt r17, $L_012
+
+$L_6_or_more:
+ ldq l0, 0(r16)
+ ldq l1, 8(r16)
+ ldq l2, 16(r16)
+ addq l0, a0, a0
+
+ lda r16, 24(r16)
+ lda r17, -3(r17)
+ blt r17, $L_end
+
+ ALIGN(16)
+C Main loop
+$L_9_or_more:
+$Loop: cmpult a0, l0, r0
+ ldq l0, 0(r16)
+ addq r0, c0, c0
+ addq l1, a1, a1
+ cmpult a1, l1, r0
+ ldq l1, 8(r16)
+ addq r0, c1, c1
+ addq l2, a2, a2
+ cmpult a2, l2, r0
+ ldq l2, 16(r16)
+ addq r0, c2, c2
+ addq l0, a0, a0
+ lda r16, 24(r16)
+ lda r17, -3(r17)
+ bge r17, $Loop
+
+$L_end: cmpult a0, l0, r0
+ addq r0, c0, c0
+ addq l1, a1, a1
+ cmpult a1, l1, r0
+ addq r0, c1, c1
+ addq l2, a2, a2
+ cmpult a2, l2, r0
+ addq r0, c2, c2
+
+C Handle the last (n mod 3) limbs
+$L_012: lda r17, 2(r17)
+ blt r17, $L_0
+ ldq l0, 0(r16)
+ addq l0, a0, a0
+ cmpult a0, l0, r0
+ addq r0, c0, c0
+ beq r17, $L_0
+ ldq l1, 8(r16)
+ addq l1, a1, a1
+ cmpult a1, l1, r0
+ addq r0, c1, c1
+
+C Align and sum our 3 main accumulators and 3 carry accumulators
+$L_0: srl a0, 48, r2
+ srl a1, 32, r4
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` insll a1, 2, r1', C (a1 & 0xffffffff) << 16
+` zapnot a1, 15, r25
+ sll r25, 16, r1')
+ zapnot a0, 63, r0 C a0 & 0xffffffffffff
+ srl a2, 16, a1
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` inswl a2, 4, r3', C (a2 & 0xffff) << 32
+` zapnot a2, 3, r25
+ sll r25, 32, r3')
+ addq r1, r4, r1
+ addq r0, r2, r0
+ srl c0, 32, a2
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` insll c0, 2, r4', C (c0 & 0xffffffff) << 16
+` zapnot c0, 15, r25
+ sll r25, 16, r4')
+ addq r0, r1, r0
+ addq r3, a1, r3
+ addq r0, r3, r0
+ srl c1, 16, c0
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` inswl c1, 4, r2', C (c1 & 0xffff) << 32
+` zapnot c1, 3, r25
+ sll r25, 32, r2')
+ addq r4, a2, r4
+C srl c2, 48, r3 C This will be 0 in practise
+ zapnot c2, 63, r1 C r1 = c2 & 0xffffffffffff
+ addq r0, r4, r0
+ addq r2, c0, r2
+ addq r0, r2, r0
+C addq r1, r3, r1
+ addq r0, r1, r0
+
+ ret r31, (r26), 1
+EPILOGUE(mpn_mod_34lsub1)
+ASM_END()
diff --git a/gmp/mpn/alpha/mode1o.asm b/gmp/mpn/alpha/mode1o.asm
new file mode 100644
index 0000000000..96dccc73ee
--- /dev/null
+++ b/gmp/mpn/alpha/mode1o.asm
@@ -0,0 +1,209 @@
+dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
+
+dnl Copyright 2003, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C EV4: 47
+C EV5: 30
+C EV6: 15
+
+
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
+C mp_limb_t c)
+C
+C This code follows the "alternate" code in mpn/generic/mode1o.c,
+C eliminating cbit+climb from the dependent chain. This leaves,
+C
+C ev4 ev5 ev6
+C 1 3 1 subq y = x - h
+C 23 13 7 mulq q = y * inverse
+C 23 14 7 umulh h = high (q * d)
+C -- -- --
+C 47 30 15
+C
+C In each case, the load latency, loop control, and extra carry bit handling
+C hide under the multiply latencies. Those latencies are long enough that
+C we don't need to worry about alignment or pairing to squeeze out
+C performance.
+C
+C For the first limb, some of the loop code is broken out and scheduled back
+C since it can be done earlier.
+C
+C - The first ldq src[0] is near the start of the routine, for maximum
+C time from memory.
+C
+C - The subq y=x-climb can be done without waiting for the inverse.
+C
+C - The mulq y*inverse is replicated after the final subq for the inverse,
+C instead of branching to the mulq in the main loop. On ev4 a branch
+C there would cost cycles, but we can hide them under the mulq latency.
+C
+C For the last limb, high<divisor is tested and if that's true a subtract
+C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
+C data-dependent branch, but we're waiting for umulh so any penalty should
+C hide there. The multiplies saved would be worth the cost anyway.
+C
+C Enhancements:
+C
+C For size==1, a plain division (done bitwise say) might be faster than
+C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
+C ev5. A call to gcc __remqu might be a possibility.
+
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd,gp)
+
+ C r16 src
+ C r17 size
+ C r18 d
+ C r19 c
+
+ LEA(r0, binvert_limb_table)
+ srl r18, 1, r20 C d >> 1
+
+ and r20, 127, r20 C idx = d>>1 & 0x7F
+
+ addq r0, r20, r21 C table + idx
+
+ifelse(bwx_available_p,1,
+` ldbu r20, 0(r21) C table[idx], inverse 8 bits
+',`
+ ldq_u r20, 0(r21) C table[idx] qword
+ extbl r20, r21, r20 C table[idx], inverse 8 bits
+')
+
+ mull r20, r20, r7 C i*i
+ addq r20, r20, r20 C 2*i
+
+ ldq r2, 0(r16) C x = s = src[0]
+ lda r17, -1(r17) C size--
+ clr r0 C initial cbit=0
+
+ mull r7, r18, r7 C i*i*d
+
+ subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits
+
+ mull r20, r20, r7 C i*i
+ addq r20, r20, r20 C 2*i
+
+ mull r7, r18, r7 C i*i*d
+
+ subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits
+
+ mulq r20, r20, r7 C i*i
+ addq r20, r20, r20 C 2*i
+
+ mulq r7, r18, r7 C i*i*d
+ subq r2, r19, r3 C y = x - climb
+
+ subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits
+
+ASSERT(r7, C should have d*inv==1 mod 2^64
+` mulq r18, r20, r7
+ cmpeq r7, 1, r7')
+
+ mulq r3, r20, r4 C first q = y * inv
+
+ beq r17, L(one) C if size==1
+ br L(entry)
+
+
+L(top):
+ C r0 cbit
+ C r16 src, incrementing
+ C r17 size, decrementing
+ C r18 d
+ C r19 climb
+ C r20 inv
+
+ ldq r1, 0(r16) C s = src[i]
+ subq r1, r0, r2 C x = s - cbit
+ cmpult r1, r0, r0 C new cbit = s < cbit
+
+ subq r2, r19, r3 C y = x - climb
+
+ mulq r3, r20, r4 C q = y * inv
+L(entry):
+ cmpult r2, r19, r5 C cbit2 = x < climb
+ addq r5, r0, r0 C cbit += cbit2
+ lda r16, 8(r16) C src++
+ lda r17, -1(r17) C size--
+
+ umulh r4, r18, r19 C climb = q * d
+ bne r17, L(top) C while 2 or more limbs left
+
+
+
+ C r0 cbit
+ C r18 d
+ C r19 climb
+ C r20 inv
+
+ ldq r1, 0(r16) C s = src[size-1] high limb
+
+ cmpult r1, r18, r2 C test high<divisor
+ bne r2, L(skip) C skip if so
+
+ C can't skip a division, repeat loop code
+
+ subq r1, r0, r2 C x = s - cbit
+ cmpult r1, r0, r0 C new cbit = s < cbit
+
+ subq r2, r19, r3 C y = x - climb
+
+ mulq r3, r20, r4 C q = y * inv
+L(one):
+ cmpult r2, r19, r5 C cbit2 = x < climb
+ addq r5, r0, r0 C cbit += cbit2
+
+ umulh r4, r18, r19 C climb = q * d
+
+ addq r19, r0, r0 C return climb + cbit
+ ret r31, (r26), 1
+
+
+ ALIGN(8)
+L(skip):
+ C with high<divisor, the final step can be just (cbit+climb)-s and
+ C an addback of d if that underflows
+
+ addq r19, r0, r19 C c = climb + cbit
+
+ subq r19, r1, r2 C c - s
+ cmpult r19, r1, r3 C c < s
+
+ addq r2, r18, r0 C return c-s + divisor
+
+ cmoveq r3, r2, r0 C return c-s if no underflow
+ ret r31, (r26), 1
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/mul_1.asm b/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000000..a7cdbcf8eb
--- /dev/null
+++ b/gmp/mpn/alpha/mul_1.asm
@@ -0,0 +1,102 @@
+dnl Alpha mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 7
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C vl r19
+C cl r20
+
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ umulh r2,r19,r4 C r4 = prod_high
+ beq r18,$Le1c C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ addq r3,r20,r3 C r3 = cy_limb + cl
+ stq r3,0(r16)
+ cmpult r3,r20,r0 C r0 = carry from (cy_limb + cl)
+ bne r18,$Loop C jump if size was == 2
+ br r31,$Le2
+$Le1c: addq r3,r20,r3 C r3 = cy_limb + cl
+ cmpult r3,r20,r0 C r0 = carry from (cy_limb + cl)
+$Le1: stq r3,0(r16)
+ addq r4,r0,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_mul_1c)
+
+PROLOGUE(mpn_mul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ bic r31,r31,r0 C clear cy_limb
+ umulh r2,r19,r4 C r4 = prod_high
+ beq r18,$Le1 C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ lda r18,-1(r18) C size--
+ stq r3,0(r16)
+ beq r18,$Le2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ lda r18,-1(r18) C size--
+ umulh r2,r19,r4 C r4 = prod_high
+ ldq r2,16(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,8(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ lda r16,8(r16) C res_ptr++
+ bne r18,$Loop
+
+$Le2: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = prod_high
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,8(r16)
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/gmp/mpn/alpha/rshift.asm b/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000000..6e1e214558
--- /dev/null
+++ b/gmp/mpn/alpha/rshift.asm
@@ -0,0 +1,180 @@
+dnl Alpha mpn_rshift -- Shift a number right.
+
+dnl Copyright 1994, 1995, 2000, 2009 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 3.25
+C EV6: 1.75
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C cnt r19
+
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ subq r31,r19,r20
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ sll r4,r20,r0 C compute function result
+
+ beq r28,L(L0)
+ subq r18,r28,r18
+
+ ALIGN(8)
+L(top0):
+ ldq r3,8(r17)
+ addq r16,8,r16
+ srl r4,r19,r5
+ addq r17,8,r17
+ subq r28,1,r28
+ sll r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r28,L(top0)
+
+L(L0): srl r4,r19,r24
+ beq r18,L(end)
+C warm up phase 1
+ ldq r1,8(r17)
+ subq r18,4,r18
+ ldq r2,16(r17)
+ ldq r3,24(r17)
+ ldq r4,32(r17)
+C warm up phase 2
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ beq r18,L(end1)
+ ldq r1,40(r17)
+ srl r2,r19,r22
+ ldq r2,48(r17)
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ ldq r3,56(r17)
+ srl r4,r19,r24
+ ldq r4,64(r17)
+ subq r18,4,r18
+ beq r18,L(end2)
+ ALIGN(16)
+C main loop
+L(top): stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+
+ sll r1,r20,r7
+ subq r18,4,r18
+ srl r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ sll r2,r20,r8
+ ldq r1,72(r17)
+ srl r2,r19,r22
+ ldq r2,80(r17)
+
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+
+ sll r3,r20,r5
+ unop C ldq r31,-96(r17)
+ srl r3,r19,r23
+ addq r16,32,r16
+
+ sll r4,r20,r6
+ ldq r3,88(r17)
+ srl r4,r19,r24
+ ldq r4,96(r17)
+
+ addq r17,32,r17
+ bne r18,L(top)
+C cool down phase 2/1
+L(end2):
+ stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+ sll r3,r20,r5
+ srl r3,r19,r23
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 2/2
+ stq r7,32(r16)
+ bis r5,r22,r5
+ stq r8,40(r16)
+ bis r6,r23,r6
+ stq r5,48(r16)
+ stq r6,56(r16)
+C cool down phase 2/3
+ stq r24,64(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+L(end1):
+ srl r2,r19,r22
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 1/2
+ stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ stq r5,16(r16)
+ stq r6,24(r16)
+ stq r24,32(r16)
+ ret r31,(r26),1
+
+L(end): stq r24,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/gmp/mpn/alpha/sec_tabselect.asm b/gmp/mpn/alpha/sec_tabselect.asm
new file mode 100644
index 0000000000..679b16926e
--- /dev/null
+++ b/gmp/mpn/alpha/sec_tabselect.asm
@@ -0,0 +1,137 @@
+dnl Alpha mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 2.25
+C EV6: 1.64
+
+define(`rp', `r16')
+define(`tp', `r17')
+define(`n', `r18')
+define(`nents', `r19')
+define(`which', `r20')
+
+define(`i', `r21')
+define(`j', `r22')
+define(`stride', `r23')
+define(`mask', `r24')
+define(`k', `r25')
+
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+ subq n, 4, j C outer loop induction variable
+
+ blt j, L(outer_end)
+L(outer_top):
+ mov tp, r8
+ lda r0, 0(r31)
+ lda r1, 0(r31)
+ lda r2, 0(r31)
+ lda r3, 0(r31)
+ subq j, 4, j C outer loop induction variable
+ subq nents, which, k
+ mov nents, i
+
+ ALIGN(16)
+L(top): ldq r4, 0(tp)
+ ldq r5, 8(tp)
+ cmpeq k, i, mask
+ subq i, 1, i
+ subq r31, mask, mask
+ ldq r6, 16(tp)
+ ldq r7, 24(tp)
+ and r4, mask, r4
+ and r5, mask, r5
+ or r0, r4, r0
+ or r1, r5, r1
+ and r6, mask, r6
+ and r7, mask, r7
+ or r2, r6, r2
+ or r3, r7, r3
+ s8addq n, tp, tp
+ bne i, L(top)
+
+ stq r0, 0(rp)
+ stq r1, 8(rp)
+ stq r2, 16(rp)
+ stq r3, 24(rp)
+ addq r8, 32, tp
+ addq rp, 32, rp
+ bge j, L(outer_top)
+L(outer_end):
+
+ and n, 2, r0
+ beq r0, L(b0x)
+L(b1x): mov tp, r8
+ lda r0, 0(r31)
+ lda r1, 0(r31)
+ subq nents, which, k
+ mov nents, i
+ ALIGN(16)
+L(tp2): ldq r4, 0(tp)
+ ldq r5, 8(tp)
+ cmpeq k, i, mask
+ subq i, 1, i
+ subq r31, mask, mask
+ and r4, mask, r4
+ and r5, mask, r5
+ or r0, r4, r0
+ or r1, r5, r1
+ s8addq n, tp, tp
+ bne i, L(tp2)
+ stq r0, 0(rp)
+ stq r1, 8(rp)
+ addq r8, 16, tp
+ addq rp, 16, rp
+
+L(b0x): and n, 1, r0
+ beq r0, L(b00)
+L(b01): lda r0, 0(r31)
+ subq nents, which, k
+ mov nents, i
+ ALIGN(16)
+L(tp1): ldq r4, 0(tp)
+ cmpeq k, i, mask
+ subq i, 1, i
+ subq r31, mask, mask
+ and r4, mask, r4
+ or r0, r4, r0
+ s8addq n, tp, tp
+ bne i, L(tp1)
+ stq r0, 0(rp)
+
+L(b00): ret r31, (r26), 1
+EPILOGUE()
diff --git a/gmp/mpn/alpha/sqr_diag_addlsh1.asm b/gmp/mpn/alpha/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000000..ee219ef7e8
--- /dev/null
+++ b/gmp/mpn/alpha/sqr_diag_addlsh1.asm
@@ -0,0 +1,93 @@
+dnl Alpha mpn_sqr_diag_addlsh1.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 10.2
+C EV6: 4.5
+
+C Ideally, one-way code could run at 9 c/l (limited by mulq+umulh) on ev5 and
+C about 3.75 c/l on ev6. Two-way code could run at about 3.25 c/l on ev6.
+
+C Algorithm: We allow ourselves to propagate carry to a product high word
+C without worrying for carry out, since (B-1)^2 = B^2-2B+1 has a high word of
+C B-2, i.e, will not spill. We propagate carry similarly to a product low word
+C since the problem value B-1 is a quadratic non-residue mod B, but our
+C products are squares.
+
+define(`rp', `r16')
+define(`tp', `r17')
+define(`up', `r18')
+define(`n', `r19')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+ ldq r0, 0(up)
+ bis r31, r31, r21
+ bis r31, r31, r3
+ mulq r0, r0, r7
+ stq r7, 0(rp)
+ umulh r0, r0, r6
+ lda n, -1(n)
+
+ ALIGN(16)
+L(top): ldq r0, 8(up)
+ lda up, 8(up)
+ ldq r8, 0(tp)
+ ldq r20, 8(tp)
+ mulq r0, r0, r7
+ lda tp, 16(tp)
+ sll r8, 1, r23
+ srl r8, 63, r22
+ or r21, r23, r23
+ sll r20, 1, r24
+ addq r3, r6, r6 C cannot carry per comment above
+ or r22, r24, r24
+ addq r23, r6, r21
+ umulh r0, r0, r6
+ cmpult r21, r23, r1
+ addq r1, r7, r7 C cannot carry per comment above
+ stq r21, 8(rp)
+ addq r24, r7, r22
+ stq r22, 16(rp)
+ lda n, -1(n)
+ cmpult r22, r7, r3
+ srl r20, 63, r21
+ lda rp, 16(rp)
+ bne n, L(top)
+
+ addq r3, r6, r6 C cannot carry per comment above
+ addq r21, r6, r21
+ stq r21, 8(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/sub_n.asm b/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000000..1bb72263f8
--- /dev/null
+++ b/gmp/mpn/alpha/sub_n.asm
@@ -0,0 +1,164 @@
+dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 4.75
+C EV6: 3
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+ bis r31,r20,r25
+ br L(com)
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+ bis r31,r31,r25 C clear cy
+L(com): subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ subq r4,r0,r28 C 1st main subtract
+ ldq r2,16(r18)
+ subq r28,r25,r20 C 1st carry subtract
+ ldq r3,24(r18)
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r6,-16(r17)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r19,4,r19 C decr loop cnt
+ subq r5,r1,r28 C 2nd main subtract
+ addq r18,32,r18 C update s2_ptr
+ subq r28,r25,r21 C 2nd carry subtract
+ cmpult r5,r1,r8 C compute cy from last subtract
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ ldq r1,8(r18)
+ subq r6,r2,r28 C 3rd main subtract
+ ldq r4,0(r17)
+ subq r28,r25,r22 C 3rd carry subtract
+ ldq r5,8(r17)
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C 4th main subtract
+ subq r28,r25,r23 C 4th carry subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ subq r4,r0,r28 C 1st main subtract
+ ldq r2,16(r18)
+ subq r28,r25,r20 C 1st carry subtract
+ ldq r3,24(r18)
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r6,-16(r17)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ subq r5,r1,r28 C 2nd main subtract
+ stq r23,-8(r16)
+ subq r28,r25,r21 C 2nd carry subtract
+ addq r18,32,r18 C update s2_ptr
+ cmpult r5,r1,r8 C compute cy from last subtract
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r6,r2,r28 C cy add
+ subq r28,r25,r22 C 3rd main subtract
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C cy add
+ subq r28,r25,r23 C 4th main subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: subq r4,r0,r28 C main subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r0,8(r18)
+ ldq r4,8(r17)
+ subq r28,r25,r20 C carry subtract
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: subq r4,r0,r28 C main subtract
+ subq r28,r25,r20 C carry subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/submul_1.asm b/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000000..2b63b52fa4
--- /dev/null
+++ b/gmp/mpn/alpha/submul_1.asm
@@ -0,0 +1,99 @@
+dnl Alpha mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 7
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C limb r19
+
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ subq r5,r3,r3
+ cmpult r5,r3,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/gmp/mpn/alpha/umul.asm b/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000000..039081ed48
--- /dev/null
+++ b/gmp/mpn/alpha/umul.asm
@@ -0,0 +1,44 @@
+dnl mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+ mulq r17, r18, r1
+ umulh r17, r18, r0
+ stq r1, 0(r16)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/unicos.m4 b/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000000..e05cf5cca6
--- /dev/null
+++ b/gmp/mpn/alpha/unicos.m4
@@ -0,0 +1,131 @@
+divert(-1)
+
+dnl m4 macros for alpha assembler on unicos.
+
+
+dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Note that none of the standard GMP_ASM_ autoconf tests are done for
+dnl unicos, so none of the config.m4 results can be used here.
+
+dnl No underscores on unicos
+define(`GSYM_PREFIX')
+
+define(`ASM_START',
+m4_assert_numargs(0)
+` .ident dummy')
+
+define(`X',
+m4_assert_numargs(1)
+`^X$1')
+
+define(`FLOAT64',
+m4_assert_numargs(2)
+` .psect $1@crud,data
+$1: .t_floating $2
+ .endp')
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign])
+dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',gp,,
+`ifelse(`$2',noalign,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter
+')')')')dnl
+ .stack 192 ; What does this mean? Only Cray knows.
+ .psect $1@code,code,cache
+$1::')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+` .endp')
+
+
+dnl Usage: LDGP(dst,src)
+dnl
+dnl Emit an "ldgp dst,src", but only on systems using a GOT (which unicos
+dnl doesn't).
+
+define(LDGP,
+m4_assert_numargs(2)
+)
+
+
+dnl Usage: EXTERN(variable_name)
+define(`EXTERN',
+m4_assert_numargs(1)
+` .extern $1')
+
+define(`DATASTART',
+m4_assert_numargs_range(1,2)
+` .psect $1@crud,data
+ ALIGN(ifelse($#,1,2,$2))
+$1:')
+
+define(`DATAEND',
+m4_assert_numargs(0)
+` .endp')
+
+define(`ASM_END',
+m4_assert_numargs(0)
+` .end')
+
+define(`cvttqc',
+m4_assert_numargs(-1)
+`cvttq/c')
+
+dnl Load a symbolic address into a register
+define(`LEA',
+m4_assert_numargs(2)
+ `laum $1, $2(r31)
+ sll $1, 32, $1
+ lalm $1, $2($1)
+ lal $1, $2($1)')
+
+
+dnl Usage: ALIGN(bytes)
+dnl
+dnl Unicos assembler .align emits zeros, even in code segments, so disable
+dnl aligning.
+dnl
+dnl GCC uses a macro emiting nops until the desired alignment is reached
+dnl (see unicosmk_file_start in alpha.c). Could do something like that if
+dnl we cared. The maximum desired alignment must be established at the
+dnl start of the section though, since of course emitting nops only
+dnl advances relative to the section beginning.
+
+define(`ALIGN',
+m4_assert_numargs(1)
+)
+
+
+divert