summaryrefslogtreecommitdiff
path: root/gmp/mpn/arm
diff options
context:
space:
mode:
authorPedro Alvarez <pedro.alvarez@codethink.co.uk>2014-12-22 00:55:04 +0000
committerPedro Alvarez <pedro.alvarez@codethink.co.uk>2014-12-22 00:56:42 +0000
commit54eea31d0053620bab65153ab39d61e5575aaf1b (patch)
tree5f97c96dffdb6b27df36795689abfb9086011585 /gmp/mpn/arm
parentc16297b7cfb0c1708f1d84b5d0f90be0844d07ce (diff)
downloadgcc-tarball-54eea31d0053620bab65153ab39d61e5575aaf1b.tar.gz
Add gmp, mpc and mpfr sourcesbaserock/pedroalvarez/4.9.1
Diffstat (limited to 'gmp/mpn/arm')
-rw-r--r--gmp/mpn/arm/README35
-rw-r--r--gmp/mpn/arm/aors_n.asm112
-rw-r--r--gmp/mpn/arm/aorslsh1_n.asm167
-rw-r--r--gmp/mpn/arm/aorsmul_1.asm135
-rw-r--r--gmp/mpn/arm/arm-defs.m491
-rw-r--r--gmp/mpn/arm/bdiv_dbm1c.asm113
-rw-r--r--gmp/mpn/arm/cnd_aors_n.asm134
-rw-r--r--gmp/mpn/arm/com.asm75
-rw-r--r--gmp/mpn/arm/copyd.asm84
-rw-r--r--gmp/mpn/arm/copyi.asm79
-rw-r--r--gmp/mpn/arm/dive_1.asm151
-rw-r--r--gmp/mpn/arm/gmp-mparam.h127
-rw-r--r--gmp/mpn/arm/invert_limb.asm93
-rw-r--r--gmp/mpn/arm/logops_n.asm139
-rw-r--r--gmp/mpn/arm/lshift.asm88
-rw-r--r--gmp/mpn/arm/lshiftc.asm95
-rw-r--r--gmp/mpn/arm/mod_34lsub1.asm121
-rw-r--r--gmp/mpn/arm/mode1o.asm92
-rw-r--r--gmp/mpn/arm/mul_1.asm94
-rw-r--r--gmp/mpn/arm/neon/README2
-rw-r--r--gmp/mpn/arm/neon/hamdist.asm194
-rw-r--r--gmp/mpn/arm/neon/lorrshift.asm279
-rw-r--r--gmp/mpn/arm/neon/lshiftc.asm257
-rw-r--r--gmp/mpn/arm/neon/popcount.asm166
-rw-r--r--gmp/mpn/arm/neon/sec_tabselect.asm140
-rw-r--r--gmp/mpn/arm/rsh1aors_n.asm124
-rw-r--r--gmp/mpn/arm/rshift.asm86
-rw-r--r--gmp/mpn/arm/sec_tabselect.asm131
-rw-r--r--gmp/mpn/arm/udiv.asm104
-rw-r--r--gmp/mpn/arm/v5/gcd_1.asm120
-rw-r--r--gmp/mpn/arm/v5/mod_1_1.asm129
-rw-r--r--gmp/mpn/arm/v5/mod_1_2.asm156
-rw-r--r--gmp/mpn/arm/v6/addmul_1.asm111
-rw-r--r--gmp/mpn/arm/v6/addmul_2.asm138
-rw-r--r--gmp/mpn/arm/v6/addmul_3.asm187
-rw-r--r--gmp/mpn/arm/v6/dive_1.asm149
-rw-r--r--gmp/mpn/arm/v6/gmp-mparam.h157
-rw-r--r--gmp/mpn/arm/v6/mode1o.asm95
-rw-r--r--gmp/mpn/arm/v6/mul_1.asm114
-rw-r--r--gmp/mpn/arm/v6/mul_2.asm131
-rw-r--r--gmp/mpn/arm/v6/popham.asm138
-rw-r--r--gmp/mpn/arm/v6/sqr_basecase.asm518
-rw-r--r--gmp/mpn/arm/v6/submul_1.asm125
-rw-r--r--gmp/mpn/arm/v6t2/divrem_1.asm212
-rw-r--r--gmp/mpn/arm/v6t2/gcd_1.asm115
-rw-r--r--gmp/mpn/arm/v7a/cora15/addmul_1.asm145
-rw-r--r--gmp/mpn/arm/v7a/cora15/aors_n.asm162
-rw-r--r--gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm158
-rw-r--r--gmp/mpn/arm/v7a/cora15/com.asm180
-rw-r--r--gmp/mpn/arm/v7a/cora15/gmp-mparam.h197
-rw-r--r--gmp/mpn/arm/v7a/cora15/logops_n.asm253
-rw-r--r--gmp/mpn/arm/v7a/cora15/mul_1.asm104
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm144
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/com.asm97
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/copyd.asm110
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/copyi.asm90
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm177
-rw-r--r--gmp/mpn/arm/v7a/cora15/submul_1.asm159
-rw-r--r--gmp/mpn/arm/v7a/cora9/gmp-mparam.h209
61 files changed, 8374 insertions, 0 deletions
diff --git a/gmp/mpn/arm/README b/gmp/mpn/arm/README
new file mode 100644
index 0000000000..598baa3f2e
--- /dev/null
+++ b/gmp/mpn/arm/README
@@ -0,0 +1,35 @@
+Copyright 2002, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions for ARM processors. It has been
+optimised for Cortex-A9, but the code in the top-level directory should run
+on all ARM processors at architecture level v4 or later.
diff --git a/gmp/mpn/arm/aors_n.asm b/gmp/mpn/arm/aors_n.asm
new file mode 100644
index 0000000000..fdad9f7ba6
--- /dev/null
+++ b/gmp/mpn/arm/aors_n.asm
@@ -0,0 +1,112 @@
+dnl ARM mpn_add_n and mpn_sub_n
+
+dnl Contributed to the GNU project by Robert Harley.
+
+dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.5 slightly fluctuating
+C Cortex-A15 2.25
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_add_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`CLRCY', `cmn r0, #0')
+ define(`SETCY', `cmp $1, #1')
+ define(`RETVAL', `adc r0, n, #0')
+ define(`func', mpn_add_n)
+ define(`func_nc', mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`CLRCY', `cmp r0, r0')
+ define(`SETCY', `rsbs $1, $1, #0')
+ define(`RETVAL', `sbc r0, r0, r0
+ and r0, r0, #1')
+ define(`func', mpn_sub_n)
+ define(`func_nc', mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ ldr r12, [sp, #0]
+ stmfd sp!, { r8, r9, lr }
+ SETCY( r12)
+ b L(ent)
+EPILOGUE()
+PROLOGUE(func)
+ stmfd sp!, { r8, r9, lr }
+ CLRCY( r12)
+L(ent): tst n, #1
+ beq L(skip1)
+ ldr r12, [up], #4
+ ldr lr, [vp], #4
+ ADDSUBC r12, r12, lr
+ str r12, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldmia up!, { r8, r9 }
+ ldmia vp!, { r12, lr }
+ ADDSUBC r8, r8, r12
+ ADDSUBC r9, r9, lr
+ stmia rp!, { r8, r9 }
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+L(top): ldmia up!, { r4, r5, r6, r7 }
+ ldmia vp!, { r8, r9, r12, lr }
+ ADDSUBC r4, r4, r8
+ sub n, n, #4
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r12
+ ADDSUBC r7, r7, lr
+ stmia rp!, { r4, r5, r6, r7 }
+ teq n, #0
+ bne L(top)
+
+ ldmfd sp!, { r4, r5, r6, r7 }
+
+L(rtn): RETVAL
+ ldmfd sp!, { r8, r9, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/aorslsh1_n.asm b/gmp/mpn/arm/aorslsh1_n.asm
new file mode 100644
index 0000000000..1cbd4ba1af
--- /dev/null
+++ b/gmp/mpn/arm/aorslsh1_n.asm
@@ -0,0 +1,167 @@
+dnl ARM mpn_addlsh1_n and mpn_sublsh1_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C addlsh1_n sublsh1_n
+C cycles/limb cycles/limb
+C StrongARM ? ?
+C XScale ? ?
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3.12 3.7
+C Cortex-A15 ? ?
+
+C TODO
+C * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
+C The sublsh1_n code could surely be tweaked, its REVCY slows down things
+C very much. If two insns are really needed, it might help to separate them
+C for better micro-parallelism.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`SETCY', `cmp $1, #1')
+ define(`RETVAL', `adc r0, $1, #2')
+ define(`SAVECY', `sbc $1, $2, #0')
+ define(`RESTCY', `cmn $1, #1')
+ define(`REVCY', `')
+ define(`INICYR', `mov $1, #0')
+ define(`r10r11', `r11')
+ define(`func', mpn_addlsh1_n)
+ define(`func_nc', mpn_addlsh1_nc)')
+ifdef(`OPERATION_sublsh1_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`SETCY', `rsbs $1, $1, #0')
+ define(`RETVAL', `adc r0, $1, #1')
+ define(`SAVECY', `sbc $1, $1, $1')
+ define(`RESTCY', `cmn $1, #1')
+ define(`REVCY', `sbc $1, $1, $1
+ cmn $1, #1')
+ define(`INICYR', `mvn $1, #0')
+ define(`r10r11', `r10')
+ define(`func', mpn_sublsh1_n)
+ define(`func_nc', mpn_sublsh1_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10r11, r14}
+
+ifdef(`OPERATION_addlsh1_n', `
+ mvn r11, #0
+')
+ INICYR( r14)
+ subs n, n, #3
+ blt L(le2) C carry clear on branch path
+
+ cmn r0, #0 C clear carry
+ ldmia vp!, {r8, r9, r10}
+ b L(mid)
+
+L(top): RESTCY( r14)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r10
+ ldmia vp!, {r8, r9, r10}
+ stmia rp!, {r4, r5, r6}
+ REVCY(r14)
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ adcs r10, r10, r10
+ ldmia up!, {r4, r5, r6}
+ SAVECY( r14, r11)
+ subs n, n, #3
+ blt L(exi)
+ RESTCY( r12)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r10
+ ldmia vp!, {r8, r9, r10}
+ stmia rp!, {r4, r5, r6}
+ REVCY(r12)
+L(mid): adcs r8, r8, r8
+ adcs r9, r9, r9
+ adcs r10, r10, r10
+ ldmia up!, {r4, r5, r6}
+ SAVECY( r12, r11)
+ subs n, n, #3
+ bge L(top)
+
+ mov r7, r12 C swap alternating...
+ mov r12, r14 C ...carry-save...
+ mov r14, r7 C ...registers
+
+L(exi): RESTCY( r12)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r10
+ stmia rp!, {r4, r5, r6}
+
+ REVCY(r12)
+L(le2): tst n, #1 C n = {-1,-2,-3} map to [2], [1], [0]
+ beq L(e1)
+
+L(e02): tst n, #2
+ beq L(rt0)
+ ldm vp, {r8, r9}
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ ldm up, {r4, r5}
+ SAVECY( r12, r11)
+ RESTCY( r14)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ stm rp, {r4, r5}
+ b L(rt1)
+
+L(e1): ldr r8, [vp]
+ adcs r8, r8, r8
+ ldr r4, [up]
+ SAVECY( r12, r11)
+ RESTCY( r14)
+ ADDSUBC r4, r4, r8
+ str r4, [rp]
+
+L(rt1): mov r14, r12
+ REVCY(r12)
+L(rt0): RETVAL( r14)
+ pop {r4-r10r11, r14}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/aorsmul_1.asm b/gmp/mpn/arm/aorsmul_1.asm
new file mode 100644
index 0000000000..b02fbb3b2a
--- /dev/null
+++ b/gmp/mpn/arm/aorsmul_1.asm
@@ -0,0 +1,135 @@
+dnl ARM mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.25
+C Cortex-A15 4
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`vl', `r3')
+define(`rl', `r12')
+define(`ul', `r6')
+define(`r', `lr')
+
+ifdef(`OPERATION_addmul_1', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`CLRRCY', `mov $1, #0
+ adds r0, r0, #0')
+ define(`RETVAL', `adc r0, r4, #0')
+ define(`func', mpn_addmul_1)')
+ifdef(`OPERATION_submul_1', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`CLRRCY', `subs $1, r0, r0')
+ define(`RETVAL', `sbc r0, r0, r0
+ sub r0, $1, r0')
+ define(`func', mpn_submul_1)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ stmfd sp!, { r4-r6, lr }
+ CLRRCY( r4)
+ tst n, #1
+ beq L(skip1)
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ umull r5, r4, ul, vl
+ ADDSUB r, rl, r5
+ str r, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ ldr ul, [up], #4
+ ADDSUBC r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ ADDSUBC r, rl, r5
+ str r, [rp], #4
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ b L(in)
+
+L(top): ldr ul, [up], #4
+ ADDSUBC r, rl, r5
+ ldr rl, [rp, #4]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ str r, [rp], #4
+L(in): ldr ul, [up], #4
+ ADDSUBC r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ ldr ul, [up], #4
+ ADDSUBC r, rl, r5
+ ldr rl, [rp, #4]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ str r, [rp], #4
+ ldr ul, [up], #4
+ ADDSUBC r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ sub n, n, #4
+ tst n, n
+ str r, [rp], #4
+ bne L(top)
+
+ ADDSUBC r, rl, r5
+ str r, [rp]
+
+L(rtn): RETVAL( r4)
+ ldmfd sp!, { r4-r6, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/arm-defs.m4 b/gmp/mpn/arm/arm-defs.m4
new file mode 100644
index 0000000000..6ca964a245
--- /dev/null
+++ b/gmp/mpn/arm/arm-defs.m4
@@ -0,0 +1,91 @@
+divert(-1)
+
+dnl m4 macros for ARM assembler.
+
+dnl Copyright 2001, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Standard commenting is with @, the default m4 # is for constants and we
+dnl don't want to disable macro expansions in or after them.
+
+changecom(@&*$)
+
+
+dnl APCS register names.
+
+deflit(a1,r0)
+deflit(a2,r1)
+deflit(a3,r2)
+deflit(a4,r3)
+deflit(v1,r4)
+deflit(v2,r5)
+deflit(v3,r6)
+deflit(v4,r7)
+deflit(v5,r8)
+deflit(v6,r9)
+deflit(sb,r9)
+deflit(v7,r10)
+deflit(sl,r10)
+deflit(fp,r11)
+deflit(ip,r12)
+deflit(sp,r13)
+deflit(lr,r14)
+deflit(pc,r15)
+
+
+define(`lea_list', `')
+define(`lea_num',0)
+
+dnl LEA(reg,gmp_symbol)
+dnl
+dnl Load the address of gmp_symbol into a register. The gmp_symbol must be
+dnl either local or protected/hidden, since we assume it has a fixed distance
+dnl from the point of use.
+
+define(`LEA',`dnl
+ldr $1, L(ptr`'lea_num)
+ifdef(`PIC',dnl
+`dnl
+L(bas`'lea_num):dnl
+ add $1, $1, pc`'dnl
+ m4append(`lea_list',`
+L(ptr'lea_num`): .word GSYM_PREFIX`'$2-L(bas'lea_num`)-8')
+ define(`lea_num', eval(lea_num+1))dnl
+',`dnl
+ m4append(`lea_list',`
+L(ptr'lea_num`): .word GSYM_PREFIX`'$2')
+ define(`lea_num', eval(lea_num+1))dnl
+')dnl
+')
+
+define(`EPILOGUE_cpu',
+`lea_list
+ SIZE(`$1',.-`$1')')
+
+divert
diff --git a/gmp/mpn/arm/bdiv_dbm1c.asm b/gmp/mpn/arm/bdiv_dbm1c.asm
new file mode 100644
index 0000000000..ec3de50e8e
--- /dev/null
+++ b/gmp/mpn/arm/bdiv_dbm1c.asm
@@ -0,0 +1,113 @@
+dnl ARM mpn_bdiv_dbm1c.
+
+dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4.25
+C Cortex-A15 2.5
+
+C TODO
+C * Try using umlal or umaal.
+C * Try using ldm/stm.
+
+define(`qp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`bd', `r3')
+define(`cy', `sp,#0')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+ push {r4, r5, r6, r7, r8}
+ ldr r4, [up], #4
+ ldr r5, [sp, #20]
+ ands r12, n, #3
+ beq L(fi0)
+ cmp r12, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): umull r8, r12, r4, bd
+ ldr r4, [up], #4
+ b L(lo3)
+
+L(fi0): umull r6, r7, r4, bd
+ ldr r4, [up], #4
+ b L(lo0)
+
+L(fi1): subs n, n, #1
+ umull r8, r12, r4, bd
+ bls L(wd1)
+ ldr r4, [up], #4
+ b L(lo1)
+
+L(fi2): umull r6, r7, r4, bd
+ ldr r4, [up], #4
+ b L(lo2)
+
+L(top): ldr r4, [up], #4
+ subs r5, r5, r6
+ str r5, [qp], #4
+ sbc r5, r5, r7
+L(lo1): umull r6, r7, r4, bd
+ ldr r4, [up], #4
+ subs r5, r5, r8
+ str r5, [qp], #4
+ sbc r5, r5, r12
+L(lo0): umull r8, r12, r4, bd
+ ldr r4, [up], #4
+ subs r5, r5, r6
+ str r5, [qp], #4
+ sbc r5, r5, r7
+L(lo3): umull r6, r7, r4, bd
+ ldr r4, [up], #4
+ subs r5, r5, r8
+ str r5, [qp], #4
+ sbc r5, r5, r12
+L(lo2): subs n, n, #4
+ umull r8, r12, r4, bd
+ bhi L(top)
+
+L(wd2): subs r5, r5, r6
+ str r5, [qp], #4
+ sbc r5, r5, r7
+L(wd1): subs r5, r5, r8
+ str r5, [qp]
+ sbc r0, r5, r12
+ pop {r4, r5, r6, r7, r8}
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/cnd_aors_n.asm b/gmp/mpn/arm/cnd_aors_n.asm
new file mode 100644
index 0000000000..e8eb60983a
--- /dev/null
+++ b/gmp/mpn/arm/cnd_aors_n.asm
@@ -0,0 +1,134 @@
+dnl ARM mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3
+C Cortex-A15 2.5
+
+define(`cnd', `r0')
+define(`rp', `r1')
+define(`up', `r2')
+define(`vp', `r3')
+
+define(`n', `r12')
+
+
+ifdef(`OPERATION_cnd_add_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`INITCY', `cmn r0, #0')
+ define(`RETVAL', `adc r0, n, #0')
+ define(func, mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`INITCY', `cmp r0, #0')
+ define(`RETVAL', `adc r0, n, #0
+ rsb r0, r0, #1')
+ define(func, mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r11}
+ ldr n, [sp, #32]
+
+ cmp cnd, #1
+ sbc cnd, cnd, cnd C conditionally set to 0xffffffff
+
+ INITCY C really only needed for n = 0 (mod 4)
+
+ ands r4, n, #3
+ beq L(top)
+ cmp r4, #2
+ bcc L(b1)
+ beq L(b2)
+
+L(b3): ldm vp!, {r4,r5,r6}
+ ldm up!, {r8,r9,r10}
+ bic r4, r4, cnd
+ bic r5, r5, cnd
+ bic r6, r6, cnd
+ ADDSUB r8, r8, r4
+ ADDSUBC r9, r9, r5
+ ADDSUBC r10, r10, r6
+ stm rp!, {r8,r9,r10}
+ sub n, n, #3
+ teq n, #0
+ bne L(top)
+ b L(end)
+
+L(b2): ldm vp!, {r4,r5}
+ ldm up!, {r8,r9}
+ bic r4, r4, cnd
+ bic r5, r5, cnd
+ ADDSUB r8, r8, r4
+ ADDSUBC r9, r9, r5
+ stm rp!, {r8,r9}
+ sub n, n, #2
+ teq n, #0
+ bne L(top)
+ b L(end)
+
+L(b1): ldr r4, [vp], #4
+ ldr r8, [up], #4
+ bic r4, r4, cnd
+ ADDSUB r8, r8, r4
+ str r8, [rp], #4
+ sub n, n, #1
+ teq n, #0
+ beq L(end)
+
+L(top): ldm vp!, {r4,r5,r6,r7}
+ ldm up!, {r8,r9,r10,r11}
+ bic r4, r4, cnd
+ bic r5, r5, cnd
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ ADDSUBC r8, r8, r4
+ ADDSUBC r9, r9, r5
+ ADDSUBC r10, r10, r6
+ ADDSUBC r11, r11, r7
+ sub n, n, #4
+ stm rp!, {r8,r9,r10,r11}
+ teq n, #0
+ bne L(top)
+
+L(end): RETVAL
+ pop {r4-r11}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/com.asm b/gmp/mpn/arm/com.asm
new file mode 100644
index 0000000000..42f8e3cbbe
--- /dev/null
+++ b/gmp/mpn/arm/com.asm
@@ -0,0 +1,75 @@
+dnl ARM mpn_com.
+
+dnl Copyright 2003, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.0
+C Cortex-A15 1.75
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ tst n, #1
+ beq L(skip1)
+ ldr r3, [up], #4
+ mvn r3, r3
+ str r3, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldmia up!, { r3, r12 } C load 2 limbs
+ mvn r3, r3
+ mvn r12, r12
+ stmia rp!, { r3, r12 } C store 2 limbs
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+ stmfd sp!, { r7, r8, r9 } C save regs on stack
+
+L(top): ldmia up!, { r3, r8, r9, r12 } C load 4 limbs
+ subs n, n, #4
+ mvn r3, r3
+ mvn r8, r8
+ mvn r9, r9
+ mvn r12, r12
+ stmia rp!, { r3, r8, r9, r12 } C store 4 limbs
+ bne L(top)
+
+ ldmfd sp!, { r7, r8, r9 } C restore regs from stack
+L(rtn): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/copyd.asm b/gmp/mpn/arm/copyd.asm
new file mode 100644
index 0000000000..3ea2035099
--- /dev/null
+++ b/gmp/mpn/arm/copyd.asm
@@ -0,0 +1,84 @@
+dnl ARM mpn_copyd.
+
+dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
+
+dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.25-1.5
+C Cortex-A15 1.25
+
+C TODO
+C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9
+C and A15. But it probably slows things down for 8 <= n < a few dozen.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ mov r12, n, lsl #2
+ sub r12, r12, #4
+ add rp, rp, r12
+ add up, up, r12
+
+ tst n, #1
+ beq L(skip1)
+ ldr r3, [up], #-4
+ str r3, [rp], #-4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldmda up!, { r3,r12 }
+ stmda rp!, { r3,r12 }
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+
+ push { r4-r5 }
+ subs n, n, #4
+ ldmda up!, { r3,r4,r5,r12 }
+ beq L(end)
+
+L(top): subs n, n, #4
+ stmda rp!, { r3,r4,r5,r12 }
+ ldmda up!, { r3,r4,r5,r12 }
+ bne L(top)
+
+L(end): stmda rp, { r3,r4,r5,r12 }
+ pop { r4-r5 }
+L(rtn): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/copyi.asm b/gmp/mpn/arm/copyi.asm
new file mode 100644
index 0000000000..fa454702c1
--- /dev/null
+++ b/gmp/mpn/arm/copyi.asm
@@ -0,0 +1,79 @@
+dnl ARM mpn_copyi.
+
+dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
+
+dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.25-1.5
+C Cortex-A15 1.25
+
+C TODO
+C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9
+C and A15. But it probably slows things down for 8 <= n < a few dozen.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ tst n, #1
+ beq L(skip1)
+ ldr r3, [up], #4
+ str r3, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldmia up!, { r3,r12 }
+ stmia rp!, { r3,r12 }
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+
+ push { r4-r5 }
+ subs n, n, #4
+ ldmia up!, { r3,r4,r5,r12 }
+ beq L(end)
+
+L(top): subs n, n, #4
+ stmia rp!, { r3,r4,r5,r12 }
+ ldmia up!, { r3,r4,r5,r12 }
+ bne L(top)
+
+L(end): stm rp, { r3,r4,r5,r12 }
+ pop { r4-r5 }
+L(rtn): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/dive_1.asm b/gmp/mpn/arm/dive_1.asm
new file mode 100644
index 0000000000..a695e47c77
--- /dev/null
+++ b/gmp/mpn/arm/dive_1.asm
@@ -0,0 +1,151 @@
+dnl ARM v4 mpn_modexact_1c_odd
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C norm unorm modexact_1c_odd
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 10 12
+C Cortex-A15 9 9
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te -
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`d', `r3')
+
+define(`cy', `r7')
+define(`cnt', `r6')
+define(`tnc', `r8')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+ tst d, #1
+ push {r4-r9}
+ mov cnt, #0
+ bne L(inv)
+
+C count trailing zeros
+ movs r4, d, lsl #16
+ moveq d, d, lsr #16
+ moveq cnt, #16
+ tst d, #0xff
+ moveq d, d, lsr #8
+ addeq cnt, cnt, #8
+ LEA( r4, ctz_tab)
+ and r5, d, #0xff
+ ldrb r4, [r4, r5]
+ mov d, d, lsr r4
+ add cnt, cnt, r4
+
+C binvert limb
+L(inv): LEA( r4, binvert_limb_table)
+ and r12, d, #254
+ ldrb r4, [r4, r12, lsr #1]
+ mul r12, r4, r4
+ mul r12, d, r12
+ rsb r12, r12, r4, lsl #1
+ mul r4, r12, r12
+ mul r4, d, r4
+ rsb r4, r4, r12, lsl #1 C r4 = inverse
+
+ tst cnt, cnt
+ ldr r5, [up], #4 C up[0]
+ mov cy, #0
+ bne L(unnorm)
+
+L(norm):
+ subs n, n, #1 C set carry as side-effect
+ beq L(end)
+
+ ALIGN(16)
+L(top): sbcs cy, r5, cy
+ ldr r5, [up], #4
+ sub n, n, #1
+ mul r9, r4, cy
+ tst n, n
+ umull r12, cy, d, r9
+ str r9, [rp], #4
+ bne L(top)
+
+L(end): sbc cy, r5, cy
+ mul r9, r4, cy
+ str r9, [rp]
+ pop {r4-r9}
+ bx r14
+
+L(unnorm):
+ rsb tnc, cnt, #32
+ mov r5, r5, lsr cnt
+ subs n, n, #1 C set carry as side-effect
+ beq L(edu)
+
+ ALIGN(16)
+L(tpu): ldr r12, [up], #4
+ orr r9, r5, r12, lsl tnc
+ mov r5, r12, lsr cnt
+ sbcs cy, r9, cy C critical path ->cy->cy->
+ sub n, n, #1
+ mul r9, r4, cy C critical path ->cy->r9->
+ tst n, n
+ umull r12, cy, d, r9 C critical path ->r9->cy->
+ str r9, [rp], #4
+ bne L(tpu)
+
+L(edu): sbc cy, r5, cy
+ mul r9, r4, cy
+ str r9, [rp]
+ pop {r4-r9}
+ bx r14
+EPILOGUE()
+
+ .section .rodata
+ctz_tab:
+ .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+ .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
diff --git a/gmp/mpn/arm/gmp-mparam.h b/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000000..87eec3a149
--- /dev/null
+++ b/gmp/mpn/arm/gmp-mparam.h
@@ -0,0 +1,127 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1193MHz ARM (gcc55.fsffrance.org) */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 56
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 11
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 71
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIVREM_2_THRESHOLD 0 /* preinv always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 41
+
+#define MUL_TOOM22_THRESHOLD 36
+#define MUL_TOOM33_THRESHOLD 125
+#define MUL_TOOM44_THRESHOLD 193
+#define MUL_TOOM6H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 418
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 125
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 176
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129
+
+#define SQR_BASECASE_THRESHOLD 12
+#define SQR_TOOM2_THRESHOLD 78
+#define SQR_TOOM3_THRESHOLD 137
+#define SQR_TOOM4_THRESHOLD 212
+#define SQR_TOOM6_THRESHOLD 306
+#define SQR_TOOM8_THRESHOLD 422
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 26
+
+#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 436, 5}, { 27, 6}, { 28, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 256, 9}, { 512,10}, { 1024,11}, { 2048,12}, \
+ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 28
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 404, 5}, { 13, 4}, { 27, 5}, { 27, 6}, \
+ { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 512,10}, \
+ { 1024,11}, { 2048,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 26
+#define SQR_FFT_THRESHOLD 3776
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 137
+#define MULLO_MUL_N_THRESHOLD 11479
+
+#define DC_DIV_QR_THRESHOLD 150
+#define DC_DIVAPPR_Q_THRESHOLD 494
+#define DC_BDIV_QR_THRESHOLD 148
+#define DC_BDIV_Q_THRESHOLD 345
+
+#define INV_MULMOD_BNM1_THRESHOLD 70
+#define INV_NEWTON_THRESHOLD 474
+#define INV_APPR_THRESHOLD 478
+
+#define BINV_NEWTON_THRESHOLD 542
+#define REDC_1_TO_REDC_N_THRESHOLD 117
+
+#define MU_DIV_QR_THRESHOLD 2089
+#define MU_DIVAPPR_Q_THRESHOLD 2172
+#define MUPI_DIV_QR_THRESHOLD 225
+#define MU_BDIV_QR_THRESHOLD 1528
+#define MU_BDIV_Q_THRESHOLD 2089
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 197
+#define GCD_DC_THRESHOLD 902
+#define GCDEXT_DC_THRESHOLD 650
+#define JACOBI_BASE_METHOD 2
+
+#define GET_STR_DC_THRESHOLD 20
+#define GET_STR_PRECOMPUTE_THRESHOLD 39
+#define SET_STR_DC_THRESHOLD 1045
+#define SET_STR_PRECOMPUTE_THRESHOLD 2147
diff --git a/gmp/mpn/arm/invert_limb.asm b/gmp/mpn/arm/invert_limb.asm
new file mode 100644
index 0000000000..d4c3afe2da
--- /dev/null
+++ b/gmp/mpn/arm/invert_limb.asm
@@ -0,0 +1,93 @@
+dnl ARM mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright 2001, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+ LEA( r2, approx_tab-512)
+ mov r3, r0, lsr #23
+ mov r3, r3, asl #1
+ ldrh r3, [r3, r2]
+ mov r1, r3, asl #17
+ mul r12, r3, r3
+ umull r3, r2, r12, r0
+ sub r1, r1, r2, asl #1
+ umull r3, r2, r1, r1
+ umull r12, r3, r0, r3
+ umull r2, r12, r0, r2
+ adds r2, r2, r3
+ adc r12, r12, #0
+ rsb r1, r12, r1
+ mvn r2, r2, lsr #30
+ add r2, r2, r1, asl #2
+ umull r12, r3, r0, r2
+ adds r1, r12, r0
+ adc r3, r3, r0
+ rsb r0, r3, r2
+ bx lr
+EPILOGUE()
+
+ .section .rodata
+ ALIGN(2)
+approx_tab:
+ .short 0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900
+ .short 0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180
+ .short 0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0
+ .short 0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400
+ .short 0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00
+ .short 0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800
+ .short 0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280
+ .short 0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40
+ .short 0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840
+ .short 0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380
+ .short 0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0
+ .short 0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80
+ .short 0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640
+ .short 0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240
+ .short 0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80
+ .short 0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0
+ .short 0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740
+ .short 0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400
+ .short 0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0
+ .short 0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0
+ .short 0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0
+ .short 0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0
+ .short 0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500
+ .short 0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240
+ .short 0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0
+ .short 0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40
+ .short 0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00
+ .short 0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880
+ .short 0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640
+ .short 0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440
+ .short 0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200
+ .short 0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000
+ASM_END()
diff --git a/gmp/mpn/arm/logops_n.asm b/gmp/mpn/arm/logops_n.asm
new file mode 100644
index 0000000000..5a61683fc2
--- /dev/null
+++ b/gmp/mpn/arm/logops_n.asm
@@ -0,0 +1,139 @@
+dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C and andn ior xor nand iorn nior xnor
+C StrongARM ? ?
+C XScale ? ?
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 2.5-2.72 2.75-3
+C Cortex-A15 2.25 2.75
+
+C TODO
+C * It seems that 2.25 c/l and 2.75 c/l is possible for A9.
+C * Debug popping issue, see comment below.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+ define(`func', `mpn_and_n')
+ define(`LOGOP', `and $1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+ define(`func', `mpn_andn_n')
+ define(`LOGOP', `bic $1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+ define(`func', `mpn_nand_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `and $1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+ define(`func', `mpn_ior_n')
+ define(`LOGOP', `orr $1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+ define(`func', `mpn_iorn_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `bic $1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+ define(`func', `mpn_nior_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `orr $1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+ define(`func', `mpn_xor_n')
+ define(`LOGOP', `eor $1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+ define(`func', `mpn_xnor_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `eor $1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ push { r8, r9, r10 }
+ tst n, #1
+ beq L(skip1)
+ ldr r10, [vp], #4
+ ldr r12, [up], #4
+ LOGOP( r12, r12, r10)
+ POSTOP( r12)
+ str r12, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldmia vp!, { r10, r12 }
+ ldmia up!, { r8, r9 }
+ LOGOP( r8, r8, r10)
+ LOGOP( r9, r9, r12)
+ POSTOP( r8)
+ POSTOP( r9)
+ stmia rp!, { r8, r9 }
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+ push { r4, r5, r6, r7 }
+
+ ldmia vp!, { r8, r9, r10, r12 }
+ b L(mid)
+
+L(top): ldmia vp!, { r8, r9, r10, r12 }
+ POSTOP( r4)
+ POSTOP( r5)
+ POSTOP( r6)
+ POSTOP( r7)
+ stmia rp!, { r4, r5, r6, r7 }
+L(mid): sub n, n, #4
+ ldmia up!, { r4, r5, r6, r7 }
+ teq n, #0
+ LOGOP( r4, r4, r8)
+ LOGOP( r5, r5, r9)
+ LOGOP( r6, r6, r10)
+ LOGOP( r7, r7, r12)
+ bne L(top)
+
+ POSTOP( r4)
+ POSTOP( r5)
+ POSTOP( r6)
+ POSTOP( r7)
+ stmia rp!, { r4, r5, r6, r7 }
+
+ pop { r4, r5, r6, r7 } C popping r8-r10 here strangely fails
+
+L(rtn): pop { r8, r9, r10 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/lshift.asm b/gmp/mpn/arm/lshift.asm
new file mode 100644
index 0000000000..9f777eb4dd
--- /dev/null
+++ b/gmp/mpn/arm/lshift.asm
@@ -0,0 +1,88 @@
+dnl ARM mpn_lshift.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.5
+C Cortex-A15 ?
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ add up, up, n, lsl #2
+ push {r4, r6, r7, r8}
+ ldr r4, [up, #-4]!
+ add rp, rp, n, lsl #2
+ rsb tnc, cnt, #32
+
+ mov r7, r4, lsl cnt
+ tst n, #1
+ beq L(evn) C n even
+
+L(odd): subs n, n, #2
+ bcc L(1) C n = 1
+ ldr r8, [up, #-4]!
+ b L(mid)
+
+L(evn): ldr r6, [up, #-4]!
+ subs n, n, #2
+ beq L(end)
+
+L(top): ldr r8, [up, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(mid): ldr r6, [up, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r8, lsl cnt
+ subs n, n, #2
+ bgt L(top)
+
+L(end): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(1): str r7, [rp, #-4]
+ mov r0, r4, lsr tnc
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/lshiftc.asm b/gmp/mpn/arm/lshiftc.asm
new file mode 100644
index 0000000000..5f3d6e3f5b
--- /dev/null
+++ b/gmp/mpn/arm/lshiftc.asm
@@ -0,0 +1,95 @@
+dnl ARM mpn_lshiftc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4.0
+C Cortex-A15 ?
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+ add up, up, n, lsl #2
+ push {r4, r6, r7, r8}
+ ldr r4, [up, #-4]!
+ add rp, rp, n, lsl #2
+ rsb tnc, cnt, #32
+ mvn r6, r4
+
+ mov r7, r6, lsl cnt
+ tst n, #1
+ beq L(evn) C n even
+
+L(odd): subs n, n, #2
+ bcc L(1) C n = 1
+ ldr r8, [up, #-4]!
+ mvn r8, r8
+ b L(mid)
+
+L(evn): ldr r6, [up, #-4]!
+ mvn r6, r6
+ subs n, n, #2
+ beq L(end)
+
+L(top): ldr r8, [up, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mvn r8, r8
+ mov r7, r6, lsl cnt
+L(mid): ldr r6, [up, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ mvn r6, r6
+ mov r7, r8, lsl cnt
+ subs n, n, #2
+ bgt L(top)
+
+L(end): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(1): mvn r6, #0
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]
+ mov r0, r4, lsr tnc
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/mod_34lsub1.asm b/gmp/mpn/arm/mod_34lsub1.asm
new file mode 100644
index 0000000000..ba3c06d8db
--- /dev/null
+++ b/gmp/mpn/arm/mod_34lsub1.asm
@@ -0,0 +1,121 @@
+dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.33
+C Cortex-A15 1.33
+
+define(`ap', r0)
+define(`n', r1)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C * Write cleverer summation code.
+C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l.
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+ push { r4, r5, r6, r7 }
+
+ subs n, n, #3
+ mov r7, #0
+ blt L(le2) C n <= 2
+
+ ldmia ap!, { r2, r3, r12 }
+ subs n, n, #3
+ blt L(sum) C n <= 5
+ cmn r0, #0 C clear carry
+ sub n, n, #3
+ b L(mid)
+
+L(top): adcs r2, r2, r4
+ adcs r3, r3, r5
+ adcs r12, r12, r6
+L(mid): ldmia ap!, { r4, r5, r6 }
+ tst n, n
+ sub n, n, #3
+ bpl L(top)
+
+ add n, n, #3
+
+ adcs r2, r2, r4
+ adcs r3, r3, r5
+ adcs r12, r12, r6
+ movcs r7, #1 C r7 <= 1
+
+L(sum): cmn n, #2
+ movlo r4, #0
+ ldrhs r4, [ap], #4
+ movls r5, #0
+ ldrhi r5, [ap], #4
+
+ adds r2, r2, r4
+ adcs r3, r3, r5
+ adcs r12, r12, #0
+ adc r7, r7, #0 C r7 <= 2
+
+L(sum2):
+ bic r0, r2, #0xff000000
+ add r0, r0, r2, lsr #24
+ add r0, r0, r7
+
+ mov r7, r3, lsl #8
+ bic r1, r7, #0xff000000
+ add r0, r0, r1
+ add r0, r0, r3, lsr #16
+
+ mov r7, r12, lsl #16
+ bic r1, r7, #0xff000000
+ add r0, r0, r1
+ add r0, r0, r12, lsr #8
+
+ pop { r4, r5, r6, r7 }
+ bx lr
+
+L(le2): cmn n, #1
+ bne L(1)
+ ldmia ap!, { r2, r3 }
+ mov r12, #0
+ b L(sum2)
+L(1): ldr r2, [ap]
+ bic r0, r2, #0xff000000
+ add r0, r0, r2, lsr #24
+ pop { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/mode1o.asm b/gmp/mpn/arm/mode1o.asm
new file mode 100644
index 0000000000..5e0f78fc8f
--- /dev/null
+++ b/gmp/mpn/arm/mode1o.asm
@@ -0,0 +1,92 @@
+dnl ARM mpn_modexact_1c_odd
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 10
+C Cortex-A15 9
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te -
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`up', `r0')
+define(`n', `r1')
+define(`d', `r2')
+define(`cy', `r3')
+
+ .protected binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+ stmfd sp!, {r4, r5}
+
+ LEA( r4, binvert_limb_table)
+
+ ldr r5, [up], #4 C up[0]
+
+ and r12, d, #254
+ ldrb r4, [r4, r12, lsr #1]
+ mul r12, r4, r4
+ mul r12, d, r12
+ rsb r12, r12, r4, asl #1
+ mul r4, r12, r12
+ mul r4, d, r4
+ rsb r4, r4, r12, asl #1 C r4 = inverse
+
+ subs n, n, #1 C set carry as side-effect
+ beq L(end)
+
+L(top): sbcs cy, r5, cy
+ ldr r5, [up], #4
+ sub n, n, #1
+ mul r12, r4, cy
+ tst n, n
+ umull r12, cy, d, r12
+ bne L(top)
+
+L(end): sbcs cy, r5, cy
+ mul r12, r4, cy
+ umull r12, r0, d, r12
+ addcc r0, r0, #1
+
+ ldmfd sp!, {r4, r5}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/mul_1.asm b/gmp/mpn/arm/mul_1.asm
new file mode 100644
index 0000000000..f7bc1bc386
--- /dev/null
+++ b/gmp/mpn/arm/mul_1.asm
@@ -0,0 +1,94 @@
+dnl ARM mpn_mul_1 -- Multiply a limb vector with a limb and store the result
+dnl in a second limb vector.
+dnl Contributed by Robert Harley.
+
+dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM 6-8
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4.75
+C Cortex-A15 ?
+
+C We should rewrite this along the lines of addmul_1.asm. That should save a
+C cycle on StrongARM, and several cycles on XScale.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
+define(`vl',`r3')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ stmfd sp!, { r8, r9, lr }
+ ands r12, n, #1
+ beq L(skip1)
+ ldr lr, [up], #4
+ umull r9, r12, lr, vl
+ str r9, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ mov r8, r12
+ ldmia up!, { r12, lr }
+ mov r9, #0
+ umlal r8, r9, r12, vl
+ mov r12, #0
+ umlal r9, r12, lr, vl
+ stmia rp!, { r8, r9 }
+L(skip2):
+ bics n, n, #3
+ beq L(rtn)
+ stmfd sp!, { r6, r7 }
+
+L(top): mov r6, r12
+ ldmia up!, { r8, r9, r12, lr }
+ ldr r7, [rp, #12] C cache allocate
+ mov r7, #0
+ umlal r6, r7, r8, vl
+ mov r8, #0
+ umlal r7, r8, r9, vl
+ mov r9, #0
+ umlal r8, r9, r12, vl
+ mov r12, #0
+ umlal r9, r12, lr, vl
+ subs n, n, #4
+ stmia rp!, { r6, r7, r8, r9 }
+ bne L(top)
+
+ ldmfd sp!, { r6, r7 }
+
+L(rtn): mov r0, r12
+ ldmfd sp!, { r8, r9, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/README b/gmp/mpn/arm/neon/README
new file mode 100644
index 0000000000..79e3b48ee6
--- /dev/null
+++ b/gmp/mpn/arm/neon/README
@@ -0,0 +1,2 @@
+This directory contains Neon code which runs and is efficient on all
+ARM CPUs which support Neon.
diff --git a/gmp/mpn/arm/neon/hamdist.asm b/gmp/mpn/arm/neon/hamdist.asm
new file mode 100644
index 0000000000..232089647d
--- /dev/null
+++ b/gmp/mpn/arm/neon/hamdist.asm
@@ -0,0 +1,194 @@
+dnl ARM Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.89
+C Cortex-A15 0.95
+
+C TODO
+C * Explore using vldr and vldm. Does it help on A9? (These loads do
+C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
+C popcount. Except perhaps also for popcount for the edge loads.)
+C * Arrange to align the pointer, if that helps performance. Use the same
+C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
+C valgrind!)
+C * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`bp', r1)
+define(`n', r2)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+ cmp n, #chunksize
+ bhi L(gt16k)
+
+L(lt16k):
+ vmov.i64 q8, #0 C clear summation register
+ vmov.i64 q9, #0 C clear summation register
+
+ tst n, #1
+ beq L(xxx0)
+ vmov.i64 d0, #0
+ vmov.i64 d20, #0
+ sub n, n, #1
+ vld1.32 {d0[0]}, [ap]! C load 1 limb
+ vld1.32 {d20[0]}, [bp]! C load 1 limb
+ veor d0, d0, d20
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
+
+L(xxx0):tst n, #2
+ beq L(xx00)
+ sub n, n, #2
+ vld1.32 {d0}, [ap]! C load 2 limbs
+ vld1.32 {d20}, [bp]! C load 2 limbs
+ veor d0, d0, d20
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24
+
+L(xx00):tst n, #4
+ beq L(x000)
+ sub n, n, #4
+ vld1.32 {q0}, [ap]! C load 4 limbs
+ vld1.32 {q10}, [bp]! C load 4 limbs
+ veor q0, q0, q10
+ vcnt.8 q12, q0
+ vpadal.u8 q8, q12
+
+L(x000):tst n, #8
+ beq L(0000)
+
+ subs n, n, #8
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vld1.32 {q10,q11}, [bp]! C load 8 limbs
+ bls L(sum)
+
+L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q14,q15}, [bp]! C load 8 limbs
+ veor q0, q0, q10
+ veor q1, q1, q11
+ sub n, n, #8
+ vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ b L(mid)
+
+L(0000):subs n, n, #16
+ blo L(e0)
+
+ vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vld1.32 {q14,q15}, [bp]! C load 8 limbs
+ vld1.32 {q10,q11}, [bp]! C load 8 limbs
+ veor q2, q2, q14
+ veor q3, q3, q15
+ vcnt.8 q12, q2
+ vcnt.8 q13, q3
+ subs n, n, #16
+ blo L(end)
+
+L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q14,q15}, [bp]! C load 8 limbs
+ veor q0, q0, q10
+ veor q1, q1, q11
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q0
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q1
+L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vld1.32 {q10,q11}, [bp]! C load 8 limbs
+ veor q2, q2, q14
+ veor q3, q3, q15
+ subs n, n, #16
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q2
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q3
+ bhs L(top)
+
+L(end): vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+L(sum): veor q0, q0, q10
+ veor q1, q1, q11
+ vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+ vadd.i16 q8, q8, q9
+ C we have 8 16-bit counts
+L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
+ vpaddl.u32 q8, q8 C we have 2 64-bit counts
+ vmov.32 r0, d16[0]
+ vmov.32 r1, d17[0]
+ add r0, r0, r1
+ bx lr
+
+C Code for large count. Splits operand and calls above code.
+define(`ap2', r5)
+define(`bp2', r6)
+L(gt16k):
+ push {r4,r5,r6,r14}
+ mov ap2, ap
+ mov bp2, bp
+ mov r3, n C full count
+ mov r4, #0 C total sum
+
+1: mov n, #chunksize C count for this invocation
+ bl L(lt16k) C could jump deep inside code
+ add ap2, ap2, #chunksize*4 C point at next chunk
+ add bp2, bp2, #chunksize*4 C point at next chunk
+ add r4, r4, r0
+ mov ap, ap2 C put chunk pointer in place for call
+ mov bp, bp2 C put chunk pointer in place for call
+ sub r3, r3, #chunksize
+ cmp r3, #chunksize
+ bhi 1b
+
+ mov n, r3 C count for final invocation
+ bl L(lt16k)
+ add r0, r4, r0
+ pop {r4,r5,r6,pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/lorrshift.asm b/gmp/mpn/arm/neon/lorrshift.asm
new file mode 100644
index 0000000000..3d6253fd49
--- /dev/null
+++ b/gmp/mpn/arm/neon/lorrshift.asm
@@ -0,0 +1,279 @@
+dnl ARM Neon mpn_lshift and mpn_rshift.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C StrongARM - -
+C XScale - -
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3 3 Y
+C Cortex-A15 1.5 1.5 Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses. All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands. Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
+C which might make it tricky.
+C * Clean up and simplify.
+C * Consider sharing most of the code for lshift and rshift, since the feed-in code,
+C the loop, and most of the wind-down code are identical.
+C * Replace the basecase code with code using 'extension' registers.
+C * Optimise. It is not clear that this loop insn permutation is optimal for
+C either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp', `r0')
+define(`ap', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+
+ifdef(`OPERATION_lshift',`
+ define(`IFLSH', `$1')
+ define(`IFRSH', `')
+ define(`X',`0')
+ define(`Y',`1')
+ define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+ define(`IFLSH', `')
+ define(`IFRSH', `$1')
+ define(`X',`1')
+ define(`Y',`0')
+ define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(func)
+IFLSH(` mov r12, n, lsl #2 ')
+IFLSH(` add rp, rp, r12 ')
+IFLSH(` add ap, ap, r12 ')
+
+ cmp n, #4 C SIMD code n limit
+ ble L(base)
+
+ifdef(`OPERATION_lshift',`
+ vdup.32 d6, r3 C left shift count is positive
+ sub r3, r3, #64 C right shift count is negative
+ vdup.32 d7, r3
+ mov r12, #-8') C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+ rsb r3, r3, #0 C right shift count is negative
+ vdup.32 d6, r3
+ add r3, r3, #64 C left shift count is positive
+ vdup.32 d7, r3
+ mov r12, #8') C rshift pointer update offset
+
+IFLSH(` sub ap, ap, #8 ')
+ vld1.32 {d19}, [ap], r12 C load initial 2 limbs
+ vshl.u64 d18, d19, d7 C retval
+
+ tst rp, #4 C is rp 64-bit aligned already?
+ beq L(rp_aligned) C yes, skip
+IFLSH(` add ap, ap, #4 ') C move back ap pointer
+IFRSH(` sub ap, ap, #4 ') C move back ap pointer
+ vshl.u64 d4, d19, d6
+ sub n, n, #1 C first limb handled
+IFLSH(` sub rp, rp, #4 ')
+ vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
+ vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(` sub rp, rp, #8 ')
+ subs n, n, #6
+ blt L(two_or_three_more)
+ tst n, #2
+ beq L(2)
+
+L(1): vld1.32 {d17}, [ap], r12
+ vshl.u64 d5, d19, d6
+ vld1.32 {d16}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ sub n, n, #2
+ b L(mid)
+
+L(2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vld1.32 {d17}, [ap], r12
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ subs n, n, #4
+ blt L(end)
+
+L(top): vld1.32 {d16}, [ap], r12
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+L(mid): vld1.32 {d17}, [ap], r12
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ subs n, n, #4
+ bge L(top)
+
+L(end): tst n, #1
+ beq L(evn)
+
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ b L(cj1)
+
+L(evn): vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d16, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+ vorr d2, d5, d0
+ b L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+ tst n, #1
+ beq L(l2)
+
+L(l3): vshl.u64 d5, d19, d6
+ vld1.32 {d17}, [ap], r12
+L(cj1): veor d16, d16, d16
+IFLSH(` add ap, ap, #4 ')
+ vld1.32 {d16[Y]}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+IFLSH(` add rp, rp, #4 ')
+ vst1.32 {d5[Y]}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+L(l2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vshl.u64 d1, d16, d7
+ vshl.u64 d16, d16, d6
+ vorr d2, d4, d1
+L(cj2): vst1.32 {d2}, [rp:64], r12
+ vst1.32 {d16}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+
+define(`tnc', `r12')
+L(base):
+ push {r4, r6, r7, r8}
+ifdef(`OPERATION_lshift',`
+ ldr r4, [ap, #-4]!
+ rsb tnc, cnt, #32
+
+ mov r7, r4, lsl cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #-4]!
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #-4]!
+ subs n, n, #2
+ beq L(ed) C n = 3
+ C n = 4
+L(tp): ldr r8, [ap, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(md): ldr r6, [ap, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r8, lsl cnt
+
+L(ed): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(ed1): str r7, [rp, #-4]
+ mov r0, r4, lsr tnc
+')
+ifdef(`OPERATION_rshift',`
+ ldr r4, [ap]
+ rsb tnc, cnt, #32
+
+ mov r7, r4, lsr cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #4]!
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #4]!
+ subs n, n, #2
+ beq L(ed) C n = 2
+ C n = 4
+
+L(tp): ldr r8, [ap, #4]!
+ orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ mov r7, r6, lsr cnt
+L(md): ldr r6, [ap, #4]!
+ orr r7, r7, r8, lsl tnc
+ str r7, [rp], #4
+ mov r7, r8, lsr cnt
+
+L(ed): orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ mov r7, r6, lsr cnt
+L(ed1): str r7, [rp], #4
+ mov r0, r4, lsl tnc
+')
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/lshiftc.asm b/gmp/mpn/arm/neon/lshiftc.asm
new file mode 100644
index 0000000000..9e4096256d
--- /dev/null
+++ b/gmp/mpn/arm/neon/lshiftc.asm
@@ -0,0 +1,257 @@
+dnl ARM Neon mpn_lshiftc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C StrongARM - -
+C XScale - -
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3.5 3.5 Y
+C Cortex-A15 1.75 1.75 Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses. All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands. Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
+C which might make it tricky.
+C * Clean up and simplify.
+C * Consider sharing most of the code for lshift and rshift, since the feed-in
+C code, the loop, and most of the wind-down code are identical.
+C * Replace the basecase code with code using 'extension' registers.
+C * Optimise. It is not clear that this loop insn permutation is optimal for
+C either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp', `r0')
+define(`ap', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+
+ define(`IFLSH', `$1')
+ define(`IFRSH', `')
+ define(`X',`0')
+ define(`Y',`1')
+ define(`func',`mpn_lshiftc')
+define(`OPERATION_lshiftc',1)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+IFLSH(` mov r12, n, lsl #2 ')
+IFLSH(` add rp, rp, r12 ')
+IFLSH(` add ap, ap, r12 ')
+
+ cmp n, #4 C SIMD code n limit
+ ble L(base)
+
+ifdef(`OPERATION_lshiftc',`
+ vdup.32 d6, r3 C left shift count is positive
+ sub r3, r3, #64 C right shift count is negative
+ vdup.32 d7, r3
+ mov r12, #-8') C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+ rsb r3, r3, #0 C right shift count is negative
+ vdup.32 d6, r3
+ add r3, r3, #64 C left shift count is positive
+ vdup.32 d7, r3
+ mov r12, #8') C rshift pointer update offset
+
+IFLSH(` sub ap, ap, #8 ')
+ vld1.32 {d19}, [ap], r12 C load initial 2 limbs
+ vshl.u64 d18, d19, d7 C retval
+
+ tst rp, #4 C is rp 64-bit aligned already?
+ beq L(rp_aligned) C yes, skip
+ vmvn d19, d19
+IFLSH(` add ap, ap, #4 ') C move back ap pointer
+IFRSH(` sub ap, ap, #4 ') C move back ap pointer
+ vshl.u64 d4, d19, d6
+ sub n, n, #1 C first limb handled
+IFLSH(` sub rp, rp, #4 ')
+ vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
+ vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(` sub rp, rp, #8 ')
+ subs n, n, #6
+ vmvn d19, d19
+ blt L(two_or_three_more)
+ tst n, #2
+ beq L(2)
+
+L(1): vld1.32 {d17}, [ap], r12
+ vshl.u64 d5, d19, d6
+ vmvn d17, d17
+ vld1.32 {d16}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ sub n, n, #2
+ b L(mid)
+
+L(2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vmvn d16, d16
+ vld1.32 {d17}, [ap], r12
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ subs n, n, #4
+ blt L(end)
+
+L(top): vmvn d17, d17
+ vld1.32 {d16}, [ap], r12
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+L(mid): vmvn d16, d16
+ vld1.32 {d17}, [ap], r12
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ subs n, n, #4
+ bge L(top)
+
+L(end): tst n, #1
+ beq L(evn)
+
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ b L(cj1)
+
+L(evn): vmvn d17, d17
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+ vmvn.u8 d17, #0
+ vorr d2, d5, d0
+ vshl.u64 d0, d17, d7
+ vorr d3, d4, d0
+ b L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+ tst n, #1
+ beq L(l2)
+
+L(l3): vshl.u64 d5, d19, d6
+ vld1.32 {d17}, [ap], r12
+L(cj1): vmov.u8 d16, #0
+IFLSH(` add ap, ap, #4 ')
+ vmvn d17, d17
+ vld1.32 {d16[Y]}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vmvn d16, d16
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+IFLSH(` add rp, rp, #4 ')
+ vst1.32 {d5[Y]}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+L(l2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vmvn d16, d16
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vmvn.u8 d17, #0
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vorr d3, d5, d0
+L(cj2): vst1.32 {d2}, [rp:64], r12
+ vst1.32 {d3}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+
+define(`tnc', `r12')
+L(base):
+ push {r4, r6, r7, r8}
+ ldr r4, [ap, #-4]!
+ rsb tnc, cnt, #32
+ mvn r6, r4
+
+ mov r7, r6, lsl cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #-4]!
+ mvn r8, r8
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #-4]!
+ mvn r6, r6
+ subs n, n, #2
+ beq L(ed) C n = 3
+ C n = 4
+L(tp): ldr r8, [ap, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mvn r8, r8
+ mov r7, r6, lsl cnt
+L(md): ldr r6, [ap, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ mvn r6, r6
+ mov r7, r8, lsl cnt
+
+L(ed): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(ed1): mvn r6, #0
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]
+ mov r0, r4, lsr tnc
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/popcount.asm b/gmp/mpn/arm/neon/popcount.asm
new file mode 100644
index 0000000000..2f8f9afc8d
--- /dev/null
+++ b/gmp/mpn/arm/neon/popcount.asm
@@ -0,0 +1,166 @@
+dnl ARM Neon mpn_popcount -- mpn bit population count.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.125
+C Cortex-A15 0.56
+
+C TODO
+C * Explore using vldr and vldm. Does it help on A9? (These loads do
+C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
+C popcount. Except perhaps also for popcount for the edge loads.)
+C * Arrange to align the pointer, if that helps performance. Use the same
+C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
+C valgrind!)
+C * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`n', r1)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+ cmp n, #chunksize
+ bhi L(gt16k)
+
+L(lt16k):
+ vmov.i64 q8, #0 C clear summation register
+ vmov.i64 q9, #0 C clear summation register
+
+ tst n, #1
+ beq L(xxx0)
+ vmov.i64 d0, #0
+ sub n, n, #1
+ vld1.32 {d0[0]}, [ap]! C load 1 limb
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
+
+L(xxx0):tst n, #2
+ beq L(xx00)
+ sub n, n, #2
+ vld1.32 {d0}, [ap]! C load 2 limbs
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24
+
+L(xx00):tst n, #4
+ beq L(x000)
+ sub n, n, #4
+ vld1.32 {q0}, [ap]! C load 4 limbs
+ vcnt.8 q12, q0
+ vpadal.u8 q8, q12
+
+L(x000):tst n, #8
+ beq L(0000)
+
+ subs n, n, #8
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ bls L(sum)
+
+L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ sub n, n, #8
+ vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ b L(mid)
+
+L(0000):subs n, n, #16
+ blo L(e0)
+
+ vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vcnt.8 q12, q2
+ vcnt.8 q13, q3
+ subs n, n, #16
+ blo L(end)
+
+L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q0
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q1
+L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ subs n, n, #16
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q2
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q3
+ bhs L(top)
+
+L(end): vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+L(sum): vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+ vadd.i16 q8, q8, q9
+ C we have 8 16-bit counts
+L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
+ vpaddl.u32 q8, q8 C we have 2 64-bit counts
+ vmov.32 r0, d16[0]
+ vmov.32 r1, d17[0]
+ add r0, r0, r1
+ bx lr
+
+C Code for large count. Splits operand and calls above code.
+define(`ap2', r2) C caller-saves reg not used above
+L(gt16k):
+ push {r4,r14}
+ mov ap2, ap
+ mov r3, n C full count
+ mov r4, #0 C total sum
+
+1: mov n, #chunksize C count for this invocation
+ bl L(lt16k) C could jump deep inside code
+ add ap2, ap2, #chunksize*4 C point at next chunk
+ add r4, r4, r0
+ mov ap, ap2 C put chunk pointer in place for call
+ sub r3, r3, #chunksize
+ cmp r3, #chunksize
+ bhi 1b
+
+ mov n, r3 C count for final invocation
+ bl L(lt16k)
+ add r0, r4, r0
+ pop {r4,pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/sec_tabselect.asm b/gmp/mpn/arm/neon/sec_tabselect.asm
new file mode 100644
index 0000000000..69fceb0063
--- /dev/null
+++ b/gmp/mpn/arm/neon/sec_tabselect.asm
@@ -0,0 +1,140 @@
+dnl ARM Neon mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.15
+C Cortex-A15 0.65
+
+define(`rp', `r0')
+define(`tp', `r1')
+define(`n', `r2')
+define(`nents', `r3')
+C define(`which', on stack)
+
+define(`i', `r4')
+define(`j', `r5')
+
+define(`maskq', `q10')
+define(`maskd', `d20')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+ push {r4-r5}
+
+ add r4, sp, #8
+ vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies
+ vmov.i32 q14, #1 C 4 copies of 1
+
+ subs j, n, #8
+ bmi L(outer_end)
+
+L(outer_top):
+ mov i, nents
+ mov r12, tp C preserve tp
+ veor q13, q13, q13 C 4 counter copies
+ veor q2, q2, q2
+ veor q3, q3, q3
+ ALIGN(16)
+L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies
+ vld1.32 {q0,q1}, [tp]
+ vadd.i32 q13, q13, q14
+ vbit q2, q0, maskq
+ vbit q3, q1, maskq
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(top)
+ vst1.32 {q2,q3}, [rp]!
+ add tp, r12, #32 C restore tp, point to next slice
+ subs j, j, #8
+ bpl L(outer_top)
+L(outer_end):
+
+ tst n, #4
+ beq L(b0xx)
+L(b1xx):mov i, nents
+ mov r12, tp
+ veor q13, q13, q13
+ veor q2, q2, q2
+ ALIGN(16)
+L(tp4): vceq.i32 maskq, q13, q15
+ vld1.32 {q0}, [tp]
+ vadd.i32 q13, q13, q14
+ vbit q2, q0, maskq
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp4)
+ vst1.32 {q2}, [rp]!
+ add tp, r12, #16
+
+L(b0xx):tst n, #2
+ beq L(b00x)
+L(b01x):mov i, nents
+ mov r12, tp
+ veor d26, d26, d26
+ veor d4, d4, d4
+ ALIGN(16)
+L(tp2): vceq.i32 maskd, d26, d30
+ vld1.32 {d0}, [tp]
+ vadd.i32 d26, d26, d28
+ vbit d4, d0, maskd
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp2)
+ vst1.32 {d4}, [rp]!
+ add tp, r12, #8
+
+L(b00x):tst n, #1
+ beq L(b000)
+L(b001):mov i, nents
+ mov r12, tp
+ veor d26, d26, d26
+ veor d4, d4, d4
+ ALIGN(16)
+L(tp1): vceq.i32 maskd, d26, d30
+ vld1.32 {d0[0]}, [tp]
+ vadd.i32 d26, d26, d28
+ vbit d4, d0, maskd
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp1)
+ vst1.32 {d4[0]}, [rp]
+
+L(b000):pop {r4-r5}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/rsh1aors_n.asm b/gmp/mpn/arm/rsh1aors_n.asm
new file mode 100644
index 0000000000..95c1f79ad9
--- /dev/null
+++ b/gmp/mpn/arm/rsh1aors_n.asm
@@ -0,0 +1,124 @@
+dnl ARM mpn_rsh1add_n and mpn_rsh1sub_n.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.64-3.7
+C Cortex-A15 2.5
+
+C TODO
+C * Not optimised.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`RSTCY', `cmn $1, $1')
+ define(`func', mpn_rsh1add_n)
+ define(`func_nc', mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`RSTCY',
+ `mvn $2, #0x80000000
+ cmp $2, $1')
+ define(`func', mpn_rsh1sub_n)
+ define(`func_nc', mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r11}
+ ldr r4, [up], #4
+ ldr r8, [vp], #4
+ ADDSUB r4, r4, r8
+ movs r12, r7, rrx
+ and r11, r4, #1 C return value
+ subs n, n, #4
+ blo L(end)
+
+L(top): ldmia up!, {r5,r6,r7}
+ ldmia vp!, {r8,r9,r10}
+ cmn r12, r12
+ ADDSUBC r5, r5, r8
+ ADDSUBC r6, r6, r9
+ ADDSUBC r7, r7, r10
+ movs r12, r7, rrx
+ movs r6, r6, rrx
+ movs r5, r5, rrx
+ movs r4, r4, rrx
+ subs n, n, #3
+ stmia rp!, {r4,r5,r6}
+ mov r4, r7
+ bhs L(top)
+
+L(end): cmn n, #2
+ bls L(e2)
+ ldm up, {r5,r6}
+ ldm vp, {r8,r9}
+ cmn r12, r12
+ ADDSUBC r5, r5, r8
+ ADDSUBC r6, r6, r9
+ movs r12, r6, rrx
+ movs r5, r5, rrx
+ movs r4, r4, rrx
+ stmia rp!, {r4,r5}
+ mov r4, r6
+ b L(e1)
+
+L(e2): bne L(e1)
+ ldr r5, [up, #0]
+ ldr r8, [vp, #0]
+ cmn r12, r12
+ ADDSUBC r5, r5, r8
+ movs r12, r5, rrx
+ movs r4, r4, rrx
+ str r4, [rp], #4
+ mov r4, r5
+
+L(e1): RSTCY( r12, r1)
+ mov r4, r4, rrx
+ str r4, [rp, #0]
+ mov r0, r11
+ pop {r4-r11}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/rshift.asm b/gmp/mpn/arm/rshift.asm
new file mode 100644
index 0000000000..84728d038a
--- /dev/null
+++ b/gmp/mpn/arm/rshift.asm
@@ -0,0 +1,86 @@
+dnl ARM mpn_rshift.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.5
+C Cortex-A15 ?
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ push {r4, r6, r7, r8}
+ ldr r4, [up]
+ rsb tnc, cnt, #32
+
+ mov r7, r4, lsr cnt
+ tst n, #1
+ beq L(evn) C n even
+
+L(odd): subs n, n, #2
+ bcc L(1) C n = 1
+ ldr r8, [up, #4]!
+ b L(mid)
+
+L(evn): ldr r6, [up, #4]!
+ subs n, n, #2
+ beq L(end)
+
+L(top): ldr r8, [up, #4]!
+ orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ mov r7, r6, lsr cnt
+L(mid): ldr r6, [up, #4]!
+ orr r7, r7, r8, lsl tnc
+ str r7, [rp], #4
+ mov r7, r8, lsr cnt
+ subs n, n, #2
+ bgt L(top)
+
+L(end): orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ mov r7, r6, lsr cnt
+L(1): str r7, [rp]
+ mov r0, r4, lsl tnc
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/sec_tabselect.asm b/gmp/mpn/arm/sec_tabselect.asm
new file mode 100644
index 0000000000..8cf937a091
--- /dev/null
+++ b/gmp/mpn/arm/sec_tabselect.asm
@@ -0,0 +1,131 @@
+dnl ARM mpn_sec_tabselect
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.33
+C Cortex-A15 2.2
+
+C TODO
+C * Consider using special code for small nents, either swapping the inner and
+C outer loops, or providing a few completely unrolling the inner loops.
+
+define(`rp', `r0')
+define(`tp', `r1')
+define(`n', `r2')
+define(`nents', `r3')
+C which on stack
+
+define(`i', `r11')
+define(`j', `r12')
+define(`c', `r14')
+define(`mask', `r7')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+ push {r4-r11, r14}
+
+ subs j, n, #3
+ bmi L(outer_end)
+L(outer_top):
+ ldr c, [sp, #36]
+ mov i, nents
+ push {tp}
+
+ mov r8, #0
+ mov r9, #0
+ mov r10, #0
+
+L(top): subs c, c, #1
+ ldm tp, {r4,r5,r6}
+ sbc mask, mask, mask
+ subs i, i, #1
+ add tp, tp, n, lsl #2
+ and r4, r4, mask
+ and r5, r5, mask
+ and r6, r6, mask
+ orr r8, r8, r4
+ orr r9, r9, r5
+ orr r10, r10, r6
+ bge L(top)
+
+ stmia rp!, {r8,r9,r10}
+ pop {tp}
+ add tp, tp, #12
+ subs j, j, #3
+ bpl L(outer_top)
+L(outer_end):
+
+ cmp j, #-1
+ bne L(n2)
+
+ ldr c, [sp, #36]
+ mov i, nents
+ mov r8, #0
+ mov r9, #0
+L(tp2): subs c, c, #1
+ sbc mask, mask, mask
+ ldm tp, {r4,r5}
+ subs i, i, #1
+ add tp, tp, n, lsl #2
+ and r4, r4, mask
+ and r5, r5, mask
+ orr r8, r8, r4
+ orr r9, r9, r5
+ bge L(tp2)
+ stmia rp, {r8,r9}
+ pop {r4-r11, r14}
+ bx lr
+
+L(n2): cmp j, #-2
+ bne L(n1)
+
+ ldr c, [sp, #36]
+ mov i, nents
+ mov r8, #0
+L(tp1): subs c, c, #1
+ sbc mask, mask, mask
+ ldr r4, [tp]
+ subs i, i, #1
+ add tp, tp, n, lsl #2
+ and r4, r4, mask
+ orr r8, r8, r4
+ bge L(tp1)
+ str r8, [rp]
+L(n1): pop {r4-r11, r14}
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/udiv.asm b/gmp/mpn/arm/udiv.asm
new file mode 100644
index 0000000000..8d441c74ed
--- /dev/null
+++ b/gmp/mpn/arm/udiv.asm
@@ -0,0 +1,104 @@
+dnl ARM mpn_udiv_qrnnd -- divide a two limb dividend and a one limb divisor.
+dnl Return quotient and store remainder through a supplied pointer.
+
+dnl Copyright 2001, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rem_ptr',`r0')
+define(`n1',`r1')
+define(`n0',`r2')
+define(`d',`r3')
+
+C divstep -- develop one quotient bit. Dividend in $1$2, divisor in $3.
+C Quotient bit is shifted into $2.
+define(`divstep',
+ `adcs $2, $2, $2
+ adc $1, $1, $1
+ cmp $1, $3
+ subcs $1, $1, $3')
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+ mov r12, #8 C loop counter for both loops below
+ cmp d, #0x80000000 C check divisor msb and clear carry
+ bcs L(_large_divisor)
+
+L(oop): divstep(n1,n0,d)
+ divstep(n1,n0,d)
+ divstep(n1,n0,d)
+ divstep(n1,n0,d)
+ sub r12, r12, #1
+ teq r12, #0
+ bne L(oop)
+
+ str n1, [rem_ptr] C store remainder
+ adc r0, n0, n0 C quotient: add last carry from divstep
+ bx lr
+
+L(_large_divisor):
+ stmfd sp!, { r8, lr }
+
+ and r8, n0, #1 C save lsb of dividend
+ mov lr, n1, lsl #31
+ orrs n0, lr, n0, lsr #1 C n0 = lo(n1n0 >> 1)
+ mov n1, n1, lsr #1 C n1 = hi(n1n0 >> 1)
+
+ and lr, d, #1 C save lsb of divisor
+ movs d, d, lsr #1 C d = floor(orig_d / 2)
+ adc d, d, #0 C d = ceil(orig_d / 2)
+
+L(oop2):
+ divstep(n1,n0,d)
+ divstep(n1,n0,d)
+ divstep(n1,n0,d)
+ divstep(n1,n0,d)
+ sub r12, r12, #1
+ teq r12, #0
+ bne L(oop2)
+
+ adc n0, n0, n0 C shift and add last carry from divstep
+ add n1, r8, n1, lsl #1 C shift in omitted dividend lsb
+ tst lr, lr C test saved divisor lsb
+ beq L(_even_divisor)
+
+ rsb d, lr, d, lsl #1 C restore orig d value
+ adds n1, n1, n0 C fix remainder for omitted divisor lsb
+ addcs n0, n0, #1 C adjust quotient if rem. fix carried
+ subcs n1, n1, d C adjust remainder accordingly
+ cmp n1, d C remainder >= divisor?
+ subcs n1, n1, d C adjust remainder
+ addcs n0, n0, #1 C adjust quotient
+
+L(_even_divisor):
+ str n1, [rem_ptr] C store remainder
+ mov r0, n0 C quotient
+ ldmfd sp!, { r8, pc }
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/gmp/mpn/arm/v5/gcd_1.asm b/gmp/mpn/arm/v5/gcd_1.asm
new file mode 100644
index 0000000000..169d154bf0
--- /dev/null
+++ b/gmp/mpn/arm/v5/gcd_1.asm
@@ -0,0 +1,120 @@
+dnl ARM v5 mpn_gcd_1.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C StrongARM -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.9
+C Cortex-A15 ?
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+C TODO
+C * Optimise inner-loop better.
+
+C Threshold of when to call bmod when U is one limb. Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 6)
+
+C INPUT PARAMETERS
+define(`up', `r0')
+define(`n', `r1')
+define(`v0', `r2')
+
+ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
+ `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+ push {r4, r7, lr}
+ ldr r3, [up] C U low limb
+
+ orr r3, r3, v0
+ rsb r4, r3, #0
+ and r4, r4, r3
+ clz r4, r4 C min(ctz(u0),ctz(v0))
+ rsb r4, r4, #31
+
+ rsb r12, v0, #0
+ and r12, r12, v0
+ clz r12, r12
+ rsb r12, r12, #31
+ mov v0, v0, lsr r12
+
+ mov r7, v0
+
+ cmp n, #1
+ bne L(nby1)
+
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+ ldr r3, [up]
+ cmp v0, r3, lsr #BMOD_THRES_LOG2
+ bhi L(red1)
+
+L(bmod):mov r3, #0 C carry argument
+ bl mpn_modexact_1c_odd
+ b L(red0)
+
+L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
+ blo L(bmod)
+
+ bl mpn_mod_1
+
+L(red0):mov r3, r0
+L(red1):rsbs r12, r3, #0
+ and r12, r12, r3
+ clz r12, r12
+ rsb r12, r12, #31
+ bne L(mid)
+ b L(end)
+
+ ALIGN(8)
+L(top): rsb r12, r12, #31
+ movcc r3, r1 C if x-y < 0
+ movcc r7, r0 C use x,y-x
+L(mid): mov r3, r3, lsr r12 C
+ mov r0, r3 C
+ sub r1, r7, r3 C
+ rsbs r3, r7, r3 C
+ and r12, r1, r3 C
+ clz r12, r12 C
+ bne L(top) C
+
+L(end): mov r0, r7, lsl r4
+ pop {r4, r7, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v5/mod_1_1.asm b/gmp/mpn/arm/v5/mod_1_1.asm
new file mode 100644
index 0000000000..3cf0cd7763
--- /dev/null
+++ b/gmp/mpn/arm/v5/mod_1_1.asm
@@ -0,0 +1,129 @@
+dnl ARM mpn_mod_1_1p
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 7
+C Cortex-A15 6
+
+define(`ap', `r0')
+define(`n', `r1')
+define(`d', `r2')
+define(`cps',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_1_1p)
+ push {r4-r10}
+ add r0, r0, r1, asl #2
+ ldr r5, [r0, #-4]!
+ ldr r12, [r0, #-4]!
+ subs r1, r1, #2
+ ble L(4)
+ ldr r8, [r3, #12]
+ mov r4, r12
+ mov r10, r5
+ umull r7, r5, r10, r8
+ sub r1, r1, #1
+ b L(mid)
+
+L(top): adds r12, r6, r7
+ adcs r10, r4, r5
+ sub r1, r1, #1
+ mov r6, #0
+ movcs r6, r8
+ umull r7, r5, r10, r8
+ adds r4, r12, r6
+ subcs r4, r4, r2
+L(mid): ldr r6, [r0, #-4]!
+ teq r1, #0
+ bne L(top)
+
+ adds r12, r6, r7
+ adcs r5, r4, r5
+ subcs r5, r5, r2
+L(4): ldr r1, [r3, #4]
+ cmp r1, #0
+ beq L(7)
+ ldr r4, [r3, #8]
+ umull r0, r6, r5, r4
+ adds r12, r0, r12
+ addcs r6, r6, #1
+ rsb r0, r1, #32
+ mov r0, r12, lsr r0
+ orr r5, r0, r6, asl r1
+ mov r12, r12, asl r1
+ b L(8)
+L(7): cmp r5, r2
+ subcs r5, r5, r2
+L(8): ldr r0, [r3, #0]
+ umull r4, r3, r5, r0
+ add r5, r5, #1
+ adds r0, r4, r12
+ adc r5, r3, r5
+ mul r5, r2, r5
+ sub r12, r12, r5
+ cmp r12, r0
+ addhi r12, r12, r2
+ cmp r2, r12
+ subls r12, r12, r2
+ mov r0, r12, lsr r1
+ pop {r4-r10}
+ bx r14
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+ stmfd sp!, {r4, r5, r6, r14}
+ mov r5, r0
+ clz r4, r1
+ mov r0, r1, asl r4
+ rsb r6, r0, #0
+ bl mpn_invert_limb
+ str r0, [r5, #0]
+ str r4, [r5, #4]
+ cmp r4, #0
+ beq L(2)
+ rsb r1, r4, #32
+ mov r3, #1
+ mov r3, r3, asl r4
+ orr r3, r3, r0, lsr r1
+ mul r3, r6, r3
+ mov r4, r3, lsr r4
+ str r4, [r5, #8]
+L(2): mul r0, r6, r0
+ str r0, [r5, #12]
+ ldmfd sp!, {r4, r5, r6, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v5/mod_1_2.asm b/gmp/mpn/arm/v5/mod_1_2.asm
new file mode 100644
index 0000000000..aa26ecb21c
--- /dev/null
+++ b/gmp/mpn/arm/v5/mod_1_2.asm
@@ -0,0 +1,156 @@
+dnl ARM mpn_mod_1s_2p
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4.25
+C Cortex-A15 3
+
+define(`ap', `r0')
+define(`n', `r1')
+define(`d', `r2')
+define(`cps',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_2p)
+ push {r4-r10}
+ tst n, #1
+ add r7, r3, #8
+ ldmia r7, {r7, r8, r12} C load B1, B2, B3
+ add ap, ap, n, lsl #2 C put ap at operand end
+ beq L(evn)
+
+L(odd): subs n, n, #1
+ beq L(1)
+ ldmdb ap!, {r4,r6,r9}
+ mov r10, #0
+ umlal r4, r10, r6, r7
+ umlal r4, r10, r9, r8
+ b L(com)
+
+L(evn): ldmdb ap!, {r4,r10}
+L(com): subs n, n, #2
+ ble L(end)
+ ldmdb ap!, {r5,r6}
+ b L(mid)
+
+L(top): mov r9, #0
+ umlal r5, r9, r6, r7 C B1
+ umlal r5, r9, r4, r8 C B2
+ ldmdb ap!, {r4,r6}
+ umlal r5, r9, r10, r12 C B3
+ ble L(xit)
+ mov r10, #0
+ umlal r4, r10, r6, r7 C B1
+ umlal r4, r10, r5, r8 C B2
+ ldmdb ap!, {r5,r6}
+ umlal r4, r10, r9, r12 C B3
+L(mid): subs n, n, #4
+ bge L(top)
+
+ mov r9, #0
+ umlal r5, r9, r6, r7 C B1
+ umlal r5, r9, r4, r8 C B2
+ umlal r5, r9, r10, r12 C B3
+ mov r4, r5
+
+L(end): movge r9, r10 C executed iff coming via xit
+ ldr r6, [r3, #4] C cps[1] = cnt
+ mov r5, #0
+ umlal r4, r5, r9, r7
+ mov r7, r5, lsl r6
+L(x): rsb r1, r6, #32
+ orr r8, r7, r4, lsr r1
+ mov r9, r4, lsl r6
+ ldr r5, [r3, #0]
+ add r0, r8, #1
+ umull r12, r1, r8, r5
+ adds r4, r12, r9
+ adc r1, r1, r0
+ mul r5, r2, r1
+ sub r9, r9, r5
+ cmp r9, r4
+ addhi r9, r9, r2
+ cmp r2, r9
+ subls r9, r9, r2
+ mov r0, r9, lsr r6
+ pop {r4-r10}
+ bx r14
+
+L(xit): mov r10, #0
+ umlal r4, r10, r6, r7 C B1
+ umlal r4, r10, r5, r8 C B2
+ umlal r4, r10, r9, r12 C B3
+ b L(end)
+
+L(1): ldr r6, [r3, #4] C cps[1] = cnt
+ ldr r4, [ap, #-4] C ap[0]
+ mov r7, #0
+ b L(x)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_2p_cps)
+ push {r4-r8, r14}
+ clz r4, r1
+ mov r5, r1, lsl r4 C b <<= cnt
+ mov r6, r0 C r6 = cps
+ mov r0, r5
+ bl mpn_invert_limb
+ rsb r3, r4, #32
+ mov r3, r0, lsr r3
+ mov r2, #1
+ orr r3, r3, r2, lsl r4
+ rsb r1, r5, #0
+ mul r2, r1, r3
+ umull r3, r12, r2, r0
+ add r12, r2, r12
+ mvn r12, r12
+ mul r1, r5, r12
+ cmp r1, r3
+ addhi r1, r1, r5
+ umull r12, r7, r1, r0
+ add r7, r1, r7
+ mvn r7, r7
+ mul r3, r5, r7
+ cmp r3, r12
+ addhi r3, r3, r5
+ mov r5, r2, lsr r4
+ mov r7, r1, lsr r4
+ mov r8, r3, lsr r4
+ stmia r6, {r0,r4,r5,r7,r8} C fill cps
+ pop {r4-r8, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_1.asm b/gmp/mpn/arm/v6/addmul_1.asm
new file mode 100644
index 0000000000..57019e4b2b
--- /dev/null
+++ b/gmp/mpn/arm/v6/addmul_1.asm
@@ -0,0 +1,111 @@
+dnl ARM mpn_addmul_1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.25
+C Cortex-A15 4
+
+C TODO
+C * Micro-optimise feed-in code.
+C * Optimise for n=1,2 by delaying register saving.
+C * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+ ands r6, n, #3
+ mov r12, #0
+ beq L(fi0)
+ cmp r6, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): ldr r4, [up], #4
+ ldr r6, [rp, #0]
+ ldr r5, [up], #4
+ b L(lo3)
+
+L(fi0): ldr r5, [up], #4
+ ldr r7, [rp], #4
+ ldr r4, [up], #4
+ b L(lo0)
+
+L(fi1): ldr r4, [up], #4
+ ldr r6, [rp], #8
+ subs n, n, #1
+ beq L(1)
+ ldr r5, [up], #4
+ b L(lo1)
+
+L(fi2): ldr r5, [up], #4
+ ldr r7, [rp], #12
+ ldr r4, [up], #4
+ b L(lo2)
+
+ ALIGN(16)
+L(top): ldr r6, [rp, #-8]
+ ldr r5, [up], #4
+ str r7, [rp, #-12]
+L(lo1): umaal r6, r12, r4, v0
+ ldr r7, [rp, #-4]
+ ldr r4, [up], #4
+ str r6, [rp, #-8]
+L(lo0): umaal r7, r12, r5, v0
+ ldr r6, [rp, #0]
+ ldr r5, [up], #4
+ str r7, [rp, #-4]
+L(lo3): umaal r6, r12, r4, v0
+ ldr r7, [rp, #4]
+ ldr r4, [up], #4
+ str r6, [rp], #16
+L(lo2): umaal r7, r12, r5, v0
+ subs n, n, #4
+ bhi L(top)
+
+ ldr r6, [rp, #-8]
+ str r7, [rp, #-12]
+L(1): umaal r6, r12, r4, v0
+ str r6, [rp, #-8]
+ mov r0, r12
+ ldmfd sp!, { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_2.asm b/gmp/mpn/arm/v6/addmul_2.asm
new file mode 100644
index 0000000000..69817ce340
--- /dev/null
+++ b/gmp/mpn/arm/v6/addmul_2.asm
@@ -0,0 +1,138 @@
+dnl ARM mpn_addmul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.25
+C Cortex-A15 2.5
+
+C This is believed to be optimal for A15 for any unrolling, and optimal for A9
+C for 4-way unrolling. Using separate pointer update instructions is necessary
+C for optimal A9 speed.
+
+C TODO:
+C * Start the first multiply or multiplies directly at function entry.
+
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ push { r4, r5, r6, r7, r8, r9 }
+
+ ldm vp, { v0, v1 }
+ mov cya, #0
+ mov cyb, #0
+
+ tst n, #1
+ beq L(evn)
+
+L(odd): ldr r5, [rp, #0]
+ ldr u0, [up, #0]
+ ldr r4, [rp, #4]
+ tst n, #2
+ beq L(fi1)
+L(fi3): sub up, up, #12
+ sub rp, rp, #12
+ b L(lo3)
+L(fi1): sub n, n, #1
+ sub up, up, #4
+ sub rp, rp, #4
+ b L(lo1)
+
+L(evn): ldr r4, [rp, #0]
+ ldr u1, [up, #0]
+ ldr r5, [rp, #4]
+ tst n, #2
+ bne L(fi2)
+L(fi0): sub up, up, #8
+ sub rp, rp, #8
+ b L(lo0)
+L(fi2): subs n, n, #2
+ bls L(end)
+
+ ALIGN(16)
+L(top): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #0]
+ ldr r4, [rp, #8]
+ umaal r5, cyb, u1, v1
+L(lo1): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #4]
+ ldr r5, [rp, #12]
+ umaal r4, cyb, u0, v1
+L(lo0): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #8]
+ ldr r4, [rp, #16]
+ umaal r5, cyb, u1, v1
+L(lo3): ldr u1, [up, #16]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #12]
+ ldr r5, [rp, #20]
+ add rp, rp, #16
+ umaal r4, cyb, u0, v1
+ add up, up, #16
+ subs n, n, #4
+ bhi L(top)
+
+L(end): umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #0]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #4]
+ str cya, [rp, #8]
+ mov r0, cyb
+
+ pop { r4, r5, r6, r7, r8, r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_3.asm b/gmp/mpn/arm/v6/addmul_3.asm
new file mode 100644
index 0000000000..046543020f
--- /dev/null
+++ b/gmp/mpn/arm/v6/addmul_3.asm
@@ -0,0 +1,187 @@
+dnl ARM mpn_addmul_3.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.125
+C Cortex-A15 2
+
+C TODO
+C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
+C avoiding the current multiply.
+C * Start the first multiply or multiplies early.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r4') define(`v1',`r5') define(`v2',`r6')
+define(`u0',`r3') define(`u1',`r14')
+define(`w0',`r7') define(`w1',`r8') define(`w2',`r9')
+define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+ push { r4-r11, r14 }
+
+ ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32
+ ldm vp, { v0,v1,v2 }
+ mov cy0, #0
+ mov cy1, #0
+ mov cy2, #0
+
+C Tricky n mod 6
+ mul w0, w0, n C n * 3^{-1} mod 2^32
+ and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2
+ sub n, n, #3
+ifdef(`PIC',`
+ add pc, pc, w0, ror $28
+ nop
+ b L(b0)
+ b L(b2)
+ b L(b4)
+ .word 0xe7f000f0 C udf
+ b L(b3)
+ b L(b5)
+ b L(b1)
+',`
+ ldr pc, [pc, w0, ror $28]
+ nop
+ .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
+')
+
+L(b5): add up, up, #-8
+ ldr w1, [rp, #0]
+ ldr w2, [rp, #4]
+ ldr u1, [up, #8]
+ b L(lo5)
+
+L(b4): add rp, rp, #-4
+ add up, up, #-12
+ ldr w2, [rp, #4]
+ ldr w0, [rp, #8]
+ ldr u0, [up, #12]
+ b L(lo4)
+
+L(b3): add rp, rp, #-8
+ add up, up, #-16
+ ldr w0, [rp, #8]
+ ldr w1, [rp, #12]
+ ldr u1, [up, #16]
+ b L(lo3)
+
+L(b1): add rp, rp, #8
+ ldr w2, [rp, #-8]
+ ldr w0, [rp, #-4]
+ ldr u1, [up, #0]
+ b L(lo1)
+
+L(b0): add rp, rp, #4
+ add up, up, #-4
+ ldr w0, [rp, #-4]
+ ldr w1, [rp, #0]
+ ldr u0, [up, #4]
+ b L(lo0)
+
+L(b2): add rp, rp, #12
+ add up, up, #4
+ ldr w1, [rp, #-12]
+ ldr w2, [rp, #-8]
+ ldr u0, [up, #-4]
+
+ ALIGN(16)
+L(top): ldr w0, [rp, #-4]
+ umaal w1, cy0, u0, v0
+ ldr u1, [up, #0]
+ umaal w2, cy1, u0, v1
+ str w1, [rp, #-12]
+ umaal w0, cy2, u0, v2
+L(lo1): ldr w1, [rp, #0]
+ umaal w2, cy0, u1, v0
+ ldr u0, [up, #4]
+ umaal w0, cy1, u1, v1
+ str w2, [rp, #-8]
+ umaal w1, cy2, u1, v2
+L(lo0): ldr w2, [rp, #4]
+ umaal w0, cy0, u0, v0
+ ldr u1, [up, #8]
+ umaal w1, cy1, u0, v1
+ str w0, [rp, #-4]
+ umaal w2, cy2, u0, v2
+L(lo5): ldr w0, [rp, #8]
+ umaal w1, cy0, u1, v0
+ ldr u0, [up, #12]
+ umaal w2, cy1, u1, v1
+ str w1, [rp, #0]
+ umaal w0, cy2, u1, v2
+L(lo4): ldr w1, [rp, #12]
+ umaal w2, cy0, u0, v0
+ ldr u1, [up, #16]
+ umaal w0, cy1, u0, v1
+ str w2, [rp, #4]
+ umaal w1, cy2, u0, v2
+L(lo3): ldr w2, [rp, #16]
+ umaal w0, cy0, u1, v0
+ ldr u0, [up, #20]
+ umaal w1, cy1, u1, v1
+ str w0, [rp, #8]
+ umaal w2, cy2, u1, v2
+L(lo2): subs n, n, #6
+ add up, up, #24
+ add rp, rp, #24
+ bge L(top)
+
+L(end): umaal w1, cy0, u0, v0
+ ldr u1, [up, #0]
+ umaal w2, cy1, u0, v1
+ str w1, [rp, #-12]
+ mov w0, #0
+ umaal w0, cy2, u0, v2
+ umaal w2, cy0, u1, v0
+ umaal w0, cy1, u1, v1
+ str w2, [rp, #-8]
+ umaal cy1, cy2, u1, v2
+ adds w0, w0, cy0
+ str w0, [rp, #-4]
+ adcs w1, cy1, #0
+ str w1, [rp, #0]
+ adc r0, cy2, #0
+
+ pop { r4-r11, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/dive_1.asm b/gmp/mpn/arm/v6/dive_1.asm
new file mode 100644
index 0000000000..92de81473f
--- /dev/null
+++ b/gmp/mpn/arm/v6/dive_1.asm
@@ -0,0 +1,149 @@
+dnl ARM v6 mpn_divexact_1
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C norm unorm modexact_1c_odd
+C StrongARM - -
+C XScale - -
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 9 10 9
+C Cortex-A15 7 7 7
+
+C Architecture requirements:
+C v5 -
+C v5t clz
+C v5te -
+C v6 umaal
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`d', `r3')
+
+define(`cy', `r7')
+define(`cnt', `r6')
+define(`tnc', `r10')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+ push {r4,r5,r6,r7,r8,r9}
+
+ tst d, #1
+
+ rsb r4, d, #0
+ and r4, r4, d
+ clz r4, r4
+ rsb cnt, r4, #31 C count_trailing_zeros
+ mov d, d, lsr cnt
+
+C binvert limb
+ LEA( r4, binvert_limb_table)
+ and r12, d, #254
+ ldrb r4, [r4, r12, lsr #1]
+ mul r12, r4, r4
+ mul r12, d, r12
+ rsb r12, r12, r4, lsl #1
+ mul r4, r12, r12
+ mul r4, d, r4
+ rsb r4, r4, r12, lsl #1 C r4 = inverse
+
+ ldr r5, [up], #4 C up[0]
+ mov cy, #0
+ rsb r8, r4, #0 C r8 = -inverse
+ beq L(unnorm)
+
+L(norm):
+ subs n, n, #1
+ mul r5, r5, r4
+ beq L(end)
+
+ ALIGN(16)
+L(top): ldr r9, [up], #4
+ mov r12, #0
+ str r5, [rp], #4
+ umaal r12, cy, r5, d
+ mul r5, r9, r4
+ mla r5, cy, r8, r5
+ subs n, n, #1
+ bne L(top)
+
+L(end): str r5, [rp]
+ pop {r4,r5,r6,r7,r8,r9}
+ bx r14
+
+L(unnorm):
+ push {r10,r11}
+ rsb tnc, cnt, #32
+ mov r11, r5, lsr cnt
+ subs n, n, #1
+ beq L(edx)
+
+ ldr r12, [up], #4
+ orr r9, r11, r12, lsl tnc
+ mov r11, r12, lsr cnt
+ mul r5, r9, r4
+ subs n, n, #1
+ beq L(edu)
+
+ ALIGN(16)
+L(tpu): ldr r12, [up], #4
+ orr r9, r11, r12, lsl tnc
+ mov r11, r12, lsr cnt
+ mov r12, #0
+ str r5, [rp], #4
+ umaal r12, cy, r5, d
+ mul r5, r9, r4
+ mla r5, cy, r8, r5
+ subs n, n, #1
+ bne L(tpu)
+
+L(edu): str r5, [rp], #4
+ mov r12, #0
+ umaal r12, cy, r5, d
+ mul r5, r11, r4
+ mla r5, cy, r8, r5
+ str r5, [rp]
+ pop {r10,r11}
+ pop {r4,r5,r6,r7,r8,r9}
+ bx r14
+
+L(edx): mul r5, r11, r4
+ str r5, [rp]
+ pop {r10,r11}
+ pop {r4,r5,r6,r7,r8,r9}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/gmp-mparam.h b/gmp/mpn/arm/v6/gmp-mparam.h
new file mode 100644
index 0000000000..c9c6851769
--- /dev/null
+++ b/gmp/mpn/arm/v6/gmp-mparam.h
@@ -0,0 +1,157 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 700MHz ARM11 (raspberry pi) */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 29
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 33
+
+#define MUL_TOOM22_THRESHOLD 36
+#define MUL_TOOM33_THRESHOLD 117
+#define MUL_TOOM44_THRESHOLD 462
+#define MUL_TOOM6H_THRESHOLD 0 /* always */
+#define MUL_TOOM8H_THRESHOLD 620
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 130
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 573
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 209
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 209
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 305
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 50
+#define SQR_TOOM3_THRESHOLD 181
+#define SQR_TOOM4_THRESHOLD 686
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 915
+
+#define MULMID_TOOM42_THRESHOLD 72
+
+#define MULMOD_BNM1_THRESHOLD 25
+#define SQRMOD_BNM1_THRESHOLD 30
+
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 476, 5}, { 21, 6}, { 11, 5}, { 25, 6}, \
+ { 13, 5}, { 27, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
+ { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 63
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 464, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
+ { 35, 7}, { 71, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \
+ { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 61
+#define SQR_FFT_THRESHOLD 3776
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 67
+#define MULLO_MUL_N_THRESHOLD 8907
+
+#define DC_DIV_QR_THRESHOLD 40
+#define DC_DIVAPPR_Q_THRESHOLD 156
+#define DC_BDIV_QR_THRESHOLD 71
+#define DC_BDIV_Q_THRESHOLD 208
+
+#define INV_MULMOD_BNM1_THRESHOLD 70
+#define INV_NEWTON_THRESHOLD 151
+#define INV_APPR_THRESHOLD 150
+
+#define BINV_NEWTON_THRESHOLD 375
+#define REDC_1_TO_REDC_2_THRESHOLD 5
+#define REDC_2_TO_REDC_N_THRESHOLD 134
+
+#define MU_DIV_QR_THRESHOLD 2130
+#define MU_DIVAPPR_Q_THRESHOLD 2130
+#define MUPI_DIV_QR_THRESHOLD 80
+#define MU_BDIV_QR_THRESHOLD 1787
+#define MU_BDIV_Q_THRESHOLD 2130
+
+#define POWM_SEC_TABLE 7,32,460,1705
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 85
+#define HGCD_APPR_THRESHOLD 119
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 333
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 1
+
+#define GET_STR_DC_THRESHOLD 21
+#define GET_STR_PRECOMPUTE_THRESHOLD 41
+#define SET_STR_DC_THRESHOLD 527
+#define SET_STR_PRECOMPUTE_THRESHOLD 1323
+
+#define FAC_DSC_THRESHOLD 414
+#define FAC_ODD_THRESHOLD 154
diff --git a/gmp/mpn/arm/v6/mode1o.asm b/gmp/mpn/arm/v6/mode1o.asm
new file mode 100644
index 0000000000..a2f77a6bf5
--- /dev/null
+++ b/gmp/mpn/arm/v6/mode1o.asm
@@ -0,0 +1,95 @@
+dnl ARM v6 mpn_modexact_1c_odd
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 9
+C Cortex-A15 7
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te smulbb
+C v6 umaal
+C v6t2 -
+C v7a -
+
+define(`up', `r0')
+define(`n', `r1')
+define(`d', `r2')
+define(`cy', `r3')
+
+ .protected binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+ stmfd sp!, {r4, r5, r6, r7}
+
+ LEA( r4, binvert_limb_table)
+
+ ldr r6, [up], #4 C up[0]
+
+ and r12, d, #254
+ ldrb r4, [r4, r12, lsr #1]
+ smulbb r12, r4, r4
+ mul r12, d, r12
+ rsb r12, r12, r4, asl #1
+ mul r4, r12, r12
+ mul r4, d, r4
+ rsb r4, r4, r12, asl #1 C r4 = inverse
+
+ subs n, n, #1
+ sub r6, r6, cy
+ mul r6, r6, r4
+ beq L(end)
+
+ rsb r5, r4, #0 C r5 = -inverse
+
+L(top): ldr r7, [up], #4
+ mov r12, #0
+ umaal r12, cy, r6, d
+ mul r6, r7, r4
+ mla r6, cy, r5, r6
+ subs n, n, #1
+ bne L(top)
+
+L(end): mov r12, #0
+ umaal r12, cy, r6, d
+ mov r0, cy
+
+ ldmfd sp!, {r4, r5, r6, r7}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/mul_1.asm b/gmp/mpn/arm/v6/mul_1.asm
new file mode 100644
index 0000000000..0fcc0e46d9
--- /dev/null
+++ b/gmp/mpn/arm/v6/mul_1.asm
@@ -0,0 +1,114 @@
+dnl ARM mpn_mul_1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.25
+C Cortex-A15 4
+
+C TODO
+C * Micro-optimise feed-in code.
+C * Optimise for n=1,2 by delaying register saving.
+C * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+ ands r6, n, #3
+ mov r12, #0
+ beq L(fi0)
+ cmp r6, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): ldr r4, [up], #4
+ mov r6, #0
+ ldr r5, [up], #4
+ b L(lo3)
+
+L(fi0): ldr r5, [up], #4
+ add rp, rp, #4
+ mov r7, #0
+ ldr r4, [up], #4
+ b L(lo0)
+
+L(fi1): ldr r4, [up], #4
+ mov r6, #0
+ add rp, rp, #8
+ subs n, n, #1
+ beq L(1)
+ ldr r5, [up], #4
+ b L(lo1)
+
+L(fi2): ldr r5, [up], #4
+ add rp, rp, #12
+ mov r7, #0
+ ldr r4, [up], #4
+ b L(lo2)
+
+ ALIGN(16)
+L(top): mov r6, #0
+ ldr r5, [up], #4
+ str r7, [rp, #-12]
+L(lo1): umaal r6, r12, r4, v0
+ mov r7, #0
+ ldr r4, [up], #4
+ str r6, [rp, #-8]
+L(lo0): umaal r7, r12, r5, v0
+ mov r6, #0
+ ldr r5, [up], #4
+ str r7, [rp, #-4]
+L(lo3): umaal r6, r12, r4, v0
+ mov r7, #0
+ ldr r4, [up], #4
+ str r6, [rp], #16
+L(lo2): umaal r7, r12, r5, v0
+ subs n, n, #4
+ bhi L(top)
+
+ mov r6, #0
+ str r7, [rp, #-12]
+L(1): umaal r6, r12, r4, v0
+ str r6, [rp, #-8]
+ mov r0, r12
+ ldmfd sp!, { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/mul_2.asm b/gmp/mpn/arm/v6/mul_2.asm
new file mode 100644
index 0000000000..1679542a3c
--- /dev/null
+++ b/gmp/mpn/arm/v6/mul_2.asm
@@ -0,0 +1,131 @@
+dnl ARM mpn_mul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.25
+C Cortex-A15 2.5
+
+C TODO
+C * This is a trivial edit of the addmul_2 code. Check for simplifications,
+C and possible speedups to 2.0 c/l.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+ push { r4, r5, r6, r7, r8, r9 }
+
+ ldm vp, { v0, v1 }
+ mov cya, #0
+ mov cyb, #0
+
+ tst n, #1
+ beq L(evn)
+L(odd): mov r5, #0
+ ldr u0, [up, #0]
+ mov r4, #0
+ tst n, #2
+ beq L(fi1)
+L(fi3): sub up, up, #12
+ sub rp, rp, #16
+ b L(lo3)
+L(fi1): sub n, n, #1
+ sub up, up, #4
+ sub rp, rp, #8
+ b L(lo1)
+L(evn): mov r4, #0
+ ldr u1, [up, #0]
+ mov r5, #0
+ tst n, #2
+ bne L(fi2)
+L(fi0): sub up, up, #8
+ sub rp, rp, #12
+ b L(lo0)
+L(fi2): subs n, n, #2
+ sub rp, rp, #4
+ bls L(end)
+
+ ALIGN(16)
+L(top): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(lo1): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+L(lo0): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(lo3): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+ subs n, n, #4
+ bhi L(top)
+
+L(end): umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ mov r0, cyb
+
+ pop { r4, r5, r6, r7, r8, r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/popham.asm b/gmp/mpn/arm/v6/popham.asm
new file mode 100644
index 0000000000..44c8f2361c
--- /dev/null
+++ b/gmp/mpn/arm/v6/popham.asm
@@ -0,0 +1,138 @@
+dnl ARM mpn_popcount and mpn_hamdist.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C popcount hamdist
+C cycles/limb cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 8.94 9.47
+C Cortex-A15 5.67 6.44
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 usada8
+C v6t2 -
+C v7a -
+
+ifdef(`OPERATION_popcount',`
+ define(`func',`mpn_popcount')
+ define(`ap', `r0')
+ define(`n', `r1')
+ define(`a0', `r2')
+ define(`a1', `r3')
+ define(`s', `r5')
+ define(`b_01010101', `r6')
+ define(`b_00110011', `r7')
+ define(`b_00001111', `r8')
+ define(`zero', `r9')
+ define(`POPC', `$1')
+ define(`HAMD', `dnl')
+')
+ifdef(`OPERATION_hamdist',`
+ define(`func',`mpn_hamdist')
+ define(`ap', `r0')
+ define(`bp', `r1')
+ define(`n', `r2')
+ define(`a0', `r6')
+ define(`a1', `r7')
+ define(`b0', `r4')
+ define(`b1', `r5')
+ define(`s', `r11')
+ define(`b_01010101', `r8')
+ define(`b_00110011', `r9')
+ define(`b_00001111', `r10')
+ define(`zero', `r3')
+ define(`POPC', `dnl')
+ define(`HAMD', `$1')
+')
+
+
+ASM_START()
+PROLOGUE(func)
+POPC(` push { r4-r9 } ')
+HAMD(` push { r4-r11 } ')
+
+ ldr b_01010101, =0x55555555
+ mov r12, #0
+ ldr b_00110011, =0x33333333
+ mov zero, #0
+ ldr b_00001111, =0x0f0f0f0f
+
+ tst n, #1
+ beq L(evn)
+
+L(odd): ldr a1, [ap], #4 C 1 x 32 1-bit accumulators, 0-1
+HAMD(` ldr b1, [bp], #4 ') C 1 x 32 1-bit accumulators, 0-1
+HAMD(` eor a1, a1, b1 ')
+ and r4, b_01010101, a1, lsr #1
+ sub a1, a1, r4
+ and r4, a1, b_00110011
+ bic r5, a1, b_00110011
+ add r5, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
+ subs n, n, #1
+ b L(mid)
+
+L(evn): mov s, #0
+
+L(top): ldrd a0, a1, [ap], #8 C 2 x 32 1-bit accumulators, 0-1
+HAMD(` ldrd b0, b1, [bp], #8')
+HAMD(` eor a0, a0, b0 ')
+HAMD(` eor a1, a1, b1 ')
+ subs n, n, #2
+ usada8 r12, s, zero, r12
+ and r4, b_01010101, a0, lsr #1
+ sub a0, a0, r4
+ and r4, b_01010101, a1, lsr #1
+ sub a1, a1, r4
+ and r4, a0, b_00110011
+ bic r5, a0, b_00110011
+ add a0, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
+ and r4, a1, b_00110011
+ bic r5, a1, b_00110011
+ add a1, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
+ add r5, a0, a1 C 8 4-bit accumulators, 0-8
+L(mid): and r4, r5, b_00001111
+ bic r5, r5, b_00001111
+ add s, r4, r5, lsr #4 C 4 8-bit accumulators
+ bne L(top)
+
+ usada8 r0, s, zero, r12
+POPC(` pop { r4-r9 } ')
+HAMD(` pop { r4-r11 } ')
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/sqr_basecase.asm b/gmp/mpn/arm/v6/sqr_basecase.asm
new file mode 100644
index 0000000000..d52970aaa7
--- /dev/null
+++ b/gmp/mpn/arm/v6/sqr_basecase.asm
@@ -0,0 +1,518 @@
+dnl ARM v6 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Code structure:
+C
+C
+C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
+C | | | |
+C | | | |
+C | | | |
+C \|/ \|/ \|/ \|/
+C ____________ ____________
+C / \ / \
+C \|/ \ \|/ \
+C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
+C \ /|\ \ /|\
+C \____________/ \____________/
+C \ /
+C \ /
+C \ /
+C tail(0m2) tail(1m2)
+C \ /
+C \ /
+C sqr_diag_addlsh1
+
+C TODO
+C * Further tweak counter and updates in outer loops. (This could save
+C perhaps 5n cycles).
+C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
+C (This could save 2-3 cycles for n > 4.)
+C * Optimise sqr_diag_addlsh1 loop. (This could save O(n) cycles.)
+C * Implement larger final corners (xit/tix). Also stop loops earlier
+C suppressing writes of upper-most rp[] values. (This could save 10-20
+C cycles for n > 4.)
+C * Is the branch table really faster than discrete branches?
+
+define(`rp', r0)
+define(`up', r1)
+define(`n', r2)
+
+define(`v0', r3)
+define(`v1', r6)
+define(`i', r8)
+define(`n_saved', r14)
+define(`cya', r11)
+define(`cyb', r12)
+define(`u0', r7)
+define(`u1', r9)
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ and r12, n, #3
+ cmp n, #4
+ addgt r12, r12, #4
+ add pc, pc, r12, lsl #2
+ nop
+ b L(4)
+ b L(1)
+ b L(2)
+ b L(3)
+ b L(0m4)
+ b L(1m4)
+ b L(2m4)
+ b L(3m4)
+
+
+L(1m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_2m4)-.-8
+ ldm up, {v0,v1,u0}
+ sub up, up, #4
+ mov cyb, #0
+ mov r5, #0
+ umull r4, cya, v1, v0
+ str r4, [rp], #-12
+ mov r4, #0
+ b L(ko0)
+
+L(3m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_0m4)-.-8
+ ldm up, {v0,v1,u0}
+ add up, up, #4
+ mov cyb, #0
+ mov r5, #0
+ umull r4, cya, v1, v0
+ str r4, [rp], #-4
+ mov r4, #0
+ b L(ko2)
+
+L(2m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_3m4)-.-8
+ ldm up, {v0,v1,u1}
+ mov cyb, #0
+ mov r4, #0
+ umull r5, cya, v1, v0
+ str r5, [rp], #-8
+ mov r5, #0
+ b L(ko1)
+
+L(0m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_1m4)-.-8
+ ldm up, {v0,v1,u1}
+ mov cyb, #0
+ mov r4, #0
+ add up, up, #8
+ umull r5, cya, v1, v0
+ str r5, [rp, #0]
+ mov r5, #0
+
+L(top): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(ko2): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+L(ko1): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(ko0): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(top)
+ bx r10
+
+L(evnloop):
+ subs i, n, #4
+ sub n, n, #2
+ blt L(tix)
+ ldm up, {v0,v1,u0}
+ add up, up, #4
+ mov cya, #0
+ mov cyb, #0
+ ldm rp, {r4,r5}
+ sub rp, rp, #4
+ umaal r4, cya, v1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ b L(lo2)
+L(ua2): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+L(lo2): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ umaal r4, cyb, u0, v1
+ ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ ldr r5, [rp, #8]
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(ua2)
+L(am2_0m4):
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+ sub up, up, n, lsl #2
+ sub rp, rp, n, lsl #2
+ add up, up, #8
+ sub i, n, #4
+ sub n, n, #2
+ ldm up, {v0,v1,u0}
+ sub up, up, #4
+ mov cya, #0
+ mov cyb, #0
+ ldr r4, [rp, #24]
+ ldr r5, [rp, #28]
+ add rp, rp, #12
+ umaal r4, cya, v1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ b L(lo0)
+L(ua0): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ umaal r4, cyb, u0, v1
+ ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ umaal r5, cyb, u1, v1
+L(lo0): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ ldr r5, [rp, #8]
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(ua0)
+L(am2_2m4):
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+ sub up, up, n, lsl #2
+ sub rp, rp, n, lsl #2
+ add up, up, #8
+ add rp, rp, #24
+ b L(evnloop)
+
+
+L(oddloop):
+ subs i, n, #4
+ sub n, n, #2
+ blt L(xit)
+ ldm up, {v0,v1,u1}
+ mov cya, #0
+ mov cyb, #0
+ sub rp, rp, #8
+ ldr r5, [rp, #8]
+ ldr r4, [rp, #12]
+ umaal r5, cya, v1, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ b L(lo1)
+L(ua1): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ umaal r4, cyb, u0, v1
+L(lo1): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ ldr r5, [rp, #8]
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(ua1)
+L(am2_3m4):
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+ sub up, up, n, lsl #2
+ sub rp, rp, n, lsl #2
+ add up, up, #8
+ add rp, rp, #24
+ subs i, n, #4
+ sub n, n, #2
+ ldm up, {v0,v1,u1}
+ mov cya, #0
+ mov cyb, #0
+ ldr r5, [rp, #0]
+ ldr r4, [rp, #4]
+ add up, up, #8
+ umaal r5, cya, v1, v0
+ str r5, [rp, #0]
+ ldr r5, [rp, #8]
+ bls L(e3)
+L(ua3): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ umaal r4, cyb, u0, v1
+ ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ ldr r5, [rp, #8]
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(ua3)
+L(e3):
+L(am2_1m4):
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+ sub up, up, n, lsl #2
+ sub rp, rp, n, lsl #2
+ add up, up, #8
+ add rp, rp, #24
+ b L(oddloop)
+
+L(xit): ldm up!, {v0,u0}
+ ldr cya, [rp], #12
+ mov cyb, #0
+ umaal cya, cyb, u0, v0
+ b L(sqr_diag_addlsh1)
+
+L(tix): ldm up!, {v0,v1,u0}
+ ldm rp, {r4,r5}
+ mov cya, #0
+ mov cyb, #0
+ umaal r4, cya, v1, v0
+ umaal r5, cya, u0, v0
+ stm rp, {r4,r5}
+ umaal cya, cyb, u0, v1
+ add rp, rp, #20
+C b L(sqr_diag_addlsh1)
+
+
+define(`w0', r6)
+define(`w1', r7)
+define(`w2', r8)
+define(`rbx', r9)
+
+L(sqr_diag_addlsh1):
+ str cya, [rp, #-12]
+ str cyb, [rp, #-8]
+ sub n, n_saved, #1
+ sub up, up, n_saved, lsl #2
+ sub rp, rp, n_saved, lsl #3
+ ldr r3, [up], #4
+ umull w1, r5, r3, r3
+ mov w2, #0
+ mov r10, #0
+C cmn r0, #0 C clear cy (already clear by luck)
+ b L(lm)
+
+L(tsd): adds w0, w0, rbx
+ adcs w1, w1, r4
+ str w0, [rp, #0]
+L(lm): ldr w0, [rp, #4]
+ str w1, [rp, #4]
+ ldr w1, [rp, #8]!
+ add rbx, r5, w2
+ adcs w0, w0, w0
+ ldr r3, [up], #4
+ adcs w1, w1, w1
+ adc w2, r10, r10
+ umull r4, r5, r3, r3
+ subs n, n, #1
+ bne L(tsd)
+
+ adds w0, w0, rbx
+ adcs w1, w1, r4
+ adc w2, r5, w2
+ stm rp, {w0,w1,w2}
+
+ pop {r4-r10,r11,pc}
+
+
+C Straight line code for n <= 4
+
+L(1): ldr r3, [up, #0]
+ umull r1, r2, r3, r3
+ stm rp, {r1,r2}
+ bx r14
+
+L(2): push {r4-r5}
+ ldm up, {r5,r12}
+ umull r1, r2, r5, r5
+ umull r3, r4, r12, r12
+ umull r5, r12, r5, r12
+ adds r5, r5, r5
+ adcs r12, r12, r12
+ adc r4, r4, #0
+ adds r2, r2, r5
+ adcs r3, r3, r12
+ adc r4, r4, #0
+ stm rp, {r1,r2,r3,r4}
+ pop {r4-r5}
+ bx r14
+
+L(3): push {r4-r11}
+ ldm up, {r7,r8,r9}
+ umull r1, r2, r7, r7
+ umull r3, r4, r8, r8
+ umull r5, r6, r9, r9
+ umull r10, r11, r7, r8
+ mov r12, #0
+ umlal r11, r12, r7, r9
+ mov r7, #0
+ umlal r12, r7, r8, r9
+ adds r10, r10, r10
+ adcs r11, r11, r11
+ adcs r12, r12, r12
+ adcs r7, r7, r7
+ adc r6, r6, #0
+ adds r2, r2, r10
+ adcs r3, r3, r11
+ adcs r4, r4, r12
+ adcs r5, r5, r7
+ adc r6, r6, #0
+ stm rp, {r1,r2,r3,r4,r5,r6}
+ pop {r4-r11}
+ bx r14
+
+L(4): push {r4-r11, r14}
+ ldm up, {r9,r10,r11,r12}
+ umull r1, r2, r9, r9
+ umull r3, r4, r10, r10
+ umull r5, r6, r11, r11
+ umull r7, r8, r12, r12
+ stm rp, {r1,r2,r3,r4,r5,r6,r7}
+ umull r1, r2, r9, r10
+ mov r3, #0
+ umlal r2, r3, r9, r11
+ mov r4, #0
+ umlal r3, r4, r9, r12
+ mov r5, #0
+ umlal r3, r5, r10, r11
+ umaal r4, r5, r10, r12
+ mov r6, #0
+ umlal r5, r6, r11, r12
+ adds r1, r1, r1
+ adcs r2, r2, r2
+ adcs r3, r3, r3
+ adcs r4, r4, r4
+ adcs r5, r5, r5
+ adcs r6, r6, r6
+ adc r7, r8, #0
+ add rp, rp, #4
+ ldm rp, {r8,r9,r10,r11,r12,r14}
+ adds r1, r1, r8
+ adcs r2, r2, r9
+ adcs r3, r3, r10
+ adcs r4, r4, r11
+ adcs r5, r5, r12
+ adcs r6, r6, r14
+ adc r7, r7, #0
+ stm rp, {r1,r2,r3,r4,r5,r6,r7}
+ pop {r4-r11, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/submul_1.asm b/gmp/mpn/arm/v6/submul_1.asm
new file mode 100644
index 0000000000..8a21733a0a
--- /dev/null
+++ b/gmp/mpn/arm/v6/submul_1.asm
@@ -0,0 +1,125 @@
+dnl ARM mpn_submul_1.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.75
+C Cortex-A15 4.0
+
+C This loop complements U on the fly,
+C U' = B^n - 1 - U
+C and then uses that
+C R - U*v = R + U'*v + v - B^n v
+
+C TODO
+C * Micro-optimise feed-in code.
+C * Optimise for n=1,2 by delaying register saving.
+C * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+ ands r6, n, #3
+ mov r12, v0
+ beq L(fi0)
+ cmp r6, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): ldr r4, [up], #12
+ mvn r4, r4
+ ldr r6, [rp, #0]
+ ldr r5, [up, #-8]
+ b L(lo3)
+
+L(fi0): ldr r5, [up], #16
+ mvn r5, r5
+ ldr r7, [rp], #4
+ ldr r4, [up, #-12]
+ b L(lo0)
+
+L(fi1): ldr r4, [up], #4
+ mvn r4, r4
+ ldr r6, [rp], #8
+ subs n, n, #1
+ beq L(1)
+ ldr r5, [up]
+ b L(lo1)
+
+L(fi2): ldr r5, [up], #8
+ mvn r5, r5
+ ldr r7, [rp], #12
+ ldr r4, [up, #-4]
+ b L(lo2)
+
+ ALIGN(16)
+L(top): ldr r6, [rp, #-8]
+ ldr r5, [up]
+ str r7, [rp, #-12]
+L(lo1): umaal r6, r12, r4, v0
+ add up, up, #16
+ mvn r5, r5
+ ldr r7, [rp, #-4]
+ ldr r4, [up, #-12]
+ str r6, [rp, #-8]
+L(lo0): umaal r7, r12, r5, v0
+ mvn r4, r4
+ ldr r6, [rp, #0]
+ ldr r5, [up, #-8]
+ str r7, [rp, #-4]
+L(lo3): umaal r6, r12, r4, v0
+ mvn r5, r5
+ ldr r7, [rp, #4]
+ ldr r4, [up, #-4]
+ str r6, [rp], #16
+L(lo2): umaal r7, r12, r5, v0
+ mvn r4, r4
+ subs n, n, #4
+ bhi L(top)
+
+ ldr r6, [rp, #-8]
+ str r7, [rp, #-12]
+L(1): umaal r6, r12, r4, v0
+ str r6, [rp, #-8]
+ sub r0, v0, r12
+ ldmfd sp!, { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6t2/divrem_1.asm b/gmp/mpn/arm/v6t2/divrem_1.asm
new file mode 100644
index 0000000000..be24615acb
--- /dev/null
+++ b/gmp/mpn/arm/v6t2/divrem_1.asm
@@ -0,0 +1,212 @@
+dnl ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C norm unorm frac
+C StrongARM - - -
+C XScale - - -
+C Cortex-A7 ? ? ?
+C Cortex-A8 ? ? ?
+C Cortex-A9 13 14 13
+C Cortex-A15 11.4 11.8 11.1
+
+C TODO
+C * Optimise inner-loops better, they could likely run a cycle or two faster.
+C * Decrease register usage, streamline non-loop code.
+
+define(`qp_arg', `r0')
+define(`fn', `r1')
+define(`up_arg', `r2')
+define(`n_arg', `r3')
+define(`d_arg', `0')
+define(`dinv_arg',`4')
+define(`cnt_arg', `8')
+
+define(`n', `r9')
+define(`qp', `r5')
+define(`up', `r6')
+define(`cnt', `r7')
+define(`tnc', `r10')
+define(`dinv', `r0')
+define(`d', `r4')
+
+ASM_START()
+PROLOGUE(mpn_preinv_divrem_1)
+ stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ ldr d, [sp, #9*4+d_arg]
+ ldr cnt, [sp, #9*4+cnt_arg]
+ str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn
+ sub n, r3, #1
+ add r3, r1, n
+ cmp d, #0
+ add qp, qp_arg, r3, lsl #2 C put qp at Q[] end
+ add up, up_arg, n, lsl #2 C put up at U[] end
+ ldr dinv, [sp, #9*4+dinv_arg]
+ blt L(nent)
+ b L(uent)
+EPILOGUE()
+
+PROLOGUE(mpn_divrem_1)
+ stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub n, r3, #1
+ ldr d, [sp, #9*4+d_arg] C d
+ str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn
+ add r3, r1, n
+ cmp d, #0
+ add qp, qp_arg, r3, lsl #2 C put qp at Q[] end
+ add up, up_arg, n, lsl #2 C put up at U[] end
+ blt L(normalised)
+
+L(unnorm):
+ clz cnt, d
+ mov r0, d, lsl cnt C pass d << cnt
+ bl mpn_invert_limb
+L(uent):
+ mov d, d, lsl cnt C d <<= cnt
+ cmp n, #0
+ mov r1, #0 C r
+ blt L(frac)
+
+ ldr r11, [up, #0]
+
+ rsb tnc, cnt, #32
+ mov r1, r11, lsr tnc
+ mov r11, r11, lsl cnt
+ beq L(uend)
+
+ ldr r3, [up, #-4]!
+ orr r2, r11, r3, lsr tnc
+ b L(mid)
+
+L(utop):
+ mls r1, d, r8, r11
+ mov r11, r3, lsl cnt
+ ldr r3, [up, #-4]!
+ cmp r1, r2
+ addhi r1, r1, d
+ subhi r8, r8, #1
+ orr r2, r11, r3, lsr tnc
+ cmp r1, d
+ bcs L(ufx)
+L(uok): str r8, [qp], #-4
+L(mid): add r8, r1, #1
+ mov r11, r2
+ umlal r2, r8, r1, dinv
+ subs n, n, #1
+ bne L(utop)
+
+ mls r1, d, r8, r11
+ mov r11, r3, lsl cnt
+ cmp r1, r2
+ addhi r1, r1, d
+ subhi r8, r8, #1
+ cmp r1, d
+ rsbcs r1, d, r1
+ addcs r8, r8, #1
+ str r8, [qp], #-4
+
+L(uend):add r8, r1, #1
+ mov r2, r11
+ umlal r2, r8, r1, dinv
+ mls r1, d, r8, r11
+ cmp r1, r2
+ addhi r1, r1, d
+ subhi r8, r8, #1
+ cmp r1, d
+ rsbcs r1, d, r1
+ addcs r8, r8, #1
+ str r8, [qp], #-4
+L(frac):
+ ldr r2, [sp, #9*4+d_arg] C fn
+ cmp r2, #0
+ beq L(fend)
+
+L(ftop):mov r6, #0
+ add r3, r1, #1
+ umlal r6, r3, r1, dinv
+ mov r8, #0
+ mls r1, d, r3, r8
+ cmp r1, r6
+ addhi r1, r1, d
+ subhi r3, r3, #1
+ subs r2, r2, #1
+ str r3, [qp], #-4
+ bne L(ftop)
+
+L(fend):mov r11, r1, lsr cnt
+L(rtn): mov r0, r11
+ ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+L(normalised):
+ mov r0, d
+ bl mpn_invert_limb
+L(nent):
+ cmp n, #0
+ mov r11, #0 C r
+ blt L(nend)
+
+ ldr r11, [up, #0]
+ cmp r11, d
+ movlo r2, #0 C hi q limb
+ movhs r2, #1 C hi q limb
+ subhs r11, r11, d
+
+ str r2, [qp], #-4
+ cmp n, #0
+ beq L(nend)
+
+L(ntop):ldr r1, [up, #-4]!
+ add r12, r11, #1
+ umlal r1, r12, r11, dinv
+ ldr r3, [up, #0]
+ mls r11, d, r12, r3
+ cmp r11, r1
+ addhi r11, r11, d
+ subhi r12, r12, #1
+ cmp d, r11
+ bls L(nfx)
+L(nok): str r12, [qp], #-4
+ subs n, n, #1
+ bne L(ntop)
+
+L(nend):mov r1, r11 C r
+ mov cnt, #0 C shift cnt
+ b L(frac)
+
+L(nfx): add r12, r12, #1
+ rsb r11, d, r11
+ b L(nok)
+L(ufx): rsb r1, d, r1
+ add r8, r8, #1
+ b L(uok)
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6t2/gcd_1.asm b/gmp/mpn/arm/v6t2/gcd_1.asm
new file mode 100644
index 0000000000..2063647963
--- /dev/null
+++ b/gmp/mpn/arm/v6t2/gcd_1.asm
@@ -0,0 +1,115 @@
+dnl ARM v6t2 mpn_gcd_1.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.3
+C Cortex-A15 3.5
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+C TODO
+C * Optimise inner-loop better.
+C * Push saving/restoring of callee-user regs into call code
+
+C Threshold of when to call bmod when U is one limb. Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 7)
+
+C INPUT PARAMETERS
+define(`up', `r0')
+define(`n', `r1')
+define(`v0', `r2')
+
+ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
+ `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+ push {r4, r7, lr}
+ ldr r3, [up] C U low limb
+
+ orr r3, r3, v0
+ rbit r4, r3
+ clz r4, r4 C min(ctz(u0),ctz(v0))
+
+ rbit r12, v0
+ clz r12, r12
+ mov v0, v0, lsr r12
+
+ mov r7, v0
+
+ cmp n, #1
+ bne L(nby1)
+
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+ ldr r3, [up]
+ cmp v0, r3, lsr #BMOD_THRES_LOG2
+ bhi L(red1)
+
+L(bmod):mov r3, #0 C carry argument
+ bl mpn_modexact_1c_odd
+ b L(red0)
+
+L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
+ blo L(bmod)
+
+ bl mpn_mod_1
+
+L(red0):mov r3, r0
+L(red1):cmp r3, #0
+ rbit r12, r3
+ clz r12, r12
+ bne L(mid)
+ b L(end)
+
+ ALIGN(8)
+L(top): movcs r3, r1 C if x-y < 0
+ movcs r7, r0 C use x,y-x
+L(mid): mov r3, r3, lsr r12 C
+ mov r0, r3 C
+ subs r1, r7, r3 C
+ rsb r3, r7, r3 C
+ rbit r12, r1
+ clz r12, r12 C
+ bne L(top) C
+
+L(end): mov r0, r7, lsl r4
+ pop {r4, r7, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/addmul_1.asm b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
new file mode 100644
index 0000000000..c2277b32b2
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
@@ -0,0 +1,145 @@
+dnl ARM mpn_addmul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 6 3.25
+C Cortex-A15 2 this
+
+C This code uses umlal for adding in the rp[] data, keeping the recurrency path
+C separate from any multiply instructions. It performs well on A15, at umlal's
+C bandwidth.
+C
+C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
+C for all loads and stores. Alternatively, it could do 2-way or 4-way, but
+C then alignment aware code will be necessary (adding O(1) bookkeeping
+C overhead).
+C
+C We don't use r12 due to ldrd and strd limitations.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8') define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ push { r4-r11 }
+
+ ands r6, n, #3
+ sub n, n, #3
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): mov r6, #0
+ cmn r13, #0 C carry clear
+ ldr u1, [up], #-4
+ ldr w1, [rp], #-4
+ mov r7, #0
+ b L(mid)
+
+L(b00): ldrd u0, u1, [up]
+ ldrd w0, w1, [rp]
+ mov r6, #0
+ umlal w0, r6, u0, v0
+ cmn r13, #0 C carry clear
+ mov r7, #0
+ str w0, [rp]
+ b L(mid)
+
+L(b10): ldrd u0, u1, [up], #8
+ ldrd w0, w1, [rp]
+ mov r4, #0
+ umlal w0, r4, u0, v0
+ cmn r13, #0 C carry clear
+ mov r5, #0
+ str w0, [rp], #8
+ umlal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+ b L(top)
+
+L(b01): mov r4, #0
+ ldr u1, [up], #4
+ ldr w1, [rp], #4
+ mov r5, #0
+ umlal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+
+ ALIGN(16)
+L(top): ldrd u0, u1, [up, #0]
+ adcs r4, r4, w1
+ ldrd w0, w1, [rp, #0]
+ mov r6, #0
+ umlal w0, r6, u0, v0 C 1 2
+ adcs r5, r5, w0
+ mov r7, #0
+ strd r4, r5, [rp, #-4]
+L(mid): umlal w1, r7, u1, v0 C 2 3
+ ldrd u0, u1, [up, #8]
+ adcs r6, r6, w1
+ ldrd w0, w1, [rp, #8]
+ mov r4, #0
+ umlal w0, r4, u0, v0 C 3 4
+ adcs r7, r7, w0
+ mov r5, #0
+ strd r6, r7, [rp, #4]
+ umlal w1, r5, u1, v0 C 0 1
+ sub n, n, #4
+ add up, up, #16
+ add rp, rp, #16
+ tst n, n
+ bpl L(top)
+
+L(end): adcs r4, r4, w1
+ str r4, [rp, #-4]
+ adc r0, r5, #0
+ pop { r4-r11 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/aors_n.asm b/gmp/mpn/arm/v7a/cora15/aors_n.asm
new file mode 100644
index 0000000000..dc3f83992e
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/aors_n.asm
@@ -0,0 +1,162 @@
+dnl ARM mpn_add_n/mpn_sub_n optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.55 2.5
+C Cortex-A15 1.27 this
+
+C This was a major improvement compared to the code we had before, but it might
+C not be the best 8-way code possible. We've tried some permutations of auto-
+C increments and separate pointer updates, but they all ran at the same speed
+C on A15.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_add_n', `
+ define(`ADDSUBC', adcs)
+ define(`IFADD', `$1')
+ define(`SETCY', `cmp $1, #1')
+ define(`RETVAL', `adc r0, n, #0')
+ define(`RETVAL2', `adc r0, n, #1')
+ define(`func', mpn_add_n)
+ define(`func_nc', mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(`ADDSUBC', sbcs)
+ define(`IFADD', `')
+ define(`SETCY', `rsbs $1, $1, #0')
+ define(`RETVAL', `sbc r0, r0, r0
+ and r0, r0, #1')
+ define(`RETVAL2', `RETVAL')
+ define(`func', mpn_sub_n)
+ define(`func_nc', mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ ldr r12, [sp]
+ b L(ent)
+EPILOGUE()
+PROLOGUE(func)
+ mov r12, #0
+L(ent): push { r4-r9 }
+
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ ldr r7, [vp], #4
+ SETCY( r12)
+ ADDSUBC r9, r5, r7
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ SETCY( r12)
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ ldr r7, [vp], #-4
+ SETCY( r12)
+ ADDSUBC r9, r5, r7
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ ldrd r6, r7, [vp]
+ SETCY( r12)
+ sub rp, rp, #8
+ b L(lo)
+
+ ALIGN(16)
+L(top): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ strd r8, r9, [rp, #8]
+L(mid): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ ldrd r4, r5, [up, #16]
+ ldrd r6, r7, [vp, #16]
+ strd r8, r9, [rp, #16]
+ ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ ldrd r6, r7, [vp, #24]
+ strd r8, r9, [rp, #24]
+ ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ ldrd r4, r5, [up, #32]!
+ ldrd r6, r7, [vp, #32]!
+ strd r8, r9, [rp, #32]!
+L(lo): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ tst n, n
+ bne L(top)
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): RETVAL
+ pop { r4-r9 }
+ bx r14
+L(dne): strd r8, r9, [rp, #24]
+ RETVAL2
+ pop { r4-r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
new file mode 100644
index 0000000000..b9e5cd3f79
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
@@ -0,0 +1,158 @@
+dnl ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.75 3
+C Cortex-A15 1.78 this
+
+C This code does not run as well as one could have hoped, since 1.5 c/l seems
+C realistic for this insn mix.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`cnd',`r0')
+define(`rp', `r1')
+define(`up', `r2')
+define(`vp', `r3')
+define(`n', `r12')
+
+ifdef(`OPERATION_cnd_add_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`IFADD', `$1')
+ define(`INITCY', `cmn r0, #0')
+ define(`RETVAL', `adc r0, n, #0')
+ define(`RETVAL2', `adc r0, n, #1')
+ define(`func', mpn_cnd_add_n)
+ define(`func_nc', mpn_add_nc)')
+ifdef(`OPERATION_cnd_sub_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`IFADD', `')
+ define(`INITCY', `cmp r0, #0')
+ define(`RETVAL', `sbc r0, r0, r0
+ and r0, r0, #1')
+ define(`RETVAL2', `RETVAL')
+ define(`func', mpn_cnd_sub_n)
+ define(`func_nc', mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ ldr n, [sp]
+ push { r4-r9 }
+
+ cmp cnd, #1
+ sbc cnd, cnd, cnd C conditionally set to 0xffffffff
+
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ ldr r7, [vp], #4
+ bic r7, r7, cnd
+ ADDSUB r9, r5, r7
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ INITCY
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ ldr r7, [vp], #-4
+ bic r7, r7, cnd
+ ADDSUB r9, r5, r7
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ ldrd r6, r7, [vp]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ INITCY
+ sub rp, rp, #8
+ b L(lo)
+
+ ALIGN(16)
+L(top): ldrd r6, r7, [vp, #8]
+ ldrd r4, r5, [up, #8]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ strd r8, r9, [rp, #8]
+L(mid): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ ldrd r6, r7, [vp, #16]!
+ ldrd r4, r5, [up, #16]!
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ sub n, n, #1
+ strd r8, r9, [rp, #16]!
+L(lo): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ tst n, n
+ bne L(top)
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): RETVAL
+ pop { r4-r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/com.asm b/gmp/mpn/arm/v7a/cora15/com.asm
new file mode 100644
index 0000000000..a258afe934
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/com.asm
@@ -0,0 +1,180 @@
+dnl ARM mpn_com optimised for A15.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.5
+C Cortex-A15 1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
+define(`UNROLL', 4x2) C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ push { r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+ ands r12, n, #3
+ mov n, n, lsr #2
+ beq L(b00a)
+ tst r12, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ mvn r9, r5
+ str r9, [rp], #4
+ tst r12, #2
+ beq L(b00)
+L(bx0): ldrd r4, r5, [up, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+L(b00a):ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+ and r12, n, #3
+ mov n, n, lsr #2
+ tst r12, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ mvn r9, r5
+ str r9, [rp], #4
+L(bx0): tst r12, #2
+ beq L(b00)
+ ldrd r4, r5, [up, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+ ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+ ands r12, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r12, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ mvn r9, r5
+ ldrd r4, r5, [up, #0]
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ mvn r9, r5
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ sub rp, rp, #8
+ b L(lo)
+')
+ ALIGN(16)
+ifelse(UNROLL,4,`
+L(top): ldrd r4, r5, [up, #8]
+ strd r8, r9, [rp, #8]
+L(mid): mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #16]!
+ strd r8, r9, [rp, #16]!
+ sub n, n, #1
+L(lo): mvn r8, r4
+ mvn r9, r5
+ tst n, n
+ bne L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top): ldrd r4, r5, [up, #8]
+ strd r8, r9, [rp, #8]
+L(mid): mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #16]
+ strd r8, r9, [rp, #16]
+ mvn r8, r4
+ mvn r9, r5
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ strd r8, r9, [rp, #24]
+ mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #32]!
+ strd r8, r9, [rp, #32]!
+L(lo): mvn r8, r4
+ mvn r9, r5
+ tst n, n
+ bne L(top)
+')
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): pop { r4-r5,r8-r9 }
+ bx r14
+ifelse(UNROLL,4x2,`
+L(dne): strd r8, r9, [rp, #24]
+ pop { r4-r5,r8-r9 }
+ bx r14
+')
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
new file mode 100644
index 0000000000..2a06532b3e
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
@@ -0,0 +1,197 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1700MHz Cortex-A15 with Neon (in spite of file position) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 15
+
+#define MUL_TOOM22_THRESHOLD 23
+#define MUL_TOOM33_THRESHOLD 90
+#define MUL_TOOM44_THRESHOLD 262
+#define MUL_TOOM6H_THRESHOLD 351
+#define MUL_TOOM8H_THRESHOLD 557
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 43
+#define SQR_TOOM3_THRESHOLD 138
+#define SQR_TOOM4_THRESHOLD 363
+#define SQR_TOOM6_THRESHOLD 517
+#define SQR_TOOM8_THRESHOLD 725
+
+#define MULMID_TOOM42_THRESHOLD 52
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 550 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 550, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 6}, { 39, 7}, { 25, 6}, \
+ { 51, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 51, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 99, 9}, { 55,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 831,12}, { 447,11}, { 895,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \
+ { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1663,12}, { 895,13}, { 511,12}, { 1087,13}, \
+ { 639,12}, { 1407,13}, { 767,12}, { 1599,13}, \
+ { 895,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,14}, { 767,13}, \
+ { 1535,12}, { 3071,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2815,12}, { 5631,13}, { 2943,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 137
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 525, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 51, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \
+ { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,11}, { 351,12}, { 191,11}, \
+ { 383,10}, { 799,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 511,11}, \
+ { 1023,12}, { 575,11}, { 1151,12}, { 639,11}, \
+ { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1663,12}, { 895,13}, \
+ { 511,12}, { 1087,13}, { 639,12}, { 1407,13}, \
+ { 767,12}, { 1727,13}, { 895,14}, { 511,13}, \
+ { 1023,12}, { 2047,13}, { 1151,12}, { 2431,13}, \
+ { 1279,14}, { 767,13}, { 1535,12}, { 3071,15}, \
+ { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \
+ { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2687,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 139
+#define SQR_FFT_THRESHOLD 4736
+
+#define MULLO_BASECASE_THRESHOLD 9
+#define MULLO_DC_THRESHOLD 39
+#define MULLO_MUL_N_THRESHOLD 11278
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 296
+#define DC_BDIV_QR_THRESHOLD 52
+#define DC_BDIV_Q_THRESHOLD 300
+
+#define INV_MULMOD_BNM1_THRESHOLD 44
+#define INV_NEWTON_THRESHOLD 294
+#define INV_APPR_THRESHOLD 294
+
+#define BINV_NEWTON_THRESHOLD 375
+#define REDC_1_TO_REDC_2_THRESHOLD 102
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1718
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1528
+#define MU_BDIV_Q_THRESHOLD 1718
+
+#define POWM_SEC_TABLE 3,32,70,416,1464
+
+#define MATRIX22_STRASSEN_THRESHOLD 22
+#define HGCD_THRESHOLD 152
+#define HGCD_APPR_THRESHOLD 230
+#define HGCD_REDUCE_THRESHOLD 3259
+#define GCD_DC_THRESHOLD 702
+#define GCDEXT_DC_THRESHOLD 538
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 32
+#define SET_STR_DC_THRESHOLD 119
+#define SET_STR_PRECOMPUTE_THRESHOLD 1063
+
+#define FAC_DSC_THRESHOLD 262
+#define FAC_ODD_THRESHOLD 26
diff --git a/gmp/mpn/arm/v7a/cora15/logops_n.asm b/gmp/mpn/arm/v7a/cora15/logops_n.asm
new file mode 100644
index 0000000000..06026143e1
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/logops_n.asm
@@ -0,0 +1,253 @@
+dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C and andn ior xor nand iorn nior xnor
+C StrongARM ? ?
+C XScale ? ?
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3.5 3.56
+C Cortex-A15 1.27 1.64
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
+define(`UNROLL', 4x2) C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+ define(`func', `mpn_and_n')
+ define(`LOGOP', `and $1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+ define(`func', `mpn_andn_n')
+ define(`LOGOP', `bic $1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+ define(`func', `mpn_nand_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `and $1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+ define(`func', `mpn_ior_n')
+ define(`LOGOP', `orr $1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+ define(`func', `mpn_iorn_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `bic $1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+ define(`func', `mpn_nior_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `orr $1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+ define(`func', `mpn_xor_n')
+ define(`LOGOP', `eor $1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+ define(`func', `mpn_xnor_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `eor $1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ push { r4-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00a)
+ tst r6, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ ldr r7, [vp], #4
+ LOGOP( r9, r5, r7)
+ POSTOP( r9)
+ str r9, [rp], #4
+ tst r6, #2
+ beq L(b00)
+L(bx0): ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+L(b00a):ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+ and r6, n, #3
+ mov n, n, lsr #2
+ tst r6, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ ldr r7, [vp], #4
+ LOGOP( r9, r5, r7)
+ POSTOP( r9)
+ str r9, [rp], #4
+L(bx0): tst r6, #2
+ beq L(b00)
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+ ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ ldr r7, [vp], #4
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ POSTOP( r9)
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ ldr r7, [vp], #-4
+ LOGOP( r9, r5, r7)
+ POSTOP( r9)
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ ldrd r6, r7, [vp]
+ sub rp, rp, #8
+ b L(lo)
+')
+ ALIGN(16)
+ifelse(UNROLL,4,`
+L(top): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #8]
+L(mid): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #16]!
+ ldrd r6, r7, [vp, #16]!
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #16]!
+ sub n, n, #1
+L(lo): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ tst n, n
+ bne L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #8]
+L(mid): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #16]
+ ldrd r6, r7, [vp, #16]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #16]
+ LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ ldrd r6, r7, [vp, #24]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #24]
+ LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #32]!
+ ldrd r6, r7, [vp, #32]!
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #32]!
+L(lo): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ tst n, n
+ bne L(top)
+')
+
+L(end): POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #8]
+L(wd1): pop { r4-r9 }
+ bx r14
+ifelse(UNROLL,4x2,`
+L(dne): POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #24]
+ pop { r4-r9 }
+ bx r14
+')
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/mul_1.asm b/gmp/mpn/arm/v7a/cora15/mul_1.asm
new file mode 100644
index 0000000000..766ba5c57f
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/mul_1.asm
@@ -0,0 +1,104 @@
+dnl ARM mpn_mul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.25 3.25
+C Cortex-A15 2.25 this
+
+
+C This runs well on A15 but very poorly on A9. By scheduling loads and adds
+C it is possible to get good A9 performance as well, but at the cost of using
+C many more (callee-saves) registers.
+
+C This is armv5 code, optimized for the armv7a cpu A15. Its location in the
+C GMP file structure might be misleading.
+
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`v0', `r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ ldr r12, [sp]
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ mov r12, #0
+L(ent): push {r4-r7}
+
+ ldr r6, [up], #4
+ tst n, #1
+ beq L(bx0)
+
+L(bx1): umull r4, r7, r6, v0
+ adds r4, r4, r12
+ tst n, #2
+ beq L(lo1)
+ b L(lo3)
+
+L(bx0): umull r4, r5, r6, v0
+ adds r4, r4, r12
+ tst n, #2
+ beq L(lo0)
+ b L(lo2)
+
+L(top): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r5, r6, v0
+ adds r4, r4, r7
+L(lo0): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r7, r6, v0
+ adcs r4, r4, r5
+L(lo3): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r5, r6, v0
+ adcs r4, r4, r7
+L(lo2): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r7, r6, v0
+ adcs r4, r4, r5
+L(lo1): adc r7, r7, #0
+ subs n, n, #4
+ bgt L(top)
+
+ str r4, [rp]
+ mov r0, r7
+ pop {r4-r7}
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
new file mode 100644
index 0000000000..d8cfe3f78f
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
new file mode 100644
index 0000000000..b48204d926
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
new file mode 100644
index 0000000000..16c34a2699
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
@@ -0,0 +1,144 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.25
+C Cortex-A15 2.25
+
+C TODO
+C * Consider using 4-way feed-in code.
+C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`DO_add', `
+ define(`ADCSBCS', `adcs $1, $2, $3')
+ define(`CLRCY', `cmn r13, #1')
+ define(`RETVAL', `adc r0, $1, #0')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADCSBCS', `sbcs $1, $2, $3')
+ define(`CLRCY', `cmp r13, #0')
+ define(`RETVAL', `sbc $2, $2, $2
+ cmn $2, #1
+ adc r0, $1, #0')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADCSBCS', `sbcs $1, $3, $2')
+ define(`CLRCY', `cmp r13, #0')
+ define(`RETVAL', `sbc r0, $1, #0')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10}
+ vmov.i8 d0, #0 C could feed carry through here
+ CLRCY
+ tst n, #1
+ beq L(bb0)
+
+L(bb1): vld1.32 {d3[0]}, [vp]!
+ vsli.u32 d0, d3, #LSH
+ ldr r12, [up], #4
+ vmov.32 r5, d0[0]
+ vshr.u32 d0, d3, #32-LSH
+ ADCSBCS( r12, r12, r5)
+ str r12, [rp], #4
+ bics n, n, #1
+ beq L(rtn)
+
+L(bb0): tst n, #2
+ beq L(b00)
+
+L(b10): vld1.32 {d3}, [vp]!
+ vsli.u64 d0, d3, #LSH
+ ldmia up!, {r10,r12}
+ vmov r4, r5, d0
+ vshr.u64 d0, d3, #64-LSH
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r10,r12}
+ bics n, n, #2
+ beq L(rtn)
+
+L(b00): vld1.32 {d2}, [vp]!
+ vsli.u64 d0, d2, #LSH
+ vshr.u64 d1, d2, #64-LSH
+ vld1.32 {d3}, [vp]!
+ vsli.u64 d1, d3, #LSH
+ vmov r6, r7, d0
+ vshr.u64 d0, d3, #64-LSH
+ sub n, n, #4
+ tst n, n
+ beq L(end)
+
+ ALIGN(16)
+L(top): ldmia up!, {r8,r9,r10,r12}
+ vld1.32 {d2}, [vp]!
+ vsli.u64 d0, d2, #LSH
+ vmov r4, r5, d1
+ vshr.u64 d1, d2, #64-LSH
+ ADCSBCS( r8, r8, r6)
+ ADCSBCS( r9, r9, r7)
+ vld1.32 {d3}, [vp]!
+ vsli.u64 d1, d3, #LSH
+ vmov r6, r7, d0
+ vshr.u64 d0, d3, #64-LSH
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r8,r9,r10,r12}
+ sub n, n, #4
+ tst n, n
+ bne L(top)
+
+L(end): ldmia up!, {r8,r9,r10,r12}
+ vmov r4, r5, d1
+ ADCSBCS( r8, r8, r6)
+ ADCSBCS( r9, r9, r7)
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r8,r9,r10,r12}
+L(rtn): vmov.32 r0, d0[0]
+ RETVAL( r0, r1)
+ pop {r4-r10}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/com.asm b/gmp/mpn/arm/v7a/cora15/neon/com.asm
new file mode 100644
index 0000000000..9e7a629287
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/com.asm
@@ -0,0 +1,97 @@
+dnl ARM Neon mpn_com optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A8 ?
+C Cortex-A9 2.1
+C Cortex-A15 0.65
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ cmp n, #7
+ ble L(bc)
+
+C Perform a few initial operation until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ vld1.32 {d0[0]}, [up]!
+ sub n, n, #1
+ vmvn d0, d0
+ vst1.32 {d0[0]}, [rp]!
+L(al1): tst rp, #8
+ beq L(al2)
+ vld1.32 {d0}, [up]!
+ sub n, n, #2
+ vmvn d0, d0
+ vst1.32 {d0}, [rp:64]!
+L(al2): vld1.32 {q2}, [up]!
+ subs n, n, #12
+ blt L(end)
+
+ ALIGN(16)
+L(top): vld1.32 {q0}, [up]!
+ vmvn q2, q2
+ subs n, n, #8
+ vst1.32 {q2}, [rp:128]!
+ vld1.32 {q2}, [up]!
+ vmvn q0, q0
+ vst1.32 {q0}, [rp:128]!
+ bge L(top)
+
+L(end): vmvn q2, q2
+ vst1.32 {q2}, [rp:128]!
+
+C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ vld1.32 {q0}, [up]!
+ vmvn q0, q0
+ vst1.32 {q0}, [rp]!
+L(tl1): tst n, #2
+ beq L(tl2)
+ vld1.32 {d0}, [up]!
+ vmvn d0, d0
+ vst1.32 {d0}, [rp]!
+L(tl2): tst n, #1
+ beq L(tl3)
+ vld1.32 {d0[0]}, [up]
+ vmvn d0, d0
+ vst1.32 {d0[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
new file mode 100644
index 0000000000..98fe535def
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
@@ -0,0 +1,110 @@
+dnl ARM Neon mpn_copyd optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.75 slower than core register code
+C Cortex-A15 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ add rp, rp, n, lsl #2
+ add up, up, n, lsl #2
+
+ cmp n, #7
+ ble L(bc)
+
+C Copy until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ sub up, up, #4
+ vld1.32 {d22[0]}, [up]
+ sub n, n, #1
+ sub rp, rp, #4
+ vst1.32 {d22[0]}, [rp]
+L(al1): tst rp, #8
+ beq L(al2)
+ sub up, up, #8
+ vld1.32 {d22}, [up]
+ sub n, n, #2
+ sub rp, rp, #8
+ vst1.32 {d22}, [rp:64]
+L(al2): sub up, up, #16
+ vld1.32 {d26-d27}, [up]
+ subs n, n, #12
+ sub rp, rp, #16 C offset rp for loop
+ blt L(end)
+
+ sub up, up, #16 C offset up for loop
+ mov r12, #-16
+
+ ALIGN(16)
+L(top): vld1.32 {d22-d23}, [up], r12
+ vst1.32 {d26-d27}, [rp:128], r12
+ vld1.32 {d26-d27}, [up], r12
+ vst1.32 {d22-d23}, [rp:128], r12
+ subs n, n, #8
+ bge L(top)
+
+ add up, up, #16 C undo up offset
+ C rp offset undoing folded
+L(end): vst1.32 {d26-d27}, [rp:128]
+
+C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ sub up, up, #16
+ vld1.32 {d22-d23}, [up]
+ sub rp, rp, #16
+ vst1.32 {d22-d23}, [rp]
+L(tl1): tst n, #2
+ beq L(tl2)
+ sub up, up, #8
+ vld1.32 {d22}, [up]
+ sub rp, rp, #8
+ vst1.32 {d22}, [rp]
+L(tl2): tst n, #1
+ beq L(tl3)
+ sub up, up, #4
+ vld1.32 {d22[0]}, [up]
+ sub rp, rp, #4
+ vst1.32 {d22[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
new file mode 100644
index 0000000000..2e05afe5e8
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
@@ -0,0 +1,90 @@
+dnl ARM Neon mpn_copyi optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.75 slower than core register code
+C Cortex-A15 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ cmp n, #7
+ ble L(bc)
+
+C Copy until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ vld1.32 {d22[0]}, [up]!
+ sub n, n, #1
+ vst1.32 {d22[0]}, [rp]!
+L(al1): tst rp, #8
+ beq L(al2)
+ vld1.32 {d22}, [up]!
+ sub n, n, #2
+ vst1.32 {d22}, [rp:64]!
+L(al2): vld1.32 {d26-d27}, [up]!
+ subs n, n, #12
+ blt L(end)
+
+ ALIGN(16)
+L(top): vld1.32 {d22-d23}, [up]!
+ vst1.32 {d26-d27}, [rp:128]!
+ vld1.32 {d26-d27}, [up]!
+ vst1.32 {d22-d23}, [rp:128]!
+ subs n, n, #8
+ bge L(top)
+
+L(end): vst1.32 {d26-d27}, [rp:128]!
+
+C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ vld1.32 {d22-d23}, [up]!
+ vst1.32 {d22-d23}, [rp]!
+L(tl1): tst n, #2
+ beq L(tl2)
+ vld1.32 {d22}, [up]!
+ vst1.32 {d22}, [rp]!
+L(tl2): tst n, #1
+ beq L(tl3)
+ vld1.32 {d22[0]}, [up]
+ vst1.32 {d22[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
new file mode 100644
index 0000000000..2c11d6debd
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
@@ -0,0 +1,177 @@
+dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4-5
+C Cortex-A15 2.5
+
+C TODO
+C * Try to make this smaller, its size (384 bytes) is excessive.
+C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
+C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(`ADDSUBS', `adds $1, $2, $3')
+ define(`ADCSBCS', `adcs $1, $2, $3')
+ define(`IFADD', `$1')
+ define(`IFSUB', `')
+ define(`func', mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(`ADDSUBS', `subs $1, $2, $3')
+ define(`ADCSBCS', `sbcs $1, $2, $3')
+ define(`IFADD', `')
+ define(`IFSUB', `$1')
+ define(`func', mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10}
+
+ ands r4, n, #3
+ beq L(b00)
+ cmp r4, #2
+ blo L(b01)
+ beq L(b10)
+
+L(b11): ldmia up!, {r9,r10,r12}
+ ldmia vp!, {r5,r6,r7}
+ ADDSUBS( r9, r9, r5)
+ vmov d4, r9, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vshr.u64 d3, d4, #1
+ vmov d1, r10, r12
+ vsli.u64 d3, d1, #31
+ vshr.u64 d2, d1, #1
+ vst1.32 d3[0], [rp]!
+ bics n, n, #3
+ beq L(wd2)
+L(gt3): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ b L(mi0)
+
+L(b10): ldmia up!, {r10,r12}
+ ldmia vp!, {r6,r7}
+ ADDSUBS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vmov d4, r10, r12
+ bics n, n, #2
+ vshr.u64 d2, d4, #1
+ beq L(wd2)
+L(gt2): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ b L(mi0)
+
+L(b01): ldr r12, [up], #4
+ ldr r7, [vp], #4
+ ADDSUBS( r12, r12, r7)
+ vmov d4, r12, r12
+ bics n, n, #1
+ bne L(gt1)
+ mov r5, r12, lsr #1
+IFADD(` adc r1, n, #0')
+IFSUB(` adc r1, n, #1')
+ bfi r5, r1, #31, #1
+ str r5, [rp]
+ and r0, r12, #1
+ pop {r4-r10}
+ bx r14
+L(gt1): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ vshr.u64 d2, d4, #1
+ ADCSBCS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d0, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vsli.u64 d2, d0, #31
+ vshr.u64 d3, d0, #1
+ vst1.32 d2[0], [rp]!
+ b L(mi1)
+
+L(b00): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ ADDSUBS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d4, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vshr.u64 d3, d4, #1
+ b L(mi1)
+
+ ALIGN(16)
+L(top): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ vsli.u64 d3, d1, #63
+ vshr.u64 d2, d1, #1
+ vst1.32 d3, [rp]!
+L(mi0): ADCSBCS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d0, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vsli.u64 d2, d0, #63
+ vshr.u64 d3, d0, #1
+ vst1.32 d2, [rp]!
+L(mi1): vmov d1, r10, r12
+ sub n, n, #4
+ tst n, n
+ bne L(top)
+
+L(end): vsli.u64 d3, d1, #63
+ vshr.u64 d2, d1, #1
+ vst1.32 d3, [rp]!
+L(wd2): vmov r4, r5, d2
+IFADD(` adc r1, n, #0')
+IFSUB(` adc r1, n, #1')
+ bfi r5, r1, #31, #1
+ stm rp, {r4,r5}
+
+L(rtn): vmov.32 r0, d4[0]
+ and r0, r0, #1
+ pop {r4-r10}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/submul_1.asm b/gmp/mpn/arm/v7a/cora15/submul_1.asm
new file mode 100644
index 0000000000..ed7bfe820b
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/submul_1.asm
@@ -0,0 +1,159 @@
+dnl ARM mpn_submul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.75 3.75
+C Cortex-A15 2.32 this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions. It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C U' = B^n - 1 - U
+C and then uses that
+C R - U*v = R + U'*v + v - B^n v
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 umaal
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8') define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ sub sp, sp, #32
+ strd r10, r11, [sp, #24]
+ strd r8, r9, [sp, #16]
+ strd r6, r7, [sp, #8]
+ strd r4, r5, [sp, #0]
+C push { r4-r11 }
+
+ ands r6, n, #3
+ sub n, n, #3
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): mov r6, #0
+ ldr u1, [up], #-4
+ ldr w1, [rp], #-16
+ mvn u1, u1
+ adds r7, v0, #0
+ b L(mid)
+
+L(b00): ldrd u0, u1, [up]
+ ldrd w0, w1, [rp], #-12
+ mvn u0, u0
+ mvn u1, u1
+ mov r6, v0
+ umaal w0, r6, u0, v0
+ cmn r13, #0 C carry clear
+ mov r7, #0
+ str w0, [rp, #12]
+ b L(mid)
+
+L(b10): ldrd u0, u1, [up], #8
+ ldrd w0, w1, [rp]
+ mvn u0, u0
+ mvn u1, u1
+ mov r4, v0
+ umaal w0, r4, u0, v0
+ mov r5, #0
+ str w0, [rp], #-4
+ umlal w1, r5, u1, v0
+ adds n, n, #0
+ bmi L(end)
+ b L(top)
+
+L(b01): ldr u1, [up], #4
+ ldr w1, [rp], #-8
+ mvn u1, u1
+ mov r5, v0
+ mov r4, #0
+ umaal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+
+C ALIGN(16)
+L(top): ldrd u0, u1, [up, #0]
+ adcs r4, r4, w1
+ mvn u0, u0
+ ldrd w0, w1, [rp, #12]
+ mvn u1, u1
+ mov r6, #0
+ umlal w0, r6, u0, v0 C 1 2
+ adcs r5, r5, w0
+ mov r7, #0
+ strd r4, r5, [rp, #8]
+L(mid): umaal w1, r7, u1, v0 C 2 3
+ ldrd u0, u1, [up, #8]
+ add up, up, #16
+ adcs r6, r6, w1
+ mvn u0, u0
+ ldrd w0, w1, [rp, #20]
+ mvn u1, u1
+ mov r4, #0
+ umlal w0, r4, u0, v0 C 3 4
+ adcs r7, r7, w0
+ mov r5, #0
+ strd r6, r7, [rp, #16]!
+ sub n, n, #4
+ umlal w1, r5, u1, v0 C 0 1
+ tst n, n
+ bpl L(top)
+
+L(end): adcs r4, r4, w1
+ str r4, [rp, #8]
+ adc r0, r5, #0
+ sub r0, v0, r0
+ pop { r4-r11 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora9/gmp-mparam.h b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
new file mode 100644
index 0000000000..9660257820
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1000MHz Cortex-A9 */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define MUL_TOOM22_THRESHOLD 45
+#define MUL_TOOM33_THRESHOLD 129
+#define MUL_TOOM44_THRESHOLD 387
+#define MUL_TOOM6H_THRESHOLD 517
+#define MUL_TOOM8H_THRESHOLD 774
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 222
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 235
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 208
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 54
+#define SQR_TOOM3_THRESHOLD 181
+#define SQR_TOOM4_THRESHOLD 490
+#define SQR_TOOM6_THRESHOLD 656
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 64
+
+#define MULMOD_BNM1_THRESHOLD 26
+#define SQRMOD_BNM1_THRESHOLD 28
+
+#define MUL_FFT_MODF_THRESHOLD 624 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 624, 5}, { 28, 6}, { 15, 5}, { 34, 6}, \
+ { 18, 5}, { 37, 6}, { 28, 7}, { 15, 6}, \
+ { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 47, 7}, { 25, 6}, \
+ { 51, 7}, { 27, 6}, { 55, 7}, { 29, 8}, \
+ { 15, 7}, { 31, 6}, { 63, 7}, { 37, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
+ { 27, 7}, { 57, 9}, { 15, 8}, { 31, 7}, \
+ { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \
+ { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \
+ { 1023,10}, { 543,11}, { 287,10}, { 575, 9}, \
+ { 1151,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 607,12}, { 319,11}, { 735,12}, \
+ { 383,11}, { 863,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1279,12}, { 703,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \
+ { 1663,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1023,11}, { 2047,12}, { 1215,13}, { 639,12}, \
+ { 1407,13}, { 767,12}, { 1663,13}, { 895,12}, \
+ { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
+ { 1407,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \
+ { 4351,13}, { 2431,14}, { 1279,13}, { 2559,12}, \
+ { 5119,13}, { 2815,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 560, 5}, { 19, 4}, { 39, 5}, { 21, 4}, \
+ { 43, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 35, 6}, { 36, 7}, { 19, 6}, \
+ { 40, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \
+ { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \
+ { 23, 8}, { 55, 9}, { 31, 8}, { 71, 9}, \
+ { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \
+ { 111,11}, { 63,10}, { 159,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415, 9}, { 831,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \
+ { 543,11}, { 287,10}, { 575, 9}, { 1151,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 799,11}, \
+ { 415,10}, { 831,13}, { 127,11}, { 511,10}, \
+ { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \
+ { 1151,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 863,12}, \
+ { 447,11}, { 959,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \
+ { 1791,12}, { 959,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1215,13}, { 639,12}, { 1407,13}, \
+ { 767,12}, { 1663,13}, { 895,12}, { 1791,14}, \
+ { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
+ { 2431,13}, { 1279,12}, { 2559,13}, { 1407,14}, \
+ { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
+ { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2047,12}, { 4095,13}, { 2175,12}, { 4351,13}, \
+ { 2431,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2815,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 167
+#define SQR_FFT_THRESHOLD 5312
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 38
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 100
+#define DC_BDIV_QR_THRESHOLD 43
+#define DC_BDIV_Q_THRESHOLD 104
+
+#define INV_MULMOD_BNM1_THRESHOLD 98
+#define INV_NEWTON_THRESHOLD 138
+#define INV_APPR_THRESHOLD 133
+
+#define BINV_NEWTON_THRESHOLD 333
+#define REDC_1_TO_REDC_2_THRESHOLD 2
+#define REDC_2_TO_REDC_N_THRESHOLD 142
+
+#define MU_DIV_QR_THRESHOLD 2350
+#define MU_DIVAPPR_Q_THRESHOLD 2259
+#define MUPI_DIV_QR_THRESHOLD 70
+#define MU_BDIV_QR_THRESHOLD 2089
+#define MU_BDIV_Q_THRESHOLD 2172
+
+#define POWM_SEC_TABLE 37,48,81,615,1925
+
+#define MATRIX22_STRASSEN_THRESHOLD 22
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 4284
+#define GCD_DC_THRESHOLD 416
+#define GCDEXT_DC_THRESHOLD 298
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 33
+#define SET_STR_DC_THRESHOLD 140
+#define SET_STR_PRECOMPUTE_THRESHOLD 748
+
+#define FAC_DSC_THRESHOLD 309
+#define FAC_ODD_THRESHOLD 29