summaryrefslogtreecommitdiff
path: root/gmp/mpn/arm/v7a
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/arm/v7a')
-rw-r--r--gmp/mpn/arm/v7a/cora15/addmul_1.asm145
-rw-r--r--gmp/mpn/arm/v7a/cora15/aors_n.asm162
-rw-r--r--gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm158
-rw-r--r--gmp/mpn/arm/v7a/cora15/com.asm180
-rw-r--r--gmp/mpn/arm/v7a/cora15/gmp-mparam.h197
-rw-r--r--gmp/mpn/arm/v7a/cora15/logops_n.asm253
-rw-r--r--gmp/mpn/arm/v7a/cora15/mul_1.asm104
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm144
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/com.asm97
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/copyd.asm110
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/copyi.asm90
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm177
-rw-r--r--gmp/mpn/arm/v7a/cora15/submul_1.asm159
-rw-r--r--gmp/mpn/arm/v7a/cora9/gmp-mparam.h209
16 files changed, 2271 insertions, 0 deletions
diff --git a/gmp/mpn/arm/v7a/cora15/addmul_1.asm b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
new file mode 100644
index 0000000000..c2277b32b2
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
@@ -0,0 +1,145 @@
+dnl ARM mpn_addmul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 6 3.25
+C Cortex-A15 2 this
+
+C This code uses umlal for adding in the rp[] data, keeping the recurrency path
+C separate from any multiply instructions. It performs well on A15, at umlal's
+C bandwidth.
+C
+C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
+C for all loads and stores. Alternatively, it could do 2-way or 4-way, but
+C then alignment aware code will be necessary (adding O(1) bookkeeping
+C overhead).
+C
+C We don't use r12 due to ldrd and strd limitations.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8') define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ push { r4-r11 }
+
+ ands r6, n, #3
+ sub n, n, #3
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): mov r6, #0
+ cmn r13, #0 C carry clear
+ ldr u1, [up], #-4
+ ldr w1, [rp], #-4
+ mov r7, #0
+ b L(mid)
+
+L(b00): ldrd u0, u1, [up]
+ ldrd w0, w1, [rp]
+ mov r6, #0
+ umlal w0, r6, u0, v0
+ cmn r13, #0 C carry clear
+ mov r7, #0
+ str w0, [rp]
+ b L(mid)
+
+L(b10): ldrd u0, u1, [up], #8
+ ldrd w0, w1, [rp]
+ mov r4, #0
+ umlal w0, r4, u0, v0
+ cmn r13, #0 C carry clear
+ mov r5, #0
+ str w0, [rp], #8
+ umlal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+ b L(top)
+
+L(b01): mov r4, #0
+ ldr u1, [up], #4
+ ldr w1, [rp], #4
+ mov r5, #0
+ umlal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+
+ ALIGN(16)
+L(top): ldrd u0, u1, [up, #0]
+ adcs r4, r4, w1
+ ldrd w0, w1, [rp, #0]
+ mov r6, #0
+ umlal w0, r6, u0, v0 C 1 2
+ adcs r5, r5, w0
+ mov r7, #0
+ strd r4, r5, [rp, #-4]
+L(mid): umlal w1, r7, u1, v0 C 2 3
+ ldrd u0, u1, [up, #8]
+ adcs r6, r6, w1
+ ldrd w0, w1, [rp, #8]
+ mov r4, #0
+ umlal w0, r4, u0, v0 C 3 4
+ adcs r7, r7, w0
+ mov r5, #0
+ strd r6, r7, [rp, #4]
+ umlal w1, r5, u1, v0 C 0 1
+ sub n, n, #4
+ add up, up, #16
+ add rp, rp, #16
+ tst n, n
+ bpl L(top)
+
+L(end): adcs r4, r4, w1
+ str r4, [rp, #-4]
+ adc r0, r5, #0
+ pop { r4-r11 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/aors_n.asm b/gmp/mpn/arm/v7a/cora15/aors_n.asm
new file mode 100644
index 0000000000..dc3f83992e
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/aors_n.asm
@@ -0,0 +1,162 @@
+dnl ARM mpn_add_n/mpn_sub_n optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.55 2.5
+C Cortex-A15 1.27 this
+
+C This was a major improvement compared to the code we had before, but it might
+C not be the best 8-way code possible. We've tried some permutations of auto-
+C increments and separate pointer updates, but they all ran at the same speed
+C on A15.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_add_n', `
+ define(`ADDSUBC', adcs)
+ define(`IFADD', `$1')
+ define(`SETCY', `cmp $1, #1')
+ define(`RETVAL', `adc r0, n, #0')
+ define(`RETVAL2', `adc r0, n, #1')
+ define(`func', mpn_add_n)
+ define(`func_nc', mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(`ADDSUBC', sbcs)
+ define(`IFADD', `')
+ define(`SETCY', `rsbs $1, $1, #0')
+ define(`RETVAL', `sbc r0, r0, r0
+ and r0, r0, #1')
+ define(`RETVAL2', `RETVAL')
+ define(`func', mpn_sub_n)
+ define(`func_nc', mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ ldr r12, [sp]
+ b L(ent)
+EPILOGUE()
+PROLOGUE(func)
+ mov r12, #0
+L(ent): push { r4-r9 }
+
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ ldr r7, [vp], #4
+ SETCY( r12)
+ ADDSUBC r9, r5, r7
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ SETCY( r12)
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ ldr r7, [vp], #-4
+ SETCY( r12)
+ ADDSUBC r9, r5, r7
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ ldrd r6, r7, [vp]
+ SETCY( r12)
+ sub rp, rp, #8
+ b L(lo)
+
+ ALIGN(16)
+L(top): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ strd r8, r9, [rp, #8]
+L(mid): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ ldrd r4, r5, [up, #16]
+ ldrd r6, r7, [vp, #16]
+ strd r8, r9, [rp, #16]
+ ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ ldrd r6, r7, [vp, #24]
+ strd r8, r9, [rp, #24]
+ ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ ldrd r4, r5, [up, #32]!
+ ldrd r6, r7, [vp, #32]!
+ strd r8, r9, [rp, #32]!
+L(lo): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ tst n, n
+ bne L(top)
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): RETVAL
+ pop { r4-r9 }
+ bx r14
+L(dne): strd r8, r9, [rp, #24]
+ RETVAL2
+ pop { r4-r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
new file mode 100644
index 0000000000..b9e5cd3f79
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
@@ -0,0 +1,158 @@
+dnl ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.75 3
+C Cortex-A15 1.78 this
+
+C This code does not run as well as one could have hoped, since 1.5 c/l seems
+C realistic for this insn mix.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`cnd',`r0')
+define(`rp', `r1')
+define(`up', `r2')
+define(`vp', `r3')
+define(`n', `r12')
+
+ifdef(`OPERATION_cnd_add_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`IFADD', `$1')
+ define(`INITCY', `cmn r0, #0')
+ define(`RETVAL', `adc r0, n, #0')
+ define(`RETVAL2', `adc r0, n, #1')
+ define(`func', mpn_cnd_add_n)
+ define(`func_nc', mpn_add_nc)')
+ifdef(`OPERATION_cnd_sub_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`IFADD', `')
+ define(`INITCY', `cmp r0, #0')
+ define(`RETVAL', `sbc r0, r0, r0
+ and r0, r0, #1')
+ define(`RETVAL2', `RETVAL')
+ define(`func', mpn_cnd_sub_n)
+ define(`func_nc', mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ ldr n, [sp]
+ push { r4-r9 }
+
+ cmp cnd, #1
+ sbc cnd, cnd, cnd C conditionally set to 0xffffffff
+
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ ldr r7, [vp], #4
+ bic r7, r7, cnd
+ ADDSUB r9, r5, r7
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ INITCY
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ ldr r7, [vp], #-4
+ bic r7, r7, cnd
+ ADDSUB r9, r5, r7
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ ldrd r6, r7, [vp]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ INITCY
+ sub rp, rp, #8
+ b L(lo)
+
+ ALIGN(16)
+L(top): ldrd r6, r7, [vp, #8]
+ ldrd r4, r5, [up, #8]
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ strd r8, r9, [rp, #8]
+L(mid): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ ldrd r6, r7, [vp, #16]!
+ ldrd r4, r5, [up, #16]!
+ bic r6, r6, cnd
+ bic r7, r7, cnd
+ sub n, n, #1
+ strd r8, r9, [rp, #16]!
+L(lo): ADDSUBC r8, r4, r6
+ ADDSUBC r9, r5, r7
+ tst n, n
+ bne L(top)
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): RETVAL
+ pop { r4-r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/com.asm b/gmp/mpn/arm/v7a/cora15/com.asm
new file mode 100644
index 0000000000..a258afe934
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/com.asm
@@ -0,0 +1,180 @@
+dnl ARM mpn_com optimised for A15.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.5
+C Cortex-A15 1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
+define(`UNROLL', 4x2) C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ push { r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+ ands r12, n, #3
+ mov n, n, lsr #2
+ beq L(b00a)
+ tst r12, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ mvn r9, r5
+ str r9, [rp], #4
+ tst r12, #2
+ beq L(b00)
+L(bx0): ldrd r4, r5, [up, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+L(b00a):ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+ and r12, n, #3
+ mov n, n, lsr #2
+ tst r12, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ mvn r9, r5
+ str r9, [rp], #4
+L(bx0): tst r12, #2
+ beq L(b00)
+ ldrd r4, r5, [up, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+ ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+ ands r12, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r12, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ mvn r9, r5
+ ldrd r4, r5, [up, #0]
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ mvn r9, r5
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ sub rp, rp, #8
+ b L(lo)
+')
+ ALIGN(16)
+ifelse(UNROLL,4,`
+L(top): ldrd r4, r5, [up, #8]
+ strd r8, r9, [rp, #8]
+L(mid): mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #16]!
+ strd r8, r9, [rp, #16]!
+ sub n, n, #1
+L(lo): mvn r8, r4
+ mvn r9, r5
+ tst n, n
+ bne L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top): ldrd r4, r5, [up, #8]
+ strd r8, r9, [rp, #8]
+L(mid): mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #16]
+ strd r8, r9, [rp, #16]
+ mvn r8, r4
+ mvn r9, r5
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ strd r8, r9, [rp, #24]
+ mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #32]!
+ strd r8, r9, [rp, #32]!
+L(lo): mvn r8, r4
+ mvn r9, r5
+ tst n, n
+ bne L(top)
+')
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): pop { r4-r5,r8-r9 }
+ bx r14
+ifelse(UNROLL,4x2,`
+L(dne): strd r8, r9, [rp, #24]
+ pop { r4-r5,r8-r9 }
+ bx r14
+')
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
new file mode 100644
index 0000000000..2a06532b3e
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
@@ -0,0 +1,197 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1700MHz Cortex-A15 with Neon (in spite of file position) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 15
+
+#define MUL_TOOM22_THRESHOLD 23
+#define MUL_TOOM33_THRESHOLD 90
+#define MUL_TOOM44_THRESHOLD 262
+#define MUL_TOOM6H_THRESHOLD 351
+#define MUL_TOOM8H_THRESHOLD 557
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 43
+#define SQR_TOOM3_THRESHOLD 138
+#define SQR_TOOM4_THRESHOLD 363
+#define SQR_TOOM6_THRESHOLD 517
+#define SQR_TOOM8_THRESHOLD 725
+
+#define MULMID_TOOM42_THRESHOLD 52
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 550 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 550, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 6}, { 39, 7}, { 25, 6}, \
+ { 51, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 51, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 99, 9}, { 55,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 831,12}, { 447,11}, { 895,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \
+ { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1663,12}, { 895,13}, { 511,12}, { 1087,13}, \
+ { 639,12}, { 1407,13}, { 767,12}, { 1599,13}, \
+ { 895,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,14}, { 767,13}, \
+ { 1535,12}, { 3071,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2815,12}, { 5631,13}, { 2943,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 137
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 525, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 51, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \
+ { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,11}, { 351,12}, { 191,11}, \
+ { 383,10}, { 799,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 511,11}, \
+ { 1023,12}, { 575,11}, { 1151,12}, { 639,11}, \
+ { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1663,12}, { 895,13}, \
+ { 511,12}, { 1087,13}, { 639,12}, { 1407,13}, \
+ { 767,12}, { 1727,13}, { 895,14}, { 511,13}, \
+ { 1023,12}, { 2047,13}, { 1151,12}, { 2431,13}, \
+ { 1279,14}, { 767,13}, { 1535,12}, { 3071,15}, \
+ { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \
+ { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2687,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 139
+#define SQR_FFT_THRESHOLD 4736
+
+#define MULLO_BASECASE_THRESHOLD 9
+#define MULLO_DC_THRESHOLD 39
+#define MULLO_MUL_N_THRESHOLD 11278
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 296
+#define DC_BDIV_QR_THRESHOLD 52
+#define DC_BDIV_Q_THRESHOLD 300
+
+#define INV_MULMOD_BNM1_THRESHOLD 44
+#define INV_NEWTON_THRESHOLD 294
+#define INV_APPR_THRESHOLD 294
+
+#define BINV_NEWTON_THRESHOLD 375
+#define REDC_1_TO_REDC_2_THRESHOLD 102
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1718
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1528
+#define MU_BDIV_Q_THRESHOLD 1718
+
+#define POWM_SEC_TABLE 3,32,70,416,1464
+
+#define MATRIX22_STRASSEN_THRESHOLD 22
+#define HGCD_THRESHOLD 152
+#define HGCD_APPR_THRESHOLD 230
+#define HGCD_REDUCE_THRESHOLD 3259
+#define GCD_DC_THRESHOLD 702
+#define GCDEXT_DC_THRESHOLD 538
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 32
+#define SET_STR_DC_THRESHOLD 119
+#define SET_STR_PRECOMPUTE_THRESHOLD 1063
+
+#define FAC_DSC_THRESHOLD 262
+#define FAC_ODD_THRESHOLD 26
diff --git a/gmp/mpn/arm/v7a/cora15/logops_n.asm b/gmp/mpn/arm/v7a/cora15/logops_n.asm
new file mode 100644
index 0000000000..06026143e1
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/logops_n.asm
@@ -0,0 +1,253 @@
+dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C and andn ior xor nand iorn nior xnor
+C StrongARM ? ?
+C XScale ? ?
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3.5 3.56
+C Cortex-A15 1.27 1.64
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
+define(`UNROLL', 4x2) C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+ define(`func', `mpn_and_n')
+ define(`LOGOP', `and $1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+ define(`func', `mpn_andn_n')
+ define(`LOGOP', `bic $1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+ define(`func', `mpn_nand_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `and $1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+ define(`func', `mpn_ior_n')
+ define(`LOGOP', `orr $1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+ define(`func', `mpn_iorn_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `bic $1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+ define(`func', `mpn_nior_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `orr $1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+ define(`func', `mpn_xor_n')
+ define(`LOGOP', `eor $1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+ define(`func', `mpn_xnor_n')
+ define(`POSTOP', `mvn $1, $1')
+ define(`LOGOP', `eor $1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ push { r4-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00a)
+ tst r6, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ ldr r7, [vp], #4
+ LOGOP( r9, r5, r7)
+ POSTOP( r9)
+ str r9, [rp], #4
+ tst r6, #2
+ beq L(b00)
+L(bx0): ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+L(b00a):ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+ and r6, n, #3
+ mov n, n, lsr #2
+ tst r6, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ ldr r7, [vp], #4
+ LOGOP( r9, r5, r7)
+ POSTOP( r9)
+ str r9, [rp], #4
+L(bx0): tst r6, #2
+ beq L(b00)
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+ ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+ ands r6, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ ldr r7, [vp], #4
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #0]
+ ldrd r6, r7, [vp, #0]
+ POSTOP( r9)
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ ldrd r6, r7, [vp], #-8
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ ldr r7, [vp], #-4
+ LOGOP( r9, r5, r7)
+ POSTOP( r9)
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ ldrd r6, r7, [vp]
+ sub rp, rp, #8
+ b L(lo)
+')
+ ALIGN(16)
+ifelse(UNROLL,4,`
+L(top): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #8]
+L(mid): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #16]!
+ ldrd r6, r7, [vp, #16]!
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #16]!
+ sub n, n, #1
+L(lo): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ tst n, n
+ bne L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top): ldrd r4, r5, [up, #8]
+ ldrd r6, r7, [vp, #8]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #8]
+L(mid): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #16]
+ ldrd r6, r7, [vp, #16]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #16]
+ LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ ldrd r6, r7, [vp, #24]
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #24]
+ LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ ldrd r4, r5, [up, #32]!
+ ldrd r6, r7, [vp, #32]!
+ POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #32]!
+L(lo): LOGOP( r8, r4, r6)
+ LOGOP( r9, r5, r7)
+ tst n, n
+ bne L(top)
+')
+
+L(end): POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #8]
+L(wd1): pop { r4-r9 }
+ bx r14
+ifelse(UNROLL,4x2,`
+L(dne): POSTOP( r8)
+ POSTOP( r9)
+ strd r8, r9, [rp, #24]
+ pop { r4-r9 }
+ bx r14
+')
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/mul_1.asm b/gmp/mpn/arm/v7a/cora15/mul_1.asm
new file mode 100644
index 0000000000..766ba5c57f
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/mul_1.asm
@@ -0,0 +1,104 @@
+dnl ARM mpn_mul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.25 3.25
+C Cortex-A15 2.25 this
+
+
+C This runs well on A15 but very poorly on A9. By scheduling loads and adds
+C it is possible to get good A9 performance as well, but at the cost of using
+C many more (callee-saves) registers.
+
+C This is armv5 code, optimized for the armv7a cpu A15. Its location in the
+C GMP file structure might be misleading.
+
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`v0', `r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ ldr r12, [sp]
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ mov r12, #0
+L(ent): push {r4-r7}
+
+ ldr r6, [up], #4
+ tst n, #1
+ beq L(bx0)
+
+L(bx1): umull r4, r7, r6, v0
+ adds r4, r4, r12
+ tst n, #2
+ beq L(lo1)
+ b L(lo3)
+
+L(bx0): umull r4, r5, r6, v0
+ adds r4, r4, r12
+ tst n, #2
+ beq L(lo0)
+ b L(lo2)
+
+L(top): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r5, r6, v0
+ adds r4, r4, r7
+L(lo0): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r7, r6, v0
+ adcs r4, r4, r5
+L(lo3): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r5, r6, v0
+ adcs r4, r4, r7
+L(lo2): ldr r6, [up], #4
+ str r4, [rp], #4
+ umull r4, r7, r6, v0
+ adcs r4, r4, r5
+L(lo1): adc r7, r7, #0
+ subs n, n, #4
+ bgt L(top)
+
+ str r4, [rp]
+ mov r0, r7
+ pop {r4-r7}
+ bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
new file mode 100644
index 0000000000..d8cfe3f78f
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
new file mode 100644
index 0000000000..b48204d926
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
new file mode 100644
index 0000000000..16c34a2699
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
@@ -0,0 +1,144 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.25
+C Cortex-A15 2.25
+
+C TODO
+C * Consider using 4-way feed-in code.
+C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`DO_add', `
+ define(`ADCSBCS', `adcs $1, $2, $3')
+ define(`CLRCY', `cmn r13, #1')
+ define(`RETVAL', `adc r0, $1, #0')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADCSBCS', `sbcs $1, $2, $3')
+ define(`CLRCY', `cmp r13, #0')
+ define(`RETVAL', `sbc $2, $2, $2
+ cmn $2, #1
+ adc r0, $1, #0')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADCSBCS', `sbcs $1, $3, $2')
+ define(`CLRCY', `cmp r13, #0')
+ define(`RETVAL', `sbc r0, $1, #0')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10}
+ vmov.i8 d0, #0 C could feed carry through here
+ CLRCY
+ tst n, #1
+ beq L(bb0)
+
+L(bb1): vld1.32 {d3[0]}, [vp]!
+ vsli.u32 d0, d3, #LSH
+ ldr r12, [up], #4
+ vmov.32 r5, d0[0]
+ vshr.u32 d0, d3, #32-LSH
+ ADCSBCS( r12, r12, r5)
+ str r12, [rp], #4
+ bics n, n, #1
+ beq L(rtn)
+
+L(bb0): tst n, #2
+ beq L(b00)
+
+L(b10): vld1.32 {d3}, [vp]!
+ vsli.u64 d0, d3, #LSH
+ ldmia up!, {r10,r12}
+ vmov r4, r5, d0
+ vshr.u64 d0, d3, #64-LSH
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r10,r12}
+ bics n, n, #2
+ beq L(rtn)
+
+L(b00): vld1.32 {d2}, [vp]!
+ vsli.u64 d0, d2, #LSH
+ vshr.u64 d1, d2, #64-LSH
+ vld1.32 {d3}, [vp]!
+ vsli.u64 d1, d3, #LSH
+ vmov r6, r7, d0
+ vshr.u64 d0, d3, #64-LSH
+ sub n, n, #4
+ tst n, n
+ beq L(end)
+
+ ALIGN(16)
+L(top): ldmia up!, {r8,r9,r10,r12}
+ vld1.32 {d2}, [vp]!
+ vsli.u64 d0, d2, #LSH
+ vmov r4, r5, d1
+ vshr.u64 d1, d2, #64-LSH
+ ADCSBCS( r8, r8, r6)
+ ADCSBCS( r9, r9, r7)
+ vld1.32 {d3}, [vp]!
+ vsli.u64 d1, d3, #LSH
+ vmov r6, r7, d0
+ vshr.u64 d0, d3, #64-LSH
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r8,r9,r10,r12}
+ sub n, n, #4
+ tst n, n
+ bne L(top)
+
+L(end): ldmia up!, {r8,r9,r10,r12}
+ vmov r4, r5, d1
+ ADCSBCS( r8, r8, r6)
+ ADCSBCS( r9, r9, r7)
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r8,r9,r10,r12}
+L(rtn): vmov.32 r0, d0[0]
+ RETVAL( r0, r1)
+ pop {r4-r10}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/com.asm b/gmp/mpn/arm/v7a/cora15/neon/com.asm
new file mode 100644
index 0000000000..9e7a629287
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/com.asm
@@ -0,0 +1,97 @@
+dnl ARM Neon mpn_com optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A8 ?
+C Cortex-A9 2.1
+C Cortex-A15 0.65
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ cmp n, #7
+ ble L(bc)
+
+C Perform a few initial operation until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ vld1.32 {d0[0]}, [up]!
+ sub n, n, #1
+ vmvn d0, d0
+ vst1.32 {d0[0]}, [rp]!
+L(al1): tst rp, #8
+ beq L(al2)
+ vld1.32 {d0}, [up]!
+ sub n, n, #2
+ vmvn d0, d0
+ vst1.32 {d0}, [rp:64]!
+L(al2): vld1.32 {q2}, [up]!
+ subs n, n, #12
+ blt L(end)
+
+ ALIGN(16)
+L(top): vld1.32 {q0}, [up]!
+ vmvn q2, q2
+ subs n, n, #8
+ vst1.32 {q2}, [rp:128]!
+ vld1.32 {q2}, [up]!
+ vmvn q0, q0
+ vst1.32 {q0}, [rp:128]!
+ bge L(top)
+
+L(end): vmvn q2, q2
+ vst1.32 {q2}, [rp:128]!
+
+C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ vld1.32 {q0}, [up]!
+ vmvn q0, q0
+ vst1.32 {q0}, [rp]!
+L(tl1): tst n, #2
+ beq L(tl2)
+ vld1.32 {d0}, [up]!
+ vmvn d0, d0
+ vst1.32 {d0}, [rp]!
+L(tl2): tst n, #1
+ beq L(tl3)
+ vld1.32 {d0[0]}, [up]
+ vmvn d0, d0
+ vst1.32 {d0[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
new file mode 100644
index 0000000000..98fe535def
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
@@ -0,0 +1,110 @@
+dnl ARM Neon mpn_copyd optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.75 slower than core register code
+C Cortex-A15 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ add rp, rp, n, lsl #2
+ add up, up, n, lsl #2
+
+ cmp n, #7
+ ble L(bc)
+
+C Copy until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ sub up, up, #4
+ vld1.32 {d22[0]}, [up]
+ sub n, n, #1
+ sub rp, rp, #4
+ vst1.32 {d22[0]}, [rp]
+L(al1): tst rp, #8
+ beq L(al2)
+ sub up, up, #8
+ vld1.32 {d22}, [up]
+ sub n, n, #2
+ sub rp, rp, #8
+ vst1.32 {d22}, [rp:64]
+L(al2): sub up, up, #16
+ vld1.32 {d26-d27}, [up]
+ subs n, n, #12
+ sub rp, rp, #16 C offset rp for loop
+ blt L(end)
+
+ sub up, up, #16 C offset up for loop
+ mov r12, #-16
+
+ ALIGN(16)
+L(top): vld1.32 {d22-d23}, [up], r12
+ vst1.32 {d26-d27}, [rp:128], r12
+ vld1.32 {d26-d27}, [up], r12
+ vst1.32 {d22-d23}, [rp:128], r12
+ subs n, n, #8
+ bge L(top)
+
+ add up, up, #16 C undo up offset
+ C rp offset undoing folded
+L(end): vst1.32 {d26-d27}, [rp:128]
+
+C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ sub up, up, #16
+ vld1.32 {d22-d23}, [up]
+ sub rp, rp, #16
+ vst1.32 {d22-d23}, [rp]
+L(tl1): tst n, #2
+ beq L(tl2)
+ sub up, up, #8
+ vld1.32 {d22}, [up]
+ sub rp, rp, #8
+ vst1.32 {d22}, [rp]
+L(tl2): tst n, #1
+ beq L(tl3)
+ sub up, up, #4
+ vld1.32 {d22[0]}, [up]
+ sub rp, rp, #4
+ vst1.32 {d22[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
new file mode 100644
index 0000000000..2e05afe5e8
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
@@ -0,0 +1,90 @@
+dnl ARM Neon mpn_copyi optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.75 slower than core register code
+C Cortex-A15 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ cmp n, #7
+ ble L(bc)
+
+C Copy until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ vld1.32 {d22[0]}, [up]!
+ sub n, n, #1
+ vst1.32 {d22[0]}, [rp]!
+L(al1): tst rp, #8
+ beq L(al2)
+ vld1.32 {d22}, [up]!
+ sub n, n, #2
+ vst1.32 {d22}, [rp:64]!
+L(al2): vld1.32 {d26-d27}, [up]!
+ subs n, n, #12
+ blt L(end)
+
+ ALIGN(16)
+L(top): vld1.32 {d22-d23}, [up]!
+ vst1.32 {d26-d27}, [rp:128]!
+ vld1.32 {d26-d27}, [up]!
+ vst1.32 {d22-d23}, [rp:128]!
+ subs n, n, #8
+ bge L(top)
+
+L(end): vst1.32 {d26-d27}, [rp:128]!
+
+C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ vld1.32 {d22-d23}, [up]!
+ vst1.32 {d22-d23}, [rp]!
+L(tl1): tst n, #2
+ beq L(tl2)
+ vld1.32 {d22}, [up]!
+ vst1.32 {d22}, [rp]!
+L(tl2): tst n, #1
+ beq L(tl3)
+ vld1.32 {d22[0]}, [up]
+ vst1.32 {d22[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
new file mode 100644
index 0000000000..2c11d6debd
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
@@ -0,0 +1,177 @@
+dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4-5
+C Cortex-A15 2.5
+
+C TODO
+C * Try to make this smaller, its size (384 bytes) is excessive.
+C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
+C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(`ADDSUBS', `adds $1, $2, $3')
+ define(`ADCSBCS', `adcs $1, $2, $3')
+ define(`IFADD', `$1')
+ define(`IFSUB', `')
+ define(`func', mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(`ADDSUBS', `subs $1, $2, $3')
+ define(`ADCSBCS', `sbcs $1, $2, $3')
+ define(`IFADD', `')
+ define(`IFSUB', `$1')
+ define(`func', mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10}
+
+ ands r4, n, #3
+ beq L(b00)
+ cmp r4, #2
+ blo L(b01)
+ beq L(b10)
+
+L(b11): ldmia up!, {r9,r10,r12}
+ ldmia vp!, {r5,r6,r7}
+ ADDSUBS( r9, r9, r5)
+ vmov d4, r9, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vshr.u64 d3, d4, #1
+ vmov d1, r10, r12
+ vsli.u64 d3, d1, #31
+ vshr.u64 d2, d1, #1
+ vst1.32 d3[0], [rp]!
+ bics n, n, #3
+ beq L(wd2)
+L(gt3): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ b L(mi0)
+
+L(b10): ldmia up!, {r10,r12}
+ ldmia vp!, {r6,r7}
+ ADDSUBS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vmov d4, r10, r12
+ bics n, n, #2
+ vshr.u64 d2, d4, #1
+ beq L(wd2)
+L(gt2): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ b L(mi0)
+
+L(b01): ldr r12, [up], #4
+ ldr r7, [vp], #4
+ ADDSUBS( r12, r12, r7)
+ vmov d4, r12, r12
+ bics n, n, #1
+ bne L(gt1)
+ mov r5, r12, lsr #1
+IFADD(` adc r1, n, #0')
+IFSUB(` adc r1, n, #1')
+ bfi r5, r1, #31, #1
+ str r5, [rp]
+ and r0, r12, #1
+ pop {r4-r10}
+ bx r14
+L(gt1): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ vshr.u64 d2, d4, #1
+ ADCSBCS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d0, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vsli.u64 d2, d0, #31
+ vshr.u64 d3, d0, #1
+ vst1.32 d2[0], [rp]!
+ b L(mi1)
+
+L(b00): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ ADDSUBS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d4, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vshr.u64 d3, d4, #1
+ b L(mi1)
+
+ ALIGN(16)
+L(top): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ vsli.u64 d3, d1, #63
+ vshr.u64 d2, d1, #1
+ vst1.32 d3, [rp]!
+L(mi0): ADCSBCS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d0, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vsli.u64 d2, d0, #63
+ vshr.u64 d3, d0, #1
+ vst1.32 d2, [rp]!
+L(mi1): vmov d1, r10, r12
+ sub n, n, #4
+ tst n, n
+ bne L(top)
+
+L(end): vsli.u64 d3, d1, #63
+ vshr.u64 d2, d1, #1
+ vst1.32 d3, [rp]!
+L(wd2): vmov r4, r5, d2
+IFADD(` adc r1, n, #0')
+IFSUB(` adc r1, n, #1')
+ bfi r5, r1, #31, #1
+ stm rp, {r4,r5}
+
+L(rtn): vmov.32 r0, d4[0]
+ and r0, r0, #1
+ pop {r4-r10}
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/submul_1.asm b/gmp/mpn/arm/v7a/cora15/submul_1.asm
new file mode 100644
index 0000000000..ed7bfe820b
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/submul_1.asm
@@ -0,0 +1,159 @@
+dnl ARM mpn_submul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.75 3.75
+C Cortex-A15 2.32 this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions. It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C U' = B^n - 1 - U
+C and then uses that
+C R - U*v = R + U'*v + v - B^n v
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 umaal
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8') define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ sub sp, sp, #32
+ strd r10, r11, [sp, #24]
+ strd r8, r9, [sp, #16]
+ strd r6, r7, [sp, #8]
+ strd r4, r5, [sp, #0]
+C push { r4-r11 }
+
+ ands r6, n, #3
+ sub n, n, #3
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): mov r6, #0
+ ldr u1, [up], #-4
+ ldr w1, [rp], #-16
+ mvn u1, u1
+ adds r7, v0, #0
+ b L(mid)
+
+L(b00): ldrd u0, u1, [up]
+ ldrd w0, w1, [rp], #-12
+ mvn u0, u0
+ mvn u1, u1
+ mov r6, v0
+ umaal w0, r6, u0, v0
+ cmn r13, #0 C carry clear
+ mov r7, #0
+ str w0, [rp, #12]
+ b L(mid)
+
+L(b10): ldrd u0, u1, [up], #8
+ ldrd w0, w1, [rp]
+ mvn u0, u0
+ mvn u1, u1
+ mov r4, v0
+ umaal w0, r4, u0, v0
+ mov r5, #0
+ str w0, [rp], #-4
+ umlal w1, r5, u1, v0
+ adds n, n, #0
+ bmi L(end)
+ b L(top)
+
+L(b01): ldr u1, [up], #4
+ ldr w1, [rp], #-8
+ mvn u1, u1
+ mov r5, v0
+ mov r4, #0
+ umaal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+
+C ALIGN(16)
+L(top): ldrd u0, u1, [up, #0]
+ adcs r4, r4, w1
+ mvn u0, u0
+ ldrd w0, w1, [rp, #12]
+ mvn u1, u1
+ mov r6, #0
+ umlal w0, r6, u0, v0 C 1 2
+ adcs r5, r5, w0
+ mov r7, #0
+ strd r4, r5, [rp, #8]
+L(mid): umaal w1, r7, u1, v0 C 2 3
+ ldrd u0, u1, [up, #8]
+ add up, up, #16
+ adcs r6, r6, w1
+ mvn u0, u0
+ ldrd w0, w1, [rp, #20]
+ mvn u1, u1
+ mov r4, #0
+ umlal w0, r4, u0, v0 C 3 4
+ adcs r7, r7, w0
+ mov r5, #0
+ strd r6, r7, [rp, #16]!
+ sub n, n, #4
+ umlal w1, r5, u1, v0 C 0 1
+ tst n, n
+ bpl L(top)
+
+L(end): adcs r4, r4, w1
+ str r4, [rp, #8]
+ adc r0, r5, #0
+ sub r0, v0, r0
+ pop { r4-r11 }
+ bx r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora9/gmp-mparam.h b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
new file mode 100644
index 0000000000..9660257820
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1000MHz Cortex-A9 */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define MUL_TOOM22_THRESHOLD 45
+#define MUL_TOOM33_THRESHOLD 129
+#define MUL_TOOM44_THRESHOLD 387
+#define MUL_TOOM6H_THRESHOLD 517
+#define MUL_TOOM8H_THRESHOLD 774
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 222
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 235
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 208
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 54
+#define SQR_TOOM3_THRESHOLD 181
+#define SQR_TOOM4_THRESHOLD 490
+#define SQR_TOOM6_THRESHOLD 656
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 64
+
+#define MULMOD_BNM1_THRESHOLD 26
+#define SQRMOD_BNM1_THRESHOLD 28
+
+#define MUL_FFT_MODF_THRESHOLD 624 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 624, 5}, { 28, 6}, { 15, 5}, { 34, 6}, \
+ { 18, 5}, { 37, 6}, { 28, 7}, { 15, 6}, \
+ { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 47, 7}, { 25, 6}, \
+ { 51, 7}, { 27, 6}, { 55, 7}, { 29, 8}, \
+ { 15, 7}, { 31, 6}, { 63, 7}, { 37, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
+ { 27, 7}, { 57, 9}, { 15, 8}, { 31, 7}, \
+ { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \
+ { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \
+ { 1023,10}, { 543,11}, { 287,10}, { 575, 9}, \
+ { 1151,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 607,12}, { 319,11}, { 735,12}, \
+ { 383,11}, { 863,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1279,12}, { 703,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \
+ { 1663,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1023,11}, { 2047,12}, { 1215,13}, { 639,12}, \
+ { 1407,13}, { 767,12}, { 1663,13}, { 895,12}, \
+ { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
+ { 1407,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \
+ { 4351,13}, { 2431,14}, { 1279,13}, { 2559,12}, \
+ { 5119,13}, { 2815,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 560, 5}, { 19, 4}, { 39, 5}, { 21, 4}, \
+ { 43, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 35, 6}, { 36, 7}, { 19, 6}, \
+ { 40, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \
+ { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \
+ { 23, 8}, { 55, 9}, { 31, 8}, { 71, 9}, \
+ { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \
+ { 111,11}, { 63,10}, { 159,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415, 9}, { 831,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \
+ { 543,11}, { 287,10}, { 575, 9}, { 1151,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 799,11}, \
+ { 415,10}, { 831,13}, { 127,11}, { 511,10}, \
+ { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \
+ { 1151,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 863,12}, \
+ { 447,11}, { 959,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \
+ { 1791,12}, { 959,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1215,13}, { 639,12}, { 1407,13}, \
+ { 767,12}, { 1663,13}, { 895,12}, { 1791,14}, \
+ { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
+ { 2431,13}, { 1279,12}, { 2559,13}, { 1407,14}, \
+ { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
+ { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2047,12}, { 4095,13}, { 2175,12}, { 4351,13}, \
+ { 2431,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2815,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 167
+#define SQR_FFT_THRESHOLD 5312
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 38
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 100
+#define DC_BDIV_QR_THRESHOLD 43
+#define DC_BDIV_Q_THRESHOLD 104
+
+#define INV_MULMOD_BNM1_THRESHOLD 98
+#define INV_NEWTON_THRESHOLD 138
+#define INV_APPR_THRESHOLD 133
+
+#define BINV_NEWTON_THRESHOLD 333
+#define REDC_1_TO_REDC_2_THRESHOLD 2
+#define REDC_2_TO_REDC_N_THRESHOLD 142
+
+#define MU_DIV_QR_THRESHOLD 2350
+#define MU_DIVAPPR_Q_THRESHOLD 2259
+#define MUPI_DIV_QR_THRESHOLD 70
+#define MU_BDIV_QR_THRESHOLD 2089
+#define MU_BDIV_Q_THRESHOLD 2172
+
+#define POWM_SEC_TABLE 37,48,81,615,1925
+
+#define MATRIX22_STRASSEN_THRESHOLD 22
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 4284
+#define GCD_DC_THRESHOLD 416
+#define GCDEXT_DC_THRESHOLD 298
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 33
+#define SET_STR_DC_THRESHOLD 140
+#define SET_STR_PRECOMPUTE_THRESHOLD 748
+
+#define FAC_DSC_THRESHOLD 309
+#define FAC_ODD_THRESHOLD 29