diff options
Diffstat (limited to 'gmp/mpn/arm/v7a/cora15')
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/addmul_1.asm | 145 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/aors_n.asm | 162 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm | 158 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/com.asm | 180 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/gmp-mparam.h | 197 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/logops_n.asm | 253 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/mul_1.asm | 104 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm | 43 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm | 43 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm | 144 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/com.asm | 97 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/copyd.asm | 110 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/copyi.asm | 90 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm | 177 | ||||
-rw-r--r-- | gmp/mpn/arm/v7a/cora15/submul_1.asm | 159 |
15 files changed, 0 insertions, 2062 deletions
diff --git a/gmp/mpn/arm/v7a/cora15/addmul_1.asm b/gmp/mpn/arm/v7a/cora15/addmul_1.asm deleted file mode 100644 index c2277b32b2..0000000000 --- a/gmp/mpn/arm/v7a/cora15/addmul_1.asm +++ /dev/null @@ -1,145 +0,0 @@ -dnl ARM mpn_addmul_1 optimised for A15. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb best -C StrongARM: - -C XScale ? -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 6 3.25 -C Cortex-A15 2 this - -C This code uses umlal for adding in the rp[] data, keeping the recurrency path -C separate from any multiply instructions. It performs well on A15, at umlal's -C bandwidth. -C -C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm -C for all loads and stores. Alternatively, it could do 2-way or 4-way, but -C then alignment aware code will be necessary (adding O(1) bookkeeping -C overhead). -C -C We don't use r12 due to ldrd and strd limitations. - -C Architecture requirements: -C v5 - -C v5t - -C v5te ldrd strd -C v6 - -C v6t2 - -C v7a - - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') -define(`v0', `r3') - -define(`w0', `r10') define(`w1', `r11') -define(`u0', `r8') define(`u1', `r9') - -ASM_START() -PROLOGUE(mpn_addmul_1) - push { r4-r11 } - - ands r6, n, #3 - sub n, n, #3 - beq L(b00) - cmp r6, #2 - bcc L(b01) - beq L(b10) - -L(b11): mov r6, #0 - cmn r13, #0 C carry clear - ldr u1, [up], #-4 - ldr w1, [rp], #-4 - mov r7, #0 - b L(mid) - -L(b00): ldrd u0, u1, [up] - ldrd w0, w1, [rp] - mov r6, #0 - umlal w0, r6, u0, v0 - cmn r13, #0 C carry clear - mov r7, #0 - str w0, [rp] - b L(mid) - -L(b10): ldrd u0, u1, [up], #8 - ldrd w0, w1, [rp] - mov r4, #0 - umlal w0, r4, u0, v0 - cmn r13, #0 C carry clear - mov r5, #0 - str w0, [rp], #8 - umlal w1, r5, u1, v0 - tst n, n - bmi L(end) - b L(top) - -L(b01): mov r4, #0 - ldr u1, [up], #4 - ldr w1, [rp], #4 - mov r5, #0 - umlal w1, r5, u1, v0 - tst n, n - bmi L(end) - - ALIGN(16) -L(top): ldrd u0, u1, [up, #0] - adcs r4, r4, w1 - ldrd w0, w1, [rp, #0] - mov r6, #0 - umlal w0, r6, u0, v0 C 1 2 - adcs r5, r5, w0 - mov r7, #0 - strd r4, r5, [rp, #-4] -L(mid): umlal w1, r7, u1, v0 C 2 3 - ldrd u0, u1, [up, #8] - adcs r6, r6, w1 - ldrd w0, w1, [rp, #8] - mov r4, #0 - umlal w0, r4, u0, v0 C 3 4 - adcs r7, r7, w0 - mov r5, #0 - strd r6, r7, [rp, #4] - umlal w1, r5, u1, v0 C 0 1 - sub n, n, #4 - add up, up, #16 - add rp, rp, #16 - tst n, n - bpl L(top) - -L(end): adcs r4, r4, w1 - str r4, [rp, #-4] - adc r0, r5, #0 - pop { r4-r11 } - bx r14 -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/aors_n.asm b/gmp/mpn/arm/v7a/cora15/aors_n.asm deleted file mode 100644 index dc3f83992e..0000000000 --- a/gmp/mpn/arm/v7a/cora15/aors_n.asm +++ /dev/null @@ -1,162 +0,0 @@ -dnl ARM mpn_add_n/mpn_sub_n optimised for A15. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb best -C StrongARM: - -C XScale ? -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 3.55 2.5 -C Cortex-A15 1.27 this - -C This was a major improvement compared to the code we had before, but it might -C not be the best 8-way code possible. We've tried some permutations of auto- -C increments and separate pointer updates, but they all ran at the same speed -C on A15. - -C Architecture requirements: -C v5 - -C v5t - -C v5te ldrd strd -C v6 - -C v6t2 - -C v7a - - -define(`rp', `r0') -define(`up', `r1') -define(`vp', `r2') -define(`n', `r3') - -ifdef(`OPERATION_add_n', ` - define(`ADDSUBC', adcs) - define(`IFADD', `$1') - define(`SETCY', `cmp $1, #1') - define(`RETVAL', `adc r0, n, #0') - define(`RETVAL2', `adc r0, n, #1') - define(`func', mpn_add_n) - define(`func_nc', mpn_add_nc)') -ifdef(`OPERATION_sub_n', ` - define(`ADDSUBC', sbcs) - define(`IFADD', `') - define(`SETCY', `rsbs $1, $1, #0') - define(`RETVAL', `sbc r0, r0, r0 - and r0, r0, #1') - define(`RETVAL2', `RETVAL') - define(`func', mpn_sub_n) - define(`func_nc', mpn_sub_nc)') - -MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) - -ASM_START() -PROLOGUE(func_nc) - ldr r12, [sp] - b L(ent) -EPILOGUE() -PROLOGUE(func) - mov r12, #0 -L(ent): push { r4-r9 } - - ands r6, n, #3 - mov n, n, lsr #2 - beq L(b00) - cmp r6, #2 - bcc L(b01) - beq L(b10) - -L(b11): ldr r5, [up], #4 - ldr r7, [vp], #4 - SETCY( r12) - ADDSUBC r9, r5, r7 - ldrd r4, r5, [up, #0] - ldrd r6, r7, [vp, #0] - str r9, [rp], #-4 - b L(lo) - -L(b00): ldrd r4, r5, [up], #-8 - ldrd r6, r7, [vp], #-8 - SETCY( r12) - sub rp, rp, #16 - b L(mid) - -L(b01): ldr r5, [up], #-4 - ldr r7, [vp], #-4 - SETCY( r12) - ADDSUBC r9, r5, r7 - str r9, [rp], #-12 - tst n, n - beq L(wd1) -L(gt1): ldrd r4, r5, [up, #8] - ldrd r6, r7, [vp, #8] - b L(mid) - -L(b10): ldrd r4, r5, [up] - ldrd r6, r7, [vp] - SETCY( r12) - sub rp, rp, #8 - b L(lo) - - ALIGN(16) -L(top): ldrd r4, r5, [up, #8] - ldrd r6, r7, [vp, #8] - strd r8, r9, [rp, #8] -L(mid): ADDSUBC r8, r4, r6 - ADDSUBC r9, r5, r7 - ldrd r4, r5, [up, #16] - ldrd r6, r7, [vp, #16] - strd r8, r9, [rp, #16] - ADDSUBC r8, r4, r6 - ADDSUBC r9, r5, r7 - sub n, n, #2 - tst n, n - bmi L(dne) - ldrd r4, r5, [up, #24] - ldrd r6, r7, [vp, #24] - strd r8, r9, [rp, #24] - ADDSUBC r8, r4, r6 - ADDSUBC r9, r5, r7 - ldrd r4, r5, [up, #32]! - ldrd r6, r7, [vp, #32]! - strd r8, r9, [rp, #32]! -L(lo): ADDSUBC r8, r4, r6 - ADDSUBC r9, r5, r7 - tst n, n - bne L(top) - -L(end): strd r8, r9, [rp, #8] -L(wd1): RETVAL - pop { r4-r9 } - bx r14 -L(dne): strd r8, r9, [rp, #24] - RETVAL2 - pop { r4-r9 } - bx r14 -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm deleted file mode 100644 index b9e5cd3f79..0000000000 --- a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm +++ /dev/null @@ -1,158 +0,0 @@ -dnl ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb best -C StrongARM: - -C XScale ? -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 3.75 3 -C Cortex-A15 1.78 this - -C This code does not run as well as one could have hoped, since 1.5 c/l seems -C realistic for this insn mix. - -C Architecture requirements: -C v5 - -C v5t - -C v5te ldrd strd -C v6 - -C v6t2 - -C v7a - - -define(`cnd',`r0') -define(`rp', `r1') -define(`up', `r2') -define(`vp', `r3') -define(`n', `r12') - -ifdef(`OPERATION_cnd_add_n', ` - define(`ADDSUB', adds) - define(`ADDSUBC', adcs) - define(`IFADD', `$1') - define(`INITCY', `cmn r0, #0') - define(`RETVAL', `adc r0, n, #0') - define(`RETVAL2', `adc r0, n, #1') - define(`func', mpn_cnd_add_n) - define(`func_nc', mpn_add_nc)') -ifdef(`OPERATION_cnd_sub_n', ` - define(`ADDSUB', subs) - define(`ADDSUBC', sbcs) - define(`IFADD', `') - define(`INITCY', `cmp r0, #0') - define(`RETVAL', `sbc r0, r0, r0 - and r0, r0, #1') - define(`RETVAL2', `RETVAL') - define(`func', mpn_cnd_sub_n) - define(`func_nc', mpn_sub_nc)') - -MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) - -ASM_START() -PROLOGUE(func) - ldr n, [sp] - push { r4-r9 } - - cmp cnd, #1 - sbc cnd, cnd, cnd C conditionally set to 0xffffffff - - ands r6, n, #3 - mov n, n, lsr #2 - beq L(b00) - cmp r6, #2 - bcc L(b01) - beq L(b10) - -L(b11): ldr r5, [up], #4 - ldr r7, [vp], #4 - bic r7, r7, cnd - ADDSUB r9, r5, r7 - ldrd r4, r5, [up, #0] - ldrd r6, r7, [vp, #0] - bic r6, r6, cnd - bic r7, r7, cnd - str r9, [rp], #-4 - b L(lo) - -L(b00): ldrd r4, r5, [up], #-8 - ldrd r6, r7, [vp], #-8 - bic r6, r6, cnd - bic r7, r7, cnd - INITCY - sub rp, rp, #16 - b L(mid) - -L(b01): ldr r5, [up], #-4 - ldr r7, [vp], #-4 - bic r7, r7, cnd - ADDSUB r9, r5, r7 - str r9, [rp], #-12 - tst n, n - beq L(wd1) -L(gt1): ldrd r4, r5, [up, #8] - ldrd r6, r7, [vp, #8] - bic r6, r6, cnd - bic r7, r7, cnd - b L(mid) - -L(b10): ldrd r4, r5, [up] - ldrd r6, r7, [vp] - bic r6, r6, cnd - bic r7, r7, cnd - INITCY - sub rp, rp, #8 - b L(lo) - - ALIGN(16) -L(top): ldrd r6, r7, [vp, #8] - ldrd r4, r5, [up, #8] - bic r6, r6, cnd - bic r7, r7, cnd - strd r8, r9, [rp, #8] -L(mid): ADDSUBC r8, r4, r6 - ADDSUBC r9, r5, r7 - ldrd r6, r7, [vp, #16]! - ldrd r4, r5, [up, #16]! - bic r6, r6, cnd - bic r7, r7, cnd - sub n, n, #1 - strd r8, r9, [rp, #16]! -L(lo): ADDSUBC r8, r4, r6 - ADDSUBC r9, r5, r7 - tst n, n - bne L(top) - -L(end): strd r8, r9, [rp, #8] -L(wd1): RETVAL - pop { r4-r9 } - bx r14 -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/com.asm b/gmp/mpn/arm/v7a/cora15/com.asm deleted file mode 100644 index a258afe934..0000000000 --- a/gmp/mpn/arm/v7a/cora15/com.asm +++ /dev/null @@ -1,180 +0,0 @@ -dnl ARM mpn_com optimised for A15. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C StrongARM ? -C XScale ? -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 2.5 -C Cortex-A15 1.0 - -C This is great A15 core register code, but it is a bit large. -C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling. - -C Architecture requirements: -C v5 - -C v5t - -C v5te ldrd strd -C v6 - -C v6t2 - -C v7a - - -define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2 -define(`UNROLL', 4x2) C alternatives: 4 4x2 - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') - -ASM_START() -PROLOGUE(mpn_com) - push { r4-r5,r8-r9 } - -ifelse(FEEDIN_VARIANT,0,` - ands r12, n, #3 - mov n, n, lsr #2 - beq L(b00a) - tst r12, #1 - beq L(bx0) - ldr r5, [up], #4 - mvn r9, r5 - str r9, [rp], #4 - tst r12, #2 - beq L(b00) -L(bx0): ldrd r4, r5, [up, #0] - sub rp, rp, #8 - b L(lo) -L(b00): tst n, n - beq L(wd1) -L(b00a):ldrd r4, r5, [up], #-8 - sub rp, rp, #16 - b L(mid) -') -ifelse(FEEDIN_VARIANT,1,` - and r12, n, #3 - mov n, n, lsr #2 - tst r12, #1 - beq L(bx0) - ldr r5, [up], #4 - mvn r9, r5 - str r9, [rp], #4 -L(bx0): tst r12, #2 - beq L(b00) - ldrd r4, r5, [up, #0] - sub rp, rp, #8 - b L(lo) -L(b00): tst n, n - beq L(wd1) - ldrd r4, r5, [up], #-8 - sub rp, rp, #16 - b L(mid) -') -ifelse(FEEDIN_VARIANT,2,` - ands r12, n, #3 - mov n, n, lsr #2 - beq L(b00) - cmp r12, #2 - bcc L(b01) - beq L(b10) - -L(b11): ldr r5, [up], #4 - mvn r9, r5 - ldrd r4, r5, [up, #0] - str r9, [rp], #-4 - b L(lo) - -L(b00): ldrd r4, r5, [up], #-8 - sub rp, rp, #16 - b L(mid) - -L(b01): ldr r5, [up], #-4 - mvn r9, r5 - str r9, [rp], #-12 - tst n, n - beq L(wd1) -L(gt1): ldrd r4, r5, [up, #8] - b L(mid) - -L(b10): ldrd r4, r5, [up] - sub rp, rp, #8 - b L(lo) -') - ALIGN(16) -ifelse(UNROLL,4,` -L(top): ldrd r4, r5, [up, #8] - strd r8, r9, [rp, #8] -L(mid): mvn r8, r4 - mvn r9, r5 - ldrd r4, r5, [up, #16]! - strd r8, r9, [rp, #16]! - sub n, n, #1 -L(lo): mvn r8, r4 - mvn r9, r5 - tst n, n - bne L(top) -') -ifelse(UNROLL,4x2,` -L(top): ldrd r4, r5, [up, #8] - strd r8, r9, [rp, #8] -L(mid): mvn r8, r4 - mvn r9, r5 - ldrd r4, r5, [up, #16] - strd r8, r9, [rp, #16] - mvn r8, r4 - mvn r9, r5 - sub n, n, #2 - tst n, n - bmi L(dne) - ldrd r4, r5, [up, #24] - strd r8, r9, [rp, #24] - mvn r8, r4 - mvn r9, r5 - ldrd r4, r5, [up, #32]! - strd r8, r9, [rp, #32]! -L(lo): mvn r8, r4 - mvn r9, r5 - tst n, n - bne L(top) -') - -L(end): strd r8, r9, [rp, #8] -L(wd1): pop { r4-r5,r8-r9 } - bx r14 -ifelse(UNROLL,4x2,` -L(dne): strd r8, r9, [rp, #24] - pop { r4-r5,r8-r9 } - bx r14 -') -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h deleted file mode 100644 index 2a06532b3e..0000000000 --- a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h +++ /dev/null @@ -1,197 +0,0 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. - -Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software -Foundation, Inc. - -This file is part of the GNU MP Library. - -The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - -or - - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. - -or both in parallel, as here. - -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. */ - -#define GMP_LIMB_BITS 32 -#define GMP_LIMB_BYTES 4 - -/* 1700MHz Cortex-A15 with Neon (in spite of file position) */ -/* FFT tuning limit = 25000000 */ -/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */ - -#define MOD_1_NORM_THRESHOLD 0 /* always */ -#define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 -#define USE_PREINV_DIVREM_1 1 /* native */ -#define DIV_QR_1N_PI1_METHOD 1 -#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 15 - -#define MUL_TOOM22_THRESHOLD 23 -#define MUL_TOOM33_THRESHOLD 90 -#define MUL_TOOM44_THRESHOLD 262 -#define MUL_TOOM6H_THRESHOLD 351 -#define MUL_TOOM8H_THRESHOLD 557 - -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169 -#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 - -#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 43 -#define SQR_TOOM3_THRESHOLD 138 -#define SQR_TOOM4_THRESHOLD 363 -#define SQR_TOOM6_THRESHOLD 517 -#define SQR_TOOM8_THRESHOLD 725 - -#define MULMID_TOOM42_THRESHOLD 52 - -#define MULMOD_BNM1_THRESHOLD 17 -#define SQRMOD_BNM1_THRESHOLD 23 - -#define MUL_FFT_MODF_THRESHOLD 550 /* k = 5 */ -#define MUL_FFT_TABLE3 \ - { { 550, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \ - { 31, 7}, { 19, 6}, { 39, 7}, { 25, 6}, \ - { 51, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ - { 19, 7}, { 41, 8}, { 23, 7}, { 51, 8}, \ - { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ - { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ - { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ - { 47, 8}, { 99, 9}, { 55,10}, { 31, 9}, \ - { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \ - { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ - { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ - { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \ - { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ - { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \ - { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ - { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ - { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ - { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ - { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ - { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ - { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ - { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ - { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \ - { 1599,11}, { 831,12}, { 447,11}, { 895,13}, \ - { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \ - { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \ - { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ - { 1663,12}, { 895,13}, { 511,12}, { 1087,13}, \ - { 639,12}, { 1407,13}, { 767,12}, { 1599,13}, \ - { 895,14}, { 511,13}, { 1023,12}, { 2111,13}, \ - { 1151,12}, { 2431,13}, { 1279,14}, { 767,13}, \ - { 1535,12}, { 3071,15}, { 511,14}, { 1023,13}, \ - { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \ - { 2815,12}, { 5631,13}, { 2943,14}, { 16384,15}, \ - { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 137 -#define MUL_FFT_THRESHOLD 5760 - -#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */ -#define SQR_FFT_TABLE3 \ - { { 525, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \ - { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ - { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \ - { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ - { 23, 7}, { 51, 8}, { 27, 7}, { 55, 9}, \ - { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \ - { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \ - { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ - { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ - { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ - { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ - { 191,10}, { 111,11}, { 63,10}, { 143, 9}, \ - { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \ - { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ - { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ - { 159,10}, { 335, 9}, { 671,10}, { 351,11}, \ - { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ - { 799,10}, { 415,11}, { 223,12}, { 127,11}, \ - { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ - { 319,10}, { 671,11}, { 351,12}, { 191,11}, \ - { 383,10}, { 799,11}, { 415,10}, { 831,13}, \ - { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ - { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \ - { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \ - { 831,12}, { 447,11}, { 895,12}, { 511,11}, \ - { 1023,12}, { 575,11}, { 1151,12}, { 639,11}, \ - { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \ - { 1599,12}, { 831,11}, { 1663,12}, { 895,13}, \ - { 511,12}, { 1087,13}, { 639,12}, { 1407,13}, \ - { 767,12}, { 1727,13}, { 895,14}, { 511,13}, \ - { 1023,12}, { 2047,13}, { 1151,12}, { 2431,13}, \ - { 1279,14}, { 767,13}, { 1535,12}, { 3071,15}, \ - { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \ - { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \ - { 2687,14}, { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 139 -#define SQR_FFT_THRESHOLD 4736 - -#define MULLO_BASECASE_THRESHOLD 9 -#define MULLO_DC_THRESHOLD 39 -#define MULLO_MUL_N_THRESHOLD 11278 - -#define DC_DIV_QR_THRESHOLD 54 -#define DC_DIVAPPR_Q_THRESHOLD 296 -#define DC_BDIV_QR_THRESHOLD 52 -#define DC_BDIV_Q_THRESHOLD 300 - -#define INV_MULMOD_BNM1_THRESHOLD 44 -#define INV_NEWTON_THRESHOLD 294 -#define INV_APPR_THRESHOLD 294 - -#define BINV_NEWTON_THRESHOLD 375 -#define REDC_1_TO_REDC_2_THRESHOLD 102 -#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ - -#define MU_DIV_QR_THRESHOLD 1718 -#define MU_DIVAPPR_Q_THRESHOLD 1718 -#define MUPI_DIV_QR_THRESHOLD 108 -#define MU_BDIV_QR_THRESHOLD 1528 -#define MU_BDIV_Q_THRESHOLD 1718 - -#define POWM_SEC_TABLE 3,32,70,416,1464 - -#define MATRIX22_STRASSEN_THRESHOLD 22 -#define HGCD_THRESHOLD 152 -#define HGCD_APPR_THRESHOLD 230 -#define HGCD_REDUCE_THRESHOLD 3259 -#define GCD_DC_THRESHOLD 702 -#define GCDEXT_DC_THRESHOLD 538 -#define JACOBI_BASE_METHOD 4 - -#define GET_STR_DC_THRESHOLD 18 -#define GET_STR_PRECOMPUTE_THRESHOLD 32 -#define SET_STR_DC_THRESHOLD 119 -#define SET_STR_PRECOMPUTE_THRESHOLD 1063 - -#define FAC_DSC_THRESHOLD 262 -#define FAC_ODD_THRESHOLD 26 diff --git a/gmp/mpn/arm/v7a/cora15/logops_n.asm b/gmp/mpn/arm/v7a/cora15/logops_n.asm deleted file mode 100644 index 06026143e1..0000000000 --- a/gmp/mpn/arm/v7a/cora15/logops_n.asm +++ /dev/null @@ -1,253 +0,0 @@ -dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb cycles/limb -C and andn ior xor nand iorn nior xnor -C StrongARM ? ? -C XScale ? ? -C Cortex-A7 ? ? -C Cortex-A8 ? ? -C Cortex-A9 3.5 3.56 -C Cortex-A15 1.27 1.64 - -C This is great A15 core register code, but it is a bit large. -C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling. - -C Architecture requirements: -C v5 - -C v5t - -C v5te ldrd strd -C v6 - -C v6t2 - -C v7a - - -define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2 -define(`UNROLL', 4x2) C alternatives: 4 4x2 - -define(`rp', `r0') -define(`up', `r1') -define(`vp', `r2') -define(`n', `r3') - -define(`POSTOP') - -ifdef(`OPERATION_and_n',` - define(`func', `mpn_and_n') - define(`LOGOP', `and $1, $2, $3')') -ifdef(`OPERATION_andn_n',` - define(`func', `mpn_andn_n') - define(`LOGOP', `bic $1, $2, $3')') -ifdef(`OPERATION_nand_n',` - define(`func', `mpn_nand_n') - define(`POSTOP', `mvn $1, $1') - define(`LOGOP', `and $1, $2, $3')') -ifdef(`OPERATION_ior_n',` - define(`func', `mpn_ior_n') - define(`LOGOP', `orr $1, $2, $3')') -ifdef(`OPERATION_iorn_n',` - define(`func', `mpn_iorn_n') - define(`POSTOP', `mvn $1, $1') - define(`LOGOP', `bic $1, $3, $2')') -ifdef(`OPERATION_nior_n',` - define(`func', `mpn_nior_n') - define(`POSTOP', `mvn $1, $1') - define(`LOGOP', `orr $1, $2, $3')') -ifdef(`OPERATION_xor_n',` - define(`func', `mpn_xor_n') - define(`LOGOP', `eor $1, $2, $3')') -ifdef(`OPERATION_xnor_n',` - define(`func', `mpn_xnor_n') - define(`POSTOP', `mvn $1, $1') - define(`LOGOP', `eor $1, $2, $3')') - -MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) - -ASM_START() -PROLOGUE(func) - push { r4-r9 } - -ifelse(FEEDIN_VARIANT,0,` - ands r6, n, #3 - mov n, n, lsr #2 - beq L(b00a) - tst r6, #1 - beq L(bx0) - ldr r5, [up], #4 - ldr r7, [vp], #4 - LOGOP( r9, r5, r7) - POSTOP( r9) - str r9, [rp], #4 - tst r6, #2 - beq L(b00) -L(bx0): ldrd r4, r5, [up, #0] - ldrd r6, r7, [vp, #0] - sub rp, rp, #8 - b L(lo) -L(b00): tst n, n - beq L(wd1) -L(b00a):ldrd r4, r5, [up], #-8 - ldrd r6, r7, [vp], #-8 - sub rp, rp, #16 - b L(mid) -') -ifelse(FEEDIN_VARIANT,1,` - and r6, n, #3 - mov n, n, lsr #2 - tst r6, #1 - beq L(bx0) - ldr r5, [up], #4 - ldr r7, [vp], #4 - LOGOP( r9, r5, r7) - POSTOP( r9) - str r9, [rp], #4 -L(bx0): tst r6, #2 - beq L(b00) - ldrd r4, r5, [up, #0] - ldrd r6, r7, [vp, #0] - sub rp, rp, #8 - b L(lo) -L(b00): tst n, n - beq L(wd1) - ldrd r4, r5, [up], #-8 - ldrd r6, r7, [vp], #-8 - sub rp, rp, #16 - b L(mid) -') -ifelse(FEEDIN_VARIANT,2,` - ands r6, n, #3 - mov n, n, lsr #2 - beq L(b00) - cmp r6, #2 - bcc L(b01) - beq L(b10) - -L(b11): ldr r5, [up], #4 - ldr r7, [vp], #4 - LOGOP( r9, r5, r7) - ldrd r4, r5, [up, #0] - ldrd r6, r7, [vp, #0] - POSTOP( r9) - str r9, [rp], #-4 - b L(lo) - -L(b00): ldrd r4, r5, [up], #-8 - ldrd r6, r7, [vp], #-8 - sub rp, rp, #16 - b L(mid) - -L(b01): ldr r5, [up], #-4 - ldr r7, [vp], #-4 - LOGOP( r9, r5, r7) - POSTOP( r9) - str r9, [rp], #-12 - tst n, n - beq L(wd1) -L(gt1): ldrd r4, r5, [up, #8] - ldrd r6, r7, [vp, #8] - b L(mid) - -L(b10): ldrd r4, r5, [up] - ldrd r6, r7, [vp] - sub rp, rp, #8 - b L(lo) -') - ALIGN(16) -ifelse(UNROLL,4,` -L(top): ldrd r4, r5, [up, #8] - ldrd r6, r7, [vp, #8] - POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #8] -L(mid): LOGOP( r8, r4, r6) - LOGOP( r9, r5, r7) - ldrd r4, r5, [up, #16]! - ldrd r6, r7, [vp, #16]! - POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #16]! - sub n, n, #1 -L(lo): LOGOP( r8, r4, r6) - LOGOP( r9, r5, r7) - tst n, n - bne L(top) -') -ifelse(UNROLL,4x2,` -L(top): ldrd r4, r5, [up, #8] - ldrd r6, r7, [vp, #8] - POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #8] -L(mid): LOGOP( r8, r4, r6) - LOGOP( r9, r5, r7) - ldrd r4, r5, [up, #16] - ldrd r6, r7, [vp, #16] - POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #16] - LOGOP( r8, r4, r6) - LOGOP( r9, r5, r7) - sub n, n, #2 - tst n, n - bmi L(dne) - ldrd r4, r5, [up, #24] - ldrd r6, r7, [vp, #24] - POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #24] - LOGOP( r8, r4, r6) - LOGOP( r9, r5, r7) - ldrd r4, r5, [up, #32]! - ldrd r6, r7, [vp, #32]! - POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #32]! -L(lo): LOGOP( r8, r4, r6) - LOGOP( r9, r5, r7) - tst n, n - bne L(top) -') - -L(end): POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #8] -L(wd1): pop { r4-r9 } - bx r14 -ifelse(UNROLL,4x2,` -L(dne): POSTOP( r8) - POSTOP( r9) - strd r8, r9, [rp, #24] - pop { r4-r9 } - bx r14 -') -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/mul_1.asm b/gmp/mpn/arm/v7a/cora15/mul_1.asm deleted file mode 100644 index 766ba5c57f..0000000000 --- a/gmp/mpn/arm/v7a/cora15/mul_1.asm +++ /dev/null @@ -1,104 +0,0 @@ -dnl ARM mpn_mul_1 optimised for A15. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb best -C StrongARM: - -C XScale ? -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 5.25 3.25 -C Cortex-A15 2.25 this - - -C This runs well on A15 but very poorly on A9. By scheduling loads and adds -C it is possible to get good A9 performance as well, but at the cost of using -C many more (callee-saves) registers. - -C This is armv5 code, optimized for the armv7a cpu A15. Its location in the -C GMP file structure might be misleading. - - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') -define(`v0', `r3') - -ASM_START() -PROLOGUE(mpn_mul_1c) - ldr r12, [sp] - b L(ent) -EPILOGUE() -PROLOGUE(mpn_mul_1) - mov r12, #0 -L(ent): push {r4-r7} - - ldr r6, [up], #4 - tst n, #1 - beq L(bx0) - -L(bx1): umull r4, r7, r6, v0 - adds r4, r4, r12 - tst n, #2 - beq L(lo1) - b L(lo3) - -L(bx0): umull r4, r5, r6, v0 - adds r4, r4, r12 - tst n, #2 - beq L(lo0) - b L(lo2) - -L(top): ldr r6, [up], #4 - str r4, [rp], #4 - umull r4, r5, r6, v0 - adds r4, r4, r7 -L(lo0): ldr r6, [up], #4 - str r4, [rp], #4 - umull r4, r7, r6, v0 - adcs r4, r4, r5 -L(lo3): ldr r6, [up], #4 - str r4, [rp], #4 - umull r4, r5, r6, v0 - adcs r4, r4, r7 -L(lo2): ldr r6, [up], #4 - str r4, [rp], #4 - umull r4, r7, r6, v0 - adcs r4, r4, r5 -L(lo1): adc r7, r7, #0 - subs n, n, #4 - bgt L(top) - - str r4, [rp] - mov r0, r7 - pop {r4-r7} - bx lr -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm deleted file mode 100644 index d8cfe3f78f..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm +++ /dev/null @@ -1,43 +0,0 @@ -dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -define(LSH, 1) - -ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') -ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') -ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') - -MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) - -include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm deleted file mode 100644 index b48204d926..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm +++ /dev/null @@ -1,43 +0,0 @@ -dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -define(LSH, 2) - -ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') -ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') -ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') - -MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) - -include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm deleted file mode 100644 index 16c34a2699..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm +++ /dev/null @@ -1,144 +0,0 @@ -dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - - -C cycles/limb -C StrongARM - -C XScale - -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 5.25 -C Cortex-A15 2.25 - -C TODO -C * Consider using 4-way feed-in code. -C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps -C insufficiently for A7 and A8. - -define(`rp', `r0') -define(`up', `r1') -define(`vp', `r2') -define(`n', `r3') - -ifdef(`DO_add', ` - define(`ADCSBCS', `adcs $1, $2, $3') - define(`CLRCY', `cmn r13, #1') - define(`RETVAL', `adc r0, $1, #0') - define(`func', mpn_addlsh`'LSH`'_n)') -ifdef(`DO_sub', ` - define(`ADCSBCS', `sbcs $1, $2, $3') - define(`CLRCY', `cmp r13, #0') - define(`RETVAL', `sbc $2, $2, $2 - cmn $2, #1 - adc r0, $1, #0') - define(`func', mpn_sublsh`'LSH`'_n)') -ifdef(`DO_rsb', ` - define(`ADCSBCS', `sbcs $1, $3, $2') - define(`CLRCY', `cmp r13, #0') - define(`RETVAL', `sbc r0, $1, #0') - define(`func', mpn_rsblsh`'LSH`'_n)') - - -ASM_START() -PROLOGUE(func) - push {r4-r10} - vmov.i8 d0, #0 C could feed carry through here - CLRCY - tst n, #1 - beq L(bb0) - -L(bb1): vld1.32 {d3[0]}, [vp]! - vsli.u32 d0, d3, #LSH - ldr r12, [up], #4 - vmov.32 r5, d0[0] - vshr.u32 d0, d3, #32-LSH - ADCSBCS( r12, r12, r5) - str r12, [rp], #4 - bics n, n, #1 - beq L(rtn) - -L(bb0): tst n, #2 - beq L(b00) - -L(b10): vld1.32 {d3}, [vp]! - vsli.u64 d0, d3, #LSH - ldmia up!, {r10,r12} - vmov r4, r5, d0 - vshr.u64 d0, d3, #64-LSH - ADCSBCS( r10, r10, r4) - ADCSBCS( r12, r12, r5) - stmia rp!, {r10,r12} - bics n, n, #2 - beq L(rtn) - -L(b00): vld1.32 {d2}, [vp]! - vsli.u64 d0, d2, #LSH - vshr.u64 d1, d2, #64-LSH - vld1.32 {d3}, [vp]! - vsli.u64 d1, d3, #LSH - vmov r6, r7, d0 - vshr.u64 d0, d3, #64-LSH - sub n, n, #4 - tst n, n - beq L(end) - - ALIGN(16) -L(top): ldmia up!, {r8,r9,r10,r12} - vld1.32 {d2}, [vp]! - vsli.u64 d0, d2, #LSH - vmov r4, r5, d1 - vshr.u64 d1, d2, #64-LSH - ADCSBCS( r8, r8, r6) - ADCSBCS( r9, r9, r7) - vld1.32 {d3}, [vp]! - vsli.u64 d1, d3, #LSH - vmov r6, r7, d0 - vshr.u64 d0, d3, #64-LSH - ADCSBCS( r10, r10, r4) - ADCSBCS( r12, r12, r5) - stmia rp!, {r8,r9,r10,r12} - sub n, n, #4 - tst n, n - bne L(top) - -L(end): ldmia up!, {r8,r9,r10,r12} - vmov r4, r5, d1 - ADCSBCS( r8, r8, r6) - ADCSBCS( r9, r9, r7) - ADCSBCS( r10, r10, r4) - ADCSBCS( r12, r12, r5) - stmia rp!, {r8,r9,r10,r12} -L(rtn): vmov.32 r0, d0[0] - RETVAL( r0, r1) - pop {r4-r10} - bx r14 -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/neon/com.asm b/gmp/mpn/arm/v7a/cora15/neon/com.asm deleted file mode 100644 index 9e7a629287..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/com.asm +++ /dev/null @@ -1,97 +0,0 @@ -dnl ARM Neon mpn_com optimised for A15. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C StrongARM ? -C XScale ? -C Cortex-A8 ? -C Cortex-A9 2.1 -C Cortex-A15 0.65 - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') - -ASM_START() -PROLOGUE(mpn_com) - cmp n, #7 - ble L(bc) - -C Perform a few initial operation until rp is 128-bit aligned - tst rp, #4 - beq L(al1) - vld1.32 {d0[0]}, [up]! - sub n, n, #1 - vmvn d0, d0 - vst1.32 {d0[0]}, [rp]! -L(al1): tst rp, #8 - beq L(al2) - vld1.32 {d0}, [up]! - sub n, n, #2 - vmvn d0, d0 - vst1.32 {d0}, [rp:64]! -L(al2): vld1.32 {q2}, [up]! - subs n, n, #12 - blt L(end) - - ALIGN(16) -L(top): vld1.32 {q0}, [up]! - vmvn q2, q2 - subs n, n, #8 - vst1.32 {q2}, [rp:128]! - vld1.32 {q2}, [up]! - vmvn q0, q0 - vst1.32 {q0}, [rp:128]! - bge L(top) - -L(end): vmvn q2, q2 - vst1.32 {q2}, [rp:128]! - -C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we -C arrive here via L(bc) -L(bc): tst n, #4 - beq L(tl1) - vld1.32 {q0}, [up]! - vmvn q0, q0 - vst1.32 {q0}, [rp]! -L(tl1): tst n, #2 - beq L(tl2) - vld1.32 {d0}, [up]! - vmvn d0, d0 - vst1.32 {d0}, [rp]! -L(tl2): tst n, #1 - beq L(tl3) - vld1.32 {d0[0]}, [up] - vmvn d0, d0 - vst1.32 {d0[0]}, [rp] -L(tl3): bx lr -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm deleted file mode 100644 index 98fe535def..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm +++ /dev/null @@ -1,110 +0,0 @@ -dnl ARM Neon mpn_copyd optimised for A15. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C StrongARM - -C XScale - -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 1.75 slower than core register code -C Cortex-A15 0.52 - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') - -ASM_START() -PROLOGUE(mpn_copyd) - add rp, rp, n, lsl #2 - add up, up, n, lsl #2 - - cmp n, #7 - ble L(bc) - -C Copy until rp is 128-bit aligned - tst rp, #4 - beq L(al1) - sub up, up, #4 - vld1.32 {d22[0]}, [up] - sub n, n, #1 - sub rp, rp, #4 - vst1.32 {d22[0]}, [rp] -L(al1): tst rp, #8 - beq L(al2) - sub up, up, #8 - vld1.32 {d22}, [up] - sub n, n, #2 - sub rp, rp, #8 - vst1.32 {d22}, [rp:64] -L(al2): sub up, up, #16 - vld1.32 {d26-d27}, [up] - subs n, n, #12 - sub rp, rp, #16 C offset rp for loop - blt L(end) - - sub up, up, #16 C offset up for loop - mov r12, #-16 - - ALIGN(16) -L(top): vld1.32 {d22-d23}, [up], r12 - vst1.32 {d26-d27}, [rp:128], r12 - vld1.32 {d26-d27}, [up], r12 - vst1.32 {d22-d23}, [rp:128], r12 - subs n, n, #8 - bge L(top) - - add up, up, #16 C undo up offset - C rp offset undoing folded -L(end): vst1.32 {d26-d27}, [rp:128] - -C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we -C arrive here via L(bc) -L(bc): tst n, #4 - beq L(tl1) - sub up, up, #16 - vld1.32 {d22-d23}, [up] - sub rp, rp, #16 - vst1.32 {d22-d23}, [rp] -L(tl1): tst n, #2 - beq L(tl2) - sub up, up, #8 - vld1.32 {d22}, [up] - sub rp, rp, #8 - vst1.32 {d22}, [rp] -L(tl2): tst n, #1 - beq L(tl3) - sub up, up, #4 - vld1.32 {d22[0]}, [up] - sub rp, rp, #4 - vst1.32 {d22[0]}, [rp] -L(tl3): bx lr -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm deleted file mode 100644 index 2e05afe5e8..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm +++ /dev/null @@ -1,90 +0,0 @@ -dnl ARM Neon mpn_copyi optimised for A15. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C StrongARM - -C XScale - -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 1.75 slower than core register code -C Cortex-A15 0.52 - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') - -ASM_START() -PROLOGUE(mpn_copyi) - cmp n, #7 - ble L(bc) - -C Copy until rp is 128-bit aligned - tst rp, #4 - beq L(al1) - vld1.32 {d22[0]}, [up]! - sub n, n, #1 - vst1.32 {d22[0]}, [rp]! -L(al1): tst rp, #8 - beq L(al2) - vld1.32 {d22}, [up]! - sub n, n, #2 - vst1.32 {d22}, [rp:64]! -L(al2): vld1.32 {d26-d27}, [up]! - subs n, n, #12 - blt L(end) - - ALIGN(16) -L(top): vld1.32 {d22-d23}, [up]! - vst1.32 {d26-d27}, [rp:128]! - vld1.32 {d26-d27}, [up]! - vst1.32 {d22-d23}, [rp:128]! - subs n, n, #8 - bge L(top) - -L(end): vst1.32 {d26-d27}, [rp:128]! - -C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we -C arrive here via L(bc) -L(bc): tst n, #4 - beq L(tl1) - vld1.32 {d22-d23}, [up]! - vst1.32 {d22-d23}, [rp]! -L(tl1): tst n, #2 - beq L(tl2) - vld1.32 {d22}, [up]! - vst1.32 {d22}, [rp]! -L(tl2): tst n, #1 - beq L(tl3) - vld1.32 {d22[0]}, [up] - vst1.32 {d22[0]}, [rp] -L(tl3): bx lr -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm deleted file mode 100644 index 2c11d6debd..0000000000 --- a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm +++ /dev/null @@ -1,177 +0,0 @@ -dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C StrongARM - -C XScale - -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 4-5 -C Cortex-A15 2.5 - -C TODO -C * Try to make this smaller, its size (384 bytes) is excessive. -C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family. -C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps -C insufficiently for A7 and A8. - -define(`rp', `r0') -define(`up', `r1') -define(`vp', `r2') -define(`n', `r3') - -ifdef(`OPERATION_rsh1add_n', ` - define(`ADDSUBS', `adds $1, $2, $3') - define(`ADCSBCS', `adcs $1, $2, $3') - define(`IFADD', `$1') - define(`IFSUB', `') - define(`func', mpn_rsh1add_n)') -ifdef(`OPERATION_rsh1sub_n', ` - define(`ADDSUBS', `subs $1, $2, $3') - define(`ADCSBCS', `sbcs $1, $2, $3') - define(`IFADD', `') - define(`IFSUB', `$1') - define(`func', mpn_rsh1sub_n)') - -MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) - -ASM_START() -PROLOGUE(func) - push {r4-r10} - - ands r4, n, #3 - beq L(b00) - cmp r4, #2 - blo L(b01) - beq L(b10) - -L(b11): ldmia up!, {r9,r10,r12} - ldmia vp!, {r5,r6,r7} - ADDSUBS( r9, r9, r5) - vmov d4, r9, r9 - ADCSBCS( r10, r10, r6) - ADCSBCS( r12, r12, r7) - vshr.u64 d3, d4, #1 - vmov d1, r10, r12 - vsli.u64 d3, d1, #31 - vshr.u64 d2, d1, #1 - vst1.32 d3[0], [rp]! - bics n, n, #3 - beq L(wd2) -L(gt3): ldmia up!, {r8,r9,r10,r12} - ldmia vp!, {r4,r5,r6,r7} - b L(mi0) - -L(b10): ldmia up!, {r10,r12} - ldmia vp!, {r6,r7} - ADDSUBS( r10, r10, r6) - ADCSBCS( r12, r12, r7) - vmov d4, r10, r12 - bics n, n, #2 - vshr.u64 d2, d4, #1 - beq L(wd2) -L(gt2): ldmia up!, {r8,r9,r10,r12} - ldmia vp!, {r4,r5,r6,r7} - b L(mi0) - -L(b01): ldr r12, [up], #4 - ldr r7, [vp], #4 - ADDSUBS( r12, r12, r7) - vmov d4, r12, r12 - bics n, n, #1 - bne L(gt1) - mov r5, r12, lsr #1 -IFADD(` adc r1, n, #0') -IFSUB(` adc r1, n, #1') - bfi r5, r1, #31, #1 - str r5, [rp] - and r0, r12, #1 - pop {r4-r10} - bx r14 -L(gt1): ldmia up!, {r8,r9,r10,r12} - ldmia vp!, {r4,r5,r6,r7} - vshr.u64 d2, d4, #1 - ADCSBCS( r8, r8, r4) - ADCSBCS( r9, r9, r5) - vmov d0, r8, r9 - ADCSBCS( r10, r10, r6) - ADCSBCS( r12, r12, r7) - vsli.u64 d2, d0, #31 - vshr.u64 d3, d0, #1 - vst1.32 d2[0], [rp]! - b L(mi1) - -L(b00): ldmia up!, {r8,r9,r10,r12} - ldmia vp!, {r4,r5,r6,r7} - ADDSUBS( r8, r8, r4) - ADCSBCS( r9, r9, r5) - vmov d4, r8, r9 - ADCSBCS( r10, r10, r6) - ADCSBCS( r12, r12, r7) - vshr.u64 d3, d4, #1 - b L(mi1) - - ALIGN(16) -L(top): ldmia up!, {r8,r9,r10,r12} - ldmia vp!, {r4,r5,r6,r7} - vsli.u64 d3, d1, #63 - vshr.u64 d2, d1, #1 - vst1.32 d3, [rp]! -L(mi0): ADCSBCS( r8, r8, r4) - ADCSBCS( r9, r9, r5) - vmov d0, r8, r9 - ADCSBCS( r10, r10, r6) - ADCSBCS( r12, r12, r7) - vsli.u64 d2, d0, #63 - vshr.u64 d3, d0, #1 - vst1.32 d2, [rp]! -L(mi1): vmov d1, r10, r12 - sub n, n, #4 - tst n, n - bne L(top) - -L(end): vsli.u64 d3, d1, #63 - vshr.u64 d2, d1, #1 - vst1.32 d3, [rp]! -L(wd2): vmov r4, r5, d2 -IFADD(` adc r1, n, #0') -IFSUB(` adc r1, n, #1') - bfi r5, r1, #31, #1 - stm rp, {r4,r5} - -L(rtn): vmov.32 r0, d4[0] - and r0, r0, #1 - pop {r4-r10} - bx r14 -EPILOGUE() diff --git a/gmp/mpn/arm/v7a/cora15/submul_1.asm b/gmp/mpn/arm/v7a/cora15/submul_1.asm deleted file mode 100644 index ed7bfe820b..0000000000 --- a/gmp/mpn/arm/v7a/cora15/submul_1.asm +++ /dev/null @@ -1,159 +0,0 @@ -dnl ARM mpn_submul_1 optimised for A15. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb best -C StrongARM: - -C XScale ? -C Cortex-A7 ? -C Cortex-A8 ? -C Cortex-A9 5.75 3.75 -C Cortex-A15 2.32 this - -C This code uses umlal and umaal for adding in the rp[] data, keeping the -C recurrency path separate from any multiply instructions. It performs well on -C A15, but not quite at the multiply bandwidth like the corresponding addmul_1 -C code. -C -C We don't use r12 due to ldrd and strd limitations. -C -C This loop complements U on the fly, -C U' = B^n - 1 - U -C and then uses that -C R - U*v = R + U'*v + v - B^n v - -C Architecture requirements: -C v5 - -C v5t - -C v5te ldrd strd -C v6 umaal -C v6t2 - -C v7a - - -define(`rp', `r0') -define(`up', `r1') -define(`n', `r2') -define(`v0', `r3') - -define(`w0', `r10') define(`w1', `r11') -define(`u0', `r8') define(`u1', `r9') - -ASM_START() -PROLOGUE(mpn_submul_1) - sub sp, sp, #32 - strd r10, r11, [sp, #24] - strd r8, r9, [sp, #16] - strd r6, r7, [sp, #8] - strd r4, r5, [sp, #0] -C push { r4-r11 } - - ands r6, n, #3 - sub n, n, #3 - beq L(b00) - cmp r6, #2 - bcc L(b01) - beq L(b10) - -L(b11): mov r6, #0 - ldr u1, [up], #-4 - ldr w1, [rp], #-16 - mvn u1, u1 - adds r7, v0, #0 - b L(mid) - -L(b00): ldrd u0, u1, [up] - ldrd w0, w1, [rp], #-12 - mvn u0, u0 - mvn u1, u1 - mov r6, v0 - umaal w0, r6, u0, v0 - cmn r13, #0 C carry clear - mov r7, #0 - str w0, [rp, #12] - b L(mid) - -L(b10): ldrd u0, u1, [up], #8 - ldrd w0, w1, [rp] - mvn u0, u0 - mvn u1, u1 - mov r4, v0 - umaal w0, r4, u0, v0 - mov r5, #0 - str w0, [rp], #-4 - umlal w1, r5, u1, v0 - adds n, n, #0 - bmi L(end) - b L(top) - -L(b01): ldr u1, [up], #4 - ldr w1, [rp], #-8 - mvn u1, u1 - mov r5, v0 - mov r4, #0 - umaal w1, r5, u1, v0 - tst n, n - bmi L(end) - -C ALIGN(16) -L(top): ldrd u0, u1, [up, #0] - adcs r4, r4, w1 - mvn u0, u0 - ldrd w0, w1, [rp, #12] - mvn u1, u1 - mov r6, #0 - umlal w0, r6, u0, v0 C 1 2 - adcs r5, r5, w0 - mov r7, #0 - strd r4, r5, [rp, #8] -L(mid): umaal w1, r7, u1, v0 C 2 3 - ldrd u0, u1, [up, #8] - add up, up, #16 - adcs r6, r6, w1 - mvn u0, u0 - ldrd w0, w1, [rp, #20] - mvn u1, u1 - mov r4, #0 - umlal w0, r4, u0, v0 C 3 4 - adcs r7, r7, w0 - mov r5, #0 - strd r6, r7, [rp, #16]! - sub n, n, #4 - umlal w1, r5, u1, v0 C 0 1 - tst n, n - bpl L(top) - -L(end): adcs r4, r4, w1 - str r4, [rp, #8] - adc r0, r5, #0 - sub r0, v0, r0 - pop { r4-r11 } - bx r14 -EPILOGUE() |