diff options
Diffstat (limited to 'gmp/mpn/arm/neon/lshiftc.asm')
-rw-r--r-- | gmp/mpn/arm/neon/lshiftc.asm | 257 |
1 files changed, 0 insertions, 257 deletions
diff --git a/gmp/mpn/arm/neon/lshiftc.asm b/gmp/mpn/arm/neon/lshiftc.asm deleted file mode 100644 index 9e4096256d..0000000000 --- a/gmp/mpn/arm/neon/lshiftc.asm +++ /dev/null @@ -1,257 +0,0 @@ -dnl ARM Neon mpn_lshiftc. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C StrongARM - - -C XScale - - -C Cortex-A7 ? ? -C Cortex-A8 ? ? -C Cortex-A9 3.5 3.5 Y -C Cortex-A15 1.75 1.75 Y - - -C We read 64 bits at a time at 32-bit aligned addresses, and except for the -C first and last store, we write using 64-bit aligned addresses. All shifting -C is done on 64-bit words in 'extension' registers. -C -C It should be possible to read also using 64-bit alignment, by manipulating -C the shift count for unaligned operands. Not done, since it does not seem to -C matter for A9 or A15. -C -C This will not work in big-endian mode. - -C TODO -C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, -C which might make it tricky. -C * Clean up and simplify. -C * Consider sharing most of the code for lshift and rshift, since the feed-in -C code, the loop, and most of the wind-down code are identical. -C * Replace the basecase code with code using 'extension' registers. -C * Optimise. It is not clear that this loop insn permutation is optimal for -C either A9 or A15. - -C INPUT PARAMETERS -define(`rp', `r0') -define(`ap', `r1') -define(`n', `r2') -define(`cnt', `r3') - - define(`IFLSH', `$1') - define(`IFRSH', `') - define(`X',`0') - define(`Y',`1') - define(`func',`mpn_lshiftc') -define(`OPERATION_lshiftc',1) - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_lshiftc) -IFLSH(` mov r12, n, lsl #2 ') -IFLSH(` add rp, rp, r12 ') -IFLSH(` add ap, ap, r12 ') - - cmp n, #4 C SIMD code n limit - ble L(base) - -ifdef(`OPERATION_lshiftc',` - vdup.32 d6, r3 C left shift count is positive - sub r3, r3, #64 C right shift count is negative - vdup.32 d7, r3 - mov r12, #-8') C lshift pointer update offset -ifdef(`OPERATION_rshift',` - rsb r3, r3, #0 C right shift count is negative - vdup.32 d6, r3 - add r3, r3, #64 C left shift count is positive - vdup.32 d7, r3 - mov r12, #8') C rshift pointer update offset - -IFLSH(` sub ap, ap, #8 ') - vld1.32 {d19}, [ap], r12 C load initial 2 limbs - vshl.u64 d18, d19, d7 C retval - - tst rp, #4 C is rp 64-bit aligned already? - beq L(rp_aligned) C yes, skip - vmvn d19, d19 -IFLSH(` add ap, ap, #4 ') C move back ap pointer -IFRSH(` sub ap, ap, #4 ') C move back ap pointer - vshl.u64 d4, d19, d6 - sub n, n, #1 C first limb handled -IFLSH(` sub rp, rp, #4 ') - vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned - vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] - -L(rp_aligned): -IFLSH(` sub rp, rp, #8 ') - subs n, n, #6 - vmvn d19, d19 - blt L(two_or_three_more) - tst n, #2 - beq L(2) - -L(1): vld1.32 {d17}, [ap], r12 - vshl.u64 d5, d19, d6 - vmvn d17, d17 - vld1.32 {d16}, [ap], r12 - vshl.u64 d0, d17, d7 - vshl.u64 d4, d17, d6 - sub n, n, #2 - b L(mid) - -L(2): vld1.32 {d16}, [ap], r12 - vshl.u64 d4, d19, d6 - vmvn d16, d16 - vld1.32 {d17}, [ap], r12 - vshl.u64 d1, d16, d7 - vshl.u64 d5, d16, d6 - subs n, n, #4 - blt L(end) - -L(top): vmvn d17, d17 - vld1.32 {d16}, [ap], r12 - vorr d2, d4, d1 - vshl.u64 d0, d17, d7 - vshl.u64 d4, d17, d6 - vst1.32 {d2}, [rp:64], r12 -L(mid): vmvn d16, d16 - vld1.32 {d17}, [ap], r12 - vorr d3, d5, d0 - vshl.u64 d1, d16, d7 - vshl.u64 d5, d16, d6 - vst1.32 {d3}, [rp:64], r12 - subs n, n, #4 - bge L(top) - -L(end): tst n, #1 - beq L(evn) - - vorr d2, d4, d1 - vst1.32 {d2}, [rp:64], r12 - b L(cj1) - -L(evn): vmvn d17, d17 - vorr d2, d4, d1 - vshl.u64 d0, d17, d7 - vshl.u64 d4, d17, d6 - vst1.32 {d2}, [rp:64], r12 - vmvn.u8 d17, #0 - vorr d2, d5, d0 - vshl.u64 d0, d17, d7 - vorr d3, d4, d0 - b L(cj2) - -C Load last 2 - 3 limbs, store last 4 - 5 limbs -L(two_or_three_more): - tst n, #1 - beq L(l2) - -L(l3): vshl.u64 d5, d19, d6 - vld1.32 {d17}, [ap], r12 -L(cj1): vmov.u8 d16, #0 -IFLSH(` add ap, ap, #4 ') - vmvn d17, d17 - vld1.32 {d16[Y]}, [ap], r12 - vshl.u64 d0, d17, d7 - vshl.u64 d4, d17, d6 - vmvn d16, d16 - vorr d3, d5, d0 - vshl.u64 d1, d16, d7 - vshl.u64 d5, d16, d6 - vst1.32 {d3}, [rp:64], r12 - vorr d2, d4, d1 - vst1.32 {d2}, [rp:64], r12 -IFLSH(` add rp, rp, #4 ') - vst1.32 {d5[Y]}, [rp] - vmov.32 r0, d18[X] - bx lr - -L(l2): vld1.32 {d16}, [ap], r12 - vshl.u64 d4, d19, d6 - vmvn d16, d16 - vshl.u64 d1, d16, d7 - vshl.u64 d5, d16, d6 - vmvn.u8 d17, #0 - vorr d2, d4, d1 - vshl.u64 d0, d17, d7 - vorr d3, d5, d0 -L(cj2): vst1.32 {d2}, [rp:64], r12 - vst1.32 {d3}, [rp] - vmov.32 r0, d18[X] - bx lr - - -define(`tnc', `r12') -L(base): - push {r4, r6, r7, r8} - ldr r4, [ap, #-4]! - rsb tnc, cnt, #32 - mvn r6, r4 - - mov r7, r6, lsl cnt - tst n, #1 - beq L(ev) C n even - -L(od): subs n, n, #2 - bcc L(ed1) C n = 1 - ldr r8, [ap, #-4]! - mvn r8, r8 - b L(md) C n = 3 - -L(ev): ldr r6, [ap, #-4]! - mvn r6, r6 - subs n, n, #2 - beq L(ed) C n = 3 - C n = 4 -L(tp): ldr r8, [ap, #-4]! - orr r7, r7, r6, lsr tnc - str r7, [rp, #-4]! - mvn r8, r8 - mov r7, r6, lsl cnt -L(md): ldr r6, [ap, #-4]! - orr r7, r7, r8, lsr tnc - str r7, [rp, #-4]! - mvn r6, r6 - mov r7, r8, lsl cnt - -L(ed): orr r7, r7, r6, lsr tnc - str r7, [rp, #-4]! - mov r7, r6, lsl cnt -L(ed1): mvn r6, #0 - orr r7, r7, r6, lsr tnc - str r7, [rp, #-4] - mov r0, r4, lsr tnc - pop {r4, r6, r7, r8} - bx r14 -EPILOGUE() |