dnl ARM64 mpn_rshift. dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb assumed optimal c/l C Cortex-A53 3.5-4.0 3.25 C Cortex-A57 2.0 2.0 C X-Gene 2.67 2.5 C TODO C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These C numbers should be 1 and 0, respectively. The str in wind-down should also C go. C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. C * A53's speed depends on alignment, but not as simply as for lshift/lshiftc. changecom(blah) define(`rp_arg', `x0') define(`up', `x1') define(`n', `x2') define(`cnt', `x3') define(`rp', `x16') define(`tnc',`x8') define(`PSHIFT', lsr) define(`NSHIFT', lsl) ASM_START() PROLOGUE(mpn_rshift) mov rp, rp_arg sub tnc, xzr, cnt lsr x18, n, #2 tbz n, #0, L(bx0) L(bx1): ldr x5, [up] tbnz n, #1, L(b11) L(b01): NSHIFT x0, x5, tnc PSHIFT x2, x5, cnt cbnz x18, L(gt1) str x2, [rp] ret L(gt1): ldp x4, x5, [up,#8] sub up, up, #8 sub rp, rp, #32 b L(lo2) L(b11): NSHIFT x0, x5, tnc PSHIFT x2, x5, cnt ldp x6, x7, [up,#8]! sub rp, rp, #16 b L(lo3) L(bx0): ldp x4, x5, [up] tbz n, #1, L(b00) L(b10): NSHIFT x0, x4, tnc PSHIFT x13, x4, cnt NSHIFT x10, x5, tnc PSHIFT x2, x5, cnt cbnz x18, L(gt2) orr x10, x10, x13 stp x10, x2, [rp] ret L(gt2): ldp x4, x5, [up,#16] orr x10, x10, x13 str x10, [rp],#-24 b L(lo2) L(b00): NSHIFT x0, x4, tnc PSHIFT x13, x4, cnt NSHIFT x10, x5, tnc PSHIFT x2, x5, cnt ldp x6, x7, [up,#16]! orr x10, x10, x13 str x10, [rp],#-8 b L(lo0) ALIGN(16) L(top): ldp x4, x5, [up,#16] orr x10, x10, x13 orr x11, x12, x2 stp x11, x10, [rp,#16] PSHIFT x2, x7, cnt L(lo2): NSHIFT x10, x5, tnc NSHIFT x12, x4, tnc PSHIFT x13, x4, cnt ldp x6, x7, [up,#32]! orr x10, x10, x13 orr x11, x12, x2 stp x11, x10, [rp,#32]! PSHIFT x2, x5, cnt L(lo0): sub x18, x18, #1 L(lo3): NSHIFT x10, x7, tnc NSHIFT x12, x6, tnc PSHIFT x13, x6, cnt cbnz x18, L(top) L(end): orr x10, x10, x13 orr x11, x12, x2 PSHIFT x2, x7, cnt stp x11, x10, [rp,#16] str x2, [rp,#32] ret EPILOGUE()