diff options
Diffstat (limited to 'mpn/powerpc64')
34 files changed, 1738 insertions, 448 deletions
diff --git a/mpn/powerpc64/com.asm b/mpn/powerpc64/com.asm index 4fb2e65d7..cb89bade2 100644 --- a/mpn/powerpc64/com.asm +++ b/mpn/powerpc64/com.asm @@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1? -C POWER4/PPC970: 1.6 +C cycles/limb +C POWER3/PPC630 1? +C POWER4/PPC970 1.6 +C POWER5 ? +C POWER6 ? +C POWER7 1.45 C TODO C * 8-way unrolling brings timing down to about 1.3 cycles/limb. diff --git a/mpn/powerpc64/copyd.asm b/mpn/powerpc64/copyd.asm index 6a46a433c..256e7dc12 100644 --- a/mpn/powerpc64/copyd.asm +++ b/mpn/powerpc64/copyd.asm @@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1 -C POWER4/PPC970: 1 +C cycles/limb +C POWER3/PPC630 1 +C POWER4/PPC970 1 +C POWER5 ? +C POWER6 ? +C POWER7 1.4 C INPUT PARAMETERS C rp r3 diff --git a/mpn/powerpc64/copyi.asm b/mpn/powerpc64/copyi.asm index 5cb7e4856..31d1fc2e7 100644 --- a/mpn/powerpc64/copyi.asm +++ b/mpn/powerpc64/copyi.asm @@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1 -C POWER4/PPC970: 1 +C cycles/limb +C POWER3/PPC630 1 +C POWER4/PPC970 1 +C POWER5 ? +C POWER6 ? +C POWER7 1.4 C INPUT PARAMETERS C rp r3 diff --git a/mpn/powerpc64/logops_n.asm b/mpn/powerpc64/logops_n.asm index 917b59f45..2caa2c7c6 100644 --- a/mpn/powerpc64/logops_n.asm +++ b/mpn/powerpc64/logops_n.asm @@ -20,9 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 1.75 -C POWER4/PPC970: 2.10 +C cycles/limb +C POWER3/PPC630 1.75 +C POWER4/PPC970 2.10 +C POWER5 ? +C POWER6 ? +C POWER7 1.75 C n POWER3/PPC630 POWER4/PPC970 C 1 15.00 15.33 diff --git a/mpn/powerpc64/lshift.asm b/mpn/powerpc64/lshift.asm index f97661ae7..eb70c4983 100644 --- a/mpn/powerpc64/lshift.asm +++ b/mpn/powerpc64/lshift.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.75 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 C TODO C * Try to reduce the number of needed live registers diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/lshiftc.asm index 647244d1f..8f470a5f4 100644 --- a/mpn/powerpc64/mode64/lshiftc.asm +++ b/mpn/powerpc64/lshiftc.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.5 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.5 +C POWER7 2.15 C TODO C * Try to reduce the number of needed live registers @@ -189,6 +190,9 @@ L(cj2): std r10, -32(rp) L(ret): ld r31, -8(r1) ld r30, -16(r1) - mr r3, retval +ifdef(`HAVE_ABI_mode32', +` srdi r3, retval, 32 + mr r4, retval +',` mr r3, retval') blr EPILOGUE() diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm index 980525f67..8c30871c2 100644 --- a/mpn/powerpc64/mode64/aors_n.asm +++ b/mpn/powerpc64/mode64/aors_n.asm @@ -1,6 +1,6 @@ dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. -dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -20,11 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 1.5 -C POWER4/PPC970 2 -C POWER5 2.25 -C POWER6 2.63 +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.63 +C POWER7 2.25-2.87 C This code is a little bit slower for POWER3/PPC630 than the simple code used C previously, but it is much faster for POWER4/PPC970. The reason for the @@ -136,6 +137,7 @@ L(go): ld r6, 0(r4) C load s1 limb addi r4, r4, 32 addi r5, r5, 32 + ALIGN(16) L(top): ADDSUBC r28, r7, r6 ld r6, 0(r4) C load s1 limb ld r7, 0(r5) C load s2 limb diff --git a/mpn/powerpc64/mode64/aorscnd_n.asm b/mpn/powerpc64/mode64/aorscnd_n.asm new file mode 100644 index 000000000..47aa6fb39 --- /dev/null +++ b/mpn/powerpc64/mode64/aorscnd_n.asm @@ -0,0 +1,185 @@ +dnl PowerPC-64 mpn_addcnd_n/mpn_subcnd_n. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 2.25 +C POWER5 ? +C POWER6 ? +C POWER7 ? + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') +define(`cnd', `r7') + +ifdef(`OPERATION_addcnd_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addcnd_n) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_subcnd_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_subcnd_n) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + subfic cnd, cnd, 0 + subfe cnd, cnd, cnd + + rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi r6, r6, 3 C compute count... + srdi r6, r6, 2 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r8, 0(up) C load s1 limb + ld r9, 0(vp) C load s2 limb + ld r10, 8(up) C load s1 limb + ld r11, 8(vp) C load s2 limb + ld r12, 16(up) C load s1 limb + addi up, up, 24 + ld r0, 16(vp) C load s2 limb + addi vp, vp, 24 + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r29, 0(rp) + std r30, 8(rp) + std r31, 16(rp) + addi rp, rp, 24 + bdnz L(go) + b L(ret) + +L(b01): ld r12, 0(up) C load s1 limb + addi up, up, 8 + ld r0, 0(vp) C load s2 limb + addi vp, vp, 8 + and r0, r0, cnd + ADDSUB r31, r0, r12 C add + std r31, 0(rp) + addi rp, rp, 8 + bdnz L(go) + b L(ret) + +L(b10): ld r10, 0(up) C load s1 limb + ld r11, 0(vp) C load s2 limb + ld r12, 8(up) C load s1 limb + addi up, up, 16 + ld r0, 8(vp) C load s2 limb + addi vp, vp, 16 + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + std r30, 0(rp) + std r31, 8(rp) + addi rp, rp, 16 + bdnz L(go) + b L(ret) + +L(b00): CLRCB C clear/set cy +L(go): ld r6, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdz L(end) + + addi up, up, 32 + addi vp, vp, 32 + +L(top): ADDSUBC r28, r27, r6 + ld r6, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ADDSUBC r29, r9, r8 + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ADDSUBC r30, r11, r10 + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ADDSUBC r31, r0, r12 + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + std r28, 0(rp) + addi up, up, 32 + std r29, 8(rp) + addi vp, vp, 32 + std r30, 16(rp) + std r31, 24(rp) + addi rp, rp, 32 + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r27, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r28, 0(rp) + std r29, 8(rp) + std r30, 16(rp) + std r31, 24(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/mpn/powerpc64/mode64/aorslshC_n.asm b/mpn/powerpc64/mode64/aorslshC_n.asm index 4622cd946..3776d3e59 100644 --- a/mpn/powerpc64/mode64/aorslshC_n.asm +++ b/mpn/powerpc64/mode64/aorslshC_n.asm @@ -17,11 +17,12 @@ dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. -C cycles/limb -C POWER3/PPC630 1.83 (1.5 c/l should be possible) -C POWER4/PPC970 3 (2.0 c/l should be possible) -C POWER5 3 -C POWER6 3.5-47 +C cycles/limb +C POWER3/PPC630 1.83 (1.5 c/l should be possible) +C POWER4/PPC970 3 (2.0 c/l should be possible) +C POWER5 3 +C POWER6 3.5-47 +C POWER7 3 C STATUS C * Try combining upx+up, and vpx+vp. diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm index b1a3315b6..4b843a044 100644 --- a/mpn/powerpc64/mode64/aorsmul_1.asm +++ b/mpn/powerpc64/mode64/aorsmul_1.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C mpn_addmul_1 mpn_submul_1 -C cycles/limb cycles/limb -C POWER3/PPC630 6-18 6-18 -C POWER4/PPC970 8 8.3 -C POWER5 8 8.25 -C POWER6 16.25 16.75 +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 6-18 6-18 +C POWER4/PPC970 8 8.3 +C POWER5 8 8.25 +C POWER6 16.25 16.75 +C POWER7 3.77 4.9 C TODO C * Try to reduce the number of needed live registers @@ -53,7 +54,7 @@ ifdef(`OPERATION_submul_1',` ') MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) - + ASM_START() PROLOGUE(func_nc) EPILOGUE() diff --git a/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/mpn/powerpc64/mode64/bdiv_dbm1c.asm index 40f3d4ec7..e88fc4440 100644 --- a/mpn/powerpc64/mode64/bdiv_dbm1c.asm +++ b/mpn/powerpc64/mode64/bdiv_dbm1c.asm @@ -19,11 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb +C cycles/limb C POWER3/PPC630 6-18 C POWER4/PPC970 8.5? C POWER5 8.5 fluctuating as function of n % 3 C POWER6 15 +C POWER6 15 +C POWER7 4.75 C TODO C * Nothing to do... diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm index d457d65e9..0f94154bf 100644 --- a/mpn/powerpc64/mode64/dive_1.asm +++ b/mpn/powerpc64/mode64/dive_1.asm @@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C norm unorm +C cycles/limb +C norm unorm C POWER3/PPC630 13-19 -C POWER4/PPC970 16 -C POWER5 16 16 -C POWER6 37 46 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 C TODO C * Check if n=1 code is really an improvement. It probably isn't. diff --git a/mpn/powerpc64/mode64/divrem_1.asm b/mpn/powerpc64/mode64/divrem_1.asm index 9d065b728..c0e7b2a9f 100644 --- a/mpn/powerpc64/mode64/divrem_1.asm +++ b/mpn/powerpc64/mode64/divrem_1.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C norm unorm frac -C POWER3/PPC630 16-34 16-34 ~11 -C POWER4/PPC970 29 19 -C POWER5 29 29 ~20 -C POWER6 50 59 ~42 +C cycles/limb +C norm unorm frac +C POWER3/PPC630 16-34 16-34 ~11 +C POWER4/PPC970 29 19 +C POWER5 29 29 ~20 +C POWER6 50 59 ~42 +C POWER7 25 25 ~14 C INPUT PARAMETERS C qp = r3 diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm index 53ef1c708..18f549357 100644 --- a/mpn/powerpc64/mode64/divrem_2.asm +++ b/mpn/powerpc64/mode64/divrem_2.asm @@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C norm frac +C cycles/limb +C norm frac C POWER3/PPC630 -C POWER4/PPC970 ? ? -C POWER5 37 ? -C POWER6 62 ? +C POWER4/PPC970 ? ? +C POWER5 37 ? +C POWER6 62 ? +C POWER6 30.5 ? C INPUT PARAMETERS C qp = r3 diff --git a/mpn/powerpc64/mode64/invert_limb.asm b/mpn/powerpc64/mode64/invert_limb.asm index aed0a32ab..31b243001 100644 --- a/mpn/powerpc64/mode64/invert_limb.asm +++ b/mpn/powerpc64/mode64/invert_limb.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb (approximate) -C POWER3/PPC630 80 -C POWER4/PPC970 86 -C POWER5 86 -C POWER6 170 +C cycles/limb (approximate) +C POWER3/PPC630 80 +C POWER4/PPC970 86 +C POWER5 86 +C POWER6 170 +C POWER7 66 ASM_START() PROLOGUE(mpn_invert_limb) diff --git a/mpn/powerpc64/mode64/mod_1_1.asm b/mpn/powerpc64/mode64/mod_1_1.asm index 61e39310a..f24ceb2c8 100644 --- a/mpn/powerpc64/mode64/mod_1_1.asm +++ b/mpn/powerpc64/mode64/mod_1_1.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 17 -C POWER5 16 -C POWER6 30 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 17 +C POWER5 16 +C POWER6 30 +C POWER7 10.2 C TODO C * Optimise, in particular the cps function. This was compiler-generated and diff --git a/mpn/powerpc64/mode64/mod_1_4.asm b/mpn/powerpc64/mode64/mod_1_4.asm index e0f26da96..b6163c5e7 100644 --- a/mpn/powerpc64/mode64/mod_1_4.asm +++ b/mpn/powerpc64/mode64/mod_1_4.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 9 -C POWER5 9 -C POWER6 13 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 9 +C POWER5 9 +C POWER6 13 +C POWER7 3.5 C TODO C * Optimise, in particular the cps function. This was compiler-generated and diff --git a/mpn/powerpc64/mode64/mod_34lsub1.asm b/mpn/powerpc64/mode64/mod_34lsub1.asm index 62ba17a3c..30b9f98be 100644 --- a/mpn/powerpc64/mode64/mod_34lsub1.asm +++ b/mpn/powerpc64/mode64/mod_34lsub1.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 1.33 -C POWER4/PPC970 1.5 -C POWER5 1.32 -C POWER6 2.35 +C cycles/limb +C POWER3/PPC630 1.33 +C POWER4/PPC970 1.5 +C POWER5 1.32 +C POWER6 2.35 +C POWER7 1 C INPUT PARAMETERS define(`up',`r3') diff --git a/mpn/powerpc64/mode64/mode1o.asm b/mpn/powerpc64/mode64/mode1o.asm index 489ca8551..37e4028d8 100644 --- a/mpn/powerpc64/mode64/mode1o.asm +++ b/mpn/powerpc64/mode64/mode1o.asm @@ -19,10 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 13-19 -C POWER4/PPC970: 16 -C POWER5: 16 +C cycles/limb +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 +C POWER6 ? +C POWER7 12 C TODO C * Check if n=1 code is really an improvement. It probably isn't. diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm index 12bff2fb6..e911cf551 100644 --- a/mpn/powerpc64/mode64/mul_1.asm +++ b/mpn/powerpc64/mode64/mul_1.asm @@ -21,11 +21,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 6-18 -C POWER4/PPC970 7.25? not updated for last file revision -C POWER5 7.25 -C POWER6 14 +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 7.25? not updated for last file revision +C POWER5 7.25 +C POWER6 14 +C POWER7 2.9 C TODO C * Try to reduce the number of needed live registers (at least r5 and r10 diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm index fd7ff9aa1..9a3957f94 100644 --- a/mpn/powerpc64/mode64/mul_basecase.asm +++ b/mpn/powerpc64/mode64/mul_basecase.asm @@ -1,4 +1,4 @@ -dnl PowerPC-64 mpn_basecase. +dnl PowerPC-64 mpn_mul_basecase. dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software dnl Foundation, Inc. @@ -20,11 +20,11 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 6-18 -C POWER4/PPC970 8 -C POWER5 8 -C POWER6 24 +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 24 C INPUT PARAMETERS define(`rp', `r3') diff --git a/mpn/powerpc64/mode64/p3/gmp-mparam.h b/mpn/powerpc64/mode64/p3/gmp-mparam.h index 221b0e1d8..cf1d8ca47 100644 --- a/mpn/powerpc64/mode64/p3/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p3/gmp-mparam.h @@ -23,12 +23,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ -#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -36,22 +37,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 33 #define MUL_TOOM44_THRESHOLD 46 #define MUL_TOOM6H_THRESHOLD 77 -#define MUL_TOOM8H_THRESHOLD 115 +#define MUL_TOOM8H_THRESHOLD 139 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 38 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 33 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 32 - -#define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 16 -#define SQR_TOOM3_THRESHOLD 49 -#define SQR_TOOM4_THRESHOLD 70 -#define SQR_TOOM6_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 48 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 45 +#define SQR_TOOM4_THRESHOLD 64 +#define SQR_TOOM6_THRESHOLD 85 #define SQR_TOOM8_THRESHOLD 139 +#define MULMID_TOOM42_THRESHOLD 22 + #define MULMOD_BNM1_THRESHOLD 8 -#define SQRMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define POWM_SEC_TABLE 2,23,127,502,1421 #define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -123,35 +128,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 118 #define SQR_FFT_THRESHOLD 1728 -#define MULLO_BASECASE_THRESHOLD 3 -#define MULLO_DC_THRESHOLD 28 -#define MULLO_MUL_N_THRESHOLD 4940 +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 2367 -#define DC_DIV_QR_THRESHOLD 27 -#define DC_DIVAPPR_Q_THRESHOLD 95 -#define DC_BDIV_QR_THRESHOLD 28 +#define DC_DIV_QR_THRESHOLD 26 +#define DC_DIVAPPR_Q_THRESHOLD 87 +#define DC_BDIV_QR_THRESHOLD 27 #define DC_BDIV_Q_THRESHOLD 62 -#define INV_MULMOD_BNM1_THRESHOLD 29 -#define INV_NEWTON_THRESHOLD 92 -#define INV_APPR_THRESHOLD 94 +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 91 +#define INV_APPR_THRESHOLD 91 #define BINV_NEWTON_THRESHOLD 115 -#define REDC_1_TO_REDC_N_THRESHOLD 30 +#define REDC_1_TO_REDC_N_THRESHOLD 31 #define MU_DIV_QR_THRESHOLD 551 #define MU_DIVAPPR_Q_THRESHOLD 551 -#define MUPI_DIV_QR_THRESHOLD 49 -#define MU_BDIV_QR_THRESHOLD 492 +#define MUPI_DIV_QR_THRESHOLD 50 +#define MU_BDIV_QR_THRESHOLD 474 #define MU_BDIV_Q_THRESHOLD 492 -#define MATRIX22_STRASSEN_THRESHOLD 9 -#define HGCD_THRESHOLD 55 -#define GCD_DC_THRESHOLD 150 -#define GCDEXT_DC_THRESHOLD 124 +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 53 +#define HGCD_APPR_THRESHOLD 55 +#define HGCD_REDUCE_THRESHOLD 688 +#define GCD_DC_THRESHOLD 148 +#define GCDEXT_DC_THRESHOLD 118 #define JACOBI_BASE_METHOD 1 -#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_DC_THRESHOLD 16 #define GET_STR_PRECOMPUTE_THRESHOLD 27 -#define SET_STR_DC_THRESHOLD 354 +#define SET_STR_DC_THRESHOLD 375 #define SET_STR_PRECOMPUTE_THRESHOLD 812 diff --git a/mpn/powerpc64/mode64/p4/gmp-mparam.h b/mpn/powerpc64/mode64/p4/gmp-mparam.h index 9a0932654..317bc94d6 100644 --- a/mpn/powerpc64/mode64/p4/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p4/gmp-mparam.h @@ -29,6 +29,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 37 @@ -43,16 +44,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 62 -#define SQR_BASECASE_THRESHOLD 5 -#define SQR_TOOM2_THRESHOLD 28 -#define SQR_TOOM3_THRESHOLD 57 -#define SQR_TOOM4_THRESHOLD 136 -#define SQR_TOOM6_THRESHOLD 181 -#define SQR_TOOM8_THRESHOLD 272 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 214 +#define SQR_TOOM6_THRESHOLD 254 +#define SQR_TOOM8_THRESHOLD 430 -#define MULMOD_BNM1_THRESHOLD 13 +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 12 #define SQRMOD_BNM1_THRESHOLD 16 +#define POWM_SEC_TABLE 6,47,347,1036,2826 + #define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 372, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ @@ -116,9 +121,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_TABLE3_SIZE 103 #define SQR_FFT_THRESHOLD 2752 -#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_BASECASE_THRESHOLD 3 #define MULLO_DC_THRESHOLD 36 -#define MULLO_MUL_N_THRESHOLD 12691 +#define MULLO_MUL_N_THRESHOLD 13463 #define DC_DIV_QR_THRESHOLD 43 #define DC_DIVAPPR_Q_THRESHOLD 158 @@ -139,12 +144,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MU_BDIV_Q_THRESHOLD 998 #define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 105 +#define HGCD_THRESHOLD 103 +#define HGCD_APPR_THRESHOLD 110 +#define HGCD_REDUCE_THRESHOLD 1962 #define GCD_DC_THRESHOLD 318 #define GCDEXT_DC_THRESHOLD 242 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 12 #define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 858 -#define SET_STR_PRECOMPUTE_THRESHOLD 1864 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1781 diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h index 827b555c8..9220f99d5 100644 --- a/mpn/powerpc64/mode64/p5/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h @@ -1,4 +1,4 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. +/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file. Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free Software Foundation, Inc. @@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 40 @@ -38,22 +39,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 50 #define MUL_TOOM44_THRESHOLD 121 #define MUL_TOOM6H_THRESHOLD 202 -#define MUL_TOOM8H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 260 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 82 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 #define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 -#define SQR_BASECASE_THRESHOLD 9 -#define SQR_TOOM2_THRESHOLD 36 -#define SQR_TOOM3_THRESHOLD 59 -#define SQR_TOOM4_THRESHOLD 147 -#define SQR_TOOM6_THRESHOLD 204 -#define SQR_TOOM8_THRESHOLD 288 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 191 +#define SQR_TOOM8_THRESHOLD 284 -#define MULMOD_BNM1_THRESHOLD 14 -#define SQRMOD_BNM1_THRESHOLD 16 +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 4,35,387,1068,2699 #define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */ #define MUL_FFT_TABLE3 \ @@ -166,15 +171,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 2752 #define MULLO_BASECASE_THRESHOLD 0 -#define MULLO_DC_THRESHOLD 31 +#define MULLO_DC_THRESHOLD 42 #define MULLO_MUL_N_THRESHOLD 6633 -#define DC_DIV_QR_THRESHOLD 37 +#define DC_DIV_QR_THRESHOLD 43 #define DC_DIVAPPR_Q_THRESHOLD 155 #define DC_BDIV_QR_THRESHOLD 46 -#define DC_BDIV_Q_THRESHOLD 112 +#define DC_BDIV_Q_THRESHOLD 120 -#define INV_MULMOD_BNM1_THRESHOLD 26 +#define INV_MULMOD_BNM1_THRESHOLD 52 #define INV_NEWTON_THRESHOLD 177 #define INV_APPR_THRESHOLD 165 @@ -189,11 +194,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MATRIX22_STRASSEN_THRESHOLD 15 #define HGCD_THRESHOLD 108 -#define GCD_DC_THRESHOLD 303 +#define HGCD_APPR_THRESHOLD 113 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 315 #define GCDEXT_DC_THRESHOLD 237 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 13 #define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 532 -#define SET_STR_PRECOMPUTE_THRESHOLD 1639 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1585 diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm new file mode 100644 index 000000000..4bd508488 --- /dev/null +++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm @@ -0,0 +1,172 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 +dnl Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 12.25 12.8 +C POWER7 ? ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Handle n = 1 and perhaps n = 2 seperately, without saving any registers. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(AM, `$1') + define(SM, `') + define(CLRRSC, `addic $1, r0, 0') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(AM, `') + define(SM, `$1') + define(CLRRSC, `subfc $1, r0, r0') +') + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): ld r8, 0(up) + ld r7, 8(up) + ld r27, 16(up) + addi up, up, 16 + addi rp, rp, 16 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r5, r5, r29 + b L(l3) + +L(b2): ld r7, 0(up) + ld r27, 8(up) + addi up, up, 8 + addi rp, rp, 8 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + ADDSUB r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ADDSUB r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + CLRRSC( r12) C clear r12 and clr/set cy + + ALIGN(32) +L(top): +SM(` subfe r11, r0, r0') C complement... +SM(` addic r11, r11, 1') C ...carry flag + ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r0, r0, r28 + std r0, -24(rp) + ADDSUBC r5, r5, r29 +L(l3): std r5, -16(rp) + ADDSUBC r9, r9, r30 +L(l2): std r9, -8(rp) + ADDSUBC r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + +AM(` addze r3, r12') +SM(` subfe r11, r0, r0') C complement... + ld r31, -8(r1) +SM(` subf r3, r11, r12') + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h index d447b56d9..5392138f1 100644 --- a/mpn/powerpc64/mode64/p6/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -1,7 +1,7 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. +/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free -Software Foundation, Inc. +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011 +Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 21 @@ -38,23 +39,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 50 #define MUL_TOOM44_THRESHOLD 112 #define MUL_TOOM6H_THRESHOLD 274 -#define MUL_TOOM8H_THRESHOLD 430 +#define MUL_TOOM8H_THRESHOLD 339 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 62 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76 #define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78 -#define SQR_BASECASE_THRESHOLD 9 -#define SQR_TOOM2_THRESHOLD 30 -#define SQR_TOOM3_THRESHOLD 53 -#define SQR_TOOM4_THRESHOLD 148 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 49 +#define SQR_TOOM4_THRESHOLD 136 #define SQR_TOOM6_THRESHOLD 226 -#define SQR_TOOM8_THRESHOLD 430 +#define SQR_TOOM8_THRESHOLD 393 + +#define MULMID_TOOM42_THRESHOLD 36 #define MULMOD_BNM1_THRESHOLD 14 #define SQRMOD_BNM1_THRESHOLD 14 +#define POWM_SEC_TABLE 4,23,213,840,2618 + #define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ @@ -106,34 +111,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 2368 #define MULLO_BASECASE_THRESHOLD 5 -#define MULLO_DC_THRESHOLD 28 -#define MULLO_MUL_N_THRESHOLD 6633 - -#define DC_DIV_QR_THRESHOLD 27 -#define DC_DIVAPPR_Q_THRESHOLD 112 -#define DC_BDIV_QR_THRESHOLD 29 -#define DC_BDIV_Q_THRESHOLD 86 - -#define INV_MULMOD_BNM1_THRESHOLD 47 -#define INV_NEWTON_THRESHOLD 93 -#define INV_APPR_THRESHOLD 91 - -#define BINV_NEWTON_THRESHOLD 132 -#define REDC_1_TO_REDC_N_THRESHOLD 39 - -#define MU_DIV_QR_THRESHOLD 855 -#define MU_DIVAPPR_Q_THRESHOLD 807 -#define MUPI_DIV_QR_THRESHOLD 33 -#define MU_BDIV_QR_THRESHOLD 807 -#define MU_BDIV_Q_THRESHOLD 872 - -#define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 64 -#define GCD_DC_THRESHOLD 237 -#define GCDEXT_DC_THRESHOLD 183 +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 3271 + +#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIVAPPR_Q_THRESHOLD 200 +#define DC_BDIV_QR_THRESHOLD 70 +#define DC_BDIV_Q_THRESHOLD 168 + +#define INV_MULMOD_BNM1_THRESHOLD 61 +#define INV_NEWTON_THRESHOLD 166 +#define INV_APPR_THRESHOLD 166 + +#define BINV_NEWTON_THRESHOLD 222 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 59 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 109 +#define HGCD_APPR_THRESHOLD 108 +#define HGCD_REDUCE_THRESHOLD 1052 +#define GCD_DC_THRESHOLD 501 +#define GCDEXT_DC_THRESHOLD 249 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 17 -#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 #define SET_STR_DC_THRESHOLD 532 -#define SET_STR_PRECOMPUTE_THRESHOLD 1648 +#define SET_STR_PRECOMPUTE_THRESHOLD 1639 diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm index 427d6081a..52c5af8ff 100644 --- a/mpn/powerpc64/mode64/p6/mul_basecase.asm +++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm @@ -1,4 +1,4 @@ -dnl PowerPC-64 mpn_basecase. +dnl PowerPC-64 mpn_mul_basecase. dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free dnl Software Foundation, Inc. diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h new file mode 100644 index 000000000..02603c525 --- /dev/null +++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h @@ -0,0 +1,159 @@ +/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011 +Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define BYTES_PER_MP_LIMB 8 + +/* 3550 MHz POWER7 (gcc110.fsffrance.org) */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 28 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 298 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 143 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 135 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 141 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 36 +#define SQR_TOOM3_THRESHOLD 109 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 303 +#define SQR_TOOM8_THRESHOLD 399 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define POWM_SEC_TABLE 6,65,342,1465 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 575,11}, \ + { 303,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 106 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 103 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 23 +#define MULLO_MUL_N_THRESHOLD 9174 + +#define DC_DIV_QR_THRESHOLD 30 +#define DC_DIVAPPR_Q_THRESHOLD 124 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 81 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 133 + +#define BINV_NEWTON_THRESHOLD 300 +#define REDC_1_TO_REDC_N_THRESHOLD 76 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1442 +#define MUPI_DIV_QR_THRESHOLD 58 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 124 +#define HGCD_APPR_THRESHOLD 155 +#define HGCD_REDUCE_THRESHOLD 3134 +#define GCD_DC_THRESHOLD 492 +#define GCDEXT_DC_THRESHOLD 333 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 1517 +#define SET_STR_PRECOMPUTE_THRESHOLD 3421 diff --git a/mpn/powerpc64/mode64/rsh1add_n.asm b/mpn/powerpc64/mode64/rsh1add_n.asm index 8af3ca774..2a5ef3060 100644 --- a/mpn/powerpc64/mode64/rsh1add_n.asm +++ b/mpn/powerpc64/mode64/rsh1add_n.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 2 (1.5 c/l should be possible) -C POWER4/PPC970 4 (2.0 c/l should be possible) -C POWER5 3.5 (2.0 c/l should be possible) -C POWER6 4.5 +C cycles/limb +C POWER3/PPC630 2 (1.5 c/l should be possible) +C POWER4/PPC970 4 (2.0 c/l should be possible) +C POWER5 3.5 (2.0 c/l should be possible) +C POWER6 4.5 +C POWER7 3.5 define(`rp',`r3') define(`up',`r4') diff --git a/mpn/powerpc64/mode64/rsh1sub_n.asm b/mpn/powerpc64/mode64/rsh1sub_n.asm index 1faa03379..b10eb8ab7 100644 --- a/mpn/powerpc64/mode64/rsh1sub_n.asm +++ b/mpn/powerpc64/mode64/rsh1sub_n.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 2 (1.5 c/l should be possible) -C POWER4/PPC970 4 (2.0 c/l should be possible) -C POWER5 3.5 (2.0 c/l should be possible) -C POWER6 4.5 +C cycles/limb +C POWER3/PPC630 2 (1.5 c/l should be possible) +C POWER4/PPC970 4 (2.0 c/l should be possible) +C POWER5 3.5 (2.0 c/l should be possible) +C POWER6 4.5 +C POWER7 3.5 define(`rp',`r3') define(`up',`r4') diff --git a/mpn/powerpc64/mode64/sqr_basecase.asm b/mpn/powerpc64/mode64/sqr_basecase.asm new file mode 100644 index 000000000..72ac2d318 --- /dev/null +++ b/mpn/powerpc64/mode64/sqr_basecase.asm @@ -0,0 +1,852 @@ +dnl PowerPC-64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 Free +dnl Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 16.25 +C POWER7 3.77 + +C NOTES +C * This is very crude, cleanup! +C * Try to reduce the number of needed live registers. +C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The +C cost will be more live registers. +C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code +C size a lot and speed things up perhaps 25%. +C * Use computed goto in order to compress the code. +C * Implement a larger final corner. +C * Schedule callee-saves register saves into other insns. This could save +C about 5 cycles/call. (We cannot analogously optimise the restores, since +C the sqr_diag_addlsh1 loop has no wind-down code as currently written.) +C * Should the alternating std/adde sequences be split? Some pipelines handle +C adde poorly, and might sequentialise all these instructions. +C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for +C adjacent integer multiply insns. Except for the multiply insns, the code +C was not carefully optimised for POWER6 or any other CPU. +C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`rp_outer', `r25') +define(`up_outer', `r21') +define(`rp_saved', `r22') +define(`up_saved', `r23') +define(`n_saved', `r24') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + cmpdi cr0, n, 2 + bge cr0, L(ge2) + ld r5, 0(up) C n = 1 + nop + mulld r8, r5, r5 C weight 0 + mulhdu r9, r5, r5 C weight 1 + std r8, 0(rp) + std r9, 8(rp) + blr + ALIGN(16) +L(ge2): bgt cr0, L(gt2) + ld r0, 0(up) C n = 2 + nop + mulld r8, r0, r0 C u0 * u0 + mulhdu r9, r0, r0 C u0 * u0 + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r8, 0(rp) + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + + ALIGN(16) +L(gt2): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + + mr rp_saved, rp + mr up_saved, up + mr n_saved, n + mr rp_outer, rp + mr up_outer, up + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic r7, n, 2 C compute count... + srdi r7, r7, 2 C ...for ctr + mtctr r7 C copy count into ctr + beq- cr0, L(b0) + blt- cr6, L(b1) + beq- cr6, L(b2) + +L(b3): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 + li r12, 0 C carry limb + bdz L(em3) + + ALIGN(16) +L(tm3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm3) + +L(em3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop) + +L(b0): ld r6, 0(up) + ld r27, 8(up) + mulld r7, r27, r6 + mulhdu r12, r27, r6 + std r7, 8(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(em0) + + ALIGN(16) +L(tm0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm0) + +L(em0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_2) + +L(b1): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r12, r27, r6 + addc r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(em1) + + ALIGN(16) +L(tm1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm1) + +L(em1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_3) + +L(b2): addi r7, r7, -1 C FIXME + mtctr r7 C FIXME + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + std r0, 8(rp) + std r7, 16(rp) + std r11, 24(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(em2) + + ALIGN(16) +L(tm2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm2) + +L(em2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_0) + + +L(outer_loop): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + bdz L(outer_end) + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + adde r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(ea1) + + ALIGN(16) +L(ta1): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta1) + +L(ea1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_0): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r28 + adde r7, r7, r26 + addze r12, r8 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(ea0) + + ALIGN(16) +L(ta0): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta0) + +L(ea0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_3): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + addc r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(ea3) + + ALIGN(16) +L(ta3): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta3) + +L(ea3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + +L(outer_loop_ent_2): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + bdz L(ea2) + addi up, up, 24 + + ALIGN(16) +L(ta2): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta2) + +L(ea2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + b L(outer_loop) + +L(outer_end): + ld r6, 0(up) + ld r9, 8(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + addc r0, r0, r11 + std r0, 0(rp) + addze r8, r8 + std r8, 8(rp) + +define(`rp', `rp_saved') +define(`up', `r5') +define(`n', `r6') +define(`climb', `r0') + + addi r4, rp_saved, 8 + mr r5, up_saved + mr r6, n_saved + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 2 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C put loop count into ctr + beq cr0, L(xb0) + blt cr6, L(xb1) + beq cr6, L(xb2) + +L(xb3): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + addi up, up, 24 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + addze climb, r29 + addc r10, r10, r25 + adde r11, r11, r26 + adde r6, r6, r27 + adde r7, r7, r28 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + std r6, 24(rp) + std r7, 32(rp) + addi rp, rp, 40 + bdnz L(top) + b L(end) + +L(xb2): ld r6, 0(up) + ld r7, 8(up) + addi up, up, 16 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + ld r10, 8(rp) + ld r11, 16(rp) + addc r10, r10, r10 + adde r11, r11, r11 + addze climb, r27 + addc r10, r10, r25 + adde r11, r11, r26 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + addi rp, rp, 24 + bdnz L(top) + b L(end) + +L(xb0): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + ld r12, 40(rp) + ld r23, 48(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + adde r12, r12, r12 + adde r23, r23, r23 + addze climb, r31 + std r24, 0(rp) + addc r10, r10, r25 + std r10, 8(rp) + adde r11, r11, r26 + std r11, 16(rp) + adde r6, r6, r27 + std r6, 24(rp) + adde r7, r7, r28 + std r7, 32(rp) + adde r12, r12, r29 + std r12, 40(rp) + adde r23, r23, r30 + std r23, 48(rp) + addi rp, rp, 56 + bdnz L(top) + b L(end) + +L(xb1): ld r6, 0(up) + addi up, up, 8 + mulld r24, r6, r6 + mulhdu climb, r6, r6 + std r24, 0(rp) + addic rp, rp, 8 C clear carry as side-effect + + ALIGN(32) +L(top): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r8, 0(rp) + ld r9, 8(rp) + adde r8, r8, r8 + adde r9, r9, r9 + ld r10, 16(rp) + ld r11, 24(rp) + adde r10, r10, r10 + adde r11, r11, r11 + ld r6, 32(rp) + ld r7, 40(rp) + adde r6, r6, r6 + adde r7, r7, r7 + ld r12, 48(rp) + ld r23, 56(rp) + adde r12, r12, r12 + adde r23, r23, r23 + addze r31, r31 + addc r8, r8, climb + std r8, 0(rp) + adde r9, r9, r24 + std r9, 8(rp) + adde r10, r10, r25 + std r10, 16(rp) + adde r11, r11, r26 + std r11, 24(rp) + adde r6, r6, r27 + std r6, 32(rp) + adde r7, r7, r28 + std r7, 40(rp) + adde r12, r12, r29 + std r12, 48(rp) + adde r23, r23, r30 + std r23, 56(rp) + mr climb, r31 + addi rp, rp, 64 + bdnz L(top) + +L(end): addze climb, climb + std climb, 0(rp) + + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + blr +EPILOGUE() diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm deleted file mode 100644 index 663f04c14..000000000 --- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm +++ /dev/null @@ -1,238 +0,0 @@ -dnl PowerPC-64 mpn_sqr_diag_addlsh1 - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C POWER3/PPC630 10 -C POWER4/PPC970 6 -C POWER5 5.375 -C POWER6 8.5 - -C NOTES -C * This was written for POWER6 and its preferences for adjacent integer -C multiply insns. The cost is that we get a large set of live registers, -C and therefore need to save 9 callee-saves registers. Except for the -C multiply insns, the code was not carefully optimised for POWER6 or any -C other CPU. -C * Perform some cross-jumping in the feed-in code, into the loop's tail. - -C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n) - -C INPUT PARAMETERS -define(`rp', `r3') -define(`tp', `r4') -define(`up', `r5') -define(`n', `r6') - -define(`climb', `r0') - -ASM_START() -PROLOGUE(mpn_sqr_diag_addlsh1) - std r31, -8(r1) - std r30, -16(r1) - std r29, -24(r1) - std r28, -32(r1) - std r27, -40(r1) - std r26, -48(r1) - std r25, -56(r1) - std r24, -64(r1) - std r23, -72(r1) - - rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 - cmpdi cr6, r0, 2 - addi n, n, 2 C compute count... - srdi n, n, 2 C ...for ctr - mtctr n C put loop count into ctr - beq cr0, L(b0) - blt cr6, L(b1) - beq cr6, L(b2) - -L(b3): ld r6, 0(up) - ld r7, 8(up) - ld r12, 16(up) - addi up, up, 24 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - mulld r28, r12, r12 - mulhdu r29, r12, r12 - ld r10, 0(tp) - ld r11, 8(tp) - ld r6, 16(tp) - ld r7, 24(tp) - addi tp, tp, 32 - addc r10, r10, r10 - adde r11, r11, r11 - adde r6, r6, r6 - adde r7, r7, r7 - addze climb, r29 - addc r10, r10, r25 - adde r11, r11, r26 - adde r6, r6, r27 - adde r7, r7, r28 - std r24, 0(rp) - std r10, 8(rp) - std r11, 16(rp) - std r6, 24(rp) - std r7, 32(rp) - addi rp, rp, 40 - bdnz L(top) - b L(end) - -L(b2): ld r6, 0(up) - ld r7, 8(up) - addi up, up, 16 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - ld r10, 0(tp) - ld r11, 8(tp) - addi tp, tp, 16 - addc r10, r10, r10 - adde r11, r11, r11 - addze climb, r27 - addc r10, r10, r25 - adde r11, r11, r26 - std r24, 0(rp) - std r10, 8(rp) - std r11, 16(rp) - addi rp, rp, 24 - bdnz L(top) - b L(end) - -L(b0): ld r6, 0(up) - ld r7, 8(up) - ld r12, 16(up) - ld r23, 24(up) - addi up, up, 32 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - mulld r28, r12, r12 - mulhdu r29, r12, r12 - mulld r30, r23, r23 - mulhdu r31, r23, r23 - ld r10, 0(tp) - ld r11, 8(tp) - ld r6, 16(tp) - ld r7, 24(tp) - ld r12, 32(tp) - ld r23, 40(tp) - addi tp, tp, 48 - addc r10, r10, r10 - adde r11, r11, r11 - adde r6, r6, r6 - adde r7, r7, r7 - adde r12, r12, r12 - adde r23, r23, r23 - addze climb, r31 - std r24, 0(rp) - addc r10, r10, r25 - std r10, 8(rp) - adde r11, r11, r26 - std r11, 16(rp) - adde r6, r6, r27 - std r6, 24(rp) - adde r7, r7, r28 - std r7, 32(rp) - adde r12, r12, r29 - std r12, 40(rp) - adde r23, r23, r30 - std r23, 48(rp) - addi rp, rp, 56 - bdnz L(top) - b L(end) - -L(b1): ld r6, 0(up) - addi up, up, 8 - mulld r24, r6, r6 - mulhdu climb, r6, r6 - std r24, 0(rp) - addic rp, rp, 8 C clear carry as side-effect - - ALIGN(32) -L(top): ld r6, 0(up) - ld r7, 8(up) - ld r12, 16(up) - ld r23, 24(up) - addi up, up, 32 - mulld r24, r6, r6 - mulhdu r25, r6, r6 - mulld r26, r7, r7 - mulhdu r27, r7, r7 - mulld r28, r12, r12 - mulhdu r29, r12, r12 - mulld r30, r23, r23 - mulhdu r31, r23, r23 - ld r8, 0(tp) - ld r9, 8(tp) - adde r8, r8, r8 - adde r9, r9, r9 - ld r10, 16(tp) - ld r11, 24(tp) - adde r10, r10, r10 - adde r11, r11, r11 - ld r6, 32(tp) - ld r7, 40(tp) - adde r6, r6, r6 - adde r7, r7, r7 - ld r12, 48(tp) - ld r23, 56(tp) - adde r12, r12, r12 - adde r23, r23, r23 - addi tp, tp, 64 - addze r31, r31 - addc r8, r8, climb - std r8, 0(rp) - adde r9, r9, r24 - std r9, 8(rp) - adde r10, r10, r25 - std r10, 16(rp) - adde r11, r11, r26 - std r11, 24(rp) - adde r6, r6, r27 - std r6, 32(rp) - adde r7, r7, r28 - std r7, 40(rp) - adde r12, r12, r29 - std r12, 48(rp) - adde r23, r23, r30 - std r23, 56(rp) - mr climb, r31 - addi rp, rp, 64 - bdnz L(top) - -L(end): addze climb, climb - std climb, 0(rp) - -L(ret): ld r31, -8(r1) - ld r30, -16(r1) - ld r29, -24(r1) - ld r28, -32(r1) - ld r27, -40(r1) - ld r26, -48(r1) - ld r25, -56(r1) - ld r24, -64(r1) - ld r23, -72(r1) - blr -EPILOGUE() diff --git a/mpn/powerpc64/rshift.asm b/mpn/powerpc64/rshift.asm index 6545af769..18406c57e 100644 --- a/mpn/powerpc64/rshift.asm +++ b/mpn/powerpc64/rshift.asm @@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630 ? -C POWER4/PPC970 ? -C POWER5 2.25 -C POWER6 9.75 +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 C TODO C * Try to reduce the number of needed live registers diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm new file mode 100644 index 000000000..7d189388b --- /dev/null +++ b/mpn/powerpc64/tabselect.asm @@ -0,0 +1,96 @@ +dnl PowerPC-64 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 3.3 +C POWER5 ? +C POWER6 ? +C POWER7 2.5 + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using VMX could result in significant speedup for certain CPUs. + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`mask', `r8') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + addi r0, n, 1 + srdi r0, r0, 1 C inner loop count + andi. r9, n, 1 C set cr0 for use in inner loop + subf which, nents, which + sldi n, n, 3 + +L(outer): + mtctr r0 C put inner loop count in ctr + + add r9, which, nents C are we at the selected table entry? + addic r9, r9, -1 C set CF iff not selected entry + subfe mask, r0, r0 + + beq cr0, L(top) C branch to loop entry if n even + + ld r9, 0(tp) + addi tp, tp, 8 + and r9, r9, mask + ld r11, 0(rp) + andc r11, r11, mask + or r9, r9, r11 + std r9, 0(rp) + addi rp, rp, 8 + bdz L(end) + + ALIGN(16) +L(top): ld r9, 0(tp) + ld r10, 8(tp) + addi tp, tp, 16 + nop + and r9, r9, mask + and r10, r10, mask + ld r11, 0(rp) + ld r12, 8(rp) + andc r11, r11, mask + andc r12, r12, mask + or r9, r9, r11 + or r10, r10, r12 + std r9, 0(rp) + std r10, 8(rp) + addi rp, rp, 16 + bdnz L(top) + +L(end): subf rp, n, rp C move rp back to beginning + cmpdi cr6, nents, 1 + addi nents, nents, -1 + bne cr6, L(outer) + + blr +EPILOGUE() |