diff options
Diffstat (limited to 'rts/gmp/mpn/sparc64')
-rw-r--r-- | rts/gmp/mpn/sparc64/README | 48 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/add_n.asm | 172 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/addmul1h.asm | 203 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/addmul_1.asm | 114 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/copyi.asm | 79 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/gmp-mparam.h | 88 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/lshift.asm | 97 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/mul_1.asm | 113 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/mul_1h.asm | 183 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/rshift.asm | 94 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/sub_n.asm | 172 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/submul1h.asm | 204 | ||||
-rw-r--r-- | rts/gmp/mpn/sparc64/submul_1.asm | 114 |
13 files changed, 1681 insertions, 0 deletions
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README new file mode 100644 index 0000000000..6923a133f3 --- /dev/null +++ b/rts/gmp/mpn/sparc64/README @@ -0,0 +1,48 @@ +This directory contains mpn functions for 64-bit V9 SPARC + +RELEVANT OPTIMIZATION ISSUES + +The Ultra I/II pipeline executes up to two simple integer arithmetic operations +per cycle. The 64-bit integer multiply instruction mulx takes from 5 cycles to +35 cycles, depending on the position of the most significant bit of the 1st +source operand. It cannot overlap with other instructions. For our use of +mulx, it will take from 5 to 20 cycles. + +Integer conditional move instructions cannot dual-issue with other integer +instructions. No conditional move can issue 1-5 cycles after a load. (Or +something such bizzare.) + +Integer branches can issue with two integer arithmetic instructions. Likewise +for integer loads. Four instructions may issue (arith, arith, ld/st, branch) +but only if the branch is last. + +(The V9 architecture manual recommends that the 2nd operand of a multiply +instruction be the smaller one. For UltraSPARC, they got things backwards and +optimize for the wrong operand! Really helpful in the light of that multiply +is incredibly slow on these CPUs!) + +STATUS + +There is new code in ~/prec/gmp-remote/sparc64. Not tested or completed, but +the pipelines are worked out. Here are the timings: + +* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb. + +* add_n, sub_n: add3.s currently runs at 6 cycles/limb. We use a bizarre + scheme of compares and branches (with some nops and fnops to align things) + and carefully stay away from the instructions intended for this application + (i.e., movcs and movcc). + + Using movcc/movcs, even with deep unrolling, seems to get down to 7 + cycles/limb. + + The most promising approach is to split operands in 32-bit pieces using + srlx, then use two addccc, and finally compile the results with sllx+or. + The result could run at 5 cycles/limb, I think. It might be possible to + do without unrolling, or with minimal unrolling. + +* addmul_1/submul_1: Should optimize for when scalar operand < 2^32. +* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II, + Karatsuba's method should save up to 16 cycles (i.e. > 20%). +* mul_1 (and possibly the other multiply functions): Handle carry in the + same tricky way as add_n,sub_n. diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm new file mode 100644 index 0000000000..72b3895a5b --- /dev/null +++ b/rts/gmp/mpn/sparc64/add_n.asm @@ -0,0 +1,172 @@ +! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +! sum in a third limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! s1_ptr %o1 +! s2_ptr %o2 +! size %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_add_n) + +! 12 mem ops >= 12 cycles +! 8 shift insn >= 8 cycles +! 8 addccc, executing alone, +8 cycles +! Unrolling not mandatory...perhaps 2-way is best? +! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl +! All in all, it runs at 5 cycles/limb + + save %sp,-160,%sp + + addcc %g0,%g0,%g0 + + add %i3,-4,%i3 + brlz,pn %i3,L(there) + nop + + ldx [%i1+0],%l0 + ldx [%i2+0],%l4 + ldx [%i1+8],%l1 + ldx [%i2+8],%l5 + ldx [%i1+16],%l2 + ldx [%i2+16],%l6 + ldx [%i1+24],%l3 + ldx [%i2+24],%l7 + add %i1,32,%i1 + add %i2,32,%i2 + + add %i3,-4,%i3 + brlz,pn %i3,L(skip) + nop + b L(loop1) ! jump instead of executing many NOPs + nop + ALIGN(32) +!--------- Start main loop --------- +L(loop1): + addccc %l0,%l4,%g1 +!- + srlx %l0,32,%o0 + ldx [%i1+0],%l0 +!- + srlx %l4,32,%o4 + ldx [%i2+0],%l4 +!- + addccc %o0,%o4,%g0 +!- + addccc %l1,%l5,%g2 +!- + srlx %l1,32,%o1 + ldx [%i1+8],%l1 +!- + srlx %l5,32,%o5 + ldx [%i2+8],%l5 +!- + addccc %o1,%o5,%g0 +!- + addccc %l2,%l6,%g3 +!- + srlx %l2,32,%o2 + ldx [%i1+16],%l2 +!- + srlx %l6,32,%g5 ! asymmetry + ldx [%i2+16],%l6 +!- + addccc %o2,%g5,%g0 +!- + addccc %l3,%l7,%g4 +!- + srlx %l3,32,%o3 + ldx [%i1+24],%l3 + add %i1,32,%i1 +!- + srlx %l7,32,%o7 + ldx [%i2+24],%l7 + add %i2,32,%i2 +!- + addccc %o3,%o7,%g0 +!- + stx %g1,[%i0+0] +!- + stx %g2,[%i0+8] +!- + stx %g3,[%i0+16] + add %i3,-4,%i3 +!- + stx %g4,[%i0+24] + add %i0,32,%i0 + + brgez,pt %i3,L(loop1) + nop +!--------- End main loop --------- +L(skip): + addccc %l0,%l4,%g1 + srlx %l0,32,%o0 + srlx %l4,32,%o4 + addccc %o0,%o4,%g0 + addccc %l1,%l5,%g2 + srlx %l1,32,%o1 + srlx %l5,32,%o5 + addccc %o1,%o5,%g0 + addccc %l2,%l6,%g3 + srlx %l2,32,%o2 + srlx %l6,32,%g5 ! asymmetry + addccc %o2,%g5,%g0 + addccc %l3,%l7,%g4 + srlx %l3,32,%o3 + srlx %l7,32,%o7 + addccc %o3,%o7,%g0 + stx %g1,[%i0+0] + stx %g2,[%i0+8] + stx %g3,[%i0+16] + stx %g4,[%i0+24] + add %i0,32,%i0 + +L(there): + add %i3,4,%i3 + brz,pt %i3,L(end) + nop + +L(loop2): + ldx [%i1+0],%l0 + add %i1,8,%i1 + ldx [%i2+0],%l4 + add %i2,8,%i2 + srlx %l0,32,%g2 + srlx %l4,32,%g3 + addccc %l0,%l4,%g1 + addccc %g2,%g3,%g0 + stx %g1,[%i0+0] + add %i0,8,%i0 + add %i3,-1,%i3 + brgz,pt %i3,L(loop2) + nop + +L(end): addc %g0,%g0,%i0 + ret + restore +EPILOGUE(mpn_add_n) diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm new file mode 100644 index 0000000000..96cb5f7369 --- /dev/null +++ b/rts/gmp/mpn/sparc64/addmul1h.asm @@ -0,0 +1,203 @@ +dnl SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +ifdef(`LOWPART', +`addmull:', +`addmulu:') + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` ld [%i0+DHI],%g5') + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +ifdef(`LOWPART', +` add %g5,%g1,%g1') C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` st %g4,[%i0-4+DHI] + srlx %g4,32,%g4') + + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +ifdef(`LOWPART', +`EPILOGUE(addmull)', +`EPILOGUE(addmulu)') diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm new file mode 100644 index 0000000000..c3f04cea6a --- /dev/null +++ b/rts/gmp/mpn/sparc64/addmul_1.asm @@ -0,0 +1,114 @@ +dnl SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_addmul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_addmul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`LOWPART') +define(`E',`L(l.$1)') +include_mpn(`sparc64/addmul1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/addmul1h.asm') diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm new file mode 100644 index 0000000000..d9957e3c90 --- /dev/null +++ b/rts/gmp/mpn/sparc64/copyi.asm @@ -0,0 +1,79 @@ +! SPARC v9 __gmpn_copy -- Copy a limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! rptr %o0 +! sptr %o1 +! n %o2 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_copyi) + add %o2,-8,%o2 + brlz,pn %o2,L(skip) + nop + b,a L(loop1) + nop + + ALIGN(16) +L(loop1): + ldx [%o1+0],%g1 + ldx [%o1+8],%g2 + ldx [%o1+16],%g3 + ldx [%o1+24],%g4 + ldx [%o1+32],%g5 + ldx [%o1+40],%o3 + ldx [%o1+48],%o4 + ldx [%o1+56],%o5 + add %o1,64,%o1 + stx %g1,[%o0+0] + stx %g2,[%o0+8] + stx %g3,[%o0+16] + stx %g4,[%o0+24] + stx %g5,[%o0+32] + stx %o3,[%o0+40] + stx %o4,[%o0+48] + stx %o5,[%o0+56] + add %o2,-8,%o2 + brgez,pt %o2,L(loop1) + add %o0,64,%o0 + +L(skip): + add %o2,8,%o2 + brz,pt %o2,L(end) + nop + +L(loop2): + ldx [%o1],%g1 + add %o1,8,%o1 + add %o2,-1,%o2 + stx %g1,[%o0] + add %o0,8,%o0 + brgz,pt %o2,L(loop2) + nop + +L(end): retl + nop +EPILOGUE(mpn_copyi) diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h new file mode 100644 index 0000000000..74f61661c1 --- /dev/null +++ b/rts/gmp/mpn/sparc64/gmp-mparam.h @@ -0,0 +1,88 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* Tell the toom3 multiply implementation to call low-level mpn + functions instead of open-coding operations in C. */ +#define USE_MORE_MPN 1 + + +/* Run on sun workshop cc. */ +/* Generated by tuneup.c, 2000-07-30. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 12 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 95 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 33 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 125 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 27 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 107 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 12 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 199 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 320 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 1664 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 320 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 1664 +#endif diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm new file mode 100644 index 0000000000..2d2edc50a7 --- /dev/null +++ b/rts/gmp/mpn/sparc64/lshift.asm @@ -0,0 +1,97 @@ +! SPARC v9 __gmpn_lshift -- + +! Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! src_ptr %o1 +! size %o2 +! cnt %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_lshift) + sllx %o2,3,%g1 + add %o1,%g1,%o1 ! make %o1 point at end of src + ldx [%o1-8],%g2 ! load first limb + sub %g0,%o3,%o5 ! negate shift count + add %o0,%g1,%o0 ! make %o0 point at end of res + add %o2,-1,%o2 + and %o2,4-1,%g4 ! number of limbs in first loop + srlx %g2,%o5,%g1 ! compute function result + brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop + mov %g1,%g5 + + sub %o2,%g4,%o2 ! adjust count for main loop + +L(loop0): + ldx [%o1-16],%g3 + add %o0,-8,%o0 + add %o1,-8,%o1 + add %g4,-1,%g4 + sllx %g2,%o3,%o4 + srlx %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + brnz,pt %g4,L(loop0) + stx %o4,[%o0+0] + +L(0): brz,pn %o2,L(end) + nop + +L(loop1): + ldx [%o1-16],%g3 + add %o0,-32,%o0 + add %o2,-4,%o2 + sllx %g2,%o3,%o4 + srlx %g3,%o5,%g1 + + ldx [%o1-24],%g2 + sllx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0+24] + srlx %g2,%o5,%g1 + + ldx [%o1-32],%g3 + sllx %g2,%o3,%o4 + or %g4,%g1,%g4 + stx %g4,[%o0+16] + srlx %g3,%o5,%g1 + + ldx [%o1-40],%g2 + sllx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0+8] + srlx %g2,%o5,%g1 + + add %o1,-32,%o1 + or %g4,%g1,%g4 + brnz,pt %o2,L(loop1) + stx %g4,[%o0+0] + +L(end): sllx %g2,%o3,%g2 + stx %g2,[%o0-8] + retl + mov %g5,%o0 +EPILOGUE(mpn_lshift) diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm new file mode 100644 index 0000000000..f2f2821d51 --- /dev/null +++ b/rts/gmp/mpn/sparc64/mul_1.asm @@ -0,0 +1,113 @@ +dnl SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and +dnl store the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_mul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call mull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_mul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`E',`L($1)') +include_mpn(`sparc64/mul_1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/addmul1h.asm') diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm new file mode 100644 index 0000000000..5078c01c3f --- /dev/null +++ b/rts/gmp/mpn/sparc64/mul_1h.asm @@ -0,0 +1,183 @@ +dnl SPARC 64-bit mull -- Helper for mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +mull: + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + st %g4,[%i0-4+DHI] + srlx %g4,32,%g4 + + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +EPILOGUE(mull) diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm new file mode 100644 index 0000000000..baf7920efb --- /dev/null +++ b/rts/gmp/mpn/sparc64/rshift.asm @@ -0,0 +1,94 @@ +! SPARC v9 __gmpn_rshift -- + +! Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! src_ptr %o1 +! size %o2 +! cnt %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_rshift) + ldx [%o1],%g2 ! load first limb + sub %g0,%o3,%o5 ! negate shift count + add %o2,-1,%o2 + and %o2,4-1,%g4 ! number of limbs in first loop + sllx %g2,%o5,%g1 ! compute function result + brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop + mov %g1,%g5 + + sub %o2,%g4,%o2 ! adjust count for main loop + +L(loop0): + ldx [%o1+8],%g3 + add %o0,8,%o0 + add %o1,8,%o1 + add %g4,-1,%g4 + srlx %g2,%o3,%o4 + sllx %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + brnz,pt %g4,L(loop0) + stx %o4,[%o0-8] + +L(0): brz,pn %o2,L(end) + nop + +L(loop1): + ldx [%o1+8],%g3 + add %o0,32,%o0 + add %o2,-4,%o2 + srlx %g2,%o3,%o4 + sllx %g3,%o5,%g1 + + ldx [%o1+16],%g2 + srlx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0-32] + sllx %g2,%o5,%g1 + + ldx [%o1+24],%g3 + srlx %g2,%o3,%o4 + or %g4,%g1,%g4 + stx %g4,[%o0-24] + sllx %g3,%o5,%g1 + + ldx [%o1+32],%g2 + srlx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0-16] + sllx %g2,%o5,%g1 + + add %o1,32,%o1 + or %g4,%g1,%g4 + brnz %o2,L(loop1) + stx %g4,[%o0-8] + +L(end): srlx %g2,%o3,%g2 + stx %g2,[%o0-0] + retl + mov %g5,%o0 +EPILOGUE(mpn_rshift) diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm new file mode 100644 index 0000000000..61547138e0 --- /dev/null +++ b/rts/gmp/mpn/sparc64/sub_n.asm @@ -0,0 +1,172 @@ +! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! store difference in a third limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! s1_ptr %o1 +! s2_ptr %o2 +! size %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_sub_n) + +! 12 mem ops >= 12 cycles +! 8 shift insn >= 8 cycles +! 8 addccc, executing alone, +8 cycles +! Unrolling not mandatory...perhaps 2-way is best? +! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl +! All in all, it runs at 5 cycles/limb + + save %sp,-160,%sp + + addcc %g0,%g0,%g0 + + add %i3,-4,%i3 + brlz,pn %i3,L(there) + nop + + ldx [%i1+0],%l0 + ldx [%i2+0],%l4 + ldx [%i1+8],%l1 + ldx [%i2+8],%l5 + ldx [%i1+16],%l2 + ldx [%i2+16],%l6 + ldx [%i1+24],%l3 + ldx [%i2+24],%l7 + add %i1,32,%i1 + add %i2,32,%i2 + + add %i3,-4,%i3 + brlz,pn %i3,L(skip) + nop + b L(loop1) ! jump instead of executing many NOPs + nop + ALIGN(32) +!--------- Start main loop --------- +L(loop1): + subccc %l0,%l4,%g1 +!- + srlx %l0,32,%o0 + ldx [%i1+0],%l0 +!- + srlx %l4,32,%o4 + ldx [%i2+0],%l4 +!- + subccc %o0,%o4,%g0 +!- + subccc %l1,%l5,%g2 +!- + srlx %l1,32,%o1 + ldx [%i1+8],%l1 +!- + srlx %l5,32,%o5 + ldx [%i2+8],%l5 +!- + subccc %o1,%o5,%g0 +!- + subccc %l2,%l6,%g3 +!- + srlx %l2,32,%o2 + ldx [%i1+16],%l2 +!- + srlx %l6,32,%g5 ! asymmetry + ldx [%i2+16],%l6 +!- + subccc %o2,%g5,%g0 +!- + subccc %l3,%l7,%g4 +!- + srlx %l3,32,%o3 + ldx [%i1+24],%l3 + add %i1,32,%i1 +!- + srlx %l7,32,%o7 + ldx [%i2+24],%l7 + add %i2,32,%i2 +!- + subccc %o3,%o7,%g0 +!- + stx %g1,[%i0+0] +!- + stx %g2,[%i0+8] +!- + stx %g3,[%i0+16] + add %i3,-4,%i3 +!- + stx %g4,[%i0+24] + add %i0,32,%i0 + + brgez,pt %i3,L(loop1) + nop +!--------- End main loop --------- +L(skip): + subccc %l0,%l4,%g1 + srlx %l0,32,%o0 + srlx %l4,32,%o4 + subccc %o0,%o4,%g0 + subccc %l1,%l5,%g2 + srlx %l1,32,%o1 + srlx %l5,32,%o5 + subccc %o1,%o5,%g0 + subccc %l2,%l6,%g3 + srlx %l2,32,%o2 + srlx %l6,32,%g5 ! asymmetry + subccc %o2,%g5,%g0 + subccc %l3,%l7,%g4 + srlx %l3,32,%o3 + srlx %l7,32,%o7 + subccc %o3,%o7,%g0 + stx %g1,[%i0+0] + stx %g2,[%i0+8] + stx %g3,[%i0+16] + stx %g4,[%i0+24] + add %i0,32,%i0 + +L(there): + add %i3,4,%i3 + brz,pt %i3,L(end) + nop + +L(loop2): + ldx [%i1+0],%l0 + add %i1,8,%i1 + ldx [%i2+0],%l4 + add %i2,8,%i2 + srlx %l0,32,%g2 + srlx %l4,32,%g3 + subccc %l0,%l4,%g1 + subccc %g2,%g3,%g0 + stx %g1,[%i0+0] + add %i0,8,%i0 + add %i3,-1,%i3 + brgz,pt %i3,L(loop2) + nop + +L(end): addc %g0,%g0,%i0 + ret + restore +EPILOGUE(mpn_sub_n) diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm new file mode 100644 index 0000000000..7f51ba59c6 --- /dev/null +++ b/rts/gmp/mpn/sparc64/submul1h.asm @@ -0,0 +1,204 @@ +dnl SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +ifdef(`LOWPART', +`submull:', +`submulu:') + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) +ifdef(`LOWPART', +` ld [%i0+DHI],%g5') + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` subxcc %g5,%g4,%l2') C add *res_ptr to p0 (ADD2) +ifdef(`LOWPART', +` st %l2,[%i0-4+DHI] + srlx %g4,32,%g4') + + addx %g4,0,%g4 + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +ifdef(`LOWPART', +`EPILOGUE(submull)', +`EPILOGUE(submulu)') diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm new file mode 100644 index 0000000000..7c6af0a98b --- /dev/null +++ b/rts/gmp/mpn/sparc64/submul_1.asm @@ -0,0 +1,114 @@ +dnl SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_submul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call submull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call submulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_submul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`LOWPART') +define(`E',`L(l.$1)') +include_mpn(`sparc64/submul1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/submul1h.asm') |