summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn/sparc64
diff options
context:
space:
mode:
Diffstat (limited to 'rts/gmp/mpn/sparc64')
-rw-r--r--rts/gmp/mpn/sparc64/README48
-rw-r--r--rts/gmp/mpn/sparc64/add_n.asm172
-rw-r--r--rts/gmp/mpn/sparc64/addmul1h.asm203
-rw-r--r--rts/gmp/mpn/sparc64/addmul_1.asm114
-rw-r--r--rts/gmp/mpn/sparc64/copyi.asm79
-rw-r--r--rts/gmp/mpn/sparc64/gmp-mparam.h88
-rw-r--r--rts/gmp/mpn/sparc64/lshift.asm97
-rw-r--r--rts/gmp/mpn/sparc64/mul_1.asm113
-rw-r--r--rts/gmp/mpn/sparc64/mul_1h.asm183
-rw-r--r--rts/gmp/mpn/sparc64/rshift.asm94
-rw-r--r--rts/gmp/mpn/sparc64/sub_n.asm172
-rw-r--r--rts/gmp/mpn/sparc64/submul1h.asm204
-rw-r--r--rts/gmp/mpn/sparc64/submul_1.asm114
13 files changed, 1681 insertions, 0 deletions
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README
new file mode 100644
index 0000000000..6923a133f3
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/README
@@ -0,0 +1,48 @@
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+The Ultra I/II pipeline executes up to two simple integer arithmetic operations
+per cycle. The 64-bit integer multiply instruction mulx takes from 5 cycles to
+35 cycles, depending on the position of the most significant bit of the 1st
+source operand. It cannot overlap with other instructions. For our use of
+mulx, it will take from 5 to 20 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions. No conditional move can issue 1-5 cycles after a load. (Or
+something such bizzare.)
+
+Integer branches can issue with two integer arithmetic instructions. Likewise
+for integer loads. Four instructions may issue (arith, arith, ld/st, branch)
+but only if the branch is last.
+
+(The V9 architecture manual recommends that the 2nd operand of a multiply
+instruction be the smaller one. For UltraSPARC, they got things backwards and
+optimize for the wrong operand! Really helpful in the light of that multiply
+is incredibly slow on these CPUs!)
+
+STATUS
+
+There is new code in ~/prec/gmp-remote/sparc64. Not tested or completed, but
+the pipelines are worked out. Here are the timings:
+
+* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb.
+
+* add_n, sub_n: add3.s currently runs at 6 cycles/limb. We use a bizarre
+ scheme of compares and branches (with some nops and fnops to align things)
+ and carefully stay away from the instructions intended for this application
+ (i.e., movcs and movcc).
+
+ Using movcc/movcs, even with deep unrolling, seems to get down to 7
+ cycles/limb.
+
+ The most promising approach is to split operands in 32-bit pieces using
+ srlx, then use two addccc, and finally compile the results with sllx+or.
+ The result could run at 5 cycles/limb, I think. It might be possible to
+ do without unrolling, or with minimal unrolling.
+
+* addmul_1/submul_1: Should optimize for when scalar operand < 2^32.
+* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II,
+ Karatsuba's method should save up to 16 cycles (i.e. > 20%).
+* mul_1 (and possibly the other multiply functions): Handle carry in the
+ same tricky way as add_n,sub_n.
diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm
new file mode 100644
index 0000000000..72b3895a5b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/add_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! s1_ptr %o1
+! s2_ptr %o2
+! size %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_add_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+ save %sp,-160,%sp
+
+ addcc %g0,%g0,%g0
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(there)
+ nop
+
+ ldx [%i1+0],%l0
+ ldx [%i2+0],%l4
+ ldx [%i1+8],%l1
+ ldx [%i2+8],%l5
+ ldx [%i1+16],%l2
+ ldx [%i2+16],%l6
+ ldx [%i1+24],%l3
+ ldx [%i2+24],%l7
+ add %i1,32,%i1
+ add %i2,32,%i2
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(skip)
+ nop
+ b L(loop1) ! jump instead of executing many NOPs
+ nop
+ ALIGN(32)
+!--------- Start main loop ---------
+L(loop1):
+ addccc %l0,%l4,%g1
+!-
+ srlx %l0,32,%o0
+ ldx [%i1+0],%l0
+!-
+ srlx %l4,32,%o4
+ ldx [%i2+0],%l4
+!-
+ addccc %o0,%o4,%g0
+!-
+ addccc %l1,%l5,%g2
+!-
+ srlx %l1,32,%o1
+ ldx [%i1+8],%l1
+!-
+ srlx %l5,32,%o5
+ ldx [%i2+8],%l5
+!-
+ addccc %o1,%o5,%g0
+!-
+ addccc %l2,%l6,%g3
+!-
+ srlx %l2,32,%o2
+ ldx [%i1+16],%l2
+!-
+ srlx %l6,32,%g5 ! asymmetry
+ ldx [%i2+16],%l6
+!-
+ addccc %o2,%g5,%g0
+!-
+ addccc %l3,%l7,%g4
+!-
+ srlx %l3,32,%o3
+ ldx [%i1+24],%l3
+ add %i1,32,%i1
+!-
+ srlx %l7,32,%o7
+ ldx [%i2+24],%l7
+ add %i2,32,%i2
+!-
+ addccc %o3,%o7,%g0
+!-
+ stx %g1,[%i0+0]
+!-
+ stx %g2,[%i0+8]
+!-
+ stx %g3,[%i0+16]
+ add %i3,-4,%i3
+!-
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+ brgez,pt %i3,L(loop1)
+ nop
+!--------- End main loop ---------
+L(skip):
+ addccc %l0,%l4,%g1
+ srlx %l0,32,%o0
+ srlx %l4,32,%o4
+ addccc %o0,%o4,%g0
+ addccc %l1,%l5,%g2
+ srlx %l1,32,%o1
+ srlx %l5,32,%o5
+ addccc %o1,%o5,%g0
+ addccc %l2,%l6,%g3
+ srlx %l2,32,%o2
+ srlx %l6,32,%g5 ! asymmetry
+ addccc %o2,%g5,%g0
+ addccc %l3,%l7,%g4
+ srlx %l3,32,%o3
+ srlx %l7,32,%o7
+ addccc %o3,%o7,%g0
+ stx %g1,[%i0+0]
+ stx %g2,[%i0+8]
+ stx %g3,[%i0+16]
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+L(there):
+ add %i3,4,%i3
+ brz,pt %i3,L(end)
+ nop
+
+L(loop2):
+ ldx [%i1+0],%l0
+ add %i1,8,%i1
+ ldx [%i2+0],%l4
+ add %i2,8,%i2
+ srlx %l0,32,%g2
+ srlx %l4,32,%g3
+ addccc %l0,%l4,%g1
+ addccc %g2,%g3,%g0
+ stx %g1,[%i0+0]
+ add %i0,8,%i0
+ add %i3,-1,%i3
+ brgz,pt %i3,L(loop2)
+ nop
+
+L(end): addc %g0,%g0,%i0
+ ret
+ restore
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm
new file mode 100644
index 0000000000..96cb5f7369
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul1h.asm
@@ -0,0 +1,203 @@
+dnl SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`addmull:',
+`addmulu:')
+ save %sp,-256,%sp
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1+4],%f11
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,E(end2)
+ std %f12,[%fp-17]
+
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end3)
+ std %f12,[%fp-33]
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,E(end4)
+ std %f12,[%fp-17]
+
+ b,a E(loop)
+ nop C nop is cheap to nullify
+
+ ALIGN(16)
+C BEGIN LOOP
+E(loop):
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ sub %i2,2,%i2
+ add %i0,4,%i0 C res_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ std %f12,[%fp-17]
+ brnz,pt %i2,E(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+ fxtod %f10,%f2
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ b,a E(yyy)
+
+E(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ifdef(`LOWPART',
+` ld [%i0+DHI],%g5')
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ ldx [%fp-33],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ifdef(`LOWPART',
+` add %g5,%g1,%g1') C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ifdef(`LOWPART',
+` st %g4,[%i0-4+DHI]
+ srlx %g4,32,%g4')
+
+ ret
+ restore %g0,%g4,%o0 C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(addmull)',
+`EPILOGUE(addmulu)')
diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm
new file mode 100644
index 0000000000..c3f04cea6a
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul_1.asm
@@ -0,0 +1,114 @@
+dnl SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl add the result to a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+PROLOGUE(mpn_addmul_1)
+ save %sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below. Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hh(L(noll)),%g2
+ sethi %lm(L(noll)),%g1
+ or %g2,%hm(L(noll)),%g2
+ or %g1,%lo(L(noll)),%g1
+ sllx %g2,32,%g2
+ ld [%g1+%g2],%f10')
+
+ sub %i1,%i0,%g1
+ srlx %g1,3,%g1
+ cmp %g1,%i2
+ bcc,pt %xcc,L(nooverlap)
+ nop
+
+ sllx %i2,3,%g2 C compute stack allocation byte count
+ add %g2,15,%o0
+ and %o0,-16,%o0
+ sub %sp,%o0,%sp
+ add %sp,2223,%o0
+
+ mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp
+ call mpn_copyi
+ mov %i2,%o2 C copy n to mpn_copyi's count parameter
+
+ add %sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+ mov %i0,%o0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call addmull
+ srl %i3,0,%o3
+
+ mov %o0,%l0 C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+ srlx %i3,32,%o3
+ brz,a,pn %o3,L(small)
+ mov %o0,%i0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call addmulu
+ add %i0,4,%o0
+
+ add %l0,%o0,%i0
+L(small):
+ ret
+ restore %g0,%g0,%g0
+EPILOGUE(mpn_addmul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/addmul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000000..d9957e3c90
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/copyi.asm
@@ -0,0 +1,79 @@
+! SPARC v9 __gmpn_copy -- Copy a limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! rptr %o0
+! sptr %o1
+! n %o2
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_copyi)
+ add %o2,-8,%o2
+ brlz,pn %o2,L(skip)
+ nop
+ b,a L(loop1)
+ nop
+
+ ALIGN(16)
+L(loop1):
+ ldx [%o1+0],%g1
+ ldx [%o1+8],%g2
+ ldx [%o1+16],%g3
+ ldx [%o1+24],%g4
+ ldx [%o1+32],%g5
+ ldx [%o1+40],%o3
+ ldx [%o1+48],%o4
+ ldx [%o1+56],%o5
+ add %o1,64,%o1
+ stx %g1,[%o0+0]
+ stx %g2,[%o0+8]
+ stx %g3,[%o0+16]
+ stx %g4,[%o0+24]
+ stx %g5,[%o0+32]
+ stx %o3,[%o0+40]
+ stx %o4,[%o0+48]
+ stx %o5,[%o0+56]
+ add %o2,-8,%o2
+ brgez,pt %o2,L(loop1)
+ add %o0,64,%o0
+
+L(skip):
+ add %o2,8,%o2
+ brz,pt %o2,L(end)
+ nop
+
+L(loop2):
+ ldx [%o1],%g1
+ add %o1,8,%o1
+ add %o2,-1,%o2
+ stx %g1,[%o0]
+ add %o0,8,%o0
+ brgz,pt %o2,L(loop2)
+ nop
+
+L(end): retl
+ nop
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..74f61661c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+ functions instead of open-coding operations in C. */
+#define USE_MORE_MPN 1
+
+
+/* Run on sun workshop cc. */
+/* Generated by tuneup.c, 2000-07-30. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 12
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 95
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 33
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 125
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 27
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 107
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 12
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 199
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 320
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 1664
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 320
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 1664
+#endif
diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000000..2d2edc50a7
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/lshift.asm
@@ -0,0 +1,97 @@
+! SPARC v9 __gmpn_lshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! src_ptr %o1
+! size %o2
+! cnt %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_lshift)
+ sllx %o2,3,%g1
+ add %o1,%g1,%o1 ! make %o1 point at end of src
+ ldx [%o1-8],%g2 ! load first limb
+ sub %g0,%o3,%o5 ! negate shift count
+ add %o0,%g1,%o0 ! make %o0 point at end of res
+ add %o2,-1,%o2
+ and %o2,4-1,%g4 ! number of limbs in first loop
+ srlx %g2,%o5,%g1 ! compute function result
+ brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop
+ mov %g1,%g5
+
+ sub %o2,%g4,%o2 ! adjust count for main loop
+
+L(loop0):
+ ldx [%o1-16],%g3
+ add %o0,-8,%o0
+ add %o1,-8,%o1
+ add %g4,-1,%g4
+ sllx %g2,%o3,%o4
+ srlx %g3,%o5,%g1
+ mov %g3,%g2
+ or %o4,%g1,%o4
+ brnz,pt %g4,L(loop0)
+ stx %o4,[%o0+0]
+
+L(0): brz,pn %o2,L(end)
+ nop
+
+L(loop1):
+ ldx [%o1-16],%g3
+ add %o0,-32,%o0
+ add %o2,-4,%o2
+ sllx %g2,%o3,%o4
+ srlx %g3,%o5,%g1
+
+ ldx [%o1-24],%g2
+ sllx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0+24]
+ srlx %g2,%o5,%g1
+
+ ldx [%o1-32],%g3
+ sllx %g2,%o3,%o4
+ or %g4,%g1,%g4
+ stx %g4,[%o0+16]
+ srlx %g3,%o5,%g1
+
+ ldx [%o1-40],%g2
+ sllx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0+8]
+ srlx %g2,%o5,%g1
+
+ add %o1,-32,%o1
+ or %g4,%g1,%g4
+ brnz,pt %o2,L(loop1)
+ stx %g4,[%o0+0]
+
+L(end): sllx %g2,%o3,%g2
+ stx %g2,[%o0-8]
+ retl
+ mov %g5,%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm
new file mode 100644
index 0000000000..f2f2821d51
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1.asm
@@ -0,0 +1,113 @@
+dnl SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl store the result to a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+PROLOGUE(mpn_mul_1)
+ save %sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below. Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hh(L(noll)),%g2
+ sethi %lm(L(noll)),%g1
+ or %g2,%hm(L(noll)),%g2
+ or %g1,%lo(L(noll)),%g1
+ sllx %g2,32,%g2
+ ld [%g1+%g2],%f10')
+
+ sub %i1,%i0,%g1
+ srlx %g1,3,%g1
+ cmp %g1,%i2
+ bcc,pt %xcc,L(nooverlap)
+ nop
+
+ sllx %i2,3,%g2 C compute stack allocation byte count
+ add %g2,15,%o0
+ and %o0,-16,%o0
+ sub %sp,%o0,%sp
+ add %sp,2223,%o0
+
+ mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp
+ call mpn_copyi
+ mov %i2,%o2 C copy n to mpn_copyi's count parameter
+
+ add %sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+ mov %i0,%o0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call mull
+ srl %i3,0,%o3
+
+ mov %o0,%l0 C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+ srlx %i3,32,%o3
+ brz,a,pn %o3,L(small)
+ mov %o0,%i0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call addmulu
+ add %i0,4,%o0
+
+ add %l0,%o0,%i0
+L(small):
+ ret
+ restore %g0,%g0,%g0
+EPILOGUE(mpn_mul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`E',`L($1)')
+include_mpn(`sparc64/mul_1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm
new file mode 100644
index 0000000000..5078c01c3f
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1h.asm
@@ -0,0 +1,183 @@
+dnl SPARC 64-bit mull -- Helper for mpn_mul_1.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+mull:
+ save %sp,-256,%sp
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1+4],%f11
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,E(end2)
+ std %f12,[%fp-17]
+
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end3)
+ std %f12,[%fp-33]
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,E(end4)
+ std %f12,[%fp-17]
+
+ b,a E(loop)
+ nop C nop is cheap to nullify
+
+ ALIGN(16)
+C BEGIN LOOP
+E(loop):
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ sub %i2,2,%i2
+ add %i0,4,%i0 C res_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ std %f12,[%fp-17]
+ brnz,pt %i2,E(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ b,a E(yyy)
+
+E(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ ldx [%fp-33],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ st %g4,[%i0-4+DHI]
+ srlx %g4,32,%g4
+
+ ret
+ restore %g0,%g4,%o0 C sideeffect: put cy in retreg
+EPILOGUE(mull)
diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000000..baf7920efb
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/rshift.asm
@@ -0,0 +1,94 @@
+! SPARC v9 __gmpn_rshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! src_ptr %o1
+! size %o2
+! cnt %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_rshift)
+ ldx [%o1],%g2 ! load first limb
+ sub %g0,%o3,%o5 ! negate shift count
+ add %o2,-1,%o2
+ and %o2,4-1,%g4 ! number of limbs in first loop
+ sllx %g2,%o5,%g1 ! compute function result
+ brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop
+ mov %g1,%g5
+
+ sub %o2,%g4,%o2 ! adjust count for main loop
+
+L(loop0):
+ ldx [%o1+8],%g3
+ add %o0,8,%o0
+ add %o1,8,%o1
+ add %g4,-1,%g4
+ srlx %g2,%o3,%o4
+ sllx %g3,%o5,%g1
+ mov %g3,%g2
+ or %o4,%g1,%o4
+ brnz,pt %g4,L(loop0)
+ stx %o4,[%o0-8]
+
+L(0): brz,pn %o2,L(end)
+ nop
+
+L(loop1):
+ ldx [%o1+8],%g3
+ add %o0,32,%o0
+ add %o2,-4,%o2
+ srlx %g2,%o3,%o4
+ sllx %g3,%o5,%g1
+
+ ldx [%o1+16],%g2
+ srlx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0-32]
+ sllx %g2,%o5,%g1
+
+ ldx [%o1+24],%g3
+ srlx %g2,%o3,%o4
+ or %g4,%g1,%g4
+ stx %g4,[%o0-24]
+ sllx %g3,%o5,%g1
+
+ ldx [%o1+32],%g2
+ srlx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0-16]
+ sllx %g2,%o5,%g1
+
+ add %o1,32,%o1
+ or %g4,%g1,%g4
+ brnz %o2,L(loop1)
+ stx %g4,[%o0-8]
+
+L(end): srlx %g2,%o3,%g2
+ stx %g2,[%o0-0]
+ retl
+ mov %g5,%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm
new file mode 100644
index 0000000000..61547138e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/sub_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! s1_ptr %o1
+! s2_ptr %o2
+! size %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+ save %sp,-160,%sp
+
+ addcc %g0,%g0,%g0
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(there)
+ nop
+
+ ldx [%i1+0],%l0
+ ldx [%i2+0],%l4
+ ldx [%i1+8],%l1
+ ldx [%i2+8],%l5
+ ldx [%i1+16],%l2
+ ldx [%i2+16],%l6
+ ldx [%i1+24],%l3
+ ldx [%i2+24],%l7
+ add %i1,32,%i1
+ add %i2,32,%i2
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(skip)
+ nop
+ b L(loop1) ! jump instead of executing many NOPs
+ nop
+ ALIGN(32)
+!--------- Start main loop ---------
+L(loop1):
+ subccc %l0,%l4,%g1
+!-
+ srlx %l0,32,%o0
+ ldx [%i1+0],%l0
+!-
+ srlx %l4,32,%o4
+ ldx [%i2+0],%l4
+!-
+ subccc %o0,%o4,%g0
+!-
+ subccc %l1,%l5,%g2
+!-
+ srlx %l1,32,%o1
+ ldx [%i1+8],%l1
+!-
+ srlx %l5,32,%o5
+ ldx [%i2+8],%l5
+!-
+ subccc %o1,%o5,%g0
+!-
+ subccc %l2,%l6,%g3
+!-
+ srlx %l2,32,%o2
+ ldx [%i1+16],%l2
+!-
+ srlx %l6,32,%g5 ! asymmetry
+ ldx [%i2+16],%l6
+!-
+ subccc %o2,%g5,%g0
+!-
+ subccc %l3,%l7,%g4
+!-
+ srlx %l3,32,%o3
+ ldx [%i1+24],%l3
+ add %i1,32,%i1
+!-
+ srlx %l7,32,%o7
+ ldx [%i2+24],%l7
+ add %i2,32,%i2
+!-
+ subccc %o3,%o7,%g0
+!-
+ stx %g1,[%i0+0]
+!-
+ stx %g2,[%i0+8]
+!-
+ stx %g3,[%i0+16]
+ add %i3,-4,%i3
+!-
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+ brgez,pt %i3,L(loop1)
+ nop
+!--------- End main loop ---------
+L(skip):
+ subccc %l0,%l4,%g1
+ srlx %l0,32,%o0
+ srlx %l4,32,%o4
+ subccc %o0,%o4,%g0
+ subccc %l1,%l5,%g2
+ srlx %l1,32,%o1
+ srlx %l5,32,%o5
+ subccc %o1,%o5,%g0
+ subccc %l2,%l6,%g3
+ srlx %l2,32,%o2
+ srlx %l6,32,%g5 ! asymmetry
+ subccc %o2,%g5,%g0
+ subccc %l3,%l7,%g4
+ srlx %l3,32,%o3
+ srlx %l7,32,%o7
+ subccc %o3,%o7,%g0
+ stx %g1,[%i0+0]
+ stx %g2,[%i0+8]
+ stx %g3,[%i0+16]
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+L(there):
+ add %i3,4,%i3
+ brz,pt %i3,L(end)
+ nop
+
+L(loop2):
+ ldx [%i1+0],%l0
+ add %i1,8,%i1
+ ldx [%i2+0],%l4
+ add %i2,8,%i2
+ srlx %l0,32,%g2
+ srlx %l4,32,%g3
+ subccc %l0,%l4,%g1
+ subccc %g2,%g3,%g0
+ stx %g1,[%i0+0]
+ add %i0,8,%i0
+ add %i3,-1,%i3
+ brgz,pt %i3,L(loop2)
+ nop
+
+L(end): addc %g0,%g0,%i0
+ ret
+ restore
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm
new file mode 100644
index 0000000000..7f51ba59c6
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul1h.asm
@@ -0,0 +1,204 @@
+dnl SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`submull:',
+`submulu:')
+ save %sp,-256,%sp
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1+4],%f11
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,E(end2)
+ std %f12,[%fp-17]
+
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end3)
+ std %f12,[%fp-33]
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,E(end4)
+ std %f12,[%fp-17]
+
+ b,a E(loop)
+ nop C nop is cheap to nullify
+
+ ALIGN(16)
+C BEGIN LOOP
+E(loop):
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ sub %i2,2,%i2
+ add %i0,4,%i0 C res_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DHI]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ std %f12,[%fp-17]
+ brnz,pt %i2,E(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DHI]
+ b,a E(yyy)
+
+E(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+` ld [%i0+DHI],%g5')
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ ldx [%fp-33],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DLO]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ifdef(`LOWPART',
+` subxcc %g5,%g4,%l2') C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+` st %l2,[%i0-4+DHI]
+ srlx %g4,32,%g4')
+
+ addx %g4,0,%g4
+ ret
+ restore %g0,%g4,%o0 C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(submull)',
+`EPILOGUE(submulu)')
diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm
new file mode 100644
index 0000000000..7c6af0a98b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul_1.asm
@@ -0,0 +1,114 @@
+dnl SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+PROLOGUE(mpn_submul_1)
+ save %sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below. Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hh(L(noll)),%g2
+ sethi %lm(L(noll)),%g1
+ or %g2,%hm(L(noll)),%g2
+ or %g1,%lo(L(noll)),%g1
+ sllx %g2,32,%g2
+ ld [%g1+%g2],%f10')
+
+ sub %i1,%i0,%g1
+ srlx %g1,3,%g1
+ cmp %g1,%i2
+ bcc,pt %xcc,L(nooverlap)
+ nop
+
+ sllx %i2,3,%g2 C compute stack allocation byte count
+ add %g2,15,%o0
+ and %o0,-16,%o0
+ sub %sp,%o0,%sp
+ add %sp,2223,%o0
+
+ mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp
+ call mpn_copyi
+ mov %i2,%o2 C copy n to mpn_copyi's count parameter
+
+ add %sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+ mov %i0,%o0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call submull
+ srl %i3,0,%o3
+
+ mov %o0,%l0 C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+ srlx %i3,32,%o3
+ brz,a,pn %o3,L(small)
+ mov %o0,%i0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call submulu
+ add %i0,4,%o0
+
+ add %l0,%o0,%i0
+L(small):
+ ret
+ restore %g0,%g0,%g0
+EPILOGUE(mpn_submul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/submul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/submul1h.asm')