13 files changed, 1681 insertions, 0 deletions
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README
new file mode 100644
index 0000000000..6923a133f3
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/README
@@ -0,0 +1,48 @@
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+The Ultra I/II pipeline executes up to two simple integer arithmetic operations
+per cycle.  The 64-bit integer multiply instruction mulx takes from 5 cycles to
+35 cycles, depending on the position of the most significant bit of the 1st
+source operand.  It cannot overlap with other instructions.  For our use of
+mulx, it will take from 5 to 20 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions.  No conditional move can issue 1-5 cycles after a load.  (Or
+something such bizzare.)
+
+Integer branches can issue with two integer arithmetic instructions.  Likewise
+for integer loads.  Four instructions may issue (arith, arith, ld/st, branch)
+but only if the branch is last.
+
+(The V9 architecture manual recommends that the 2nd operand of a multiply
+instruction be the smaller one.  For UltraSPARC, they got things backwards and
+optimize for the wrong operand!  Really helpful in the light of that multiply
+is incredibly slow on these CPUs!)
+
+STATUS
+
+There is new code in ~/prec/gmp-remote/sparc64.  Not tested or completed, but
+the pipelines are worked out.  Here are the timings:
+
+* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb.
+
+* add_n, sub_n: add3.s currently runs at 6 cycles/limb.  We use a bizarre
+  scheme of compares and branches (with some nops and fnops to align things)
+  and carefully stay away from the instructions intended for this application
+  (i.e., movcs and movcc).
+
+  Using movcc/movcs, even with deep unrolling, seems to get down to 7
+  cycles/limb.
+
+  The most promising approach is to split operands in 32-bit pieces using
+  srlx, then use two addccc, and finally compile the results with sllx+or.
+  The result could run at 5 cycles/limb, I think.  It might be possible to
+  do without unrolling, or with minimal unrolling.
+
+* addmul_1/submul_1: Should optimize for when scalar operand < 2^32.
+* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II,
+  Karatsuba's method should save up to 16 cycles (i.e. > 20%).
+* mul_1 (and possibly the other multiply functions): Handle carry in the
+  same tricky way as add_n,sub_n.
diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm
new file mode 100644
index 0000000000..72b3895a5b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/add_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_add_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	addccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	addccc	%o0,%o4,%g0
+!-
+	addccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	addccc	%o1,%o5,%g0
+!-
+	addccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	addccc	%o2,%g5,%g0
+!-
+	addccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	addccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	addccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	addccc	%o0,%o4,%g0
+	addccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	addccc	%o1,%o5,%g0
+	addccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	addccc	%o2,%g5,%g0
+	addccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	addccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	addccc	%l0,%l4,%g1
+	addccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm
new file mode 100644
index 0000000000..96cb5f7369
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul1h.asm
@@ -0,0 +1,203 @@
+dnl  SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`addmull:',
+`addmulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+ifdef(`LOWPART',
+`	add	%g5,%g1,%g1')		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(addmull)',
+`EPILOGUE(addmulu)')
diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm
new file mode 100644
index 0000000000..c3f04cea6a
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_addmul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_addmul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/addmul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000000..d9957e3c90
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/copyi.asm
@@ -0,0 +1,79 @@
+! SPARC v9 __gmpn_copy -- Copy a limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! rptr	%o0
+! sptr	%o1
+! n	%o2
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_copyi)
+	add	%o2,-8,%o2
+	brlz,pn	%o2,L(skip)
+	nop
+	b,a	L(loop1)
+	nop
+
+	ALIGN(16)
+L(loop1):
+	ldx	[%o1+0],%g1
+	ldx	[%o1+8],%g2
+	ldx	[%o1+16],%g3
+	ldx	[%o1+24],%g4
+	ldx	[%o1+32],%g5
+	ldx	[%o1+40],%o3
+	ldx	[%o1+48],%o4
+	ldx	[%o1+56],%o5
+	add	%o1,64,%o1
+	stx	%g1,[%o0+0]
+	stx	%g2,[%o0+8]
+	stx	%g3,[%o0+16]
+	stx	%g4,[%o0+24]
+	stx	%g5,[%o0+32]
+	stx	%o3,[%o0+40]
+	stx	%o4,[%o0+48]
+	stx	%o5,[%o0+56]
+	add	%o2,-8,%o2
+	brgez,pt	%o2,L(loop1)
+	add	%o0,64,%o0
+
+L(skip):
+	add	%o2,8,%o2
+	brz,pt	%o2,L(end)
+	nop
+
+L(loop2):
+	ldx	[%o1],%g1
+	add	%o1,8,%o1
+	add	%o2,-1,%o2
+	stx	%g1,[%o0]
+	add	%o0,8,%o0
+	brgz,pt	%o2,L(loop2)
+	nop
+
+L(end):	retl
+	nop
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..74f61661c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+   functions instead of open-coding operations in C.  */
+#define USE_MORE_MPN 1
+
+
+/* Run on sun workshop cc. */
+/* Generated by tuneup.c, 2000-07-30. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     12
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         95
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     33
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        125
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                27
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD              107
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              12
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD          4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           199
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     320
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         1664
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     320
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         1664
+#endif
diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000000..2d2edc50a7
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/lshift.asm
@@ -0,0 +1,97 @@
+! SPARC v9 __gmpn_lshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_lshift)
+	sllx	%o2,3,%g1
+	add	%o1,%g1,%o1	! make %o1 point at end of src
+	ldx	[%o1-8],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o0,%g1,%o0	! make %o0 point at end of res
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	srlx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)		! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1-16],%g3
+	add	%o0,-8,%o0
+	add	%o1,-8,%o1
+	add	%g4,-1,%g4
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0+0]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1-16],%g3
+	add	%o0,-32,%o0
+	add	%o2,-4,%o2
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-24],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+24]
+	srlx	%g2,%o5,%g1
+
+	ldx	[%o1-32],%g3
+	sllx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0+16]
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-40],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+8]
+	srlx	%g2,%o5,%g1
+
+	add	%o1,-32,%o1
+	or	%g4,%g1,%g4
+	brnz,pt	%o2,L(loop1)
+	 stx	%g4,[%o0+0]
+
+L(end):	sllx	%g2,%o3,%g2
+	stx	%g2,[%o0-8]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm
new file mode 100644
index 0000000000..f2f2821d51
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1.asm
@@ -0,0 +1,113 @@
+dnl  SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_mul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	mull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_mul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`E',`L($1)')
+include_mpn(`sparc64/mul_1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm
new file mode 100644
index 0000000000..5078c01c3f
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1h.asm
@@ -0,0 +1,183 @@
+dnl  SPARC 64-bit mull -- Helper for mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+mull:
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mull)
diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000000..baf7920efb
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/rshift.asm
@@ -0,0 +1,94 @@
+! SPARC v9 __gmpn_rshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_rshift)
+	ldx	[%o1],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	sllx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)	! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1+8],%g3
+	add	%o0,8,%o0
+	add	%o1,8,%o1
+	add	%g4,-1,%g4
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0-8]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1+8],%g3
+	add	%o0,32,%o0
+	add	%o2,-4,%o2
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+16],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-32]
+	sllx	%g2,%o5,%g1
+
+	ldx	[%o1+24],%g3
+	srlx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0-24]
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+32],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-16]
+	sllx	%g2,%o5,%g1
+
+	add	%o1,32,%o1
+	or	%g4,%g1,%g4
+	brnz	%o2,L(loop1)
+	 stx	%g4,[%o0-8]
+
+L(end):	srlx	%g2,%o3,%g2
+	stx	%g2,[%o0-0]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm
new file mode 100644
index 0000000000..61547138e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/sub_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	subccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	subccc	%o0,%o4,%g0
+!-
+	subccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	subccc	%o1,%o5,%g0
+!-
+	subccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	subccc	%o2,%g5,%g0
+!-
+	subccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	subccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	subccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	subccc	%o0,%o4,%g0
+	subccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	subccc	%o1,%o5,%g0
+	subccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	subccc	%o2,%g5,%g0
+	subccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	subccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	subccc	%l0,%l4,%g1
+	subccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm
new file mode 100644
index 0000000000..7f51ba59c6
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul1h.asm
@@ -0,0 +1,204 @@
+dnl  SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`submull:',
+`submulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	subxcc	%g5,%g4,%l2')		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	st	%l2,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	addx	%g4,0,%g4
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(submull)',
+`EPILOGUE(submulu)')
diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm
new file mode 100644
index 0000000000..7c6af0a98b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_submul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_submul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/submul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/submul1h.asm')