20 files changed, 1993 insertions, 0 deletions
diff --git a/rts/gmp/mpn/hppa/README b/rts/gmp/mpn/hppa/README
new file mode 100644
index 0000000000..97e7abe011
--- /dev/null
+++ b/rts/gmp/mpn/hppa/README
@@ -0,0 +1,91 @@
+This directory contains mpn functions for various HP PA-RISC chips.  Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+   instructions below (but some sw pipelining is needed to avoid the
+   xmpyu-fstds delay):
+
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib	Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+   (asymptotically) on the PA7100, using the instructions below.  With proper
+   sw pipelining and the unrolling level below, the speed becomes 8
+   cycles/limb.
+
+	fldds	s1_ptr
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	addc
+	addc
+	addc
+	addc
+	addc	%r0,%r0,cy-limb
+
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	add
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+   support emerges.  But we want to use 64-bit operations whenever possible,
+   in particular for loads and stores.  It is possible to handle mpn_add_n
+   efficiently by rotating (when s1/s2 are aligned), masking+bit field
+   inserting when (they are not).  The speed should double compared to the
+   code used today.
diff --git a/rts/gmp/mpn/hppa/add_n.s b/rts/gmp/mpn/hppa/add_n.s
new file mode 100644
index 0000000000..c53b2f71b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/add_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/gmp-mparam.h b/rts/gmp/mpn/hppa/gmp-mparam.h
new file mode 100644
index 0000000000..98b6d9ce3c
--- /dev/null
+++ b/rts/gmp/mpn/hppa/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the PA7100 using GCC.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      172
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      185
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              96
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            122
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            18
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          33
+#endif
diff --git a/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
new file mode 100644
index 0000000000..c7d218f922
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
@@ -0,0 +1,102 @@
+; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
new file mode 100644
index 0000000000..4512fddec9
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
@@ -0,0 +1,98 @@
+; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+	.code
+	.export		__gmpn_mul_1
+__gmpn_mul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds	 	%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
new file mode 100644
index 0000000000..4f4be08b37
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
@@ -0,0 +1,75 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	addc		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
new file mode 100644
index 0000000000..04db06822e
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
@@ -0,0 +1,189 @@
+; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	add	 s0,lo0,s0
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	add	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	addc	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	addc	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	addc	 s3,lo3,s3
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addc	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	addc	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	addc	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
new file mode 100644
index 0000000000..31669b1a55
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
@@ -0,0 +1,83 @@
+; HP-PA  __gmpn_lshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r22,%r29,%r20
+
+L$end1	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$end2	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
new file mode 100644
index 0000000000..d32b10b4b1
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
@@ -0,0 +1,80 @@
+; HP-PA  __gmpn_rshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r29,%r22,%r20
+
+L$end1	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$end2	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
new file mode 100644
index 0000000000..0eec41c4b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
@@ -0,0 +1,76 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	subb		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
new file mode 100644
index 0000000000..0fba21dcef
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
@@ -0,0 +1,195 @@
+; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	sub	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	subb	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	subb	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	subb	 s3,lo3,s3
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... just invert cy
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	sub	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	subb	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	subb	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	subb	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... invert cy
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
new file mode 100644
index 0000000000..20a5b5ce0a
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
@@ -0,0 +1,111 @@
+; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __gmpn_addmul_1, if we use
+; 	sub,>>=	%r29,%r19,%r22
+;	addi	1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
new file mode 100644
index 0000000000..b83d6f4dd2
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
@@ -0,0 +1,80 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+	.code
+L$0000	.word		0x43f00000		; 2^64
+	.word		0x0
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	; n_hi
+	stws		%r24,-12(0,%r30)	; n_lo
+#ifdef PIC
+	addil		LT%L$0000,%r19
+	ldo		RT%L$0000(%r1),%r19
+#else
+	ldil		L%L$0000,%r19
+	ldo		R%L$0000(%r19),%r19
+#endif	
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L$1
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r19),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+L$1
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r19
+	comib,=		0,%r19,L$2
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+L$2	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/umul.s b/rts/gmp/mpn/hppa/hppa1_1/umul.s
new file mode 100644
index 0000000000..1f1300ac9b
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/umul.s
@@ -0,0 +1,42 @@
+; Copyright (C) 1999 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.code
+	.export		__umul_ppmm
+	.align 4
+__umul_ppmm
+	.proc
+	.callinfo frame=64,no_calls
+	.entry
+
+	ldo 64(%r30),%r30
+	stw %r25,-16(0,%r30)
+	fldws -16(0,%r30),%fr22R
+	stw %r24,-16(0,%r30)
+	fldws -16(0,%r30),%fr22L
+	xmpyu %fr22R,%fr22L,%fr22
+	fstds %fr22,-16(0,%r30)
+	ldw -16(0,%r30),%r28
+	ldw -12(0,%r30),%r29
+	stw %r29,0(0,%r26)
+	bv 0(%r2)
+	ldo -64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
new file mode 100644
index 0000000000..6e97278a39
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0
+; and store sum in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	bv		(%r2)
+	.exit
+	addc		%r0,%r0,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
new file mode 100644
index 0000000000..7d9b50fc27
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same
+; length > 0 and store difference in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	addc		%r0,%r0,%r28
+	bv		(%r2)
+	.exit
+	subi		1,%r28,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/lshift.s b/rts/gmp/mpn/hppa/lshift.s
new file mode 100644
index 0000000000..f5a2daad60
--- /dev/null
+++ b/rts/gmp/mpn/hppa/lshift.s
@@ -0,0 +1,66 @@
+; HP-PA  __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$0002	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$0003	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/rshift.s b/rts/gmp/mpn/hppa/rshift.s
new file mode 100644
index 0000000000..e05e2f10b5
--- /dev/null
+++ b/rts/gmp/mpn/hppa/rshift.s
@@ -0,0 +1,63 @@
+; HP-PA  __gmpn_rshift -- 
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$0002	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$0003	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/sub_n.s b/rts/gmp/mpn/hppa/sub_n.s
new file mode 100644
index 0000000000..8f770ad1ad
--- /dev/null
+++ b/rts/gmp/mpn/hppa/sub_n.s
@@ -0,0 +1,59 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/udiv_qrnnd.s b/rts/gmp/mpn/hppa/udiv_qrnnd.s
new file mode 100644
index 0000000000..9aa3b8a830
--- /dev/null
+++ b/rts/gmp/mpn/hppa/udiv_qrnnd.s
@@ -0,0 +1,286 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+	.code
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	comb,<		%r23,0,L$largedivisor
+	 sub		%r0,%r23,%r1		; clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+L$largedivisor
+	extru		%r24,31,1,%r19		; r19 = n0 & 1
+	bb,<		%r23,31,L$odd
+	 extru		%r23,30,31,%r22		; r22 = d >> 1
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+L$odd	addib,sv,n	1,%r22,L$FF..		; r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF..	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+
+	.exit
+	.procend