summaryrefslogtreecommitdiff
path: root/mpn/hppa/hppa1_1
diff options
context:
space:
mode:
Diffstat (limited to 'mpn/hppa/hppa1_1')
-rw-r--r--mpn/hppa/hppa1_1/addmul_1.s102
-rw-r--r--mpn/hppa/hppa1_1/mul_1.s98
-rw-r--r--mpn/hppa/hppa1_1/pa7100/add_n.s75
-rw-r--r--mpn/hppa/hppa1_1/pa7100/addmul_1.S189
-rw-r--r--mpn/hppa/hppa1_1/pa7100/lshift.s83
-rw-r--r--mpn/hppa/hppa1_1/pa7100/rshift.s80
-rw-r--r--mpn/hppa/hppa1_1/pa7100/sub_n.s76
-rw-r--r--mpn/hppa/hppa1_1/pa7100/submul_1.S195
-rw-r--r--mpn/hppa/hppa1_1/submul_1.s111
-rw-r--r--mpn/hppa/hppa1_1/udiv_qrnnd.s75
10 files changed, 1084 insertions, 0 deletions
diff --git a/mpn/hppa/hppa1_1/addmul_1.s b/mpn/hppa/hppa1_1/addmul_1.s
new file mode 100644
index 000000000..0fdcb3cb2
--- /dev/null
+++ b/mpn/hppa/hppa1_1/addmul_1.s
@@ -0,0 +1,102 @@
+; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r26
+; s1_ptr r25
+; size r24
+; s2_limb r23
+
+; This runs at 11 cycles/limb on a PA7000. With the used instructions, it
+; can not become faster due to data cache contention after a store. On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+ .code
+ .export __mpn_addmul_1
+__mpn_addmul_1
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) ; move s2_limb ...
+ addib,= -1,%r24,L$just_one_limb
+ fldws -16(%r30),%fr4 ; ... into fr4
+ add %r0,%r0,%r0 ; clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 ; least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L$end
+ ldw -12(%r30),%r1
+
+; Main loop
+L$loop ldws 0(%r26),%r29
+ fldws,ma 4(%r25),%fr5
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addc %r0,%r28,%r28
+ addib,<> -1,%r24,L$loop
+ ldw -12(%r30),%r1
+
+L$end ldw 0(%r26),%r29
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ ldws 0(%r26),%r29
+ addc %r0,%r28,%r28
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+L$just_one_limb
+ xmpyu %fr4,%fr5,%fr6
+ ldw 0(%r26),%r29
+ fstds %fr6,-16(%r30)
+ ldw -12(%r30),%r1
+ ldw -16(%r30),%r28
+ add %r29,%r1,%r19
+ stw %r19,0(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/mul_1.s b/mpn/hppa/hppa1_1/mul_1.s
new file mode 100644
index 000000000..cdd0c1d7f
--- /dev/null
+++ b/mpn/hppa/hppa1_1/mul_1.s
@@ -0,0 +1,98 @@
+; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r26
+; s1_ptr r25
+; size r24
+; s2_limb r23
+
+; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can
+; not become faster due to data cache contention after a store. On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu. Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively. We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue. If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early! (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+ .code
+ .export __mpn_mul_1
+__mpn_mul_1
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) ; move s2_limb ...
+ addib,= -1,%r24,L$just_one_limb
+ fldws -16(%r30),%fr4 ; ... into fr4
+ add %r0,%r0,%r0 ; clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 ; least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L$end
+ ldw -12(%r30),%r1
+
+; Main loop
+L$loop fldws,ma 4(%r25),%fr5
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addib,<> -1,%r24,L$loop
+ ldw -12(%r30),%r1
+
+L$end stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ stws,ma %r19,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+L$just_one_limb
+ xmpyu %fr4,%fr5,%fr6
+ fstds %fr6,-16(%r30)
+ ldw -16(%r30),%r28
+ ldo -64(%r30),%r30
+ bv 0(%r2)
+ fstws %fr6R,0(%r26)
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/pa7100/add_n.s b/mpn/hppa/hppa1_1/pa7100/add_n.s
new file mode 100644
index 000000000..21fe16154
--- /dev/null
+++ b/mpn/hppa/hppa1_1/pa7100/add_n.s
@@ -0,0 +1,75 @@
+; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+ .code
+ .export __mpn_add_n
+__mpn_add_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L$rest
+ add %r20,%r19,%r28 ; add first limbs ignoring cy
+
+L$loop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L$loop
+ addc %r20,%r19,%r28
+
+L$rest addib,= 4,%r23,L$end
+ nop
+L$eloop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L$eloop
+ addc %r20,%r19,%r28
+
+L$end stws %r28,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r0,%r28
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/mpn/hppa/hppa1_1/pa7100/addmul_1.S
new file mode 100644
index 000000000..eb1d12bf6
--- /dev/null
+++ b/mpn/hppa/hppa1_1/pa7100/addmul_1.S
@@ -0,0 +1,189 @@
+; HP-PA 7100/7200 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1995 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr %r26
+#define s1_ptr %r25
+#define size %r24
+#define s2_limb %r23
+
+#define cylimb %r28
+#define s0 %r19
+#define s1 %r20
+#define s2 %r3
+#define s3 %r4
+#define lo0 %r21
+#define lo1 %r5
+#define lo2 %r6
+#define lo3 %r7
+#define hi0 %r22
+#define hi1 %r23 /* safe to reuse */
+#define hi2 %r29
+#define hi3 %r1
+
+ .code
+ .export __mpn_addmul_1
+__mpn_addmul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb ; clear cy and cylimb
+ addib,< -4,size,L$few_limbs
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L$0
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ add s0,lo0,s0
+ addib,< -1,size,L$few_limbs
+ stws,ma s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0 fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size,L$end
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ add s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ addc s1,lo1,s1
+ fstds %fr9,0(%r31)
+ addc s2,lo2,s2
+ fstds %fr10,8(%r31)
+ addc s3,lo3,s3
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size,L$loop
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addc s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ addc s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ addc s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+
+; restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+L$few_limbs
+ addib,=,n 4,size,L$ret
+L$loop2 fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size,L$loop2
+ nop
+
+L$ret addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/pa7100/lshift.s b/mpn/hppa/hppa1_1/pa7100/lshift.s
new file mode 100644
index 000000000..4c74a505a
--- /dev/null
+++ b/mpn/hppa/hppa1_1/pa7100/lshift.s
@@ -0,0 +1,83 @@
+; HP-PA __mpn_lshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s_ptr gr25
+; size gr24
+; cnt gr23
+
+ .code
+ .export __mpn_lshift
+__mpn_lshift
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ sh2add %r24,%r25,%r25
+ sh2add %r24,%r26,%r26
+ ldws,mb -4(0,%r25),%r22
+ subi 32,%r23,%r1
+ mtsar %r1
+ addib,= -1,%r24,L$0004
+ vshd %r0,%r22,%r28 ; compute carry out limb
+ ldws,mb -4(0,%r25),%r29
+ addib,<= -5,%r24,L$rest
+ vshd %r22,%r29,%r20
+
+L$loop ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -4,%r24,L$loop
+ vshd %r22,%r29,%r20
+
+L$rest addib,= 4,%r24,L$end1
+ nop
+L$eloop ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ addib,<= -1,%r24,L$end2
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -1,%r24,L$eloop
+ vshd %r22,%r29,%r20
+
+L$end1 stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+L$end2 stws,mb %r20,-4(0,%r26)
+L$0004 vshd %r22,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/pa7100/rshift.s b/mpn/hppa/hppa1_1/pa7100/rshift.s
new file mode 100644
index 000000000..845418c53
--- /dev/null
+++ b/mpn/hppa/hppa1_1/pa7100/rshift.s
@@ -0,0 +1,80 @@
+; HP-PA __mpn_rshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s_ptr gr25
+; size gr24
+; cnt gr23
+
+ .code
+ .export __mpn_rshift
+__mpn_rshift
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r22
+ mtsar %r23
+ addib,= -1,%r24,L$0004
+ vshd %r22,%r0,%r28 ; compute carry out limb
+ ldws,ma 4(0,%r25),%r29
+ addib,<= -5,%r24,L$rest
+ vshd %r29,%r22,%r20
+
+L$loop ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -4,%r24,L$loop
+ vshd %r29,%r22,%r20
+
+L$rest addib,= 4,%r24,L$end1
+ nop
+L$eloop ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ addib,<= -1,%r24,L$end2
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -1,%r24,L$eloop
+ vshd %r29,%r22,%r20
+
+L$end1 stws,ma %r20,4(0,%r26)
+ vshd %r0,%r29,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+L$end2 stws,ma %r20,4(0,%r26)
+L$0004 vshd %r0,%r22,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/pa7100/sub_n.s b/mpn/hppa/hppa1_1/pa7100/sub_n.s
new file mode 100644
index 000000000..1e1ebcf91
--- /dev/null
+++ b/mpn/hppa/hppa1_1/pa7100/sub_n.s
@@ -0,0 +1,76 @@
+; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+ .code
+ .export __mpn_sub_n
+__mpn_sub_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L$rest
+ sub %r20,%r19,%r28 ; subtract first limbs ignoring cy
+
+L$loop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L$loop
+ subb %r20,%r19,%r28
+
+L$rest addib,= 4,%r23,L$end
+ nop
+L$eloop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L$eloop
+ subb %r20,%r19,%r28
+
+L$end stws %r28,0(0,%r26)
+ addc %r0,%r0,%r28
+ bv 0(%r2)
+ subi 1,%r28,%r28
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/pa7100/submul_1.S b/mpn/hppa/hppa1_1/pa7100/submul_1.S
new file mode 100644
index 000000000..a71176e68
--- /dev/null
+++ b/mpn/hppa/hppa1_1/pa7100/submul_1.S
@@ -0,0 +1,195 @@
+; HP-PA 7100/7200 __mpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1995 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr %r26
+#define s1_ptr %r25
+#define size %r24
+#define s2_limb %r23
+
+#define cylimb %r28
+#define s0 %r19
+#define s1 %r20
+#define s2 %r3
+#define s3 %r4
+#define lo0 %r21
+#define lo1 %r5
+#define lo2 %r6
+#define lo3 %r7
+#define hi0 %r22
+#define hi1 %r23 /* safe to reuse */
+#define hi2 %r29
+#define hi3 %r1
+
+ .code
+ .export __mpn_submul_1
+__mpn_submul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb ; clear cy and cylimb
+ addib,< -4,size,L$few_limbs
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L$0
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ sub s0,lo0,s0
+ add s0,lo0,%r0 ; invert cy
+ addib,< -1,size,L$few_limbs
+ stws,ma s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0 fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size,L$end
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ sub s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ subb s1,lo1,s1
+ fstds %fr9,0(%r31)
+ subb s2,lo2,s2
+ fstds %fr10,8(%r31)
+ subb s3,lo3,s3
+ subb %r0,%r0,lo0 ; these two insns ...
+ add lo0,lo0,%r0 ; ... just invert cy
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size,L$loop
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ sub s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ subb s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ subb s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ subb s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+ subb %r0,%r0,lo0 ; these two insns ...
+ add lo0,lo0,%r0 ; ... invert cy
+
+; restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+L$few_limbs
+ addib,=,n 4,size,L$ret
+L$loop2 fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ sub s0,lo0,s0
+ add s0,lo0,%r0 ; invert cy
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size,L$loop2
+ nop
+
+L$ret addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/submul_1.s b/mpn/hppa/hppa1_1/submul_1.s
new file mode 100644
index 000000000..a4a385467
--- /dev/null
+++ b/mpn/hppa/hppa1_1/submul_1.s
@@ -0,0 +1,111 @@
+; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r26
+; s1_ptr r25
+; size r24
+; s2_limb r23
+
+; This runs at 12 cycles/limb on a PA7000. With the used instructions, it
+; can not become faster due to data cache contention after a store. On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __mpn_addmul_1, if we use
+; sub,>>= %r29,%r19,%r22
+; addi 1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+ .code
+ .export __mpn_submul_1
+__mpn_submul_1
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) ; move s2_limb ...
+ addib,= -1,%r24,L$just_one_limb
+ fldws -16(%r30),%fr4 ; ... into fr4
+ add %r0,%r0,%r0 ; clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 ; least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L$end
+ ldw -12(%r30),%r1
+
+; Main loop
+L$loop ldws 0(%r26),%r29
+ fldws,ma 4(%r25),%fr5
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addc %r0,%r28,%r28
+ addib,<> -1,%r24,L$loop
+ ldw -12(%r30),%r1
+
+L$end ldw 0(%r26),%r29
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ ldws 0(%r26),%r29
+ addc %r0,%r28,%r28
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+L$just_one_limb
+ xmpyu %fr4,%fr5,%fr6
+ ldw 0(%r26),%r29
+ fstds %fr6,-16(%r30)
+ ldw -12(%r30),%r1
+ ldw -16(%r30),%r28
+ sub %r29,%r1,%r22
+ add %r22,%r1,%r0
+ stw %r22,0(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .exit
+ .procend
diff --git a/mpn/hppa/hppa1_1/udiv_qrnnd.s b/mpn/hppa/hppa1_1/udiv_qrnnd.s
new file mode 100644
index 000000000..bf7dc70cd
--- /dev/null
+++ b/mpn/hppa/hppa1_1/udiv_qrnnd.s
@@ -0,0 +1,75 @@
+; HP-PA __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr gr26
+; n1 gr25
+; n0 gr24
+; d gr23
+
+ .code
+L$0000 .word 0x43f00000
+ .word 0x0
+ .export __udiv_qrnnd
+__udiv_qrnnd
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+ ldo 64(%r30),%r30
+
+ stws %r25,-16(0,%r30) ; n_hi
+ stws %r24,-12(0,%r30) ; n_lo
+ ldil L'L$0000,%r19
+ ldo R'L$0000(%r19),%r19
+ fldds -16(0,%r30),%fr5
+ stws %r23,-12(0,%r30)
+ comib,<= 0,%r25,L$1
+ fcnvxf,dbl,dbl %fr5,%fr5
+ fldds 0(0,%r19),%fr4
+ fadd,dbl %fr4,%fr5,%fr5
+L$1
+ fcpy,sgl %fr0,%fr6L
+ fldws -12(0,%r30),%fr6R
+ fcnvxf,dbl,dbl %fr6,%fr4
+
+ fdiv,dbl %fr5,%fr4,%fr5
+
+ fcnvfx,dbl,dbl %fr5,%fr4
+ fstws %fr4R,-16(%r30)
+ xmpyu %fr4R,%fr6R,%fr6
+ ldws -16(%r30),%r28
+ fstds %fr6,-16(0,%r30)
+ ldws -12(0,%r30),%r21
+ ldws -16(0,%r30),%r20
+ sub %r24,%r21,%r22
+ subb %r25,%r20,%r19
+ comib,= 0,%r19,L$2
+ ldo -64(%r30),%r30
+
+ add %r22,%r23,%r22
+ ldo -1(%r28),%r28
+L$2 bv 0(%r2)
+ stws %r22,0(0,%r26)
+
+ .exit
+ .procend