diff options
Diffstat (limited to 'mpn/hppa/hppa1_1')
-rw-r--r-- | mpn/hppa/hppa1_1/addmul_1.s | 102 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/mul_1.s | 98 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/pa7100/add_n.s | 75 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/pa7100/addmul_1.S | 189 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/pa7100/lshift.s | 83 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/pa7100/rshift.s | 80 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/pa7100/sub_n.s | 76 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/pa7100/submul_1.S | 195 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/submul_1.s | 111 | ||||
-rw-r--r-- | mpn/hppa/hppa1_1/udiv_qrnnd.s | 75 |
10 files changed, 1084 insertions, 0 deletions
diff --git a/mpn/hppa/hppa1_1/addmul_1.s b/mpn/hppa/hppa1_1/addmul_1.s new file mode 100644 index 000000000..0fdcb3cb2 --- /dev/null +++ b/mpn/hppa/hppa1_1/addmul_1.s @@ -0,0 +1,102 @@ +; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 11 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 10 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + + .code + .export __mpn_addmul_1 +__mpn_addmul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/mul_1.s b/mpn/hppa/hppa1_1/mul_1.s new file mode 100644 index 000000000..cdd0c1d7f --- /dev/null +++ b/mpn/hppa/hppa1_1/mul_1.s @@ -0,0 +1,98 @@ +; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store +; the result in a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +; not become faster due to data cache contention after a store. On the +; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since +; only the xmpyu does not need the integer pipeline, so the only dual-issue +; we will get are addc+xmpyu. Unrolling would not help either CPU. + +; We could use fldds to read two limbs at a time from the S1 array, and that +; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +; PA7100, respectively. We don't do that since it does not seem worth the +; (alignment) troubles... + +; At least the PA7100 is rumored to be able to deal with cache-misses +; without stalling instruction issue. If this is true, and the cache is +; actually also lockup-free, we should use a deeper software pipeline, and +; load from S1 very early! (The loads and stores to -12(sp) will surely be +; in the cache.) + + .code + .export __mpn_mul_1 +__mpn_mul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/pa7100/add_n.s b/mpn/hppa/hppa1_1/pa7100/add_n.s new file mode 100644 index 000000000..21fe16154 --- /dev/null +++ b/mpn/hppa/hppa1_1/pa7100/add_n.s @@ -0,0 +1,75 @@ +; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. +; This is optimized for the PA7100, where is runs at 4.25 cycles/limb + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + + .code + .export __mpn_add_n +__mpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L$rest + add %r20,%r19,%r28 ; add first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L$loop + addc %r20,%r19,%r28 + +L$rest addib,= 4,%r23,L$end + nop +L$eloop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L$eloop + addc %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/mpn/hppa/hppa1_1/pa7100/addmul_1.S new file mode 100644 index 000000000..eb1d12bf6 --- /dev/null +++ b/mpn/hppa/hppa1_1/pa7100/addmul_1.S @@ -0,0 +1,189 @@ +; HP-PA 7100/7200 __mpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1995 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define res_ptr %r26 +#define s1_ptr %r25 +#define size %r24 +#define s2_limb %r23 + +#define cylimb %r28 +#define s0 %r19 +#define s1 %r20 +#define s2 %r3 +#define s3 %r4 +#define lo0 %r21 +#define lo1 %r5 +#define lo2 %r6 +#define lo3 %r7 +#define hi0 %r22 +#define hi1 %r23 /* safe to reuse */ +#define hi2 %r29 +#define hi3 %r1 + + .code + .export __mpn_addmul_1 +__mpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb ; clear cy and cylimb + addib,< -4,size,L$few_limbs + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L$0 + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + add s0,lo0,s0 + addib,< -1,size,L$few_limbs + stws,ma s0,4(res_ptr) + +; start software pipeline ---------------------------------------------------- +L$0 fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size,L$end + addc %r0,hi3,cylimb ; propagate carry into cylimb +; main loop ------------------------------------------------------------------ +L$loop fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + add s0,lo0,s0 + fstds %fr6,-8(%r31) + addc s1,lo1,s1 + fstds %fr9,0(%r31) + addc s2,lo2,s2 + fstds %fr10,8(%r31) + addc s3,lo3,s3 + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size,L$loop + addc %r0,hi3,cylimb ; propagate carry into cylimb +; finish software pipeline --------------------------------------------------- +L$end ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addc s1,lo1,s1 + stws,ma s1,4(res_ptr) + addc s2,lo2,s2 + stws,ma s2,4(res_ptr) + addc s3,lo3,s3 + stws,ma s3,4(res_ptr) + +; restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +L$few_limbs + addib,=,n 4,size,L$ret +L$loop2 fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addib,<> -1,size,L$loop2 + nop + +L$ret addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/pa7100/lshift.s b/mpn/hppa/hppa1_1/pa7100/lshift.s new file mode 100644 index 000000000..4c74a505a --- /dev/null +++ b/mpn/hppa/hppa1_1/pa7100/lshift.s @@ -0,0 +1,83 @@ +; HP-PA __mpn_lshift -- +; This is optimized for the PA7100, where is runs at 3.25 cycles/limb + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __mpn_lshift +__mpn_lshift + .proc + .callinfo frame=64,no_calls + .entry + + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L$0004 + vshd %r0,%r22,%r28 ; compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,<= -5,%r24,L$rest + vshd %r22,%r29,%r20 + +L$loop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + vshd %r22,%r29,%r20 + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -4,%r24,L$loop + vshd %r22,%r29,%r20 + +L$rest addib,= 4,%r24,L$end1 + nop +L$eloop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,<= -1,%r24,L$end2 + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -1,%r24,L$eloop + vshd %r22,%r29,%r20 + +L$end1 stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +L$end2 stws,mb %r20,-4(0,%r26) +L$0004 vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/pa7100/rshift.s b/mpn/hppa/hppa1_1/pa7100/rshift.s new file mode 100644 index 000000000..845418c53 --- /dev/null +++ b/mpn/hppa/hppa1_1/pa7100/rshift.s @@ -0,0 +1,80 @@ +; HP-PA __mpn_rshift -- +; This is optimized for the PA7100, where is runs at 3.25 cycles/limb + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __mpn_rshift +__mpn_rshift + .proc + .callinfo frame=64,no_calls + .entry + + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L$0004 + vshd %r22,%r0,%r28 ; compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,<= -5,%r24,L$rest + vshd %r29,%r22,%r20 + +L$loop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + vshd %r29,%r22,%r20 + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -4,%r24,L$loop + vshd %r29,%r22,%r20 + +L$rest addib,= 4,%r24,L$end1 + nop +L$eloop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,<= -1,%r24,L$end2 + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -1,%r24,L$eloop + vshd %r29,%r22,%r20 + +L$end1 stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +L$end2 stws,ma %r20,4(0,%r26) +L$0004 vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/pa7100/sub_n.s b/mpn/hppa/hppa1_1/pa7100/sub_n.s new file mode 100644 index 000000000..1e1ebcf91 --- /dev/null +++ b/mpn/hppa/hppa1_1/pa7100/sub_n.s @@ -0,0 +1,76 @@ +; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. +; This is optimized for the PA7100, where is runs at 4.25 cycles/limb + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + + .code + .export __mpn_sub_n +__mpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L$rest + sub %r20,%r19,%r28 ; subtract first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L$loop + subb %r20,%r19,%r28 + +L$rest addib,= 4,%r23,L$end + nop +L$eloop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L$eloop + subb %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/pa7100/submul_1.S b/mpn/hppa/hppa1_1/pa7100/submul_1.S new file mode 100644 index 000000000..a71176e68 --- /dev/null +++ b/mpn/hppa/hppa1_1/pa7100/submul_1.S @@ -0,0 +1,195 @@ +; HP-PA 7100/7200 __mpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1995 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define res_ptr %r26 +#define s1_ptr %r25 +#define size %r24 +#define s2_limb %r23 + +#define cylimb %r28 +#define s0 %r19 +#define s1 %r20 +#define s2 %r3 +#define s3 %r4 +#define lo0 %r21 +#define lo1 %r5 +#define lo2 %r6 +#define lo3 %r7 +#define hi0 %r22 +#define hi1 %r23 /* safe to reuse */ +#define hi2 %r29 +#define hi3 %r1 + + .code + .export __mpn_submul_1 +__mpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb ; clear cy and cylimb + addib,< -4,size,L$few_limbs + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L$0 + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + sub s0,lo0,s0 + add s0,lo0,%r0 ; invert cy + addib,< -1,size,L$few_limbs + stws,ma s0,4(res_ptr) + +; start software pipeline ---------------------------------------------------- +L$0 fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size,L$end + addc %r0,hi3,cylimb ; propagate carry into cylimb +; main loop ------------------------------------------------------------------ +L$loop fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + sub s0,lo0,s0 + fstds %fr6,-8(%r31) + subb s1,lo1,s1 + fstds %fr9,0(%r31) + subb s2,lo2,s2 + fstds %fr10,8(%r31) + subb s3,lo3,s3 + subb %r0,%r0,lo0 ; these two insns ... + add lo0,lo0,%r0 ; ... just invert cy + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size,L$loop + addc %r0,hi3,cylimb ; propagate carry into cylimb +; finish software pipeline --------------------------------------------------- +L$end ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + sub s0,lo0,s0 + stws,ma s0,4(res_ptr) + subb s1,lo1,s1 + stws,ma s1,4(res_ptr) + subb s2,lo2,s2 + stws,ma s2,4(res_ptr) + subb s3,lo3,s3 + stws,ma s3,4(res_ptr) + subb %r0,%r0,lo0 ; these two insns ... + add lo0,lo0,%r0 ; ... invert cy + +; restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +L$few_limbs + addib,=,n 4,size,L$ret +L$loop2 fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + sub s0,lo0,s0 + add s0,lo0,%r0 ; invert cy + stws,ma s0,4(res_ptr) + addib,<> -1,size,L$loop2 + nop + +L$ret addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/submul_1.s b/mpn/hppa/hppa1_1/submul_1.s new file mode 100644 index 000000000..a4a385467 --- /dev/null +++ b/mpn/hppa/hppa1_1/submul_1.s @@ -0,0 +1,111 @@ +; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 12 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 11 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + +; It seems possible to make this run as fast as __mpn_addmul_1, if we use +; sub,>>= %r29,%r19,%r22 +; addi 1,%r28,%r28 +; but that requires reworking the hairy software pipeline... + + .code + .export __mpn_submul_1 +__mpn_submul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/mpn/hppa/hppa1_1/udiv_qrnnd.s b/mpn/hppa/hppa1_1/udiv_qrnnd.s new file mode 100644 index 000000000..bf7dc70cd --- /dev/null +++ b/mpn/hppa/hppa1_1/udiv_qrnnd.s @@ -0,0 +1,75 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on PA 7000 and later. + +; Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + + .code +L$0000 .word 0x43f00000 + .word 0x0 + .export __udiv_qrnnd +__udiv_qrnnd + .proc + .callinfo frame=64,no_calls + .entry + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) ; n_hi + stws %r24,-12(0,%r30) ; n_lo + ldil L'L$0000,%r19 + ldo R'L$0000(%r19),%r19 + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L$1 + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r19),%fr4 + fadd,dbl %fr4,%fr5,%fr5 +L$1 + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r19 + comib,= 0,%r19,L$2 + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 +L$2 bv 0(%r2) + stws %r22,0(0,%r26) + + .exit + .procend |