summaryrefslogtreecommitdiff
path: root/mpn/alpha
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2008-12-14 02:50:47 +0100
committerTorbjorn Granlund <tege@gmplib.org>2008-12-14 02:50:47 +0100
commit32a23c292182665b54c2d85054ba2491ffc2d8a9 (patch)
treec660b5c592885a79e1ebfafa3a68e5d99b103cd1 /mpn/alpha
parenteb400c26314d57c9f6192369856796bd7481f00b (diff)
downloadgmp-32a23c292182665b54c2d85054ba2491ffc2d8a9.tar.gz
Replace mpn/alpha/ev6/{addmul_1.asm,submul_1.asm} with combined file.
Diffstat (limited to 'mpn/alpha')
-rw-r--r--mpn/alpha/ev6/addmul_1.asm388
-rw-r--r--mpn/alpha/ev6/aorsmul_1.asm387
-rw-r--r--mpn/alpha/ev6/submul_1.asm471
3 files changed, 387 insertions, 859 deletions
diff --git a/mpn/alpha/ev6/addmul_1.asm b/mpn/alpha/ev6/addmul_1.asm
deleted file mode 100644
index 8a30d7a09..000000000
--- a/mpn/alpha/ev6/addmul_1.asm
+++ /dev/null
@@ -1,388 +0,0 @@
-dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
-dnl result to a second limb vector.
-
-dnl Copyright 2000, 2003, 2004, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C EV4: 42
-C EV5: 18
-C EV6: 3.5
-
-C INPUT PARAMETERS
-C rp r16
-C up r17
-C n r18
-C vlimb r19
-
-dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
-
-dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
-dnl them, so that further disturbance to the schedule is damped.
-
-dnl We couldn't pair the loads, because the entangled schedule of the carry's
-dnl has to happen on one side {0} of the machine.
-
-dnl This is a great schedule for the d_cache, a poor schedule for the b_cache.
-dnl The lockup on U0 means that any stall can't be recovered from. Consider a
-dnl ldq in L1, say that load gets stalled because it collides with a fill from
-dnl the b_cache. On the next cycle, this load gets priority. If first looks
-dnl at L0, and goes there. The instruction we intended for L0 gets to look at
-dnl L1, which is NOT where we want it. It either stalls 1, because it can't
-dnl go in L0, or goes there, and causes a further instruction to stall.
-
-dnl So for b_cache, we're likely going to want to put one or more cycles back
-dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
-dnl At a place where we have an mt followed by a bookkeeping, put the
-dnl bookkeeping in upper, and the prefetch into lower.
-
-dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd
-dnl like not to have an ldq or an stq to preceded a conditional branch in a
-dnl quadpack. The conditional branch moves the retire pointer one cycle
-dnl later.
-
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- ldq r3, 0(r17) C
- and r18, 7, r20 C
- lda r18, -9(r18) C
- cmpeq r20, 1, r21 C
- beq r21, $L1 C
-
-$1mod8: ldq r5, 0(r16) C
- mulq r19, r3, r7 C
- umulh r19, r3, r8 C
- addq r5, r7, r23 C
- cmpult r23, r7, r20 C
- addq r8, r20, r0 C
- stq r23, 0(r16) C
- bge r18, $ent1 C
- ret r31, (r26), 1 C
-
-$L1: lda r8, 0(r31) C zero carry reg
- lda r24, 0(r31) C zero carry reg
- cmpeq r20, 2, r21 C
- bne r21, $2mod8 C
- cmpeq r20, 3, r21 C
- bne r21, $3mod8 C
- cmpeq r20, 4, r21 C
- bne r21, $4mod8 C
- cmpeq r20, 5, r21 C
- bne r21, $5mod8 C
- cmpeq r20, 6, r21 C
- bne r21, $6mod8 C
- cmpeq r20, 7, r21 C
- beq r21, $0mod8 C
-
-$7mod8: ldq r5, 0(r16) C
- lda r17, 8(r17) C
- mulq r19, r3, r7 C
- umulh r19, r3, r24 C
- addq r5, r7, r23 C
- cmpult r23, r7, r20 C
- addq r24, r20, r24 C
- stq r23, 0(r16) C
- lda r16, 8(r16) C
- ldq r3, 0(r17) C
-$6mod8: ldq r1, 8(r17) C
- mulq r19, r3, r25 C
- umulh r19, r3, r3 C
- mulq r19, r1, r28 C
- ldq r0, 16(r17) C
- ldq r4, 0(r16) C
- umulh r19, r1, r8 C
- ldq r1, 24(r17) C
- lda r17, 48(r17) C L1 bookkeeping
- mulq r19, r0, r2 C
- ldq r5, 8(r16) C
- lda r16, -32(r16) C L1 bookkeeping
- umulh r19, r0, r6 C
- addq r4, r25, r4 C lo + acc
- mulq r19, r1, r7 C
- br r31, $ent6 C
-
-$ent1: lda r17, 8(r17) C
- lda r16, 8(r16) C
- lda r8, 0(r0) C
- ldq r3, 0(r17) C
-$0mod8: ldq r1, 8(r17) C
- mulq r19, r3, r2 C
- umulh r19, r3, r6 C
- mulq r19, r1, r7 C
- ldq r0, 16(r17) C
- ldq r4, 0(r16) C
- umulh r19, r1, r24 C
- ldq r1, 24(r17) C
- mulq r19, r0, r25 C
- ldq r5, 8(r16) C
- umulh r19, r0, r3 C
- addq r4, r2, r4 C lo + acc
- mulq r19, r1, r28 C
- lda r16, -16(r16) C
- br r31, $ent0 C
-
-$3mod8: ldq r5, 0(r16) C
- lda r17, 8(r17) C
- mulq r19, r3, r7 C
- umulh r19, r3, r8 C
- addq r5, r7, r23 C
- cmpult r23, r7, r20 C
- addq r8, r20, r24 C
- stq r23, 0(r16) C
- lda r16, 8(r16) C
- ldq r3, 0(r17) C
-$2mod8: ldq r1, 8(r17) C
- mulq r19, r3, r25 C
- umulh r19, r3, r3 C
- mulq r19, r1, r28 C
- ble r18, $n23 C
- ldq r0, 16(r17) C
- ldq r4, 0(r16) C
- umulh r19, r1, r8 C
- ldq r1, 24(r17) C
- lda r17, 16(r17) C L1 bookkeeping
- mulq r19, r0, r2 C
- ldq r5, 8(r16) C
- lda r16, 0(r16) C L1 bookkeeping
- umulh r19, r0, r6 C
- addq r4, r25, r4 C lo + acc
- mulq r19, r1, r7 C
- br r31, $ent2 C
-
-$5mod8: ldq r5, 0(r16) C
- lda r17, 8(r17) C
- mulq r19, r3, r7 C
- umulh r19, r3, r24 C
- addq r5, r7, r23 C
- cmpult r23, r7, r20 C
- addq r24, r20, r8 C
- stq r23, 0(r16) C
- lda r16, 8(r16) C
- ldq r3, 0(r17) C
-$4mod8: ldq r1, 8(r17) C
- mulq r19, r3, r2 C
- umulh r19, r3, r6 C
- mulq r19, r1, r7 C
- ldq r0, 16(r17) C
- ldq r4, 0(r16) C
- umulh r19, r1, r24 C
- ldq r1, 24(r17) C
- lda r17, 32(r17) C L1 bookkeeping
- mulq r19, r0, r25 C
- ldq r5, 8(r16) C
- lda r16, 16(r16) C L1 bookkeeping
- umulh r19, r0, r3 C
- addq r4, r2, r4 C lo + acc
- mulq r19, r1, r28 C
- cmpult r4, r2, r20 C L0 lo add => carry
- addq r4, r8, r22 C U0 hi add => answer
- ble r18, $Lend C
-ALIGN(16)
-$Loop:
- bis r31, r31, r31 C U1 mt
- cmpult r22, r8, r21 C L0 hi add => carry
- addq r6, r20, r6 C U0 hi mul + carry
- ldq r0, 0(r17) C
-
- bis r31, r31, r31 C U1 mt
- addq r5, r7, r23 C L0 lo + acc
- addq r6, r21, r6 C U0 hi mul + carry
- ldq r4, 0(r16) C L1
-
- umulh r19, r1, r8 C U1
- cmpult r23, r7, r20 C L0 lo add => carry
- addq r23, r6, r23 C U0 hi add => answer
- ldq r1, 8(r17) C L1
-
- mulq r19, r0, r2 C U1
- cmpult r23, r6, r21 C L0 hi add => carry
- addq r24, r20, r24 C U0 hi mul + carry
- ldq r5, 8(r16) C L1
-
- umulh r19, r0, r6 C U1
- addq r4, r25, r4 C U0 lo + acc
- stq r22, -16(r16) C L0
- stq r23, -8(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r1, r7 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r24, r21, r24 C U0 hi mul + carry
-$ent2:
- cmpult r4, r25, r20 C L0 lo add => carry
- bis r31, r31, r31 C U1 mt
- lda r18, -8(r18) C L1 bookkeeping
- addq r4, r24, r22 C U0 hi add => answer
-
- bis r31, r31, r31 C U1 mt
- cmpult r22, r24, r21 C L0 hi add => carry
- addq r3, r20, r3 C U0 hi mul + carry
- ldq r0, 16(r17) C L1
-
- bis r31, r31, r31 C U1 mt
- addq r5, r28, r23 C L0 lo + acc
- addq r3, r21, r3 C U0 hi mul + carry
- ldq r4, 16(r16) C L1
-
- umulh r19, r1, r24 C U1
- cmpult r23, r28, r20 C L0 lo add => carry
- addq r23, r3, r23 C U0 hi add => answer
- ldq r1, 24(r17) C L1
-
- mulq r19, r0, r25 C U1
- cmpult r23, r3, r21 C L0 hi add => carry
- addq r8, r20, r8 C U0 hi mul + carry
- ldq r5, 24(r16) C L1
-
- umulh r19, r0, r3 C U1
- addq r4, r2, r4 C U0 lo + acc
- stq r22, 0(r16) C L0
- stq r23, 8(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r1, r28 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r8, r21, r8 C L0 hi mul + carry
-$ent0:
- cmpult r4, r2, r20 C L0 lo add => carry
- bis r31, r31, r31 C U1 mt
- lda r17, 64(r17) C L1 bookkeeping
- addq r4, r8, r22 C U0 hi add => answer
-
- bis r31, r31, r31 C U1 mt
- cmpult r22, r8, r21 C L0 hi add => carry
- addq r6, r20, r6 C U0 hi mul + carry
- ldq r0, -32(r17) C L1
-
- bis r31, r31, r31 C U1 mt
- addq r5, r7, r23 C L0 lo + acc
- addq r6, r21, r6 C U0 hi mul + carry
- ldq r4, 32(r16) C L1
-
- umulh r19, r1, r8 C U1
- cmpult r23, r7, r20 C L0 lo add => carry
- addq r23, r6, r23 C U0 hi add => answer
- ldq r1, -24(r17) C L1
-
- mulq r19, r0, r2 C U1
- cmpult r23, r6, r21 C L0 hi add => carry
- addq r24, r20, r24 C U0 hi mul + carry
- ldq r5, 40(r16) C L1
-
- umulh r19, r0, r6 C U1
- addq r4, r25, r4 C U0 lo + acc
- stq r22, 16(r16) C L0
- stq r23, 24(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r1, r7 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r24, r21, r24 C U0 hi mul + carry
-$ent6:
- cmpult r4, r25, r20 C L0 lo add => carry
- bis r31, r31, r31 C U1 mt
- lda r16, 64(r16) C L1 bookkeeping
- addq r4, r24, r22 C U0 hi add => answer
-
- bis r31, r31, r31 C U1 mt
- cmpult r22, r24, r21 C L0 hi add => carry
- addq r3, r20, r3 C U0 hi mul + carry
- ldq r0, -16(r17) C L1
-
- bis r31, r31, r31 C U1 mt
- addq r5, r28, r23 C L0 lo + acc
- addq r3, r21, r3 C U0 hi mul + carry
- ldq r4, -16(r16) C L1
-
- umulh r19, r1, r24 C U1
- cmpult r23, r28, r20 C L0 lo add => carry
- addq r23, r3, r23 C U0 hi add => answer
- ldq r1, -8(r17) C L1
-
- mulq r19, r0, r25 C U1
- cmpult r23, r3, r21 C L0 hi add => carry
- addq r8, r20, r8 C U0 hi mul + carry
- ldq r5, -8(r16) C L1
-
- umulh r19, r0, r3 C U1
- addq r4, r2, r4 C L0 lo + acc
- stq r22, -32(r16) C L0
- stq r23, -24(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r1, r28 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r8, r21, r8 C U0 hi mul + carry
-
- cmpult r4, r2, r20 C L0 lo add => carry
- addq r4, r8, r22 C U0 hi add => answer
- ldl r31, 256(r17) C prefetch up[]
- bgt r18, $Loop C U1 bookkeeping
-
-$Lend: cmpult r22, r8, r21 C
- addq r6, r20, r6 C
- addq r5, r7, r23 C
- addq r6, r21, r6 C
- ldq r4, 0(r16) C
- umulh r19, r1, r8 C
- cmpult r23, r7, r20 C
- addq r23, r6, r23 C
- cmpult r23, r6, r21 C
- addq r24, r20, r24 C
- ldq r5, 8(r16) C
- addq r4, r25, r4 C
- stq r22, -16(r16) C
- stq r23, -8(r16) C
- addq r24, r21, r24 C
- cmpult r4, r25, r20 C
- addq r4, r24, r22 C
- cmpult r22, r24, r21 C
- addq r3, r20, r3 C
- addq r5, r28, r23 C
- addq r3, r21, r3 C
- cmpult r23, r28, r20 C
- addq r23, r3, r23 C
- cmpult r23, r3, r21 C
- addq r8, r20, r8 C
- stq r22, 0(r16) C
- stq r23, 8(r16) C
- addq r8, r21, r0 C
- ret r31, (r26), 1 C
-
-$n23: ldq r4, 0(r16) C
- ldq r5, 8(r16) C
- umulh r19, r1, r8 C
- addq r4, r25, r4 C
- cmpult r4, r25, r20 C
- addq r4, r24, r22 C
- cmpult r22, r24, r21 C
- addq r3, r20, r3 C
- addq r5, r28, r23 C
- addq r3, r21, r3 C
- cmpult r23, r28, r20 C
- addq r23, r3, r23 C
- cmpult r23, r3, r21 C
- addq r8, r20, r8 C
- stq r22, 0(r16) C
- stq r23, 8(r16) C
- addq r8, r21, r0 C
- ret r31, (r26), 1 C
-EPILOGUE()
-ASM_END()
diff --git a/mpn/alpha/ev6/aorsmul_1.asm b/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644
index 000000000..eda092b2d
--- /dev/null
+++ b/mpn/alpha/ev6/aorsmul_1.asm
@@ -0,0 +1,387 @@
+dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.5
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n', `r18')
+define(`v0', `r19')
+
+dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl them, so that further disturbance to the schedule is damped.
+
+dnl We couldn't pair the loads, because the entangled schedule of the carry's
+dnl has to happen on one side {0} of the machine.
+
+dnl This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl The lockup on U0 means that any stall can't be recovered from. Consider a
+dnl ldq in L1, say that load gets stalled because it collides with a fill from
+dnl the b_cache. On the next cycle, this load gets priority. If first looks
+dnl at L0, and goes there. The instruction we intended for L0 gets to look at
+dnl L1, which is NOT where we want it. It either stalls 1, because it can't
+dnl go in L0, or goes there, and causes a further instruction to stall.
+
+dnl So for b_cache, we're likely going to want to put one or more cycles back
+dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl At a place where we have an mt followed by a bookkeeping, put the
+dnl bookkeeping in upper, and the prefetch into lower.
+
+dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd
+dnl like not to have an ldq or an stq to preceded a conditional branch in a
+dnl quadpack. The conditional branch moves the retire pointer one cycle
+dnl later.
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `addq')
+ define(`CMPCY', `cmpult $2,$1')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `subq')
+ define(`CMPCY', `cmpult $1,$2')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ ldq r3, 0(up) C
+ and r18, 7, r20 C
+ lda r18, -9(r18) C
+ cmpeq r20, 1, r21 C
+ beq r21, $L1 C
+
+$1mod8: ldq r5, 0(rp) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r8 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r8, r20, r0 C
+ stq r23, 0(rp) C
+ bge r18, $ent1 C
+ ret r31, (r26), 1 C
+
+$L1: lda r8, 0(r31) C zero carry reg
+ lda r24, 0(r31) C zero carry reg
+ cmpeq r20, 2, r21 C
+ bne r21, $2mod8 C
+ cmpeq r20, 3, r21 C
+ bne r21, $3mod8 C
+ cmpeq r20, 4, r21 C
+ bne r21, $4mod8 C
+ cmpeq r20, 5, r21 C
+ bne r21, $5mod8 C
+ cmpeq r20, 6, r21 C
+ bne r21, $6mod8 C
+ cmpeq r20, 7, r21 C
+ beq r21, $0mod8 C
+
+$7mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r24 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r24, r20, r24 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$6mod8: ldq r1, 8(up) C
+ mulq v0, r3, r25 C
+ umulh v0, r3, r3 C
+ mulq v0, r1, r28 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ ldq r1, 24(up) C
+ lda up, 48(up) C L1 bookkeeping
+ mulq v0, r0, r2 C
+ ldq r5, 8(rp) C
+ lda rp, -32(rp) C L1 bookkeeping
+ umulh v0, r0, r6 C
+ ADDSUB r4, r25, r25 C lo + acc
+ mulq v0, r1, r7 C
+ br r31, $ent6 C
+
+$ent1: lda up, 8(up) C
+ lda rp, 8(rp) C
+ lda r8, 0(r0) C
+ ldq r3, 0(up) C
+$0mod8: ldq r1, 8(up) C
+ mulq v0, r3, r2 C
+ umulh v0, r3, r6 C
+ mulq v0, r1, r7 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r24 C
+ ldq r1, 24(up) C
+ mulq v0, r0, r25 C
+ ldq r5, 8(rp) C
+ umulh v0, r0, r3 C
+ ADDSUB r4, r2, r2 C lo + acc
+ mulq v0, r1, r28 C
+ lda rp, -16(rp) C
+ br r31, $ent0 C
+
+$3mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r8 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r8, r20, r24 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$2mod8: ldq r1, 8(up) C
+ mulq v0, r3, r25 C
+ umulh v0, r3, r3 C
+ mulq v0, r1, r28 C
+ ble r18, $n23 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ ldq r1, 24(up) C
+ lda up, 16(up) C L1 bookkeeping
+ mulq v0, r0, r2 C
+ ldq r5, 8(rp) C
+ lda rp, 0(rp) C L1 bookkeeping
+ umulh v0, r0, r6 C
+ ADDSUB r4, r25, r25 C lo + acc
+ mulq v0, r1, r7 C
+ br r31, $ent2 C
+
+$5mod8: ldq r5, 0(rp) C
+ lda up, 8(up) C
+ mulq v0, r3, r7 C
+ umulh v0, r3, r24 C
+ ADDSUB r5, r7, r23 C
+ CMPCY( r5, r23), r20 C
+ addq r24, r20, r8 C
+ stq r23, 0(rp) C
+ lda rp, 8(rp) C
+ ldq r3, 0(up) C
+$4mod8: ldq r1, 8(up) C
+ mulq v0, r3, r2 C
+ umulh v0, r3, r6 C
+ mulq v0, r1, r7 C
+ ldq r0, 16(up) C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r24 C
+ ldq r1, 24(up) C
+ lda up, 32(up) C L1 bookkeeping
+ mulq v0, r0, r25 C
+ ldq r5, 8(rp) C
+ lda rp, 16(rp) C L1 bookkeeping
+ umulh v0, r0, r3 C
+ ADDSUB r4, r2, r2 C lo + acc
+ mulq v0, r1, r28 C
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+ ble r18, $Lend C
+ ALIGN(16)
+$Loop:
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r2, r22), r21 C L0 hi add => carry
+ addq r6, r20, r6 C U0 hi mul + carry
+ ldq r0, 0(up) C
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r7, r7 C L0 lo + acc
+ addq r6, r21, r6 C U0 hi mul + carry
+ ldq r4, 0(rp) C L1
+
+ umulh v0, r1, r8 C U1
+ CMPCY( r5, r7), r20 C L0 lo add => carry
+ ADDSUB r7, r6, r23 C U0 hi add => answer
+ ldq r1, 8(up) C L1
+
+ mulq v0, r0, r2 C U1
+ CMPCY( r7, r23), r21 C L0 hi add => carry
+ addq r24, r20, r24 C U0 hi mul + carry
+ ldq r5, 8(rp) C L1
+
+ umulh v0, r0, r6 C U1
+ ADDSUB r4, r25, r25 C U0 lo + acc
+ stq r22, -16(rp) C L0
+ stq r23, -8(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r7 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r24, r21, r24 C U0 hi mul + carry
+$ent2:
+ CMPCY( r4, r25), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r18, -8(r18) C L1 bookkeeping
+ ADDSUB r25, r24, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r25, r22), r21 C L0 hi add => carry
+ addq r3, r20, r3 C U0 hi mul + carry
+ ldq r0, 16(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r28, r28 C L0 lo + acc
+ addq r3, r21, r3 C U0 hi mul + carry
+ ldq r4, 16(rp) C L1
+
+ umulh v0, r1, r24 C U1
+ CMPCY( r5, r28), r20 C L0 lo add => carry
+ ADDSUB r28, r3, r23 C U0 hi add => answer
+ ldq r1, 24(up) C L1
+
+ mulq v0, r0, r25 C U1
+ CMPCY( r28, r23), r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r5, 24(rp) C L1
+
+ umulh v0, r0, r3 C U1
+ ADDSUB r4, r2, r2 C U0 lo + acc
+ stq r22, 0(rp) C L0
+ stq r23, 8(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r28 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+$ent0:
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda up, 64(up) C L1 bookkeeping
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r2, r22), r21 C L0 hi add => carry
+ addq r6, r20, r6 C U0 hi mul + carry
+ ldq r0, -32(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r7, r7 C L0 lo + acc
+ addq r6, r21, r6 C U0 hi mul + carry
+ ldq r4, 32(rp) C L1
+
+ umulh v0, r1, r8 C U1
+ CMPCY( r5, r7), r20 C L0 lo add => carry
+ ADDSUB r7, r6, r23 C U0 hi add => answer
+ ldq r1, -24(up) C L1
+
+ mulq v0, r0, r2 C U1
+ CMPCY( r7, r23), r21 C L0 hi add => carry
+ addq r24, r20, r24 C U0 hi mul + carry
+ ldq r5, 40(rp) C L1
+
+ umulh v0, r0, r6 C U1
+ ADDSUB r4, r25, r25 C U0 lo + acc
+ stq r22, 16(rp) C L0
+ stq r23, 24(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r7 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r24, r21, r24 C U0 hi mul + carry
+$ent6:
+ CMPCY( r4, r25), r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda rp, 64(rp) C L1 bookkeeping
+ ADDSUB r25, r24, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ CMPCY( r25, r22), r21 C L0 hi add => carry
+ addq r3, r20, r3 C U0 hi mul + carry
+ ldq r0, -16(up) C L1
+
+ bis r31, r31, r31 C U1 mt
+ ADDSUB r5, r28, r28 C L0 lo + acc
+ addq r3, r21, r3 C U0 hi mul + carry
+ ldq r4, -16(rp) C L1
+
+ umulh v0, r1, r24 C U1
+ CMPCY( r5, r28), r20 C L0 lo add => carry
+ ADDSUB r28, r3, r23 C U0 hi add => answer
+ ldq r1, -8(up) C L1
+
+ mulq v0, r0, r25 C U1
+ CMPCY( r28, r23), r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r5, -8(rp) C L1
+
+ umulh v0, r0, r3 C U1
+ ADDSUB r4, r2, r2 C U0 lo + acc
+ stq r22, -32(rp) C L0
+ stq r23, -24(rp) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq v0, r1, r28 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+
+ CMPCY( r4, r2), r20 C L0 lo add => carry
+ ADDSUB r2, r8, r22 C U0 hi add => answer
+ ldl r31, 256(up) C prefetch up[]
+ bgt r18, $Loop C U1 bookkeeping
+
+$Lend: CMPCY( r2, r22), r21 C
+ addq r6, r20, r6 C
+ ADDSUB r5, r7, r7 C
+ addq r6, r21, r6 C
+ ldq r4, 0(rp) C
+ umulh v0, r1, r8 C
+ CMPCY( r5, r7), r20 C
+ ADDSUB r7, r6, r23 C
+ CMPCY(r7, r23), r21 C
+ addq r24, r20, r24 C
+ ldq r5, 8(rp) C
+ ADDSUB r4, r25, r25 C
+ stq r22, -16(rp) C
+ stq r23, -8(rp) C
+ addq r24, r21, r24 C
+ br L(x)
+
+ ALIGN(16)
+$n23: ldq r4, 0(rp) C
+ ldq r5, 8(rp) C
+ umulh v0, r1, r8 C
+ ADDSUB r4, r25, r25 C
+L(x): CMPCY( r4, r25), r20 C
+ ADDSUB r25, r24, r22 C
+ CMPCY( r25, r22), r21 C
+ addq r3, r20, r3 C
+ ADDSUB r5, r28, r28 C
+ addq r3, r21, r3 C
+ CMPCY( r5, r28), r20 C
+ ADDSUB r28, r3, r23 C
+ CMPCY( r28, r23), r21 C
+ addq r8, r20, r8 C
+ stq r22, 0(rp) C
+ stq r23, 8(rp) C
+ addq r8, r21, r0 C
+ ret r31, (r26), 1 C
+EPILOGUE()
+ASM_END()
diff --git a/mpn/alpha/ev6/submul_1.asm b/mpn/alpha/ev6/submul_1.asm
deleted file mode 100644
index 01abad6d7..000000000
--- a/mpn/alpha/ev6/submul_1.asm
+++ /dev/null
@@ -1,471 +0,0 @@
-dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
-dnl the result from a second limb vector.
-
-dnl Copyright 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-dnl INPUT PARAMETERS
-dnl res_ptr r16
-dnl s1_ptr r17
-dnl size r18
-dnl s2_limb r19
-
-dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
-dnl exactly 3.5 cycles/limb on EV6...
-
-dnl This code was written in close cooperation with ev6 pipeline expert
-dnl Steve Root. Any errors are tege's fault, though.
-dnl
-dnl Register usages for unrolled loop:
-dnl 0-3 mul's
-dnl 4-7 acc's
-dnl 8-15 mul results
-dnl 20,21 carry's
-dnl 22,23 save for stores
-
-dnl Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
-
-dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
-dnl them, so that further disturbance to the schedule is damped.
-
-dnl We couldn't pair the loads, because the entangled schedule of the
-dnl carry's has to happen on one side {0} of the machine. Note, the total
-dnl use of U0, and the total use of L0 (after attending to the stores).
-dnl which is part of the reason why....
-
-dnl This is a great schedule for the d_cache, a poor schedule for the
-dnl b_cache. The lockup on U0 means that any stall can't be recovered
-dnl from. Consider a ldq in L1. say that load gets stalled because it
-dnl collides with a fill from the b_Cache. On the next cycle, this load
-dnl gets priority. If first looks at L0, and goes there. The instruction
-dnl we intended for L0 gets to look at L1, which is NOT where we want
-dnl it. It either stalls 1, because it can't go in L0, or goes there, and
-dnl causes a further instruction to stall.
-
-dnl So for b_cache, we're likely going to want to put one or more cycles
-dnl back into the code! And, of course, put in prefetches. For the
-dnl accumulator, lds, intent to modify. For the multiplier, you might
-dnl want ldq, evict next, if you're not wanting to use it again soon. Use
-dnl 256 ahead of present pointer value. At a place where we have an mt
-dnl followed by a bookkeeping, put the bookkeeping in upper, and the
-dnl prefetch into lower.
-
-dnl Note, the usage of physical registers per cycle is smoothed off, as
-dnl much as possible.
-
-dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
-dnl like not to have a ldq or stq to preceded a conditional branch in a
-dnl quadpack. The conditional branch moves the retire pointer one cycle
-dnl later.
-
-dnl Optimization notes:
-dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
-dnl Reserved regs: r29 r30 r31
-dnl Free caller-saves regs in unrolled code: r24 r25 r28
-dnl We should swap some of the callee-saves regs for some of the free
-dnl caller-saves regs, saving some overhead cycles.
-dnl Most importantly, we should write fast code for the 0-7 case.
-dnl The code we use there are for the 21164, and runs at 7 cycles/limb
-dnl on the 21264. Should not be hard, if we write specialized code for
-dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
-dnl need a jump table indexed by the low 3 bits of the count argument.
-
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- cmpult r18, 8, r1
- beq r1, $Large
-
- ldq r2, 0(r17) C r2 = s1_limb
- addq r17, 8, r17 C s1_ptr++
- subq r18, 1, r18 C size--
- mulq r2, r19, r3 C r3 = prod_low
- ldq r5, 0(r16) C r5 = *res_ptr
- umulh r2, r19, r0 C r0 = prod_high
- beq r18, $Lend0b C jump if size was == 1
- ldq r2, 0(r17) C r2 = s1_limb
- addq r17, 8, r17 C s1_ptr++
- subq r18, 1, r18 C size--
- subq r5, r3, r3
- cmpult r5, r3, r4
- stq r3, 0(r16)
- addq r16, 8, r16 C res_ptr++
- beq r18, $Lend0a C jump if size was == 2
-
- ALIGN(8)
-$Loop0: mulq r2, r19, r3 C r3 = prod_low
- ldq r5, 0(r16) C r5 = *res_ptr
- addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
- subq r18, 1, r18 C size--
- umulh r2, r19, r4 C r4 = cy_limb
- ldq r2, 0(r17) C r2 = s1_limb
- addq r17, 8, r17 C s1_ptr++
- addq r3, r0, r3 C r3 = cy_limb + prod_low
- cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
- subq r5, r3, r3
- cmpult r5, r3, r5
- stq r3, 0(r16)
- addq r16, 8, r16 C res_ptr++
- addq r5, r0, r0 C combine carries
- bne r18, $Loop0
-$Lend0a:
- mulq r2, r19, r3 C r3 = prod_low
- ldq r5, 0(r16) C r5 = *res_ptr
- addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
- umulh r2, r19, r4 C r4 = cy_limb
- addq r3, r0, r3 C r3 = cy_limb + prod_low
- cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
- subq r5, r3, r3
- cmpult r5, r3, r5
- stq r3, 0(r16)
- addq r5, r0, r0 C combine carries
- addq r4, r0, r0 C cy_limb = prod_high + cy
- ret r31, (r26), 1
-$Lend0b:
- subq r5, r3, r3
- cmpult r5, r3, r5
- stq r3, 0(r16)
- addq r0, r5, r0
- ret r31, (r26), 1
-
-$Large:
- lda $30, -240($30)
- stq $9, 8($30)
- stq $10, 16($30)
- stq $11, 24($30)
- stq $12, 32($30)
- stq $13, 40($30)
- stq $14, 48($30)
- stq $15, 56($30)
-
- and r18, 7, r20 C count for the first loop, 0-7
- srl r18, 3, r18 C count for unrolled loop
- bis r31, r31, r0
- beq r20, $Lunroll
- ldq r2, 0(r17) C r2 = s1_limb
- addq r17, 8, r17 C s1_ptr++
- subq r20, 1, r20 C size--
- mulq r2, r19, r3 C r3 = prod_low
- ldq r5, 0(r16) C r5 = *res_ptr
- umulh r2, r19, r0 C r0 = prod_high
- beq r20, $Lend1b C jump if size was == 1
- ldq r2, 0(r17) C r2 = s1_limb
- addq r17, 8, r17 C s1_ptr++
- subq r20, 1, r20 C size--
- subq r5, r3, r3
- cmpult r5, r3, r4
- stq r3, 0(r16)
- addq r16, 8, r16 C res_ptr++
- beq r20, $Lend1a C jump if size was == 2
-
- ALIGN(8)
-$Loop1: mulq r2, r19, r3 C r3 = prod_low
- ldq r5, 0(r16) C r5 = *res_ptr
- addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
- subq r20, 1, r20 C size--
- umulh r2, r19, r4 C r4 = cy_limb
- ldq r2, 0(r17) C r2 = s1_limb
- addq r17, 8, r17 C s1_ptr++
- addq r3, r0, r3 C r3 = cy_limb + prod_low
- cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
- subq r5, r3, r3
- cmpult r5, r3, r5
- stq r3, 0(r16)
- addq r16, 8, r16 C res_ptr++
- addq r5, r0, r0 C combine carries
- bne r20, $Loop1
-
-$Lend1a:
- mulq r2, r19, r3 C r3 = prod_low
- ldq r5, 0(r16) C r5 = *res_ptr
- addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
- umulh r2, r19, r4 C r4 = cy_limb
- addq r3, r0, r3 C r3 = cy_limb + prod_low
- cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
- subq r5, r3, r3
- cmpult r5, r3, r5
- stq r3, 0(r16)
- addq r16, 8, r16 C res_ptr++
- addq r5, r0, r0 C combine carries
- addq r4, r0, r0 C cy_limb = prod_high + cy
- br r31, $Lunroll
-$Lend1b:
- subq r5, r3, r3
- cmpult r5, r3, r5
- stq r3, 0(r16)
- addq r16, 8, r16 C res_ptr++
- addq r0, r5, r0
-
-$Lunroll:
- lda r17, -16(r17) C L1 bookkeeping
- lda r16, -16(r16) C L1 bookkeeping
- bis r0, r31, r12
-
-C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
-
- ldq r2, 16(r17) C L1
- ldq r3, 24(r17) C L1
- lda r18, -1(r18) C L1 bookkeeping
- ldq r6, 16(r16) C L1
- ldq r7, 24(r16) C L1
- ldq r0, 32(r17) C L1
- mulq r19, r2, r13 C U1
- ldq r1, 40(r17) C L1
- umulh r19, r2, r14 C U1
- mulq r19, r3, r15 C U1
- lda r17, 64(r17) C L1 bookkeeping
- ldq r4, 32(r16) C L1
- ldq r5, 40(r16) C L1
- umulh r19, r3, r8 C U1
- ldq r2, -16(r17) C L1
- mulq r19, r0, r9 C U1
- ldq r3, -8(r17) C L1
- umulh r19, r0, r10 C U1
- subq r6, r13, r13 C L0 lo + acc
- mulq r19, r1, r11 C U1
- cmpult r6, r13, r20 C L0 lo add => carry
- lda r16, 64(r16) C L1 bookkeeping
- subq r13, r12, r22 C U0 hi add => answer
- cmpult r13, r12, r21 C L0 hi add => carry
- addq r14, r20, r14 C U0 hi mul + carry
- ldq r6, -16(r16) C L1
- subq r7, r15, r28 C L0 lo + acc
- addq r14, r21, r14 C U0 hi mul + carry
- cmpult r7, r15, r20 C L0 lo add => carry
- ldq r7, -8(r16) C L1
- umulh r19, r1, r12 C U1
- subq r28, r14, r23 C U0 hi add => answer
- ldq r0, 0(r17) C L1
- mulq r19, r2, r13 C U1
- cmpult r28, r14, r21 C L0 hi add => carry
- addq r8, r20, r8 C U0 hi mul + carry
- ldq r1, 8(r17) C L1
- umulh r19, r2, r14 C U1
- subq r4, r9, r9 C L0 lo + acc
- stq r22, -48(r16) C L0
- stq r23, -40(r16) C L1
- mulq r19, r3, r15 C U1
- addq r8, r21, r8 C U0 hi mul + carry
- cmpult r4, r9, r20 C L0 lo add => carry
- subq r9, r8, r22 C U0 hi add => answer
- ble r18, $Lend C U1 bookkeeping
-
-C ____ MAIN UNROLLED LOOP ____
- ALIGN(16)
-$Loop:
- bis r31, r31, r31 C U1 mt
- cmpult r9, r8, r21 C L0 hi add => carry
- addq r10, r20, r10 C U0 hi mul + carry
- ldq r4, 0(r16) C L1
-
- bis r31, r31, r31 C U1 mt
- subq r5, r11, r23 C L0 lo + acc
- addq r10, r21, r10 C L0 hi mul + carry
- ldq r2, 16(r17) C L1
-
- umulh r19, r3, r8 C U1
- cmpult r5, r11, r20 C L0 lo add => carry
- subq r23, r10, r28 C U0 hi add => answer
- ldq r5, 8(r16) C L1
-
- mulq r19, r0, r9 C U1
- cmpult r23, r10, r21 C L0 hi add => carry
- addq r12, r20, r12 C U0 hi mul + carry
- ldq r3, 24(r17) C L1
-
- umulh r19, r0, r10 C U1
- subq r6, r13, r13 C U0 lo + acc
- stq r22, -32(r16) C L0
- stq r28, -24(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r1, r11 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r12, r21, r12 C U0 hi mul + carry
-
- cmpult r6, r13, r20 C L0 lo add => carry
- bis r31, r31, r31 C U1 mt
- lda r18, -1(r18) C L1 bookkeeping
- subq r13, r12, r22 C U0 hi add => answer
-
- bis r31, r31, r31 C U1 mt
- cmpult r13, r12, r21 C L0 hi add => carry
- addq r14, r20, r14 C U0 hi mul + carry
- ldq r6, 16(r16) C L1
-
- bis r31, r31, r31 C U1 mt
- subq r7, r15, r23 C L0 lo + acc
- addq r14, r21, r14 C U0 hi mul + carry
- ldq r0, 32(r17) C L1
-
- umulh r19, r1, r12 C U1
- cmpult r7, r15, r20 C L0 lo add => carry
- subq r23, r14, r28 C U0 hi add => answer
- ldq r7, 24(r16) C L1
-
- mulq r19, r2, r13 C U1
- cmpult r23, r14, r21 C L0 hi add => carry
- addq r8, r20, r8 C U0 hi mul + carry
- ldq r1, 40(r17) C L1
-
- umulh r19, r2, r14 C U1
- subq r4, r9, r9 C U0 lo + acc
- stq r22, -16(r16) C L0
- stq r28, -8(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r3, r15 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r8, r21, r8 C L0 hi mul + carry
-
- cmpult r4, r9, r20 C L0 lo add => carry
- bis r31, r31, r31 C U1 mt
- lda r17, 64(r17) C L1 bookkeeping
- subq r9, r8, r22 C U0 hi add => answer
-
- bis r31, r31, r31 C U1 mt
- cmpult r9, r8, r21 C L0 hi add => carry
- addq r10, r20, r10 C U0 hi mul + carry
- ldq r4, 32(r16) C L1
-
- bis r31, r31, r31 C U1 mt
- subq r5, r11, r23 C L0 lo + acc
- addq r10, r21, r10 C L0 hi mul + carry
- ldq r2, -16(r17) C L1
-
- umulh r19, r3, r8 C U1
- cmpult r5, r11, r20 C L0 lo add => carry
- subq r23, r10, r28 C U0 hi add => answer
- ldq r5, 40(r16) C L1
-
- mulq r19, r0, r9 C U1
- cmpult r23, r10, r21 C L0 hi add => carry
- addq r12, r20, r12 C U0 hi mul + carry
- ldq r3, -8(r17) C L1
-
- umulh r19, r0, r10 C U1
- subq r6, r13, r13 C U0 lo + acc
- stq r22, 0(r16) C L0
- stq r28, 8(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r1, r11 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r12, r21, r12 C U0 hi mul + carry
-
- cmpult r6, r13, r20 C L0 lo add => carry
- bis r31, r31, r31 C U1 mt
- lda r16, 64(r16) C L1 bookkeeping
- subq r13, r12, r22 C U0 hi add => answer
-
- bis r31, r31, r31 C U1 mt
- cmpult r13, r12, r21 C L0 hi add => carry
- addq r14, r20, r14 C U0 hi mul + carry
- ldq r6, -16(r16) C L1
-
- bis r31, r31, r31 C U1 mt
- subq r7, r15, r23 C L0 lo + acc
- addq r14, r21, r14 C U0 hi mul + carry
- ldq r0, 0(r17) C L1
-
- umulh r19, r1, r12 C U1
- cmpult r7, r15, r20 C L0 lo add => carry
- subq r23, r14, r28 C U0 hi add => answer
- ldq r7, -8(r16) C L1
-
- mulq r19, r2, r13 C U1
- cmpult r23, r14, r21 C L0 hi add => carry
- addq r8, r20, r8 C U0 hi mul + carry
- ldq r1, 8(r17) C L1
-
- umulh r19, r2, r14 C U1
- subq r4, r9, r9 C U0 lo + acc
- stq r22, -48(r16) C L0
- stq r28, -40(r16) C L1
-
- bis r31, r31, r31 C L0 st slosh
- mulq r19, r3, r15 C U1
- bis r31, r31, r31 C L1 st slosh
- addq r8, r21, r8 C U0 hi mul + carry
-
- cmpult r4, r9, r20 C L0 lo add => carry
- subq r9, r8, r22 C U0 hi add => answer
- bis r31, r31, r31 C L1 mt
- bgt r18, $Loop C U1 bookkeeping
-
-C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
-$Lend:
- cmpult r9, r8, r21 C L0 hi add => carry
- addq r10, r20, r10 C U0 hi mul + carry
- ldq r4, 0(r16) C L1
- subq r5, r11, r23 C L0 lo + acc
- addq r10, r21, r10 C L0 hi mul + carry
- umulh r19, r3, r8 C U1
- cmpult r5, r11, r20 C L0 lo add => carry
- subq r23, r10, r28 C U0 hi add => answer
- ldq r5, 8(r16) C L1
- mulq r19, r0, r9 C U1
- cmpult r23, r10, r21 C L0 hi add => carry
- addq r12, r20, r12 C U0 hi mul + carry
- umulh r19, r0, r10 C U1
- subq r6, r13, r13 C L0 lo + acc
- stq r22, -32(r16) C L0
- stq r28, -24(r16) C L1
- mulq r19, r1, r11 C U1
- addq r12, r21, r12 C U0 hi mul + carry
- cmpult r6, r13, r20 C L0 lo add => carry
- subq r13, r12, r22 C U0 hi add => answer
- cmpult r13, r12, r21 C L0 hi add => carry
- addq r14, r20, r14 C U0 hi mul + carry
- subq r7, r15, r23 C L0 lo + acc
- addq r14, r21, r14 C U0 hi mul + carry
- umulh r19, r1, r12 C U1
- cmpult r7, r15, r20 C L0 lo add => carry
- subq r23, r14, r28 C U0 hi add => answer
- cmpult r23, r14, r21 C L0 hi add => carry
- addq r8, r20, r8 C U0 hi mul + carry
- subq r4, r9, r9 C U0 lo + acc
- stq r22, -16(r16) C L0
- stq r28, -8(r16) C L1
- addq r8, r21, r8 C L0 hi mul + carry
- cmpult r4, r9, r20 C L0 lo add => carry
- subq r9, r8, r22 C U0 hi add => answer
- cmpult r9, r8, r21 C L0 hi add => carry
- addq r10, r20, r10 C U0 hi mul + carry
- subq r5, r11, r23 C L0 lo + acc
- addq r10, r21, r10 C L0 hi mul + carry
- cmpult r5, r11, r20 C L0 lo add => carry
- subq r23, r10, r28 C U0 hi add => answer
- cmpult r23, r10, r21 C L0 hi add => carry
- addq r12, r20, r12 C U0 hi mul + carry
- stq r22, 0(r16) C L0
- stq r28, 8(r16) C L1
- addq r12, r21, r0 C U0 hi mul + carry
-
- ldq $9, 8($30)
- ldq $10, 16($30)
- ldq $11, 24($30)
- ldq $12, 32($30)
- ldq $13, 40($30)
- ldq $14, 48($30)
- ldq $15, 56($30)
- lda $30, 240($30)
- ret r31, (r26), 1
-EPILOGUE(mpn_submul_1)
-ASM_END()