diff options
author | Torbjorn Granlund <tege@gmplib.org> | 2008-12-14 02:50:47 +0100 |
---|---|---|
committer | Torbjorn Granlund <tege@gmplib.org> | 2008-12-14 02:50:47 +0100 |
commit | 32a23c292182665b54c2d85054ba2491ffc2d8a9 (patch) | |
tree | c660b5c592885a79e1ebfafa3a68e5d99b103cd1 /mpn/alpha | |
parent | eb400c26314d57c9f6192369856796bd7481f00b (diff) | |
download | gmp-32a23c292182665b54c2d85054ba2491ffc2d8a9.tar.gz |
Replace mpn/alpha/ev6/{addmul_1.asm,submul_1.asm} with combined file.
Diffstat (limited to 'mpn/alpha')
-rw-r--r-- | mpn/alpha/ev6/addmul_1.asm | 388 | ||||
-rw-r--r-- | mpn/alpha/ev6/aorsmul_1.asm | 387 | ||||
-rw-r--r-- | mpn/alpha/ev6/submul_1.asm | 471 |
3 files changed, 387 insertions, 859 deletions
diff --git a/mpn/alpha/ev6/addmul_1.asm b/mpn/alpha/ev6/addmul_1.asm deleted file mode 100644 index 8a30d7a09..000000000 --- a/mpn/alpha/ev6/addmul_1.asm +++ /dev/null @@ -1,388 +0,0 @@ -dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add the -dnl result to a second limb vector. - -dnl Copyright 2000, 2003, 2004, 2005 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C EV4: 42 -C EV5: 18 -C EV6: 3.5 - -C INPUT PARAMETERS -C rp r16 -C up r17 -C n r18 -C vlimb r19 - -dnl This code was written in cooperation with ev6 pipeline expert Steve Root. - -dnl The stores can issue a cycle late so we have paired no-op's to 'catch' -dnl them, so that further disturbance to the schedule is damped. - -dnl We couldn't pair the loads, because the entangled schedule of the carry's -dnl has to happen on one side {0} of the machine. - -dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. -dnl The lockup on U0 means that any stall can't be recovered from. Consider a -dnl ldq in L1, say that load gets stalled because it collides with a fill from -dnl the b_cache. On the next cycle, this load gets priority. If first looks -dnl at L0, and goes there. The instruction we intended for L0 gets to look at -dnl L1, which is NOT where we want it. It either stalls 1, because it can't -dnl go in L0, or goes there, and causes a further instruction to stall. - -dnl So for b_cache, we're likely going to want to put one or more cycles back -dnl into the code! And, of course, put in lds prefetch for the rp[] operand. -dnl At a place where we have an mt followed by a bookkeeping, put the -dnl bookkeeping in upper, and the prefetch into lower. - -dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd -dnl like not to have an ldq or an stq to preceded a conditional branch in a -dnl quadpack. The conditional branch moves the retire pointer one cycle -dnl later. - - -ASM_START() -PROLOGUE(mpn_addmul_1) - ldq r3, 0(r17) C - and r18, 7, r20 C - lda r18, -9(r18) C - cmpeq r20, 1, r21 C - beq r21, $L1 C - -$1mod8: ldq r5, 0(r16) C - mulq r19, r3, r7 C - umulh r19, r3, r8 C - addq r5, r7, r23 C - cmpult r23, r7, r20 C - addq r8, r20, r0 C - stq r23, 0(r16) C - bge r18, $ent1 C - ret r31, (r26), 1 C - -$L1: lda r8, 0(r31) C zero carry reg - lda r24, 0(r31) C zero carry reg - cmpeq r20, 2, r21 C - bne r21, $2mod8 C - cmpeq r20, 3, r21 C - bne r21, $3mod8 C - cmpeq r20, 4, r21 C - bne r21, $4mod8 C - cmpeq r20, 5, r21 C - bne r21, $5mod8 C - cmpeq r20, 6, r21 C - bne r21, $6mod8 C - cmpeq r20, 7, r21 C - beq r21, $0mod8 C - -$7mod8: ldq r5, 0(r16) C - lda r17, 8(r17) C - mulq r19, r3, r7 C - umulh r19, r3, r24 C - addq r5, r7, r23 C - cmpult r23, r7, r20 C - addq r24, r20, r24 C - stq r23, 0(r16) C - lda r16, 8(r16) C - ldq r3, 0(r17) C -$6mod8: ldq r1, 8(r17) C - mulq r19, r3, r25 C - umulh r19, r3, r3 C - mulq r19, r1, r28 C - ldq r0, 16(r17) C - ldq r4, 0(r16) C - umulh r19, r1, r8 C - ldq r1, 24(r17) C - lda r17, 48(r17) C L1 bookkeeping - mulq r19, r0, r2 C - ldq r5, 8(r16) C - lda r16, -32(r16) C L1 bookkeeping - umulh r19, r0, r6 C - addq r4, r25, r4 C lo + acc - mulq r19, r1, r7 C - br r31, $ent6 C - -$ent1: lda r17, 8(r17) C - lda r16, 8(r16) C - lda r8, 0(r0) C - ldq r3, 0(r17) C -$0mod8: ldq r1, 8(r17) C - mulq r19, r3, r2 C - umulh r19, r3, r6 C - mulq r19, r1, r7 C - ldq r0, 16(r17) C - ldq r4, 0(r16) C - umulh r19, r1, r24 C - ldq r1, 24(r17) C - mulq r19, r0, r25 C - ldq r5, 8(r16) C - umulh r19, r0, r3 C - addq r4, r2, r4 C lo + acc - mulq r19, r1, r28 C - lda r16, -16(r16) C - br r31, $ent0 C - -$3mod8: ldq r5, 0(r16) C - lda r17, 8(r17) C - mulq r19, r3, r7 C - umulh r19, r3, r8 C - addq r5, r7, r23 C - cmpult r23, r7, r20 C - addq r8, r20, r24 C - stq r23, 0(r16) C - lda r16, 8(r16) C - ldq r3, 0(r17) C -$2mod8: ldq r1, 8(r17) C - mulq r19, r3, r25 C - umulh r19, r3, r3 C - mulq r19, r1, r28 C - ble r18, $n23 C - ldq r0, 16(r17) C - ldq r4, 0(r16) C - umulh r19, r1, r8 C - ldq r1, 24(r17) C - lda r17, 16(r17) C L1 bookkeeping - mulq r19, r0, r2 C - ldq r5, 8(r16) C - lda r16, 0(r16) C L1 bookkeeping - umulh r19, r0, r6 C - addq r4, r25, r4 C lo + acc - mulq r19, r1, r7 C - br r31, $ent2 C - -$5mod8: ldq r5, 0(r16) C - lda r17, 8(r17) C - mulq r19, r3, r7 C - umulh r19, r3, r24 C - addq r5, r7, r23 C - cmpult r23, r7, r20 C - addq r24, r20, r8 C - stq r23, 0(r16) C - lda r16, 8(r16) C - ldq r3, 0(r17) C -$4mod8: ldq r1, 8(r17) C - mulq r19, r3, r2 C - umulh r19, r3, r6 C - mulq r19, r1, r7 C - ldq r0, 16(r17) C - ldq r4, 0(r16) C - umulh r19, r1, r24 C - ldq r1, 24(r17) C - lda r17, 32(r17) C L1 bookkeeping - mulq r19, r0, r25 C - ldq r5, 8(r16) C - lda r16, 16(r16) C L1 bookkeeping - umulh r19, r0, r3 C - addq r4, r2, r4 C lo + acc - mulq r19, r1, r28 C - cmpult r4, r2, r20 C L0 lo add => carry - addq r4, r8, r22 C U0 hi add => answer - ble r18, $Lend C -ALIGN(16) -$Loop: - bis r31, r31, r31 C U1 mt - cmpult r22, r8, r21 C L0 hi add => carry - addq r6, r20, r6 C U0 hi mul + carry - ldq r0, 0(r17) C - - bis r31, r31, r31 C U1 mt - addq r5, r7, r23 C L0 lo + acc - addq r6, r21, r6 C U0 hi mul + carry - ldq r4, 0(r16) C L1 - - umulh r19, r1, r8 C U1 - cmpult r23, r7, r20 C L0 lo add => carry - addq r23, r6, r23 C U0 hi add => answer - ldq r1, 8(r17) C L1 - - mulq r19, r0, r2 C U1 - cmpult r23, r6, r21 C L0 hi add => carry - addq r24, r20, r24 C U0 hi mul + carry - ldq r5, 8(r16) C L1 - - umulh r19, r0, r6 C U1 - addq r4, r25, r4 C U0 lo + acc - stq r22, -16(r16) C L0 - stq r23, -8(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r1, r7 C U1 - bis r31, r31, r31 C L1 st slosh - addq r24, r21, r24 C U0 hi mul + carry -$ent2: - cmpult r4, r25, r20 C L0 lo add => carry - bis r31, r31, r31 C U1 mt - lda r18, -8(r18) C L1 bookkeeping - addq r4, r24, r22 C U0 hi add => answer - - bis r31, r31, r31 C U1 mt - cmpult r22, r24, r21 C L0 hi add => carry - addq r3, r20, r3 C U0 hi mul + carry - ldq r0, 16(r17) C L1 - - bis r31, r31, r31 C U1 mt - addq r5, r28, r23 C L0 lo + acc - addq r3, r21, r3 C U0 hi mul + carry - ldq r4, 16(r16) C L1 - - umulh r19, r1, r24 C U1 - cmpult r23, r28, r20 C L0 lo add => carry - addq r23, r3, r23 C U0 hi add => answer - ldq r1, 24(r17) C L1 - - mulq r19, r0, r25 C U1 - cmpult r23, r3, r21 C L0 hi add => carry - addq r8, r20, r8 C U0 hi mul + carry - ldq r5, 24(r16) C L1 - - umulh r19, r0, r3 C U1 - addq r4, r2, r4 C U0 lo + acc - stq r22, 0(r16) C L0 - stq r23, 8(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r1, r28 C U1 - bis r31, r31, r31 C L1 st slosh - addq r8, r21, r8 C L0 hi mul + carry -$ent0: - cmpult r4, r2, r20 C L0 lo add => carry - bis r31, r31, r31 C U1 mt - lda r17, 64(r17) C L1 bookkeeping - addq r4, r8, r22 C U0 hi add => answer - - bis r31, r31, r31 C U1 mt - cmpult r22, r8, r21 C L0 hi add => carry - addq r6, r20, r6 C U0 hi mul + carry - ldq r0, -32(r17) C L1 - - bis r31, r31, r31 C U1 mt - addq r5, r7, r23 C L0 lo + acc - addq r6, r21, r6 C U0 hi mul + carry - ldq r4, 32(r16) C L1 - - umulh r19, r1, r8 C U1 - cmpult r23, r7, r20 C L0 lo add => carry - addq r23, r6, r23 C U0 hi add => answer - ldq r1, -24(r17) C L1 - - mulq r19, r0, r2 C U1 - cmpult r23, r6, r21 C L0 hi add => carry - addq r24, r20, r24 C U0 hi mul + carry - ldq r5, 40(r16) C L1 - - umulh r19, r0, r6 C U1 - addq r4, r25, r4 C U0 lo + acc - stq r22, 16(r16) C L0 - stq r23, 24(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r1, r7 C U1 - bis r31, r31, r31 C L1 st slosh - addq r24, r21, r24 C U0 hi mul + carry -$ent6: - cmpult r4, r25, r20 C L0 lo add => carry - bis r31, r31, r31 C U1 mt - lda r16, 64(r16) C L1 bookkeeping - addq r4, r24, r22 C U0 hi add => answer - - bis r31, r31, r31 C U1 mt - cmpult r22, r24, r21 C L0 hi add => carry - addq r3, r20, r3 C U0 hi mul + carry - ldq r0, -16(r17) C L1 - - bis r31, r31, r31 C U1 mt - addq r5, r28, r23 C L0 lo + acc - addq r3, r21, r3 C U0 hi mul + carry - ldq r4, -16(r16) C L1 - - umulh r19, r1, r24 C U1 - cmpult r23, r28, r20 C L0 lo add => carry - addq r23, r3, r23 C U0 hi add => answer - ldq r1, -8(r17) C L1 - - mulq r19, r0, r25 C U1 - cmpult r23, r3, r21 C L0 hi add => carry - addq r8, r20, r8 C U0 hi mul + carry - ldq r5, -8(r16) C L1 - - umulh r19, r0, r3 C U1 - addq r4, r2, r4 C L0 lo + acc - stq r22, -32(r16) C L0 - stq r23, -24(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r1, r28 C U1 - bis r31, r31, r31 C L1 st slosh - addq r8, r21, r8 C U0 hi mul + carry - - cmpult r4, r2, r20 C L0 lo add => carry - addq r4, r8, r22 C U0 hi add => answer - ldl r31, 256(r17) C prefetch up[] - bgt r18, $Loop C U1 bookkeeping - -$Lend: cmpult r22, r8, r21 C - addq r6, r20, r6 C - addq r5, r7, r23 C - addq r6, r21, r6 C - ldq r4, 0(r16) C - umulh r19, r1, r8 C - cmpult r23, r7, r20 C - addq r23, r6, r23 C - cmpult r23, r6, r21 C - addq r24, r20, r24 C - ldq r5, 8(r16) C - addq r4, r25, r4 C - stq r22, -16(r16) C - stq r23, -8(r16) C - addq r24, r21, r24 C - cmpult r4, r25, r20 C - addq r4, r24, r22 C - cmpult r22, r24, r21 C - addq r3, r20, r3 C - addq r5, r28, r23 C - addq r3, r21, r3 C - cmpult r23, r28, r20 C - addq r23, r3, r23 C - cmpult r23, r3, r21 C - addq r8, r20, r8 C - stq r22, 0(r16) C - stq r23, 8(r16) C - addq r8, r21, r0 C - ret r31, (r26), 1 C - -$n23: ldq r4, 0(r16) C - ldq r5, 8(r16) C - umulh r19, r1, r8 C - addq r4, r25, r4 C - cmpult r4, r25, r20 C - addq r4, r24, r22 C - cmpult r22, r24, r21 C - addq r3, r20, r3 C - addq r5, r28, r23 C - addq r3, r21, r3 C - cmpult r23, r28, r20 C - addq r23, r3, r23 C - cmpult r23, r3, r21 C - addq r8, r20, r8 C - stq r22, 0(r16) C - stq r23, 8(r16) C - addq r8, r21, r0 C - ret r31, (r26), 1 C -EPILOGUE() -ASM_END() diff --git a/mpn/alpha/ev6/aorsmul_1.asm b/mpn/alpha/ev6/aorsmul_1.asm new file mode 100644 index 000000000..eda092b2d --- /dev/null +++ b/mpn/alpha/ev6/aorsmul_1.asm @@ -0,0 +1,387 @@ +dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3.5 + +C INPUT PARAMETERS +define(`rp', `r16') +define(`up', `r17') +define(`n', `r18') +define(`v0', `r19') + +dnl This code was written in cooperation with ev6 pipeline expert Steve Root. + +dnl The stores can issue a cycle late so we have paired no-op's to 'catch' +dnl them, so that further disturbance to the schedule is damped. + +dnl We couldn't pair the loads, because the entangled schedule of the carry's +dnl has to happen on one side {0} of the machine. + +dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. +dnl The lockup on U0 means that any stall can't be recovered from. Consider a +dnl ldq in L1, say that load gets stalled because it collides with a fill from +dnl the b_cache. On the next cycle, this load gets priority. If first looks +dnl at L0, and goes there. The instruction we intended for L0 gets to look at +dnl L1, which is NOT where we want it. It either stalls 1, because it can't +dnl go in L0, or goes there, and causes a further instruction to stall. + +dnl So for b_cache, we're likely going to want to put one or more cycles back +dnl into the code! And, of course, put in lds prefetch for the rp[] operand. +dnl At a place where we have an mt followed by a bookkeeping, put the +dnl bookkeeping in upper, and the prefetch into lower. + +dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd +dnl like not to have an ldq or an stq to preceded a conditional branch in a +dnl quadpack. The conditional branch moves the retire pointer one cycle +dnl later. + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `addq') + define(`CMPCY', `cmpult $2,$1') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `subq') + define(`CMPCY', `cmpult $1,$2') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + ldq r3, 0(up) C + and r18, 7, r20 C + lda r18, -9(r18) C + cmpeq r20, 1, r21 C + beq r21, $L1 C + +$1mod8: ldq r5, 0(rp) C + mulq v0, r3, r7 C + umulh v0, r3, r8 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r8, r20, r0 C + stq r23, 0(rp) C + bge r18, $ent1 C + ret r31, (r26), 1 C + +$L1: lda r8, 0(r31) C zero carry reg + lda r24, 0(r31) C zero carry reg + cmpeq r20, 2, r21 C + bne r21, $2mod8 C + cmpeq r20, 3, r21 C + bne r21, $3mod8 C + cmpeq r20, 4, r21 C + bne r21, $4mod8 C + cmpeq r20, 5, r21 C + bne r21, $5mod8 C + cmpeq r20, 6, r21 C + bne r21, $6mod8 C + cmpeq r20, 7, r21 C + beq r21, $0mod8 C + +$7mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r24 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r24, r20, r24 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$6mod8: ldq r1, 8(up) C + mulq v0, r3, r25 C + umulh v0, r3, r3 C + mulq v0, r1, r28 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + ldq r1, 24(up) C + lda up, 48(up) C L1 bookkeeping + mulq v0, r0, r2 C + ldq r5, 8(rp) C + lda rp, -32(rp) C L1 bookkeeping + umulh v0, r0, r6 C + ADDSUB r4, r25, r25 C lo + acc + mulq v0, r1, r7 C + br r31, $ent6 C + +$ent1: lda up, 8(up) C + lda rp, 8(rp) C + lda r8, 0(r0) C + ldq r3, 0(up) C +$0mod8: ldq r1, 8(up) C + mulq v0, r3, r2 C + umulh v0, r3, r6 C + mulq v0, r1, r7 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r24 C + ldq r1, 24(up) C + mulq v0, r0, r25 C + ldq r5, 8(rp) C + umulh v0, r0, r3 C + ADDSUB r4, r2, r2 C lo + acc + mulq v0, r1, r28 C + lda rp, -16(rp) C + br r31, $ent0 C + +$3mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r8 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r8, r20, r24 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$2mod8: ldq r1, 8(up) C + mulq v0, r3, r25 C + umulh v0, r3, r3 C + mulq v0, r1, r28 C + ble r18, $n23 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + ldq r1, 24(up) C + lda up, 16(up) C L1 bookkeeping + mulq v0, r0, r2 C + ldq r5, 8(rp) C + lda rp, 0(rp) C L1 bookkeeping + umulh v0, r0, r6 C + ADDSUB r4, r25, r25 C lo + acc + mulq v0, r1, r7 C + br r31, $ent2 C + +$5mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r24 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r24, r20, r8 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$4mod8: ldq r1, 8(up) C + mulq v0, r3, r2 C + umulh v0, r3, r6 C + mulq v0, r1, r7 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r24 C + ldq r1, 24(up) C + lda up, 32(up) C L1 bookkeeping + mulq v0, r0, r25 C + ldq r5, 8(rp) C + lda rp, 16(rp) C L1 bookkeeping + umulh v0, r0, r3 C + ADDSUB r4, r2, r2 C lo + acc + mulq v0, r1, r28 C + CMPCY( r4, r2), r20 C L0 lo add => carry + ADDSUB r2, r8, r22 C U0 hi add => answer + ble r18, $Lend C + ALIGN(16) +$Loop: + bis r31, r31, r31 C U1 mt + CMPCY( r2, r22), r21 C L0 hi add => carry + addq r6, r20, r6 C U0 hi mul + carry + ldq r0, 0(up) C + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r7, r7 C L0 lo + acc + addq r6, r21, r6 C U0 hi mul + carry + ldq r4, 0(rp) C L1 + + umulh v0, r1, r8 C U1 + CMPCY( r5, r7), r20 C L0 lo add => carry + ADDSUB r7, r6, r23 C U0 hi add => answer + ldq r1, 8(up) C L1 + + mulq v0, r0, r2 C U1 + CMPCY( r7, r23), r21 C L0 hi add => carry + addq r24, r20, r24 C U0 hi mul + carry + ldq r5, 8(rp) C L1 + + umulh v0, r0, r6 C U1 + ADDSUB r4, r25, r25 C U0 lo + acc + stq r22, -16(rp) C L0 + stq r23, -8(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r7 C U1 + bis r31, r31, r31 C L1 st slosh + addq r24, r21, r24 C U0 hi mul + carry +$ent2: + CMPCY( r4, r25), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r18, -8(r18) C L1 bookkeeping + ADDSUB r25, r24, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r25, r22), r21 C L0 hi add => carry + addq r3, r20, r3 C U0 hi mul + carry + ldq r0, 16(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r28, r28 C L0 lo + acc + addq r3, r21, r3 C U0 hi mul + carry + ldq r4, 16(rp) C L1 + + umulh v0, r1, r24 C U1 + CMPCY( r5, r28), r20 C L0 lo add => carry + ADDSUB r28, r3, r23 C U0 hi add => answer + ldq r1, 24(up) C L1 + + mulq v0, r0, r25 C U1 + CMPCY( r28, r23), r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r5, 24(rp) C L1 + + umulh v0, r0, r3 C U1 + ADDSUB r4, r2, r2 C U0 lo + acc + stq r22, 0(rp) C L0 + stq r23, 8(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r28 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry +$ent0: + CMPCY( r4, r2), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda up, 64(up) C L1 bookkeeping + ADDSUB r2, r8, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r2, r22), r21 C L0 hi add => carry + addq r6, r20, r6 C U0 hi mul + carry + ldq r0, -32(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r7, r7 C L0 lo + acc + addq r6, r21, r6 C U0 hi mul + carry + ldq r4, 32(rp) C L1 + + umulh v0, r1, r8 C U1 + CMPCY( r5, r7), r20 C L0 lo add => carry + ADDSUB r7, r6, r23 C U0 hi add => answer + ldq r1, -24(up) C L1 + + mulq v0, r0, r2 C U1 + CMPCY( r7, r23), r21 C L0 hi add => carry + addq r24, r20, r24 C U0 hi mul + carry + ldq r5, 40(rp) C L1 + + umulh v0, r0, r6 C U1 + ADDSUB r4, r25, r25 C U0 lo + acc + stq r22, 16(rp) C L0 + stq r23, 24(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r7 C U1 + bis r31, r31, r31 C L1 st slosh + addq r24, r21, r24 C U0 hi mul + carry +$ent6: + CMPCY( r4, r25), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda rp, 64(rp) C L1 bookkeeping + ADDSUB r25, r24, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r25, r22), r21 C L0 hi add => carry + addq r3, r20, r3 C U0 hi mul + carry + ldq r0, -16(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r28, r28 C L0 lo + acc + addq r3, r21, r3 C U0 hi mul + carry + ldq r4, -16(rp) C L1 + + umulh v0, r1, r24 C U1 + CMPCY( r5, r28), r20 C L0 lo add => carry + ADDSUB r28, r3, r23 C U0 hi add => answer + ldq r1, -8(up) C L1 + + mulq v0, r0, r25 C U1 + CMPCY( r28, r23), r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r5, -8(rp) C L1 + + umulh v0, r0, r3 C U1 + ADDSUB r4, r2, r2 C U0 lo + acc + stq r22, -32(rp) C L0 + stq r23, -24(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r28 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry + + CMPCY( r4, r2), r20 C L0 lo add => carry + ADDSUB r2, r8, r22 C U0 hi add => answer + ldl r31, 256(up) C prefetch up[] + bgt r18, $Loop C U1 bookkeeping + +$Lend: CMPCY( r2, r22), r21 C + addq r6, r20, r6 C + ADDSUB r5, r7, r7 C + addq r6, r21, r6 C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + CMPCY( r5, r7), r20 C + ADDSUB r7, r6, r23 C + CMPCY(r7, r23), r21 C + addq r24, r20, r24 C + ldq r5, 8(rp) C + ADDSUB r4, r25, r25 C + stq r22, -16(rp) C + stq r23, -8(rp) C + addq r24, r21, r24 C + br L(x) + + ALIGN(16) +$n23: ldq r4, 0(rp) C + ldq r5, 8(rp) C + umulh v0, r1, r8 C + ADDSUB r4, r25, r25 C +L(x): CMPCY( r4, r25), r20 C + ADDSUB r25, r24, r22 C + CMPCY( r25, r22), r21 C + addq r3, r20, r3 C + ADDSUB r5, r28, r28 C + addq r3, r21, r3 C + CMPCY( r5, r28), r20 C + ADDSUB r28, r3, r23 C + CMPCY( r28, r23), r21 C + addq r8, r20, r8 C + stq r22, 0(rp) C + stq r23, 8(rp) C + addq r8, r21, r0 C + ret r31, (r26), 1 C +EPILOGUE() +ASM_END() diff --git a/mpn/alpha/ev6/submul_1.asm b/mpn/alpha/ev6/submul_1.asm deleted file mode 100644 index 01abad6d7..000000000 --- a/mpn/alpha/ev6/submul_1.asm +++ /dev/null @@ -1,471 +0,0 @@ -dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract -dnl the result from a second limb vector. - -dnl Copyright 2000, 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -dnl INPUT PARAMETERS -dnl res_ptr r16 -dnl s1_ptr r17 -dnl size r18 -dnl s2_limb r19 - -dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and -dnl exactly 3.5 cycles/limb on EV6... - -dnl This code was written in close cooperation with ev6 pipeline expert -dnl Steve Root. Any errors are tege's fault, though. -dnl -dnl Register usages for unrolled loop: -dnl 0-3 mul's -dnl 4-7 acc's -dnl 8-15 mul results -dnl 20,21 carry's -dnl 22,23 save for stores - -dnl Sustains 8 mul-adds in 28 cycles in the unrolled inner loop. - -dnl The stores can issue a cycle late so we have paired no-op's to 'catch' -dnl them, so that further disturbance to the schedule is damped. - -dnl We couldn't pair the loads, because the entangled schedule of the -dnl carry's has to happen on one side {0} of the machine. Note, the total -dnl use of U0, and the total use of L0 (after attending to the stores). -dnl which is part of the reason why.... - -dnl This is a great schedule for the d_cache, a poor schedule for the -dnl b_cache. The lockup on U0 means that any stall can't be recovered -dnl from. Consider a ldq in L1. say that load gets stalled because it -dnl collides with a fill from the b_Cache. On the next cycle, this load -dnl gets priority. If first looks at L0, and goes there. The instruction -dnl we intended for L0 gets to look at L1, which is NOT where we want -dnl it. It either stalls 1, because it can't go in L0, or goes there, and -dnl causes a further instruction to stall. - -dnl So for b_cache, we're likely going to want to put one or more cycles -dnl back into the code! And, of course, put in prefetches. For the -dnl accumulator, lds, intent to modify. For the multiplier, you might -dnl want ldq, evict next, if you're not wanting to use it again soon. Use -dnl 256 ahead of present pointer value. At a place where we have an mt -dnl followed by a bookkeeping, put the bookkeeping in upper, and the -dnl prefetch into lower. - -dnl Note, the usage of physical registers per cycle is smoothed off, as -dnl much as possible. - -dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd -dnl like not to have a ldq or stq to preceded a conditional branch in a -dnl quadpack. The conditional branch moves the retire pointer one cycle -dnl later. - -dnl Optimization notes: -dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27? -dnl Reserved regs: r29 r30 r31 -dnl Free caller-saves regs in unrolled code: r24 r25 r28 -dnl We should swap some of the callee-saves regs for some of the free -dnl caller-saves regs, saving some overhead cycles. -dnl Most importantly, we should write fast code for the 0-7 case. -dnl The code we use there are for the 21164, and runs at 7 cycles/limb -dnl on the 21264. Should not be hard, if we write specialized code for -dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just -dnl need a jump table indexed by the low 3 bits of the count argument. - - -ASM_START() -PROLOGUE(mpn_submul_1) - cmpult r18, 8, r1 - beq r1, $Large - - ldq r2, 0(r17) C r2 = s1_limb - addq r17, 8, r17 C s1_ptr++ - subq r18, 1, r18 C size-- - mulq r2, r19, r3 C r3 = prod_low - ldq r5, 0(r16) C r5 = *res_ptr - umulh r2, r19, r0 C r0 = prod_high - beq r18, $Lend0b C jump if size was == 1 - ldq r2, 0(r17) C r2 = s1_limb - addq r17, 8, r17 C s1_ptr++ - subq r18, 1, r18 C size-- - subq r5, r3, r3 - cmpult r5, r3, r4 - stq r3, 0(r16) - addq r16, 8, r16 C res_ptr++ - beq r18, $Lend0a C jump if size was == 2 - - ALIGN(8) -$Loop0: mulq r2, r19, r3 C r3 = prod_low - ldq r5, 0(r16) C r5 = *res_ptr - addq r4, r0, r0 C cy_limb = cy_limb + 'cy' - subq r18, 1, r18 C size-- - umulh r2, r19, r4 C r4 = cy_limb - ldq r2, 0(r17) C r2 = s1_limb - addq r17, 8, r17 C s1_ptr++ - addq r3, r0, r3 C r3 = cy_limb + prod_low - cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) - subq r5, r3, r3 - cmpult r5, r3, r5 - stq r3, 0(r16) - addq r16, 8, r16 C res_ptr++ - addq r5, r0, r0 C combine carries - bne r18, $Loop0 -$Lend0a: - mulq r2, r19, r3 C r3 = prod_low - ldq r5, 0(r16) C r5 = *res_ptr - addq r4, r0, r0 C cy_limb = cy_limb + 'cy' - umulh r2, r19, r4 C r4 = cy_limb - addq r3, r0, r3 C r3 = cy_limb + prod_low - cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) - subq r5, r3, r3 - cmpult r5, r3, r5 - stq r3, 0(r16) - addq r5, r0, r0 C combine carries - addq r4, r0, r0 C cy_limb = prod_high + cy - ret r31, (r26), 1 -$Lend0b: - subq r5, r3, r3 - cmpult r5, r3, r5 - stq r3, 0(r16) - addq r0, r5, r0 - ret r31, (r26), 1 - -$Large: - lda $30, -240($30) - stq $9, 8($30) - stq $10, 16($30) - stq $11, 24($30) - stq $12, 32($30) - stq $13, 40($30) - stq $14, 48($30) - stq $15, 56($30) - - and r18, 7, r20 C count for the first loop, 0-7 - srl r18, 3, r18 C count for unrolled loop - bis r31, r31, r0 - beq r20, $Lunroll - ldq r2, 0(r17) C r2 = s1_limb - addq r17, 8, r17 C s1_ptr++ - subq r20, 1, r20 C size-- - mulq r2, r19, r3 C r3 = prod_low - ldq r5, 0(r16) C r5 = *res_ptr - umulh r2, r19, r0 C r0 = prod_high - beq r20, $Lend1b C jump if size was == 1 - ldq r2, 0(r17) C r2 = s1_limb - addq r17, 8, r17 C s1_ptr++ - subq r20, 1, r20 C size-- - subq r5, r3, r3 - cmpult r5, r3, r4 - stq r3, 0(r16) - addq r16, 8, r16 C res_ptr++ - beq r20, $Lend1a C jump if size was == 2 - - ALIGN(8) -$Loop1: mulq r2, r19, r3 C r3 = prod_low - ldq r5, 0(r16) C r5 = *res_ptr - addq r4, r0, r0 C cy_limb = cy_limb + 'cy' - subq r20, 1, r20 C size-- - umulh r2, r19, r4 C r4 = cy_limb - ldq r2, 0(r17) C r2 = s1_limb - addq r17, 8, r17 C s1_ptr++ - addq r3, r0, r3 C r3 = cy_limb + prod_low - cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) - subq r5, r3, r3 - cmpult r5, r3, r5 - stq r3, 0(r16) - addq r16, 8, r16 C res_ptr++ - addq r5, r0, r0 C combine carries - bne r20, $Loop1 - -$Lend1a: - mulq r2, r19, r3 C r3 = prod_low - ldq r5, 0(r16) C r5 = *res_ptr - addq r4, r0, r0 C cy_limb = cy_limb + 'cy' - umulh r2, r19, r4 C r4 = cy_limb - addq r3, r0, r3 C r3 = cy_limb + prod_low - cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) - subq r5, r3, r3 - cmpult r5, r3, r5 - stq r3, 0(r16) - addq r16, 8, r16 C res_ptr++ - addq r5, r0, r0 C combine carries - addq r4, r0, r0 C cy_limb = prod_high + cy - br r31, $Lunroll -$Lend1b: - subq r5, r3, r3 - cmpult r5, r3, r5 - stq r3, 0(r16) - addq r16, 8, r16 C res_ptr++ - addq r0, r5, r0 - -$Lunroll: - lda r17, -16(r17) C L1 bookkeeping - lda r16, -16(r16) C L1 bookkeeping - bis r0, r31, r12 - -C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ - - ldq r2, 16(r17) C L1 - ldq r3, 24(r17) C L1 - lda r18, -1(r18) C L1 bookkeeping - ldq r6, 16(r16) C L1 - ldq r7, 24(r16) C L1 - ldq r0, 32(r17) C L1 - mulq r19, r2, r13 C U1 - ldq r1, 40(r17) C L1 - umulh r19, r2, r14 C U1 - mulq r19, r3, r15 C U1 - lda r17, 64(r17) C L1 bookkeeping - ldq r4, 32(r16) C L1 - ldq r5, 40(r16) C L1 - umulh r19, r3, r8 C U1 - ldq r2, -16(r17) C L1 - mulq r19, r0, r9 C U1 - ldq r3, -8(r17) C L1 - umulh r19, r0, r10 C U1 - subq r6, r13, r13 C L0 lo + acc - mulq r19, r1, r11 C U1 - cmpult r6, r13, r20 C L0 lo add => carry - lda r16, 64(r16) C L1 bookkeeping - subq r13, r12, r22 C U0 hi add => answer - cmpult r13, r12, r21 C L0 hi add => carry - addq r14, r20, r14 C U0 hi mul + carry - ldq r6, -16(r16) C L1 - subq r7, r15, r28 C L0 lo + acc - addq r14, r21, r14 C U0 hi mul + carry - cmpult r7, r15, r20 C L0 lo add => carry - ldq r7, -8(r16) C L1 - umulh r19, r1, r12 C U1 - subq r28, r14, r23 C U0 hi add => answer - ldq r0, 0(r17) C L1 - mulq r19, r2, r13 C U1 - cmpult r28, r14, r21 C L0 hi add => carry - addq r8, r20, r8 C U0 hi mul + carry - ldq r1, 8(r17) C L1 - umulh r19, r2, r14 C U1 - subq r4, r9, r9 C L0 lo + acc - stq r22, -48(r16) C L0 - stq r23, -40(r16) C L1 - mulq r19, r3, r15 C U1 - addq r8, r21, r8 C U0 hi mul + carry - cmpult r4, r9, r20 C L0 lo add => carry - subq r9, r8, r22 C U0 hi add => answer - ble r18, $Lend C U1 bookkeeping - -C ____ MAIN UNROLLED LOOP ____ - ALIGN(16) -$Loop: - bis r31, r31, r31 C U1 mt - cmpult r9, r8, r21 C L0 hi add => carry - addq r10, r20, r10 C U0 hi mul + carry - ldq r4, 0(r16) C L1 - - bis r31, r31, r31 C U1 mt - subq r5, r11, r23 C L0 lo + acc - addq r10, r21, r10 C L0 hi mul + carry - ldq r2, 16(r17) C L1 - - umulh r19, r3, r8 C U1 - cmpult r5, r11, r20 C L0 lo add => carry - subq r23, r10, r28 C U0 hi add => answer - ldq r5, 8(r16) C L1 - - mulq r19, r0, r9 C U1 - cmpult r23, r10, r21 C L0 hi add => carry - addq r12, r20, r12 C U0 hi mul + carry - ldq r3, 24(r17) C L1 - - umulh r19, r0, r10 C U1 - subq r6, r13, r13 C U0 lo + acc - stq r22, -32(r16) C L0 - stq r28, -24(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r1, r11 C U1 - bis r31, r31, r31 C L1 st slosh - addq r12, r21, r12 C U0 hi mul + carry - - cmpult r6, r13, r20 C L0 lo add => carry - bis r31, r31, r31 C U1 mt - lda r18, -1(r18) C L1 bookkeeping - subq r13, r12, r22 C U0 hi add => answer - - bis r31, r31, r31 C U1 mt - cmpult r13, r12, r21 C L0 hi add => carry - addq r14, r20, r14 C U0 hi mul + carry - ldq r6, 16(r16) C L1 - - bis r31, r31, r31 C U1 mt - subq r7, r15, r23 C L0 lo + acc - addq r14, r21, r14 C U0 hi mul + carry - ldq r0, 32(r17) C L1 - - umulh r19, r1, r12 C U1 - cmpult r7, r15, r20 C L0 lo add => carry - subq r23, r14, r28 C U0 hi add => answer - ldq r7, 24(r16) C L1 - - mulq r19, r2, r13 C U1 - cmpult r23, r14, r21 C L0 hi add => carry - addq r8, r20, r8 C U0 hi mul + carry - ldq r1, 40(r17) C L1 - - umulh r19, r2, r14 C U1 - subq r4, r9, r9 C U0 lo + acc - stq r22, -16(r16) C L0 - stq r28, -8(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r3, r15 C U1 - bis r31, r31, r31 C L1 st slosh - addq r8, r21, r8 C L0 hi mul + carry - - cmpult r4, r9, r20 C L0 lo add => carry - bis r31, r31, r31 C U1 mt - lda r17, 64(r17) C L1 bookkeeping - subq r9, r8, r22 C U0 hi add => answer - - bis r31, r31, r31 C U1 mt - cmpult r9, r8, r21 C L0 hi add => carry - addq r10, r20, r10 C U0 hi mul + carry - ldq r4, 32(r16) C L1 - - bis r31, r31, r31 C U1 mt - subq r5, r11, r23 C L0 lo + acc - addq r10, r21, r10 C L0 hi mul + carry - ldq r2, -16(r17) C L1 - - umulh r19, r3, r8 C U1 - cmpult r5, r11, r20 C L0 lo add => carry - subq r23, r10, r28 C U0 hi add => answer - ldq r5, 40(r16) C L1 - - mulq r19, r0, r9 C U1 - cmpult r23, r10, r21 C L0 hi add => carry - addq r12, r20, r12 C U0 hi mul + carry - ldq r3, -8(r17) C L1 - - umulh r19, r0, r10 C U1 - subq r6, r13, r13 C U0 lo + acc - stq r22, 0(r16) C L0 - stq r28, 8(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r1, r11 C U1 - bis r31, r31, r31 C L1 st slosh - addq r12, r21, r12 C U0 hi mul + carry - - cmpult r6, r13, r20 C L0 lo add => carry - bis r31, r31, r31 C U1 mt - lda r16, 64(r16) C L1 bookkeeping - subq r13, r12, r22 C U0 hi add => answer - - bis r31, r31, r31 C U1 mt - cmpult r13, r12, r21 C L0 hi add => carry - addq r14, r20, r14 C U0 hi mul + carry - ldq r6, -16(r16) C L1 - - bis r31, r31, r31 C U1 mt - subq r7, r15, r23 C L0 lo + acc - addq r14, r21, r14 C U0 hi mul + carry - ldq r0, 0(r17) C L1 - - umulh r19, r1, r12 C U1 - cmpult r7, r15, r20 C L0 lo add => carry - subq r23, r14, r28 C U0 hi add => answer - ldq r7, -8(r16) C L1 - - mulq r19, r2, r13 C U1 - cmpult r23, r14, r21 C L0 hi add => carry - addq r8, r20, r8 C U0 hi mul + carry - ldq r1, 8(r17) C L1 - - umulh r19, r2, r14 C U1 - subq r4, r9, r9 C U0 lo + acc - stq r22, -48(r16) C L0 - stq r28, -40(r16) C L1 - - bis r31, r31, r31 C L0 st slosh - mulq r19, r3, r15 C U1 - bis r31, r31, r31 C L1 st slosh - addq r8, r21, r8 C U0 hi mul + carry - - cmpult r4, r9, r20 C L0 lo add => carry - subq r9, r8, r22 C U0 hi add => answer - bis r31, r31, r31 C L1 mt - bgt r18, $Loop C U1 bookkeeping - -C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ -$Lend: - cmpult r9, r8, r21 C L0 hi add => carry - addq r10, r20, r10 C U0 hi mul + carry - ldq r4, 0(r16) C L1 - subq r5, r11, r23 C L0 lo + acc - addq r10, r21, r10 C L0 hi mul + carry - umulh r19, r3, r8 C U1 - cmpult r5, r11, r20 C L0 lo add => carry - subq r23, r10, r28 C U0 hi add => answer - ldq r5, 8(r16) C L1 - mulq r19, r0, r9 C U1 - cmpult r23, r10, r21 C L0 hi add => carry - addq r12, r20, r12 C U0 hi mul + carry - umulh r19, r0, r10 C U1 - subq r6, r13, r13 C L0 lo + acc - stq r22, -32(r16) C L0 - stq r28, -24(r16) C L1 - mulq r19, r1, r11 C U1 - addq r12, r21, r12 C U0 hi mul + carry - cmpult r6, r13, r20 C L0 lo add => carry - subq r13, r12, r22 C U0 hi add => answer - cmpult r13, r12, r21 C L0 hi add => carry - addq r14, r20, r14 C U0 hi mul + carry - subq r7, r15, r23 C L0 lo + acc - addq r14, r21, r14 C U0 hi mul + carry - umulh r19, r1, r12 C U1 - cmpult r7, r15, r20 C L0 lo add => carry - subq r23, r14, r28 C U0 hi add => answer - cmpult r23, r14, r21 C L0 hi add => carry - addq r8, r20, r8 C U0 hi mul + carry - subq r4, r9, r9 C U0 lo + acc - stq r22, -16(r16) C L0 - stq r28, -8(r16) C L1 - addq r8, r21, r8 C L0 hi mul + carry - cmpult r4, r9, r20 C L0 lo add => carry - subq r9, r8, r22 C U0 hi add => answer - cmpult r9, r8, r21 C L0 hi add => carry - addq r10, r20, r10 C U0 hi mul + carry - subq r5, r11, r23 C L0 lo + acc - addq r10, r21, r10 C L0 hi mul + carry - cmpult r5, r11, r20 C L0 lo add => carry - subq r23, r10, r28 C U0 hi add => answer - cmpult r23, r10, r21 C L0 hi add => carry - addq r12, r20, r12 C U0 hi mul + carry - stq r22, 0(r16) C L0 - stq r28, 8(r16) C L1 - addq r12, r21, r0 C U0 hi mul + carry - - ldq $9, 8($30) - ldq $10, 16($30) - ldq $11, 24($30) - ldq $12, 32($30) - ldq $13, 40($30) - ldq $14, 48($30) - ldq $15, 56($30) - lda $30, 240($30) - ret r31, (r26), 1 -EPILOGUE(mpn_submul_1) -ASM_END() |