Replace mpn/alpha/ev6/{addmul_1.asm,submul_1.asm} with combined file.

author: Torbjorn Granlund <tege@gmplib.org> 2008-12-14 02:50:47 +0100
committer: Torbjorn Granlund <tege@gmplib.org> 2008-12-14 02:50:47 +0100
commit: 32a23c292182665b54c2d85054ba2491ffc2d8a9 (patch)
tree: c660b5c592885a79e1ebfafa3a68e5d99b103cd1 /mpn/alpha
parent: eb400c26314d57c9f6192369856796bd7481f00b (diff)
download: gmp-32a23c292182665b54c2d85054ba2491ffc2d8a9.tar.gz
3 files changed, 387 insertions, 859 deletions
diff --git a/mpn/alpha/ev6/addmul_1.asm b/mpn/alpha/ev6/addmul_1.asm
deleted file mode 100644
index 8a30d7a09..000000000
--- a/mpn/alpha/ev6/addmul_1.asm
+++ /dev/null
@@ -1,388 +0,0 @@
-dnl  Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
-dnl  result to a second limb vector.
-
-dnl  Copyright 2000, 2003, 2004, 2005 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C      cycles/limb
-C EV4:    42
-C EV5:    18
-C EV6:     3.5
-
-C  INPUT PARAMETERS
-C  rp	  r16
-C  up	  r17
-C  n	  r18
-C  vlimb  r19
-
-dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
-
-dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
-dnl  them, so that further disturbance to the schedule is damped.
-
-dnl  We couldn't pair the loads, because the entangled schedule of the carry's
-dnl  has to happen on one side {0} of the machine.
-
-dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
-dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
-dnl  ldq in L1, say that load gets stalled because it collides with a fill from
-dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
-dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
-dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
-dnl  go in L0, or goes there, and causes a further instruction to stall.
-
-dnl  So for b_cache, we're likely going to want to put one or more cycles back
-dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
-dnl  At a place where we have an mt followed by a bookkeeping, put the
-dnl  bookkeeping in upper, and the prefetch into lower.
-
-dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
-dnl  like not to have an ldq or an stq to preceded a conditional branch in a
-dnl  quadpack.  The conditional branch moves the retire pointer one cycle
-dnl  later.
-
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
-	ldq	r3,	0(r17)		C
-	and	r18,	7,	r20	C
-	lda	r18,	-9(r18)		C
-	cmpeq	r20,	1,	r21	C
-	beq	r21,	$L1		C
-
-$1mod8:	ldq	r5,	0(r16)		C
-	mulq	r19,	r3,	r7	C
-	umulh	r19,	r3,	r8	C
-	addq	r5,	r7,	r23	C
-	cmpult	r23,	r7,	r20	C
-	addq	r8,	r20,	r0	C
-	stq	r23,	0(r16)		C
-	bge	r18,	$ent1		C
-	ret	r31,	(r26),	1	C
-
-$L1:	lda	r8,	0(r31)		C zero carry reg
-	lda	r24,	0(r31)		C zero carry reg
-	cmpeq	r20,	2,	r21	C
-	bne	r21,	$2mod8		C
-	cmpeq	r20,	3,	r21	C
-	bne	r21,	$3mod8		C
-	cmpeq	r20,	4,	r21	C
-	bne	r21,	$4mod8		C
-	cmpeq	r20,	5,	r21	C
-	bne	r21,	$5mod8		C
-	cmpeq	r20,	6,	r21	C
-	bne	r21,	$6mod8		C
-	cmpeq	r20,	7,	r21	C
-	beq	r21,	$0mod8		C
-
-$7mod8:	ldq	r5,	0(r16)		C
-	lda	r17,	8(r17)		C
-	mulq	r19,	r3,	r7	C
-	umulh	r19,	r3,	r24	C
-	addq	r5,	r7,	r23	C
-	cmpult	r23,	r7,	r20	C
-	addq	r24,	r20,	r24	C
-	stq	r23,	0(r16)		C
-	lda	r16,	8(r16)		C
-	ldq	r3,	0(r17)		C
-$6mod8:	ldq	r1,	8(r17)		C
-	mulq	r19,	r3,	r25	C
-	umulh	r19,	r3,	r3	C
-	mulq	r19,	r1,	r28	C
-	ldq	r0,	16(r17)		C
-	ldq	r4,	0(r16)		C
-	umulh	r19,	r1,	r8	C
-	ldq	r1,	24(r17)		C
-	lda	r17,	48(r17)		C L1 bookkeeping
-	mulq	r19,	r0,	r2	C
-	ldq	r5,	8(r16)		C
-	lda	r16,	-32(r16)	C L1 bookkeeping
-	umulh	r19,	r0,	r6	C
-	addq	r4,	r25,	r4	C lo + acc
-	mulq	r19,	r1,	r7	C
-	br	r31,	$ent6		C
-
-$ent1:	lda	r17,	8(r17)		C
-	lda	r16,	8(r16)		C
-	lda	r8,	0(r0)		C
-	ldq	r3,	0(r17)		C
-$0mod8:	ldq	r1,	8(r17)		C
-	mulq	r19,	r3,	r2	C
-	umulh	r19,	r3,	r6	C
-	mulq	r19,	r1,	r7	C
-	ldq	r0,	16(r17)		C
-	ldq	r4,	0(r16)		C
-	umulh	r19,	r1,	r24	C
-	ldq	r1,	24(r17)		C
-	mulq	r19,	r0,	r25	C
-	ldq	r5,	8(r16)		C
-	umulh	r19,	r0,	r3	C
-	addq	r4,	r2,	r4	C lo + acc
-	mulq	r19,	r1,	r28	C
-	lda	r16,	-16(r16)	C
-	br	r31,	$ent0		C
-
-$3mod8:	ldq	r5,	0(r16)		C
-	lda	r17,	8(r17)		C
-	mulq	r19,	r3,	r7	C
-	umulh	r19,	r3,	r8	C
-	addq	r5,	r7,	r23	C
-	cmpult	r23,	r7,	r20	C
-	addq	r8,	r20,	r24	C
-	stq	r23,	0(r16)		C
-	lda	r16,	8(r16)		C
-	ldq	r3,	0(r17)		C
-$2mod8:	ldq	r1,	8(r17)		C
-	mulq	r19,	r3,	r25	C
-	umulh	r19,	r3,	r3	C
-	mulq	r19,	r1,	r28	C
-	ble	r18,	$n23		C
-	ldq	r0,	16(r17)		C
-	ldq	r4,	0(r16)		C
-	umulh	r19,	r1,	r8	C
-	ldq	r1,	24(r17)		C
-	lda	r17,	16(r17)		C L1 bookkeeping
-	mulq	r19,	r0,	r2	C
-	ldq	r5,	8(r16)		C
-	lda	r16,	0(r16)		C L1 bookkeeping
-	umulh	r19,	r0,	r6	C
-	addq	r4,	r25,	r4	C lo + acc
-	mulq	r19,	r1,	r7	C
-	br	r31,	$ent2		C
-
-$5mod8:	ldq	r5,	0(r16)		C
-	lda	r17,	8(r17)		C
-	mulq	r19,	r3,	r7	C
-	umulh	r19,	r3,	r24	C
-	addq	r5,	r7,	r23	C
-	cmpult	r23,	r7,	r20	C
-	addq	r24,	r20,	r8	C
-	stq	r23,	0(r16)		C
-	lda	r16,	8(r16)		C
-	ldq	r3,	0(r17)		C
-$4mod8:	ldq	r1,	8(r17)		C
-	mulq	r19,	r3,	r2	C
-	umulh	r19,	r3,	r6	C
-	mulq	r19,	r1,	r7	C
-	ldq	r0,	16(r17)		C
-	ldq	r4,	0(r16)		C
-	umulh	r19,	r1,	r24	C
-	ldq	r1,	24(r17)		C
-	lda	r17,	32(r17)		C L1 bookkeeping
-	mulq	r19,	r0,	r25	C
-	ldq	r5,	8(r16)		C
-	lda	r16,	16(r16)		C L1 bookkeeping
-	umulh	r19,	r0,	r3	C
-	addq	r4,	r2,	r4	C lo + acc
-	mulq	r19,	r1,	r28	C
-	cmpult	r4,	r2,	r20	C L0 lo add => carry
-	addq	r4,	r8,	r22	C U0 hi add => answer
-	ble	r18,	$Lend		C
-ALIGN(16)
-$Loop:
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r22,	r8,	r21	C L0 hi add => carry
-	addq	r6,	r20,	r6	C U0 hi mul + carry
-	ldq	r0,	0(r17)		C
-
-	bis	r31,	r31,	r31	C U1 mt
-	addq	r5,	r7,	r23	C L0 lo + acc
-	addq	r6,	r21,	r6	C U0 hi mul + carry
-	ldq	r4,	0(r16)		C L1
-
-	umulh	r19,	r1,	r8	C U1
-	cmpult	r23,	r7,	r20	C L0 lo add => carry
-	addq	r23,	r6,	r23	C U0 hi add => answer
-	ldq	r1,	8(r17)		C L1
-
-	mulq	r19,	r0,	r2	C U1
-	cmpult	r23,	r6,	r21	C L0 hi add => carry
-	addq	r24,	r20,	r24	C U0 hi mul + carry
-	ldq	r5,	8(r16)		C L1
-
-	umulh	r19,	r0,	r6	C U1
-	addq	r4,	r25,	r4	C U0 lo + acc
-	stq	r22,	-16(r16)	C L0
-	stq	r23,	-8(r16)		C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r1,	r7	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r24,	r21,	r24	C U0 hi mul + carry
-$ent2:
-	cmpult	r4,	r25,	r20	C L0 lo add => carry
-	bis	r31,	r31,	r31	C U1 mt
-	lda	r18,	-8(r18)		C L1 bookkeeping
-	addq	r4,	r24,	r22	C U0 hi add => answer
-
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r22,	r24,	r21	C L0 hi add => carry
-	addq	r3,	r20,	r3	C U0 hi mul + carry
-	ldq	r0,	16(r17)		C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	addq	r5,	r28,	r23	C L0 lo + acc
-	addq	r3,	r21,	r3	C U0 hi mul + carry
-	ldq	r4,	16(r16)		C L1
-
-	umulh	r19,	r1,	r24	C U1
-	cmpult	r23,	r28,	r20	C L0 lo add => carry
-	addq	r23,	r3,	r23	C U0 hi add => answer
-	ldq	r1,	24(r17)		C L1
-
-	mulq	r19,	r0,	r25	C U1
-	cmpult	r23,	r3,	r21	C L0 hi add => carry
-	addq	r8,	r20,	r8	C U0 hi mul + carry
-	ldq	r5,	24(r16)		C L1
-
-	umulh	r19,	r0,	r3	C U1
-	addq	r4,	r2,	r4	C U0 lo + acc
-	stq	r22,	0(r16)		C L0
-	stq	r23,	8(r16)		C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r1,	r28	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r8,	r21,	r8	C L0 hi mul + carry
-$ent0:
-	cmpult	r4,	r2,	r20	C L0 lo add => carry
-	bis	r31,	r31,	r31	C U1 mt
-	lda	r17,	64(r17)		C L1 bookkeeping
-	addq	r4,	r8,	r22	C U0 hi add => answer
-
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r22,	r8,	r21	C L0 hi add => carry
-	addq	r6,	r20,	r6	C U0 hi mul + carry
-	ldq	r0,	-32(r17)	C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	addq	r5,	r7,	r23	C L0 lo + acc
-	addq	r6,	r21,	r6	C U0 hi mul + carry
-	ldq	r4,	32(r16)		C L1
-
-	umulh	r19,	r1,	r8	C U1
-	cmpult	r23,	r7,	r20	C L0 lo add => carry
-	addq	r23,	r6,	r23	C U0 hi add => answer
-	ldq	r1,	-24(r17)	C L1
-
-	mulq	r19,	r0,	r2	C U1
-	cmpult	r23,	r6,	r21	C L0 hi add => carry
-	addq	r24,	r20,	r24	C U0 hi mul + carry
-	ldq	r5,	40(r16)		C L1
-
-	umulh	r19,	r0,	r6	C U1
-	addq	r4,	r25,	r4	C U0 lo + acc
-	stq	r22,	16(r16)		C L0
-	stq	r23,	24(r16)		C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r1,	r7	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r24,	r21,	r24	C U0 hi mul + carry
-$ent6:
-	cmpult	r4,	r25,	r20	C L0 lo add => carry
-	bis	r31,	r31,	r31	C U1 mt
-	lda	r16,	64(r16)		C L1 bookkeeping
-	addq	r4,	r24,	r22	C U0 hi add => answer
-
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r22,	r24,	r21	C L0 hi add => carry
-	addq	r3,	r20,	r3	C U0 hi mul + carry
-	ldq	r0,	-16(r17)	C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	addq	r5,	r28,	r23	C L0 lo + acc
-	addq	r3,	r21,	r3	C U0 hi mul + carry
-	ldq	r4,	-16(r16)	C L1
-
-	umulh	r19,	r1,	r24	C U1
-	cmpult	r23,	r28,	r20	C L0 lo add => carry
-	addq	r23,	r3,	r23	C U0 hi add => answer
-	ldq	r1,	-8(r17)		C L1
-
-	mulq	r19,	r0,	r25	C U1
-	cmpult	r23,	r3,	r21	C L0 hi add => carry
-	addq	r8,	r20,	r8	C U0 hi mul + carry
-	ldq	r5,	-8(r16)		C L1
-
-	umulh	r19,	r0,	r3	C U1
-	addq	r4,	r2,	r4	C L0 lo + acc
-	stq	r22,	-32(r16)	C L0
-	stq	r23,	-24(r16)	C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r1,	r28	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r8,	r21,	r8	C U0 hi mul + carry
-
-	cmpult	r4,	r2,	r20	C L0 lo add => carry
-	addq	r4,	r8,	r22	C U0 hi add => answer
-	ldl	r31,	256(r17)	C prefetch up[]
-	bgt	r18,	$Loop		C U1 bookkeeping
-
-$Lend:	cmpult	r22,	r8,	r21	C
-	addq	r6,	r20,	r6	C
-	addq	r5,	r7,	r23	C
-	addq	r6,	r21,	r6	C
-	ldq	r4,	0(r16)		C
-	umulh	r19,	r1,	r8	C
-	cmpult	r23,	r7,	r20	C
-	addq	r23,	r6,	r23	C
-	cmpult	r23,	r6,	r21	C
-	addq	r24,	r20,	r24	C
-	ldq	r5,	8(r16)		C
-	addq	r4,	r25,	r4	C
-	stq	r22,	-16(r16)	C
-	stq	r23,	-8(r16)		C
-	addq	r24,	r21,	r24	C
-	cmpult	r4,	r25,	r20	C
-	addq	r4,	r24,	r22	C
-	cmpult	r22,	r24,	r21	C
-	addq	r3,	r20,	r3	C
-	addq	r5,	r28,	r23	C
-	addq	r3,	r21,	r3	C
-	cmpult	r23,	r28,	r20	C
-	addq	r23,	r3,	r23	C
-	cmpult	r23,	r3,	r21	C
-	addq	r8,	r20,	r8	C
-	stq	r22,	0(r16)		C
-	stq	r23,	8(r16)		C
-	addq	r8,	r21,	r0	C
-	ret	r31,	(r26),	1	C
-
-$n23:	ldq	r4,	0(r16)		C
-	ldq	r5,	8(r16)		C
-	umulh	r19,	r1,	r8	C
-	addq	r4,	r25,	r4	C
-	cmpult	r4,	r25,	r20	C
-	addq	r4,	r24,	r22	C
-	cmpult	r22,	r24,	r21	C
-	addq	r3,	r20,	r3	C
-	addq	r5,	r28,	r23	C
-	addq	r3,	r21,	r3	C
-	cmpult	r23,	r28,	r20	C
-	addq	r23,	r3,	r23	C
-	cmpult	r23,	r3,	r21	C
-	addq	r8,	r20,	r8	C
-	stq	r22,	0(r16)		C
-	stq	r23,	8(r16)		C
-	addq	r8,	r21,	r0	C
-	ret	r31,	(r26),	1	C
-EPILOGUE()
-ASM_END()
diff --git a/mpn/alpha/ev6/aorsmul_1.asm b/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644
index 000000000..eda092b2d
--- /dev/null
+++ b/mpn/alpha/ev6/aorsmul_1.asm
@@ -0,0 +1,387 @@
+dnl  Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.5
+
+C  INPUT PARAMETERS
+define(`rp',	`r16')
+define(`up',	`r17')
+define(`n',	`r18')
+define(`v0',	`r19')
+
+dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl  them, so that further disturbance to the schedule is damped.
+
+dnl  We couldn't pair the loads, because the entangled schedule of the carry's
+dnl  has to happen on one side {0} of the machine.
+
+dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
+dnl  ldq in L1, say that load gets stalled because it collides with a fill from
+dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
+dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
+dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
+dnl  go in L0, or goes there, and causes a further instruction to stall.
+
+dnl  So for b_cache, we're likely going to want to put one or more cycles back
+dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl  At a place where we have an mt followed by a bookkeeping, put the
+dnl  bookkeeping in upper, and the prefetch into lower.
+
+dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
+dnl  like not to have an ldq or an stq to preceded a conditional branch in a
+dnl  quadpack.  The conditional branch moves the retire pointer one cycle
+dnl  later.
+
+ifdef(`OPERATION_addmul_1',`
+    define(`ADDSUB',	`addq')
+    define(`CMPCY',	`cmpult	$2,$1')
+    define(`func',	`mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+    define(`ADDSUB',	`subq')
+    define(`CMPCY',	`cmpult	$1,$2')
+    define(`func',	`mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	ldq	r3,	0(up)		C
+	and	r18,	7,	r20	C
+	lda	r18,	-9(r18)		C
+	cmpeq	r20,	1,	r21	C
+	beq	r21,	$L1		C
+
+$1mod8:	ldq	r5,	0(rp)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r8	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r8,	r20,	r0	C
+	stq	r23,	0(rp)		C
+	bge	r18,	$ent1		C
+	ret	r31,	(r26),	1	C
+
+$L1:	lda	r8,	0(r31)		C zero carry reg
+	lda	r24,	0(r31)		C zero carry reg
+	cmpeq	r20,	2,	r21	C
+	bne	r21,	$2mod8		C
+	cmpeq	r20,	3,	r21	C
+	bne	r21,	$3mod8		C
+	cmpeq	r20,	4,	r21	C
+	bne	r21,	$4mod8		C
+	cmpeq	r20,	5,	r21	C
+	bne	r21,	$5mod8		C
+	cmpeq	r20,	6,	r21	C
+	bne	r21,	$6mod8		C
+	cmpeq	r20,	7,	r21	C
+	beq	r21,	$0mod8		C
+
+$7mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r24	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r24,	r20,	r24	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$6mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r25	C
+	umulh	v0,	r3,	r3	C
+	mulq	v0,	r1,	r28	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	ldq	r1,	24(up)		C
+	lda	up,	48(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r2	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	-32(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r6	C
+	ADDSUB	r4,	r25,	r25	C lo + acc
+	mulq	v0,	r1,	r7	C
+	br	r31,	$ent6		C
+
+$ent1:	lda	up,	8(up)		C
+	lda	rp,	8(rp)		C
+	lda	r8,	0(r0)		C
+	ldq	r3,	0(up)		C
+$0mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r2	C
+	umulh	v0,	r3,	r6	C
+	mulq	v0,	r1,	r7	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r24	C
+	ldq	r1,	24(up)		C
+	mulq	v0,	r0,	r25	C
+	ldq	r5,	8(rp)		C
+	umulh	v0,	r0,	r3	C
+	ADDSUB	r4,	r2,	r2	C lo + acc
+	mulq	v0,	r1,	r28	C
+	lda	rp,	-16(rp)		C
+	br	r31,	$ent0		C
+
+$3mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r8	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r8,	r20,	r24	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$2mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r25	C
+	umulh	v0,	r3,	r3	C
+	mulq	v0,	r1,	r28	C
+	ble	r18,	$n23		C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	ldq	r1,	24(up)		C
+	lda	up,	16(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r2	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	0(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r6	C
+	ADDSUB	r4,	r25,	r25	C lo + acc
+	mulq	v0,	r1,	r7	C
+	br	r31,	$ent2		C
+
+$5mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r24	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r24,	r20,	r8	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$4mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r2	C
+	umulh	v0,	r3,	r6	C
+	mulq	v0,	r1,	r7	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r24	C
+	ldq	r1,	24(up)		C
+	lda	up,	32(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r25	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	16(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r3	C
+	ADDSUB	r4,	r2,	r2	C lo + acc
+	mulq	v0,	r1,	r28	C
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+	ble	r18,	$Lend		C
+	ALIGN(16)
+$Loop:
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
+	addq	r6,	r20,	r6	C U0 hi mul + carry
+	ldq	r0,	0(up)		C
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r7,	r7	C L0 lo + acc
+	addq	r6,	r21,	r6	C U0 hi mul + carry
+	ldq	r4,	0(rp)		C L1
+
+	umulh	v0,	r1,	r8	C U1
+	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
+	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
+	ldq	r1,	8(up)		C L1
+
+	mulq	v0,	r0,	r2	C U1
+	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
+	addq	r24,	r20,	r24	C U0 hi mul + carry
+	ldq	r5,	8(rp)		C L1
+
+	umulh	v0,	r0,	r6	C U1
+	ADDSUB	r4,	r25,	r25	C U0 lo + acc
+	stq	r22,	-16(rp)		C L0
+	stq	r23,	-8(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r7	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r24,	r21,	r24	C U0 hi mul + carry
+$ent2:
+	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r18,	-8(r18)		C L1 bookkeeping
+	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
+	addq	r3,	r20,	r3	C U0 hi mul + carry
+	ldq	r0,	16(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r28,	r28	C L0 lo + acc
+	addq	r3,	r21,	r3	C U0 hi mul + carry
+	ldq	r4,	16(rp)		C L1
+
+	umulh	v0,	r1,	r24	C U1
+	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
+	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
+	ldq	r1,	24(up)		C L1
+
+	mulq	v0,	r0,	r25	C U1
+	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r5,	24(rp)		C L1
+
+	umulh	v0,	r0,	r3	C U1
+	ADDSUB	r4,	r2,	r2	C U0 lo + acc
+	stq	r22,	0(rp)		C L0
+	stq	r23,	8(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r28	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+$ent0:
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	up,	64(up)		C L1 bookkeeping
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
+	addq	r6,	r20,	r6	C U0 hi mul + carry
+	ldq	r0,	-32(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r7,	r7	C L0 lo + acc
+	addq	r6,	r21,	r6	C U0 hi mul + carry
+	ldq	r4,	32(rp)		C L1
+
+	umulh	v0,	r1,	r8	C U1
+	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
+	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
+	ldq	r1,	-24(up)		C L1
+
+	mulq	v0,	r0,	r2	C U1
+	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
+	addq	r24,	r20,	r24	C U0 hi mul + carry
+	ldq	r5,	40(rp)		C L1
+
+	umulh	v0,	r0,	r6	C U1
+	ADDSUB	r4,	r25,	r25	C U0 lo + acc
+	stq	r22,	16(rp)		C L0
+	stq	r23,	24(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r7	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r24,	r21,	r24	C U0 hi mul + carry
+$ent6:
+	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	rp,	64(rp)		C L1 bookkeeping
+	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
+	addq	r3,	r20,	r3	C U0 hi mul + carry
+	ldq	r0,	-16(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r28,	r28	C L0 lo + acc
+	addq	r3,	r21,	r3	C U0 hi mul + carry
+	ldq	r4,	-16(rp)		C L1
+
+	umulh	v0,	r1,	r24	C U1
+	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
+	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
+	ldq	r1,	-8(up)		C L1
+
+	mulq	v0,	r0,	r25	C U1
+	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r5,	-8(rp)		C L1
+
+	umulh	v0,	r0,	r3	C U1
+	ADDSUB	r4,	r2,	r2	C U0 lo + acc
+	stq	r22,	-32(rp)		C L0
+	stq	r23,	-24(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r28	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+	ldl	r31,	256(up)		C prefetch up[]
+	bgt	r18,	$Loop		C U1 bookkeeping
+
+$Lend:	CMPCY(	r2,	r22),	r21	C
+	addq	r6,	r20,	r6	C
+	ADDSUB	r5,	r7,	r7	C
+	addq	r6,	r21,	r6	C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	CMPCY(	r5,	r7),	r20	C
+	ADDSUB	r7,	r6,	r23	C
+	CMPCY(r7,	r23),	r21	C
+	addq	r24,	r20,	r24	C
+	ldq	r5,	8(rp)		C
+	ADDSUB	r4,	r25,	r25	C
+	stq	r22,	-16(rp)		C
+	stq	r23,	-8(rp)		C
+	addq	r24,	r21,	r24	C
+	br	L(x)
+
+	ALIGN(16)
+$n23:	ldq	r4,	0(rp)		C
+	ldq	r5,	8(rp)		C
+	umulh	v0,	r1,	r8	C
+	ADDSUB	r4,	r25,	r25	C
+L(x):	CMPCY(	r4,	r25),	r20	C
+	ADDSUB	r25,	r24,	r22	C
+	CMPCY(	r25,	r22),	r21	C
+	addq	r3,	r20,	r3	C
+	ADDSUB	r5,	r28,	r28	C
+	addq	r3,	r21,	r3	C
+	CMPCY(	r5,	r28),	r20	C
+	ADDSUB	r28,	r3,	r23	C
+	CMPCY(	r28,	r23),	r21	C
+	addq	r8,	r20,	r8	C
+	stq	r22,	0(rp)		C
+	stq	r23,	8(rp)		C
+	addq	r8,	r21,	r0	C
+	ret	r31,	(r26),	1	C
+EPILOGUE()
+ASM_END()
diff --git a/mpn/alpha/ev6/submul_1.asm b/mpn/alpha/ev6/submul_1.asm
deleted file mode 100644
index 01abad6d7..000000000
--- a/mpn/alpha/ev6/submul_1.asm
+++ /dev/null
@@ -1,471 +0,0 @@
-dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
-dnl the result from a second limb vector.
-
-dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-dnl  INPUT PARAMETERS
-dnl  res_ptr	r16
-dnl  s1_ptr	r17
-dnl  size	r18
-dnl  s2_limb	r19
-
-dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
-dnl  exactly 3.5 cycles/limb on EV6...
-
-dnl This code was written in close cooperation with ev6 pipeline expert
-dnl Steve Root.  Any errors are tege's fault, though.
-dnl
-dnl   Register usages for unrolled loop:
-dnl	  0-3     mul's
-dnl	  4-7     acc's
-dnl	  8-15    mul results
-dnl	  20,21   carry's
-dnl	  22,23   save for stores
-
-dnl   Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
-
-dnl   The stores can issue a cycle late so we have paired no-op's to 'catch'
-dnl   them, so that further disturbance to the schedule is damped.
-
-dnl   We couldn't pair the loads, because the entangled schedule of the
-dnl   carry's has to happen on one side {0} of the machine. Note, the total
-dnl   use of U0, and the total use of L0 (after attending to the stores).
-dnl   which is part of the reason why....
-
-dnl   This is a great schedule for the d_cache, a poor schedule for the
-dnl   b_cache. The lockup on U0 means that any stall can't be recovered
-dnl   from. Consider a ldq in L1.  say that load gets stalled because it
-dnl   collides with a fill from the b_Cache. On the next cycle, this load
-dnl   gets priority. If first looks at L0, and goes there. The instruction
-dnl   we intended for L0 gets to look at L1, which is NOT where we want
-dnl   it. It either stalls 1, because it can't go in L0, or goes there, and
-dnl   causes a further instruction to stall.
-
-dnl   So for b_cache, we're likely going to want to put one or more cycles
-dnl   back into the code! And, of course, put in prefetches. For the
-dnl   accumulator, lds, intent to modify.  For the multiplier, you might
-dnl   want ldq, evict next, if you're not wanting to use it again soon. Use
-dnl   256 ahead of present pointer value. At a place where we have an mt
-dnl   followed by a bookkeeping, put the bookkeeping in upper, and the
-dnl   prefetch into lower.
-
-dnl   Note, the usage of physical registers per cycle is smoothed off, as
-dnl   much as possible.
-
-dnl   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
-dnl   like not to have a ldq or stq to preceded a conditional branch in a
-dnl   quadpack. The conditional branch moves the retire pointer one cycle
-dnl   later.
-
-dnl   Optimization notes:
-dnl   Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
-dnl   Reserved regs:	 r29 r30 r31
-dnl   Free caller-saves regs in unrolled code: r24 r25 r28
-dnl   We should swap some of the callee-saves regs for some of the free
-dnl   caller-saves regs, saving some overhead cycles.
-dnl   Most importantly, we should write fast code for the 0-7 case.
-dnl   The code we use there are for the 21164, and runs at 7 cycles/limb
-dnl   on the 21264.  Should not be hard, if we write specialized code for
-dnl   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
-dnl   need a jump table indexed by the low 3 bits of the count argument.
-
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
-	cmpult	r18,	8,	r1
-	beq	r1,	$Large
-
-	ldq	r2,	0(r17)		C r2 = s1_limb
-	addq	r17,	8,	r17	C s1_ptr++
-	subq	r18,	1,	r18	C size--
-	mulq	r2,	r19,	r3	C r3 = prod_low
-	ldq	r5,	0(r16)		C r5 = *res_ptr
-	umulh	r2,	r19,	r0	C r0 = prod_high
-	beq	r18,	$Lend0b		C jump if size was == 1
-	ldq	r2,	0(r17)		C r2 = s1_limb
-	addq	r17,	8,	r17	C s1_ptr++
-	subq	r18,	1,	r18	C size--
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r4
-	stq	r3,	0(r16)
-	addq	r16,	8,	r16	C res_ptr++
-	beq	r18,	$Lend0a		C jump if size was == 2
-
-	ALIGN(8)
-$Loop0:	mulq	r2,	r19,	r3	C r3 = prod_low
-	ldq	r5,	0(r16)		C r5 = *res_ptr
-	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
-	subq	r18,	1,	r18	C size--
-	umulh	r2,	r19,	r4	C r4 = cy_limb
-	ldq	r2,	0(r17)		C r2 = s1_limb
-	addq	r17,	8,	r17	C s1_ptr++
-	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
-	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r5
-	stq	r3,	0(r16)
-	addq	r16,	8,	r16	C res_ptr++
-	addq	r5,	r0,	r0	C combine carries
-	bne	r18,	$Loop0
-$Lend0a:
-	mulq	r2,	r19,	r3	C r3 = prod_low
-	ldq	r5,	0(r16)		C r5 = *res_ptr
-	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
-	umulh	r2,	r19,	r4	C r4 = cy_limb
-	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
-	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r5
-	stq	r3,	0(r16)
-	addq	r5,	r0,	r0	C combine carries
-	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
-	ret	r31,	(r26),	1
-$Lend0b:
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r5
-	stq	r3,	0(r16)
-	addq	r0,	r5,	r0
-	ret	r31,	(r26),	1
-
-$Large:
-	lda	$30,	-240($30)
-	stq	$9,	8($30)
-	stq	$10,	16($30)
-	stq	$11,	24($30)
-	stq	$12,	32($30)
-	stq	$13,	40($30)
-	stq	$14,	48($30)
-	stq	$15,	56($30)
-
-	and	r18,	7,	r20	C count for the first loop, 0-7
-	srl	r18,	3,	r18	C count for unrolled loop
-	bis	r31,	r31,	r0
-	beq	r20,	$Lunroll
-	ldq	r2,	0(r17)		C r2 = s1_limb
-	addq	r17,	8,	r17	C s1_ptr++
-	subq	r20,	1,	r20	C size--
-	mulq	r2,	r19,	r3	C r3 = prod_low
-	ldq	r5,	0(r16)		C r5 = *res_ptr
-	umulh	r2,	r19,	r0	C r0 = prod_high
-	beq	r20,	$Lend1b		C jump if size was == 1
-	ldq	r2,	0(r17)		C r2 = s1_limb
-	addq	r17,	8,	r17	C s1_ptr++
-	subq	r20,	1,	r20	C size--
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r4
-	stq	r3,	0(r16)
-	addq	r16,	8,	r16	C res_ptr++
-	beq	r20,	$Lend1a		C jump if size was == 2
-
-	ALIGN(8)
-$Loop1:	mulq	r2,	r19,	r3	C r3 = prod_low
-	ldq	r5,	0(r16)		C r5 = *res_ptr
-	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
-	subq	r20,	1,	r20	C size--
-	umulh	r2,	r19,	r4	C r4 = cy_limb
-	ldq	r2,	0(r17)		C r2 = s1_limb
-	addq	r17,	8,	r17	C s1_ptr++
-	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
-	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r5
-	stq	r3,	0(r16)
-	addq	r16,	8,	r16	C res_ptr++
-	addq	r5,	r0,	r0	C combine carries
-	bne	r20,	$Loop1
-
-$Lend1a:
-	mulq	r2,	r19,	r3	C r3 = prod_low
-	ldq	r5,	0(r16)		C r5 = *res_ptr
-	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
-	umulh	r2,	r19,	r4	C r4 = cy_limb
-	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
-	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r5
-	stq	r3,	0(r16)
-	addq	r16,	8,	r16	C res_ptr++
-	addq	r5,	r0,	r0	C combine carries
-	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
-	br	r31,	$Lunroll
-$Lend1b:
-	subq	r5,	r3,	r3
-	cmpult	r5,	r3,	r5
-	stq	r3,	0(r16)
-	addq	r16,	8,	r16	C res_ptr++
-	addq	r0,	r5,	r0
-
-$Lunroll:
-	lda	r17,	-16(r17)	C L1 bookkeeping
-	lda	r16,	-16(r16)	C L1 bookkeeping
-	bis	r0,	r31,	r12
-
-C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
-
-	ldq	r2,	16(r17)		C L1
-	ldq	r3,	24(r17)		C L1
-	lda	r18,	-1(r18)		C L1 bookkeeping
-	ldq	r6,	16(r16)		C L1
-	ldq	r7,	24(r16)		C L1
-	ldq	r0,	32(r17)		C L1
-	mulq	r19,	r2,	r13	C U1
-	ldq	r1,	40(r17)		C L1
-	umulh	r19,	r2,	r14	C U1
-	mulq	r19,	r3,	r15	C U1
-	lda	r17,	64(r17)		C L1 bookkeeping
-	ldq	r4,	32(r16)		C L1
-	ldq	r5,	40(r16)		C L1
-	umulh	r19,	r3,	r8	C U1
-	ldq	r2,	-16(r17)	C L1
-	mulq	r19,	r0,	r9	C U1
-	ldq	r3,	-8(r17)		C L1
-	umulh	r19,	r0,	r10	C U1
-	subq	r6,	r13,	r13	C L0 lo + acc
-	mulq	r19,	r1,	r11	C U1
-	cmpult	r6,	r13,	r20	C L0 lo add => carry
-	lda	r16,	64(r16)		C L1 bookkeeping
-	subq	r13,	r12,	r22	C U0 hi add => answer
-	cmpult	r13,	r12,	r21	C L0 hi add => carry
-	addq	r14,	r20,	r14	C U0 hi mul + carry
-	ldq	r6,	-16(r16)	C L1
-	subq	r7,	r15,	r28	C L0 lo + acc
-	addq	r14,	r21,	r14	C U0 hi mul + carry
-	cmpult	r7,	r15,	r20	C L0 lo add => carry
-	ldq	r7,	-8(r16)		C L1
-	umulh	r19,	r1,	r12	C U1
-	subq	r28,	r14,	r23	C U0 hi add => answer
-	ldq	r0,	0(r17)		C L1
-	mulq	r19,	r2,	r13	C U1
-	cmpult	r28,	r14,	r21	C L0 hi add => carry
-	addq	r8,	r20,	r8	C U0 hi mul + carry
-	ldq	r1,	8(r17)		C L1
-	umulh	r19,	r2,	r14	C U1
-	subq	r4,	r9,	r9	C L0 lo + acc
-	stq	r22,	-48(r16)	C L0
-	stq	r23,	-40(r16)	C L1
-	mulq	r19,	r3,	r15	C U1
-	addq	r8,	r21,	r8	C U0 hi mul + carry
-	cmpult	r4,	r9,	r20	C L0 lo add => carry
-	subq	r9,	r8,	r22	C U0 hi add => answer
-	ble	r18,	$Lend		C U1 bookkeeping
-
-C ____ MAIN UNROLLED LOOP ____
-	ALIGN(16)
-$Loop:
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r9,	r8,	r21	C L0 hi add => carry
-	addq	r10,	r20,	r10	C U0 hi mul + carry
-	ldq	r4,	0(r16)		C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	subq	r5,	r11,	r23	C L0 lo + acc
-	addq	r10,	r21,	r10	C L0 hi mul + carry
-	ldq	r2,	16(r17)		C L1
-
-	umulh	r19,	r3,	r8	C U1
-	cmpult	r5,	r11,	r20	C L0 lo add => carry
-	subq	r23,	r10,	r28	C U0 hi add => answer
-	ldq	r5,	8(r16)		C L1
-
-	mulq	r19,	r0,	r9	C U1
-	cmpult	r23,	r10,	r21	C L0 hi add => carry
-	addq	r12,	r20,	r12	C U0 hi mul + carry
-	ldq	r3,	24(r17)		C L1
-
-	umulh	r19,	r0,	r10	C U1
-	subq	r6,	r13,	r13	C U0 lo + acc
-	stq	r22,	-32(r16)	C L0
-	stq	r28,	-24(r16)	C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r1,	r11	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r12,	r21,	r12	C U0 hi mul + carry
-
-	cmpult	r6,	r13,	r20	C L0 lo add => carry
-	bis	r31,	r31,	r31	C U1 mt
-	lda	r18,	-1(r18)		C L1 bookkeeping
-	subq	r13,	r12,	r22	C U0 hi add => answer
-
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r13,	r12,	r21	C L0 hi add => carry
-	addq	r14,	r20,	r14	C U0 hi mul + carry
-	ldq	r6,	16(r16)		C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	subq	r7,	r15,	r23	C L0 lo + acc
-	addq	r14,	r21,	r14	C U0 hi mul + carry
-	ldq	r0,	32(r17)		C L1
-
-	umulh	r19,	r1,	r12	C U1
-	cmpult	r7,	r15,	r20	C L0 lo add => carry
-	subq	r23,	r14,	r28	C U0 hi add => answer
-	ldq	r7,	24(r16)		C L1
-
-	mulq	r19,	r2,	r13	C U1
-	cmpult	r23,	r14,	r21	C L0 hi add => carry
-	addq	r8,	r20,	r8	C U0 hi mul + carry
-	ldq	r1,	40(r17)		C L1
-
-	umulh	r19,	r2,	r14	C U1
-	subq	r4,	r9,	r9	C U0 lo + acc
-	stq	r22,	-16(r16)	C L0
-	stq	r28,	-8(r16)		C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r3,	r15	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r8,	r21,	r8	C L0 hi mul + carry
-
-	cmpult	r4,	r9,	r20	C L0 lo add => carry
-	bis	r31,	r31,	r31	C U1 mt
-	lda	r17,	64(r17)		C L1 bookkeeping
-	subq	r9,	r8,	r22	C U0 hi add => answer
-
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r9,	r8,	r21	C L0 hi add => carry
-	addq	r10,	r20,	r10	C U0 hi mul + carry
-	ldq	r4,	32(r16)		C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	subq	r5,	r11,	r23	C L0 lo + acc
-	addq	r10,	r21,	r10	C L0 hi mul + carry
-	ldq	r2,	-16(r17)	C L1
-
-	umulh	r19,	r3,	r8	C U1
-	cmpult	r5,	r11,	r20	C L0 lo add => carry
-	subq	r23,	r10,	r28	C U0 hi add => answer
-	ldq	r5,	40(r16)		C L1
-
-	mulq	r19,	r0,	r9	C U1
-	cmpult	r23,	r10,	r21	C L0 hi add => carry
-	addq	r12,	r20,	r12	C U0 hi mul + carry
-	ldq	r3,	-8(r17)		C L1
-
-	umulh	r19,	r0,	r10	C U1
-	subq	r6,	r13,	r13	C U0 lo + acc
-	stq	r22,	0(r16)		C L0
-	stq	r28,	8(r16)		C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r1,	r11	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r12,	r21,	r12	C U0 hi mul + carry
-
-	cmpult	r6,	r13,	r20	C L0 lo add => carry
-	bis	r31,	r31,	r31	C U1 mt
-	lda	r16,	64(r16)		C L1 bookkeeping
-	subq	r13,	r12,	r22	C U0 hi add => answer
-
-	bis	r31,	r31,	r31	C U1 mt
-	cmpult	r13,	r12,	r21	C L0 hi add => carry
-	addq	r14,	r20,	r14	C U0 hi mul + carry
-	ldq	r6,	-16(r16)	C L1
-
-	bis	r31,	r31,	r31	C U1 mt
-	subq	r7,	r15,	r23	C L0 lo + acc
-	addq	r14,	r21,	r14	C U0 hi mul + carry
-	ldq	r0,	0(r17)		C L1
-
-	umulh	r19,	r1,	r12	C U1
-	cmpult	r7,	r15,	r20	C L0 lo add => carry
-	subq	r23,	r14,	r28	C U0 hi add => answer
-	ldq	r7,	-8(r16)		C L1
-
-	mulq	r19,	r2,	r13	C U1
-	cmpult	r23,	r14,	r21	C L0 hi add => carry
-	addq	r8,	r20,	r8	C U0 hi mul + carry
-	ldq	r1,	8(r17)		C L1
-
-	umulh	r19,	r2,	r14	C U1
-	subq	r4,	r9,	r9	C U0 lo + acc
-	stq	r22,	-48(r16)	C L0
-	stq	r28,	-40(r16)	C L1
-
-	bis	r31,	r31,	r31	C L0 st slosh
-	mulq	r19,	r3,	r15	C U1
-	bis	r31,	r31,	r31	C L1 st slosh
-	addq	r8,	r21,	r8	C U0 hi mul + carry
-
-	cmpult	r4,	r9,	r20	C L0 lo add => carry
-	subq	r9,	r8,	r22	C U0 hi add => answer
-	bis	r31,	r31,	r31	C L1 mt
-	bgt	r18,	$Loop		C U1 bookkeeping
-
-C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
-$Lend:
-	cmpult	r9,	r8,	r21	C L0 hi add => carry
-	addq	r10,	r20,	r10	C U0 hi mul + carry
-	ldq	r4,	0(r16)		C L1
-	subq	r5,	r11,	r23	C L0 lo + acc
-	addq	r10,	r21,	r10	C L0 hi mul + carry
-	umulh	r19,	r3,	r8	C U1
-	cmpult	r5,	r11,	r20	C L0 lo add => carry
-	subq	r23,	r10,	r28	C U0 hi add => answer
-	ldq	r5,	8(r16)		C L1
-	mulq	r19,	r0,	r9	C U1
-	cmpult	r23,	r10,	r21	C L0 hi add => carry
-	addq	r12,	r20,	r12	C U0 hi mul + carry
-	umulh	r19,	r0,	r10	C U1
-	subq	r6,	r13,	r13	C L0 lo + acc
-	stq	r22,	-32(r16)	C L0
-	stq	r28,	-24(r16)	C L1
-	mulq	r19,	r1,	r11	C U1
-	addq	r12,	r21,	r12	C U0 hi mul + carry
-	cmpult	r6,	r13,	r20	C L0 lo add => carry
-	subq	r13,	r12,	r22	C U0 hi add => answer
-	cmpult	r13,	r12,	r21	C L0 hi add => carry
-	addq	r14,	r20,	r14	C U0 hi mul + carry
-	subq	r7,	r15,	r23	C L0 lo + acc
-	addq	r14,	r21,	r14	C U0 hi mul + carry
-	umulh	r19,	r1,	r12	C U1
-	cmpult	r7,	r15,	r20	C L0 lo add => carry
-	subq	r23,	r14,	r28	C U0 hi add => answer
-	cmpult	r23,	r14,	r21	C L0 hi add => carry
-	addq	r8,	r20,	r8	C U0 hi mul + carry
-	subq	r4,	r9,	r9	C U0 lo + acc
-	stq	r22,	-16(r16)	C L0
-	stq	r28,	-8(r16)		C L1
-	addq	r8,	r21,	r8	C L0 hi mul + carry
-	cmpult	r4,	r9,	r20	C L0 lo add => carry
-	subq	r9,	r8,	r22	C U0 hi add => answer
-	cmpult	r9,	r8,	r21	C L0 hi add => carry
-	addq	r10,	r20,	r10	C U0 hi mul + carry
-	subq	r5,	r11,	r23	C L0 lo + acc
-	addq	r10,	r21,	r10	C L0 hi mul + carry
-	cmpult	r5,	r11,	r20	C L0 lo add => carry
-	subq	r23,	r10,	r28	C U0 hi add => answer
-	cmpult	r23,	r10,	r21	C L0 hi add => carry
-	addq	r12,	r20,	r12	C U0 hi mul + carry
-	stq	r22,	0(r16)		C L0
-	stq	r28,	8(r16)		C L1
-	addq	r12,	r21,	r0	C U0 hi mul + carry
-
-	ldq	$9,	8($30)
-	ldq	$10,	16($30)
-	ldq	$11,	24($30)
-	ldq	$12,	32($30)
-	ldq	$13,	40($30)
-	ldq	$14,	48($30)
-	ldq	$15,	56($30)
-	lda	$30,	240($30)
-	ret	r31,	(r26),	1
-EPILOGUE(mpn_submul_1)
-ASM_END()
author	Torbjorn Granlund <tege@gmplib.org>	2008-12-14 02:50:47 +0100
committer	Torbjorn Granlund <tege@gmplib.org>	2008-12-14 02:50:47 +0100
commit	32a23c292182665b54c2d85054ba2491ffc2d8a9 (patch)
tree	c660b5c592885a79e1ebfafa3a68e5d99b103cd1 /mpn/alpha
parent	eb400c26314d57c9f6192369856796bd7481f00b (diff)
download	gmp-32a23c292182665b54c2d85054ba2491ffc2d8a9.tar.gz