summaryrefslogtreecommitdiff
path: root/gmp/mpn/ia64/submul_1.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/ia64/submul_1.asm')
-rw-r--r--gmp/mpn/ia64/submul_1.asm647
1 files changed, 647 insertions, 0 deletions
diff --git a/gmp/mpn/ia64/submul_1.asm b/gmp/mpn/ia64/submul_1.asm
new file mode 100644
index 0000000000..cb2a5525b5
--- /dev/null
+++ b/gmp/mpn/ia64/submul_1.asm
@@ -0,0 +1,647 @@
+dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl result from a second limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 4.0
+C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l)
+
+C TODO
+C * Optimize feed-in and wind-down code, both for speed and code size.
+C * Handle low limb input and results specially, using a common stf8 in the
+C epilogue.
+C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
+C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and
+C save a cycle.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ zxt4 n = n C I
+ ;;
+')
+{.mmi
+ mov r10 = rp C M I
+ mov r9 = up C M I
+ sub vl = r0, vl C M I negate vl
+}
+{.mmi
+ ldf8 f8 = [rp], 8 C M
+ ldf8 f7 = [up], 8 C M
+ add r19 = -1, n C M I n - 1
+ ;;
+}
+{.mmi
+ cmp.eq p6, p0 = 0, vl C M I
+ mov r8 = 0 C M I zero cylimb
+ mov r2 = ar.lc C I0
+}
+{.mmi
+ setf.sig f6 = vl C M2 M3
+ and r14 = 3, n C M I
+ shr.u r19 = r19, 2 C I0
+ ;;
+}
+{.mmb
+ nop 0
+ cmp.eq p10, p0 = 0, r14 C M I
+ (p6) br.spnt .Ldone C B vl == 0
+}
+{.mmi
+ cmp.eq p11, p0 = 2, r14 C M I
+ cmp.eq p12, p0 = 3, r14 C M I
+ mov ar.lc = r19 C I0
+}
+{.bbb
+ (p10) br.dptk .Lb00 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
+
+.Lb01: br.cloop.dptk .grt1
+
+ xma.l f39 = f7, f6, f8
+ xma.hu f43 = f7, f6, f8
+ ;;
+ getf.sig r27 = f39 C lo
+ getf.sig r31 = f43 C hi
+ ld8 r20 = [r9], 8
+ br .Lcj1
+
+.grt1: ldf8 f44 = [rp], 8
+ ldf8 f32 = [up], 8
+ ;;
+ ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ ;;
+ ldf8 f46 = [rp], 8
+ xma.l f39 = f7, f6, f8
+ ldf8 f34 = [up], 8
+ xma.hu f43 = f7, f6, f8
+ ;;
+ ldf8 f47 = [rp], 8
+ xma.l f36 = f32, f6, f44
+ ldf8 f35 = [up], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt5
+ ;;
+
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43 C hi
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40 C hi
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41 C hi
+ getf.sig r26 = f38 C lo
+ ld8 r23 = [r9], 8
+ br .Lcj5
+
+.grt5: ldf8 f44 = [rp], 8
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .Loop
+ br .Lend
+
+
+.Lb10: ldf8 f47 = [rp], 8
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt2
+
+ xma.l f38 = f7, f6, f8
+ xma.hu f42 = f7, f6, f8
+ ;;
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r26 = f38 C lo
+ getf.sig r30 = f42 C hi
+ ld8 r23 = [r9], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ getf.sig r31 = f43 C hi
+ ld8 r20 = [r9], 8
+ br .Lcj2
+
+.grt2: ldf8 f44 = [rp], 8
+ ldf8 f32 = [up], 8
+ ;;
+ ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ xma.l f38 = f7, f6, f8
+ xma.hu f42 = f7, f6, f8
+ ;;
+ ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt6
+
+ getf.sig r30 = f42 C hi
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43 C hi
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40 C hi
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ br .Lcj6
+
+.grt6: ldf8 f44 = [rp], 8
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ br .LL10
+
+
+.Lb11: ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ ;;
+ ldf8 f47 = [rp], 8
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt3
+
+ xma.l f37 = f7, f6, f8
+ xma.hu f41 = f7, f6, f8
+ ;;
+ xma.l f38 = f34, f6, f46
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41 C hi
+ ld8 r22 = [r9], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ getf.sig r30 = f42 C hi
+ ld8 r23 = [r9], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ getf.sig r31 = f43 C hi
+ ld8 r20 = [r9], 8
+ br .Lcj3
+
+.grt3: ldf8 f44 = [rp], 8
+ xma.l f37 = f7, f6, f8
+ ldf8 f32 = [up], 8
+ xma.hu f41 = f7, f6, f8
+ ;;
+ ldf8 f45 = [rp], 8
+ xma.l f38 = f34, f6, f46
+ ldf8 f33 = [up], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt7
+ ;;
+
+ getf.sig r30 = f42 C hi
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43 C hi
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ br .Lcj7
+
+.grt7: ldf8 f44 = [rp], 8
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ br .LL11
+
+
+.Lb00: ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ ;;
+ ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ ;;
+ ldf8 f47 = [rp], 8
+ xma.l f36 = f7, f6, f8
+ ldf8 f35 = [up], 8
+ xma.hu f40 = f7, f6, f8
+ br.cloop.dptk .grt4
+
+ xma.l f37 = f33, f6, f45
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40 C hi
+ xma.l f39 = f35, f6, f47
+ getf.sig r25 = f37 C lo
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41 C hi
+ getf.sig r26 = f38 C lo
+ ld8 r23 = [r9], 8
+ ;;
+ getf.sig r30 = f42 C hi
+ getf.sig r27 = f39 C lo
+ ld8 r20 = [r9], 8
+ br .Lcj4
+
+.grt4: ldf8 f44 = [rp], 8
+ xma.l f37 = f33, f6, f45
+ ldf8 f32 = [up], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f46
+ getf.sig r24 = f36 C lo
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f47
+ getf.sig r25 = f37 C lo
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt8
+ ;;
+
+ getf.sig r30 = f42 C hi
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ br .Lcj8
+
+.grt8: ldf8 f44 = [rp], 8
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ br .LL00
+
+ ALIGN(32)
+.Loop:
+{.mmi
+ ldf8 f44 = [rp], 8
+ cmp.ltu p6, p0 = r27, r8 C lo cmp
+ sub r14 = r27, r8 C lo sub
+}
+{.mmi
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ sub r8 = r20, r31 C hi sub
+ ;; C 01
+}
+{.mmf
+ getf.sig r27 = f39 C lo
+ st8 [r10] = r14, 8
+ xma.l f37 = f33, f6, f45
+}
+{.mfi
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ (p6) add r8 = 1, r8
+ ;; C 02
+}
+{.mmi
+.LL00: ldf8 f45 = [rp], 8
+ cmp.ltu p6, p0 = r24, r8
+ sub r14 = r24, r8
+}
+{.mmi
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ sub r8 = r21, r28
+ ;; C 03
+}
+{.mmf
+ getf.sig r24 = f36 C lo
+ st8 [r10] = r14, 8
+ xma.l f38 = f34, f6, f46
+}
+{.mfi
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ (p6) add r8 = 1, r8
+ ;; C 04
+}
+{.mmi
+.LL11: ldf8 f46 = [rp], 8
+ cmp.ltu p6, p0 = r25, r8
+ sub r14 = r25, r8
+}
+{.mmi
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ sub r8 = r22, r29
+ ;; C 05
+}
+{.mmf
+ getf.sig r25 = f37 C lo
+ st8 [r10] = r14, 8
+ xma.l f39 = f35, f6, f47
+}
+{.mfi
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ (p6) add r8 = 1, r8
+ ;; C 06
+}
+{.mmi
+.LL10: ldf8 f47 = [rp], 8
+ cmp.ltu p6, p0 = r26, r8
+ sub r14 = r26, r8
+}
+{.mmi
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ sub r8 = r23, r30
+ ;; C 07
+}
+{.mmf
+ getf.sig r26 = f38 C lo
+ st8 [r10] = r14, 8
+ xma.l f36 = f32, f6, f44
+}
+{.mfi
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ (p6) add r8 = 1, r8
+}
+ br.cloop.dptk .Loop
+ ;;
+
+.Lend:
+ cmp.ltu p6, p0 = r27, r8
+ sub r14 = r27, r8
+ getf.sig r30 = f42
+ sub r8 = r20, r31
+ ;;
+ getf.sig r27 = f39
+ st8 [r10] = r14, 8
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj8:
+ cmp.ltu p6, p0 = r24, r8
+ sub r14 = r24, r8
+ getf.sig r31 = f43
+ sub r8 = r21, r28
+ ;;
+ getf.sig r24 = f36
+ st8 [r10] = r14, 8
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj7:
+ cmp.ltu p6, p0 = r25, r8
+ sub r14 = r25, r8
+ getf.sig r28 = f40
+ sub r8 = r22, r29
+ ;;
+ getf.sig r25 = f37
+ st8 [r10] = r14, 8
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj6:
+ cmp.ltu p6, p0 = r26, r8
+ sub r14 = r26, r8
+ getf.sig r29 = f41
+ sub r8 = r23, r30
+ ;;
+ getf.sig r26 = f38
+ st8 [r10] = r14, 8
+ ld8 r23 = [r9], 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj5:
+ cmp.ltu p6, p0 = r27, r8
+ sub r14 = r27, r8
+ getf.sig r30 = f42
+ sub r8 = r20, r31
+ ;;
+ getf.sig r27 = f39
+ st8 [r10] = r14, 8
+ ld8 r20 = [r9], 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj4:
+ cmp.ltu p6, p0 = r24, r8
+ sub r14 = r24, r8
+ getf.sig r31 = f43
+ sub r8 = r21, r28
+ ;;
+ st8 [r10] = r14, 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj3:
+ cmp.ltu p6, p0 = r25, r8
+ sub r14 = r25, r8
+ sub r8 = r22, r29
+ ;;
+ st8 [r10] = r14, 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj2:
+ cmp.ltu p6, p0 = r26, r8
+ sub r14 = r26, r8
+ sub r8 = r23, r30
+ ;;
+ st8 [r10] = r14, 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj1:
+ cmp.ltu p6, p0 = r27, r8
+ sub r14 = r27, r8
+ sub r8 = r20, r31
+ ;;
+ st8 [r10] = r14, 8
+ mov ar.lc = r2
+ (p6) add r8 = 1, r8
+ br.ret.sptk.many b0
+.Ldone: mov ar.lc = r2
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()