diff options
Diffstat (limited to 'gmp/mpn/ia64/submul_1.asm')
-rw-r--r-- | gmp/mpn/ia64/submul_1.asm | 647 |
1 files changed, 647 insertions, 0 deletions
diff --git a/gmp/mpn/ia64/submul_1.asm b/gmp/mpn/ia64/submul_1.asm new file mode 100644 index 0000000000..cb2a5525b5 --- /dev/null +++ b/gmp/mpn/ia64/submul_1.asm @@ -0,0 +1,647 @@ +dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the +dnl result from a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4.0 +C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) + +C TODO +C * Optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in +C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and +C save a cycle. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') + +ASM_START() +PROLOGUE(mpn_submul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + mov r10 = rp C M I + mov r9 = up C M I + sub vl = r0, vl C M I negate vl +} +{.mmi + ldf8 f8 = [rp], 8 C M + ldf8 f7 = [up], 8 C M + add r19 = -1, n C M I n - 1 + ;; +} +{.mmi + cmp.eq p6, p0 = 0, vl C M I + mov r8 = 0 C M I zero cylimb + mov r2 = ar.lc C I0 +} +{.mmi + setf.sig f6 = vl C M2 M3 + and r14 = 3, n C M I + shr.u r19 = r19, 2 C I0 + ;; +} +{.mmb + nop 0 + cmp.eq p10, p0 = 0, r14 C M I + (p6) br.spnt .Ldone C B vl == 0 +} +{.mmi + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + mov ar.lc = r19 C I0 +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 + + xma.l f39 = f7, f6, f8 + xma.hu f43 = f7, f6, f8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj1 + +.grt1: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + ;; + ldf8 f46 = [rp], 8 + xma.l f39 = f7, f6, f8 + ldf8 f34 = [up], 8 + xma.hu f43 = f7, f6, f8 + ;; + ldf8 f47 = [rp], 8 + xma.l f36 = f32, f6, f44 + ldf8 f35 = [up], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt5 + ;; + + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + getf.sig r26 = f38 C lo + ld8 r23 = [r9], 8 + br .Lcj5 + +.grt5: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .Loop + br .Lend + + +.Lb10: ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r26 = f38 C lo + getf.sig r30 = f42 C hi + ld8 r23 = [r9], 8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj2 + +.grt2: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt6 + + getf.sig r30 = f42 C hi + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + br .Lcj6 + +.grt6: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + br .LL10 + + +.Lb11: ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt3 + + xma.l f37 = f7, f6, f8 + xma.hu f41 = f7, f6, f8 + ;; + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + ld8 r22 = [r9], 8 + ;; + getf.sig r26 = f38 C lo + getf.sig r30 = f42 C hi + ld8 r23 = [r9], 8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj3 + +.grt3: ldf8 f44 = [rp], 8 + xma.l f37 = f7, f6, f8 + ldf8 f32 = [up], 8 + xma.hu f41 = f7, f6, f8 + ;; + ldf8 f45 = [rp], 8 + xma.l f38 = f34, f6, f46 + ldf8 f33 = [up], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt7 + ;; + + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + br .Lcj7 + +.grt7: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + br .LL11 + + +.Lb00: ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + ldf8 f47 = [rp], 8 + xma.l f36 = f7, f6, f8 + ldf8 f35 = [up], 8 + xma.hu f40 = f7, f6, f8 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 C lo + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + getf.sig r26 = f38 C lo + ld8 r23 = [r9], 8 + ;; + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + ld8 r20 = [r9], 8 + br .Lcj4 + +.grt4: ldf8 f44 = [rp], 8 + xma.l f37 = f33, f6, f45 + ldf8 f32 = [up], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + getf.sig r24 = f36 C lo + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 C lo + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt8 + ;; + + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + br .Lcj8 + +.grt8: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + br .LL00 + + ALIGN(32) +.Loop: +{.mmi + ldf8 f44 = [rp], 8 + cmp.ltu p6, p0 = r27, r8 C lo cmp + sub r14 = r27, r8 C lo sub +} +{.mmi + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + sub r8 = r20, r31 C hi sub + ;; C 01 +} +{.mmf + getf.sig r27 = f39 C lo + st8 [r10] = r14, 8 + xma.l f37 = f33, f6, f45 +} +{.mfi + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + (p6) add r8 = 1, r8 + ;; C 02 +} +{.mmi +.LL00: ldf8 f45 = [rp], 8 + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 +} +{.mmi + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + sub r8 = r21, r28 + ;; C 03 +} +{.mmf + getf.sig r24 = f36 C lo + st8 [r10] = r14, 8 + xma.l f38 = f34, f6, f46 +} +{.mfi + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + (p6) add r8 = 1, r8 + ;; C 04 +} +{.mmi +.LL11: ldf8 f46 = [rp], 8 + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 +} +{.mmi + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + sub r8 = r22, r29 + ;; C 05 +} +{.mmf + getf.sig r25 = f37 C lo + st8 [r10] = r14, 8 + xma.l f39 = f35, f6, f47 +} +{.mfi + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + (p6) add r8 = 1, r8 + ;; C 06 +} +{.mmi +.LL10: ldf8 f47 = [rp], 8 + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 +} +{.mmi + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + sub r8 = r23, r30 + ;; C 07 +} +{.mmf + getf.sig r26 = f38 C lo + st8 [r10] = r14, 8 + xma.l f36 = f32, f6, f44 +} +{.mfi + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + (p6) add r8 = 1, r8 +} + br.cloop.dptk .Loop + ;; + +.Lend: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + getf.sig r30 = f42 + sub r8 = r20, r31 + ;; + getf.sig r27 = f39 + st8 [r10] = r14, 8 + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + (p6) add r8 = 1, r8 + ;; +.Lcj8: + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 + getf.sig r31 = f43 + sub r8 = r21, r28 + ;; + getf.sig r24 = f36 + st8 [r10] = r14, 8 + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + (p6) add r8 = 1, r8 + ;; +.Lcj7: + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 + getf.sig r28 = f40 + sub r8 = r22, r29 + ;; + getf.sig r25 = f37 + st8 [r10] = r14, 8 + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + (p6) add r8 = 1, r8 + ;; +.Lcj6: + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 + getf.sig r29 = f41 + sub r8 = r23, r30 + ;; + getf.sig r26 = f38 + st8 [r10] = r14, 8 + ld8 r23 = [r9], 8 + (p6) add r8 = 1, r8 + ;; +.Lcj5: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + getf.sig r30 = f42 + sub r8 = r20, r31 + ;; + getf.sig r27 = f39 + st8 [r10] = r14, 8 + ld8 r20 = [r9], 8 + (p6) add r8 = 1, r8 + ;; +.Lcj4: + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 + getf.sig r31 = f43 + sub r8 = r21, r28 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj3: + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 + sub r8 = r22, r29 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj2: + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 + sub r8 = r23, r30 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj1: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + sub r8 = r20, r31 + ;; + st8 [r10] = r14, 8 + mov ar.lc = r2 + (p6) add r8 = 1, r8 + br.ret.sptk.many b0 +.Ldone: mov ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() |