summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorKevin Ryde <user42@zip.com.au>2004-05-06 03:30:17 +0200
committerKevin Ryde <user42@zip.com.au>2004-05-06 03:30:17 +0200
commit7cd05c296e91a7aae92de2486d632b9baf58c61f (patch)
tree7c05f61caf267ae9e64767e0407b9edd7a46e5f3 /mpn
parenta58bb06a0c53e5efda374f0e1197dc7668fcda44 (diff)
downloadgmp-7cd05c296e91a7aae92de2486d632b9baf58c61f.tar.gz
* mpn/alpha/mode1o.asm: New file.
Diffstat (limited to 'mpn')
-rw-r--r--mpn/alpha/mode1o.asm200
1 files changed, 200 insertions, 0 deletions
diff --git a/mpn/alpha/mode1o.asm b/mpn/alpha/mode1o.asm
new file mode 100644
index 000000000..c8dc9e679
--- /dev/null
+++ b/mpn/alpha/mode1o.asm
@@ -0,0 +1,200 @@
+dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
+
+dnl Copyright 2003, 2004 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C EV4: 47
+C EV5: 30
+C EV6: 15
+
+
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
+C mp_limb_t c)
+C
+C This code follows the "alternate" code in mpn/generic/mode1o.c,
+C eliminating cbit+climb from the dependent chain. This leaves,
+C
+C ev4 ev5 ev6
+C 1 3 1 subq y = x - h
+C 23 13 7 mulq q = y * inverse
+C 23 14 7 umulh h = high (q * d)
+C -- -- --
+C 47 30 15
+C
+C In each case, the load latency, loop control, and extra carry bit handling
+C hide under the multiply latencies. Those latencies are long enough that
+C we don't need to worry about alignment or pairing to squeeze out
+C performance.
+C
+C For the first limb, some of the loop code is broken out and scheduled back
+C since it can be done earlier.
+C
+C - The first ldq src[0] is near the start of the routine, for maximum
+C time from memory.
+C
+C - The subq y=x-climb can be done without waiting for the inverse.
+C
+C - The mulq y*inverse is replicated after the final subq for the inverse,
+C instead of branching to the mulq in the main loop. On ev4 a branch
+C there would cost cycles, but we can hide them under the mulq latency.
+C
+C For the last limb, high<divisor is tested and if that's true a subtract
+C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
+C data-dependent branch, but we're waiting for umulh so any penalty should
+C hide there. The multiplies saved would be worth the cost anyway.
+C
+C Enhancements:
+C
+C For size==1, a plain division (done bitwise say) might be faster than
+C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
+C ev5. A call to gcc __remqu might be a possibility.
+
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd,gp)
+
+ C r16 src
+ C r17 size
+ C r18 d
+ C r19 c
+
+ LEA(r0, modlimb_invert_table)
+ srl r18, 1, r20 C d >> 1
+
+ and r20, 127, r20 C idx = d>>1 & 0x7F
+
+ addq r0, r20, r21 C table + idx
+
+ifelse(bwx_available_p,1,
+` ldbu r20, 0(r21) C table[idx], inverse 8 bits
+',`
+ ldq_u r20, 0(r21) C table[idx] qword
+ extbl r20, r21, r20 C table[idx], inverse 8 bits
+')
+
+ mull r20, r20, r7 C i*i
+ addq r20, r20, r20 C 2*i
+
+ ldq r2, 0(r16) C x = s = src[0]
+ lda r17, -1(r17) C size--
+ clr r0 C initial cbit=0
+
+ mull r7, r18, r7 C i*i*d
+
+ subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits
+
+ mull r20, r20, r7 C i*i
+ addq r20, r20, r20 C 2*i
+
+ mull r7, r18, r7 C i*i*d
+
+ subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits
+
+ mulq r20, r20, r7 C i*i
+ addq r20, r20, r20 C 2*i
+
+ mulq r7, r18, r7 C i*i*d
+ subq r2, r19, r3 C y = x - climb
+
+ subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits
+
+ASSERT(r7, C should have d*inv==1 mod 2^64
+` mulq r18, r20, r7
+ cmpeq r7, 1, r7')
+
+ mulq r3, r20, r4 C first q = y * inv
+
+ beq r17, L(one) C if size==1
+ br L(entry)
+
+
+L(top):
+ C r0 cbit
+ C r16 src, incrementing
+ C r17 size, decrementing
+ C r18 d
+ C r19 climb
+ C r20 inv
+
+ ldq r1, 0(r16) C s = src[i]
+ subq r1, r0, r2 C x = s - cbit
+ cmpult r1, r0, r0 C new cbit = s < cbit
+
+ subq r2, r19, r3 C y = x - climb
+
+ mulq r3, r20, r4 C q = y * inv
+L(entry):
+ cmpult r2, r19, r5 C cbit2 = x < climb
+ addq r5, r0, r0 C cbit += cbit2
+ lda r16, 8(r16) C src++
+ lda r17, -1(r17) C size--
+
+ umulh r4, r18, r19 C climb = q * d
+ bne r17, L(top) C while 2 or more limbs left
+
+
+
+ C r0 cbit
+ C r18 d
+ C r19 climb
+ C r20 inv
+
+ ldq r1, 0(r16) C s = src[size-1] high limb
+
+ cmpult r1, r18, r2 C test high<divisor
+ bne r2, L(skip) C skip if so
+
+ C can't skip a division, repeat loop code
+
+ subq r1, r0, r2 C x = s - cbit
+ cmpult r1, r0, r0 C new cbit = s < cbit
+
+ subq r2, r19, r3 C y = x - climb
+
+ mulq r3, r20, r4 C q = y * inv
+L(one):
+ cmpult r2, r19, r5 C cbit2 = x < climb
+ addq r5, r0, r0 C cbit += cbit2
+
+ umulh r4, r18, r19 C climb = q * d
+
+ addq r19, r0, r0 C return climb + cbit
+ ret r31, (r26), 1
+
+
+ ALIGN(8)
+L(skip):
+ C with high<divisor, the final step can be just (cbit+climb)-s and
+ C an addback of d if that underflows
+
+ addq r19, r0, r19 C c = climb + cbit
+
+ subq r19, r1, r2 C c - s
+ cmpult r19, r1, r3 C c < s
+
+ addq r2, r18, r0 C return c-s + divisor
+
+ cmoveq r3, r2, r0 C return c-s if no underflow
+ ret r31, (r26), 1
+
+EPILOGUE()
+ASM_END()