summaryrefslogtreecommitdiff
path: root/gmp/mpn/alpha/ev6/add_n.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/alpha/ev6/add_n.asm')
-rw-r--r--gmp/mpn/alpha/ev6/add_n.asm283
1 files changed, 283 insertions, 0 deletions
diff --git a/gmp/mpn/alpha/ev6/add_n.asm b/gmp/mpn/alpha/ev6/add_n.asm
new file mode 100644
index 0000000000..9261f31b8a
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/add_n.asm
@@ -0,0 +1,283 @@
+dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 5.4
+C EV6: 2.125
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
+C cy r20 (for mpn_add_nc)
+
+C TODO
+C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C Use multi-pronged feed-in.
+C Perform additional micro-tuning
+
+C This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C Pair loads and stores where possible
+C Store pairs oct-aligned where possible (didn't need it here)
+C Stores are delayed every third cycle
+C Loads and stores are delayed by fills
+C U stays still, put code there where possible (note alternation of U1 and U0)
+C L moves because of loads and stores
+C Note dampers in L to limit damage
+
+C This odd-looking optimization expects that were having random bits in our
+C data, so that a pure zero result is unlikely. so we penalize the unlikely
+C case to help the common case.
+
+define(`u0', `r0') define(`u1', `r3')
+define(`v0', `r1') define(`v1', `r4')
+
+define(`cy0', `r20') define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+ br r31, $entry
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+ bis r31, r31, cy0 C clear carry in
+$entry: cmpult r19, 5, r22 C L1 move counter
+ ldq u1, 0(r17) C L0 get next ones
+ ldq v1, 0(r18) C L1
+ bne r22, $Lsmall
+
+ ldq u0, 8(r17) C L0 get next ones
+ ldq v0, 8(r18) C L1
+ addq u1, v1, r5 C U0 add two data
+
+ cmpult r5, v1, r23 C U0 did it carry
+ ldq u1, 16(r17) C L0 get next ones
+ ldq v1, 16(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy0, r5 C U0 carry in
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix5f C U0 fix exact zero
+$ret5f: ldq u0, 24(r17) C L0 get next ones
+ ldq v0, 24(r18) C L1
+
+ addq r8, r23, r8 C U1 carry from last
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix6f C U1 fix exact zero
+$ret6f: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 32(r17) C L0 get next ones
+ ldq v1, 32(r18) C L1
+
+ lda r17, 40(r17) C L0 move pointer
+ lda r18, 40(r18) C L1 move pointer
+
+ lda r16, -8(r16)
+ lda r19, -13(r19) C L1 move counter
+ blt r19, $Lend C U1 loop control
+
+
+C Main loop. 8-way unrolled.
+ ALIGN(16)
+$Loop: addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, 8(r16) C L0 put an answer
+ stq r8, 16(r16) C L1 pair
+
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix7 C U0 fix exact 0
+$ret7: ldq u0, 0(r17) C L0 get next ones
+ ldq v0, 0(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ addq r2, r23, r2 C U1 carry from last
+ bis r31, r31, r31 C L moves in L !
+ addq u1, v1, r5 C U0 add two data
+
+ beq r2, $fix0 C U1 fix exact zero
+$ret0: cmpult r5, v1, cy0 C U0 did it carry
+ ldq u1, 8(r17) C L0 get next ones
+ ldq v1, 8(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, 24(r16) C L0 store pair
+ stq r2, 32(r16) C L1
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix1 C U0 fix exact zero
+$ret1: ldq u0, 16(r17) C L0 get next ones
+ ldq v0, 16(r18) C L1
+
+ lda r16, 64(r16) C L0 move pointer
+ addq r8, cy0, r8 C U1 carry from last
+ lda r19, -8(r19) C L1 move counter
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix2 C U1 fix exact zero
+$ret2: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 24(r17) C L0 get next ones
+ ldq v1, 24(r18) C L1
+
+ addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, -24(r16) C L0 put an answer
+ stq r8, -16(r16) C L1 pair
+
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix3 C U0 fix exact 0
+$ret3: ldq u0, 32(r17) C L0 get next ones
+ ldq v0, 32(r18) C L1
+
+ bis r31, r31, r31 C L damp out
+ addq r2, r23, r2 C U1 carry from last
+ bis r31, r31, r31 C L moves in L !
+ addq u1, v1, r5 C U0 add two data
+
+ beq r2, $fix4 C U1 fix exact zero
+$ret4: cmpult r5, v1, cy0 C U0 did it carry
+ ldq u1, 40(r17) C L0 get next ones
+ ldq v1, 40(r18) C L1
+
+ addq u0, v0, r8 C U1 add two data
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, -8(r16) C L0 store pair
+ stq r2, 0(r16) C L1
+
+ cmpult r8, v0, r22 C U1 did it carry
+ beq r5, $fix5 C U0 fix exact zero
+$ret5: ldq u0, 48(r17) C L0 get next ones
+ ldq v0, 48(r18) C L1
+
+ ldl r31, 256(r17) C L0 prefetch
+ addq r8, cy0, r8 C U1 carry from last
+ ldl r31, 256(r18) C L1 prefetch
+ addq u1, v1, r7 C U0 add two data
+
+ beq r8, $fix6 C U1 fix exact zero
+$ret6: cmpult r7, v1, r23 C U0 did it carry
+ ldq u1, 56(r17) C L0 get next ones
+ ldq v1, 56(r18) C L1
+
+ lda r17, 64(r17) C L0 move pointer
+ bis r31, r31, r31 C U
+ lda r18, 64(r18) C L1 move pointer
+ bge r19, $Loop C U1 loop control
+C ==== main loop end
+
+$Lend: addq u0, v0, r2 C U1 add two data
+ addq r7, r22, r7 C U0 add in carry
+ stq r5, 8(r16) C L0 put an answer
+ stq r8, 16(r16) C L1 pair
+ cmpult r2, v0, cy1 C U1 did it carry
+ beq r7, $fix7c C U0 fix exact 0
+$ret7c: addq r2, r23, r2 C U1 carry from last
+ addq u1, v1, r5 C U0 add two data
+ beq r2, $fix0c C U1 fix exact zero
+$ret0c: cmpult r5, v1, cy0 C U0 did it carry
+ addq r5, cy1, r5 C U0 carry from last
+ stq r7, 24(r16) C L0 store pair
+ stq r2, 32(r16) C L1
+ beq r5, $fix1c C U0 fix exact zero
+$ret1c: stq r5, 40(r16) C L0 put an answer
+ lda r16, 48(r16) C L0 move pointer
+
+ lda r19, 8(r19)
+ beq r19, $Lret
+
+ ldq u1, 0(r17)
+ ldq v1, 0(r18)
+$Lsmall:
+ lda r19, -1(r19)
+ beq r19, $Lend0
+
+ ALIGN(8)
+$Loop0: addq u1, v1, r2 C main add
+ cmpult r2, v1, r8 C compute cy from last add
+ ldq u1, 8(r17)
+ ldq v1, 8(r18)
+ addq r2, cy0, r5 C carry add
+ lda r17, 8(r17)
+ lda r18, 8(r18)
+ stq r5, 0(r16)
+ cmpult r5, r2, cy0 C compute cy from last add
+ lda r19, -1(r19) C decr loop cnt
+ bis r8, cy0, cy0 C combine cy from the two adds
+ lda r16, 8(r16)
+ bne r19, $Loop0
+$Lend0: addq u1, v1, r2 C main add
+ addq r2, cy0, r5 C carry add
+ cmpult r2, v1, r8 C compute cy from last add
+ cmpult r5, r2, cy0 C compute cy from last add
+ stq r5, 0(r16)
+ bis r8, cy0, r0 C combine cy from the two adds
+ ret r31,(r26),1
+
+ ALIGN(8)
+$Lret: lda r0, 0(cy0) C copy carry into return register
+ ret r31,(r26),1
+
+$fix5f: bis r23, cy0, r23 C bring forward carry
+ br r31, $ret5f
+$fix6f: bis r22, r23, r22 C bring forward carry
+ br r31, $ret6f
+$fix0: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret0
+$fix1: bis cy0, cy1, cy0 C bring forward carry
+ br r31, $ret1
+$fix2: bis r22, cy0, r22 C bring forward carry
+ br r31, $ret2
+$fix3: bis r23, r22, r23 C bring forward carry
+ br r31, $ret3
+$fix4: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret4
+$fix5: bis cy1, cy0, cy0 C bring forward carry
+ br r31, $ret5
+$fix6: bis r22, cy0, r22 C bring forward carry
+ br r31, $ret6
+$fix7: bis r23, r22, r23 C bring forward carry
+ br r31, $ret7
+$fix0c: bis cy1, r23, cy1 C bring forward carry
+ br r31, $ret0c
+$fix1c: bis cy0, cy1, cy0 C bring forward carry
+ br r31, $ret1c
+$fix7c: bis r23, r22, r23 C bring forward carry
+ br r31, $ret7c
+
+EPILOGUE()
+ASM_END()