summaryrefslogtreecommitdiff
path: root/gmp/mpn/alpha/ev6/mod_1_4.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/alpha/ev6/mod_1_4.asm')
-rw-r--r--gmp/mpn/alpha/ev6/mod_1_4.asm337
1 files changed, 337 insertions, 0 deletions
diff --git a/gmp/mpn/alpha/ev6/mod_1_4.asm b/gmp/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 0000000000..836de07c0f
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/mod_1_4.asm
@@ -0,0 +1,337 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimise. 2.75 c/l should be possible.
+C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
+C * Optimise feed-in code, starting the sw pipeline in switch code.
+C * Shorten software pipeline. The mul instructions are scheduled too far
+C from their users. Fixing this will allow us to use fewer registers.
+C * If we cannot reduce register usage, write perhaps small-n basecase.
+C * Does this work for PIC?
+
+C cycles/limb
+C EV4: ?
+C EV5: 23
+C EV6: 3
+
+define(`ap', `r16')
+define(`n', `r17')
+define(`pl', `r24')
+define(`ph', `r25')
+define(`rl', `r6')
+define(`rh', `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_4p)
+ lda r30, -64(r30)
+ stq r9, 8(r30)
+ ldq B1modb, 16(r19)
+ stq r10, 16(r30)
+ ldq B2modb, 24(r19)
+ stq r11, 24(r30)
+ ldq B3modb, 32(r19)
+ stq r12, 32(r30)
+ ldq B4modb, 40(r19)
+ stq r13, 40(r30)
+ ldq B5modb, 48(r19)
+ s8addq n, ap, ap C point ap at vector end
+
+ and n, 3, r0
+ lda n, -4(n)
+ beq r0, L(b0)
+ lda r6, -2(r0)
+ blt r6, L(b1)
+ beq r6, L(b2)
+
+L(b3): ldq r21, -16(ap)
+ ldq r22, -8(ap)
+ ldq r20, -24(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, rl
+ cmpult rl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, rh
+ lda ap, -56(ap)
+ br L(com)
+
+L(b0): ldq r21, -24(ap)
+ ldq r22, -16(ap)
+ ldq r23, -8(ap)
+ ldq r20, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r27
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, rl
+ cmpult rl, r10, r0
+ addq r27, ph, ph
+ addq r0, ph, rh
+ lda ap, -64(ap)
+ br L(com)
+
+L(b1): bis r31, r31, rh
+ ldq rl, -8(ap)
+ lda ap, -40(ap)
+ br L(com)
+
+L(b2): ldq rh, -8(ap)
+ ldq rl, -16(ap)
+ lda ap, -48(ap)
+
+L(com): ble n, L(ed3)
+ ldq r21, 8(ap)
+ ldq r22, 16(ap)
+ ldq r23, 24(ap)
+ ldq r20, 0(ap)
+ lda n, -4(n)
+ lda ap, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r27
+ mulq rl, B4modb, r11
+ umulh rl, B4modb, r28
+ ble n, L(ed2)
+
+ ALIGN(16)
+L(top): ldq r21, 8(ap)
+ mulq rh, B5modb, rl
+ addq r8, r20, pl
+ ldq r22, 16(ap)
+ cmpult pl, r8, r0
+ umulh rh, B5modb, rh
+ ldq r23, 24(ap)
+ addq r0, r12, ph
+ addq r9, pl, pl
+ mulq r21, B1modb, r8
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ umulh r21, B1modb, r12
+ lda ap, -32(ap)
+ addq r0, ph, ph
+ addq r10, pl, pl
+ mulq r22, B2modb, r9
+ cmpult pl, r10, r0
+ addq r27, ph, ph
+ addq r11, pl, pl
+ umulh r22, B2modb, r13
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r28, ph, ph
+ mulq r23, B3modb, r10
+ ldq r20, 32(ap)
+ addq pl, rl, rl
+ umulh r23, B3modb, r27
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ mulq rl, B4modb, r11
+ addq ph, rh, rh
+ umulh rl, B4modb, r28
+ addq r0, rh, rh
+ lda n, -4(n)
+ bgt n, L(top)
+
+L(ed2): mulq rh, B5modb, rl
+ addq r8, r20, pl
+ umulh rh, B5modb, rh
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, pl
+ cmpult pl, r10, r0
+ addq r27, ph, ph
+ addq r11, pl, pl
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r28, ph, ph
+ addq pl, rl, rl
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ addq ph, rh, rh
+ addq r0, rh, rh
+
+L(ed3): mulq rh, B1modb, r8
+ umulh rh, B1modb, rh
+ addq r8, rl, rl
+ cmpult rl, r8, r0
+ addq r0, rh, rh
+
+ ldq r24, 8(r19) C cnt
+ sll rh, r24, rh
+ subq r31, r24, r25
+ srl rl, r25, r2
+ sll rl, r24, rl
+ or r2, rh, rh
+
+ ldq r23, 0(r19) C bi
+ mulq rh, r23, r8
+ umulh rh, r23, r9
+ addq rh, 1, r7
+ addq r8, rl, r8 C ql
+ cmpult r8, rl, r0
+ addq r9, r7, r9
+ addq r0, r9, r9 C qh
+ mulq r9, r18, r21 C qh * b
+ subq rl, r21, rl
+ cmpult r8, rl, r0 C rl > ql
+ negq r0, r0
+ and r0, r18, r0
+ addq rl, r0, rl
+ cmpule r18, rl, r0 C rl >= b
+ negq r0, r0
+ and r0, r18, r0
+ subq rl, r0, rl
+
+ srl rl, r24, r0
+
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ lda r30, 64(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+ lda r30, -32(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ mov r16, r11
+ LEA( r4, __clz_tab)
+ lda r10, 65(r31)
+ cmpbge r31, r17, r1
+ srl r1, 1, r1
+ xor r1, 127, r1
+ addq r1, r4, r1
+ ldq_u r2, 0(r1)
+ extbl r2, r1, r2
+ s8subq r2, 7, r2
+ srl r17, r2, r3
+ subq r10, r2, r10
+ addq r3, r4, r3
+ ldq_u r1, 0(r3)
+ extbl r1, r3, r1
+ subq r10, r1, r10
+ sll r17, r10, r9
+ mov r9, r16
+ jsr r26, mpn_invert_limb
+ ldah r29, 0(r26)
+ subq r31, r10, r2
+ lda r1, 1(r31)
+ sll r1, r10, r1
+ subq r31, r9, r3
+ srl r0, r2, r2
+ ldq r26, 0(r30)
+ bis r2, r1, r2
+ lda r29, 0(r29)
+ stq r0, 0(r11)
+ stq r10, 8(r11)
+ mulq r2, r3, r2
+ srl r2, r10, r3
+ umulh r2, r0, r1
+ stq r3, 16(r11)
+ mulq r2, r0, r3
+ ornot r31, r1, r1
+ subq r1, r2, r1
+ mulq r1, r9, r1
+ addq r1, r9, r2
+ cmpule r1, r3, r3
+ cmoveq r3, r2, r1
+ srl r1, r10, r3
+ umulh r1, r0, r2
+ stq r3, 24(r11)
+ mulq r1, r0, r3
+ ornot r31, r2, r2
+ subq r2, r1, r2
+ mulq r2, r9, r2
+ addq r2, r9, r1
+ cmpule r2, r3, r3
+ cmoveq r3, r1, r2
+ srl r2, r10, r1
+ umulh r2, r0, r3
+ stq r1, 32(r11)
+ mulq r2, r0, r1
+ ornot r31, r3, r3
+ subq r3, r2, r3
+ mulq r3, r9, r3
+ addq r3, r9, r2
+ cmpule r3, r1, r1
+ cmoveq r1, r2, r3
+ srl r3, r10, r2
+ umulh r3, r0, r1
+ stq r2, 40(r11)
+ mulq r3, r0, r0
+ ornot r31, r1, r1
+ subq r1, r3, r1
+ mulq r1, r9, r1
+ addq r1, r9, r9
+ cmpule r1, r0, r0
+ cmoveq r0, r9, r1
+ ldq r9, 8(r30)
+ srl r1, r10, r1
+ ldq r10, 16(r30)
+ stq r1, 48(r11)
+ ldq r11, 24(r30)
+ lda r30, 32(r30)
+ ret r31, (r26), 1
+EPILOGUE()