summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86_64/addmul_2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86_64/addmul_2.asm')
-rw-r--r--gmp/mpn/x86_64/addmul_2.asm175
1 files changed, 79 insertions, 96 deletions
diff --git a/gmp/mpn/x86_64/addmul_2.asm b/gmp/mpn/x86_64/addmul_2.asm
index 18307d719f..8f133c3b00 100644
--- a/gmp/mpn/x86_64/addmul_2.asm
+++ b/gmp/mpn/x86_64/addmul_2.asm
@@ -1,51 +1,39 @@
dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl add the result to a third limb vector.
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.375
-C AMD K10 2.375
-C Intel P4 15-16
-C Intel core2 4.45
-C Intel NHM 4.32
-C Intel SBR 3.4
-C Intel atom ?
-C VIA nano 4.4
+C K8,K9: 2.375
+C K10: 2.375
+C P4: ?
+C P6-15: 4.45
C This code is the result of running a code generation and optimization tool
C suite written by David Harvey and Torbjorn Granlund.
C TODO
-C * Tune feed-in and wind-down code.
+C * Work on feed-in and wind-down code.
+C * Convert "mov $0" to "xor".
+C * Adjust initial lea to save some bytes.
+C * Perhaps adjust n from n_param&3 value?
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -61,124 +49,119 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_addmul_2)
- FUNC_ENTRY(4)
- mov n_param, n
push %rbx
push %rbp
- mov 0(vp), v0
+ mov (vp), v0
mov 8(vp), v1
- mov R32(n_param), R32(%rbx)
- mov (up), %rax
- lea -8(up,n_param,8), up
- lea -8(rp,n_param,8), rp
- mul v0
+ mov n_param, n
neg n
- and $3, R32(%rbx)
- jz L(b0)
- cmp $2, R32(%rbx)
- jc L(b1)
- jz L(b2)
-
-L(b3): mov %rax, w1
+ lea -32(up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+
+ and $3, R32(n_param)
+ jz L(am2p0)
+ cmp $2, R32(n_param)
+ jc L(am2p1)
+ jz L(am2p2)
+L(am2p3):
+ mov 32(up,n,8), %rax
+ mul v0
+ mov %rax, w1
+ mov 32(up,n,8), %rax
mov %rdx, w2
xor R32(w3), R32(w3)
- mov 8(up,n,8), %rax
- dec n
- jmp L(lo3)
-
-L(b2): mov %rax, w2
- mov 8(up,n,8), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- add $-2, n
- jmp L(lo2)
-
-L(b1): mov %rax, w3
- mov 8(up,n,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- inc n
- jmp L(lo1)
-
-L(b0): mov $0, R32(w3)
+ add $2, n
+ jmp L(am3)
+L(am2p0):
+ mov 32(up,n,8), %rax
+ mul v0
mov %rax, w0
- mov 8(up,n,8), %rax
+ mov 32(up,n,8), %rax
mov %rdx, w1
xor R32(w2), R32(w2)
- jmp L(lo0)
+ add $3, n
+ jmp L(am0)
+L(am2p1):
+ mov 32(up,n,8), %rax
+ mul v0
+ mov %rax, w3
+ mov 32(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ jmp L(am1)
+L(am2p2):
+ mov 32(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov 32(up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ add $1, n
+ jmp L(am2)
ALIGN(32)
-L(top): mov $0, R32(w1)
- mul v0
- add %rax, w3
- mov (up,n,8), %rax
- adc %rdx, w0
- adc $0, R32(w1)
-L(lo1): mul v1
+L(top):
add w3, (rp,n,8)
- mov $0, R32(w3)
adc %rax, w0
- mov $0, R32(w2)
mov 8(up,n,8), %rax
adc %rdx, w1
+ mov $0, R32(w2)
mul v0
add %rax, w0
mov 8(up,n,8), %rax
adc %rdx, w1
adc $0, R32(w2)
-L(lo0): mul v1
+L(am0): mul v1
add w0, 8(rp,n,8)
adc %rax, w1
adc %rdx, w2
mov 16(up,n,8), %rax
+ mov $0, R32(w3)
mul v0
add %rax, w1
+ mov 16(up,n,8), %rax
adc %rdx, w2
adc $0, R32(w3)
- mov 16(up,n,8), %rax
-L(lo3): mul v1
+L(am3): mul v1
add w1, 16(rp,n,8)
adc %rax, w2
- adc %rdx, w3
- xor R32(w0), R32(w0)
mov 24(up,n,8), %rax
+ adc %rdx, w3
mul v0
+ mov $0, R32(w0)
add %rax, w2
- mov 24(up,n,8), %rax
adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,n,8), %rax
adc $0, R32(w0)
-L(lo2): mul v1
+L(am2): mul v1
add w2, 24(rp,n,8)
adc %rax, w3
adc %rdx, w0
mov 32(up,n,8), %rax
- add $4, n
- js L(top)
-
-L(end): xor R32(w1), R32(w1)
mul v0
add %rax, w3
- mov (up), %rax
+ mov 32(up,n,8), %rax
adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w3, (rp)
+ adc $0, R32(w1)
+L(am1): mul v1
+ add $4, n
+ js L(top)
+
+ add w3, (rp,n,8)
adc %rax, w0
adc %rdx, w1
- mov w0, 8(rp)
+ mov w0, 8(rp,n,8)
mov w1, %rax
pop %rbp
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()