summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2011-10-24 09:12:00 +0200
committerTorbjorn Granlund <tege@gmplib.org>2011-10-24 09:12:00 +0200
commit8fec7463e2f655a6d996fa6300fb43b259d94901 (patch)
treeb2fa29bbecd14ae524bad3691feaf99dbea323b1 /mpn
parent436376764d813477c813b94c7efe4da8e71289ec (diff)
downloadgmp-8fec7463e2f655a6d996fa6300fb43b259d94901.tar.gz
Put intermediate result into R, don't allocate any stack space.
Diffstat (limited to 'mpn')
-rw-r--r--mpn/x86_64/sqr_basecase.asm71
1 files changed, 31 insertions, 40 deletions
diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm
index b84019458..93c666599 100644
--- a/mpn/x86_64/sqr_basecase.asm
+++ b/mpn/x86_64/sqr_basecase.asm
@@ -30,11 +30,11 @@ C large trip count. Instead, we should follow the generic/sqr_basecase.c
C code which uses addmul_2s from the start, conditionally leaving a 1x1
C multiply to the end. (In assembly code, one would stop invoking
C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
-C * This code only handles operands up to SQR_TOOM2_THRESHOLD_MAX. That
-C means we can safely use 32-bit operations for all sizes, unlike in e.g.,
-C mpn_addmul_1.
+C * Align more labels, should shave off a few cycles.
+C * We can safely use 32-bit size operations, since operands with (2^32)
+C limbs will lead to non-termination in practice.
C * The jump table could probably be optimized, at least for non-pic.
-C * The special code for n=1,2,3 was quickly written. It is probably too
+C * The special code for n <= 4 was quickly written. It is probably too
C large and unnecessarily slow.
C * Consider combining small cases code so that the n=k-1 code jumps into the
C middle of the n=k code.
@@ -62,12 +62,6 @@ define(`rp', `%rdi')
define(`up', `%rsi')
define(`n_param', `%rdx')
-C We should really trim this, for better spatial locality. Alternatively,
-C we could grab the upper part of the stack area, leaving the lower part
-C instead of the upper part unused.
-deflit(SQR_TOOM2_THRESHOLD_MAX, 80)
-define(`STACK_ALLOC', eval(8*2*SQR_TOOM2_THRESHOLD_MAX))
-
define(`n', `%r11')
define(`tp', `%r12')
define(`i', `%r8')
@@ -85,12 +79,12 @@ ASM_START()
ALIGN(16)
PROLOGUE(mpn_sqr_basecase)
- add $-48, %rsp
- mov %rbx, 40(%rsp)
- mov %rbp, 32(%rsp)
- mov %r12, 24(%rsp)
- mov %r13, 16(%rsp)
- mov %r14, 8(%rsp)
+ add $-40, %rsp
+ mov %rbx, 32(%rsp)
+ mov %rbp, 24(%rsp)
+ mov %r12, 16(%rsp)
+ mov %r13, 8(%rsp)
+ mov %r14, (%rsp)
mov R32(n_param), R32(n) C free original n register (rdx)
mov R32(n_param), R32(%rcx)
@@ -117,7 +111,7 @@ L(1): mov (up), %rax
mul %rax
mov %rax, (rp)
mov %rdx, 8(rp)
- add $40, %rsp
+ add $32, %rsp
pop %rbx
ret
@@ -141,7 +135,7 @@ L(2): mov (up), %rax
mov %r10, 16(rp)
adc $0, %r11
mov %r11, 24(rp)
- add $40, %rsp
+ add $32, %rsp
pop %rbx
ret
@@ -186,7 +180,7 @@ L(3): mov (up), %rax
adc %r10, 24(rp)
adc %r11, 32(rp)
adc %rbx, 40(rp)
- add $40, %rsp
+ add $32, %rsp
pop %rbx
ret
@@ -256,15 +250,15 @@ L(4): mov (up), %rax
adc %r12, 40(rp)
adc %rbp, 48(rp)
adc %rbx, 56(rp)
- add $24, %rsp
+ add $16, %rsp
pop %r12
pop %rbp
pop %rbx
ret
-L(0m4): add $-STACK_ALLOC, %rsp
- lea -24(%rsp,n,8), tp C point tp in middle of result operand
+L(0m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
mov (up), v0
mov 8(up), %rax
lea (up,n,8), up C point up at end of input operand
@@ -321,8 +315,8 @@ L(L3): xor R32(w1), R32(w1)
jmp L(dowhile)
-L(1m4): add $-STACK_ALLOC, %rsp
- lea (%rsp,n,8), tp C point tp in middle of result operand
+L(1m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
mov (up), v0 C u0
mov 8(up), %rax C u1
lea 8(up,n,8), up C point up at end of input operand
@@ -336,7 +330,7 @@ C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
mul v0 C u0 * u1
mov %rdx, w1
xor R32(w2), R32(w2)
- mov %rax, (%rsp)
+ mov %rax, 8(rp)
jmp L(m0)
ALIGN(16)
@@ -399,8 +393,8 @@ L(m2x): mov (up,j,8), %rax
jmp L(dowhile_end)
-L(2m4): add $-STACK_ALLOC, %rsp
- lea -24(%rsp,n,8), tp C point tp in middle of result operand
+L(2m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
mov (up), v0
mov 8(up), %rax
lea (up,n,8), up C point up at end of input operand
@@ -456,8 +450,8 @@ L(L1): xor R32(w0), R32(w0)
jmp L(dowhile_mid)
-L(3m4): add $-STACK_ALLOC, %rsp
- lea (%rsp,n,8), tp C point tp in middle of result operand
+L(3m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
mov (up), v0 C u0
mov 8(up), %rax C u1
lea 8(up,n,8), up C point up at end of input operand
@@ -472,7 +466,7 @@ C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
mov %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
- mov %rax, (%rsp)
+ mov %rax, 8(rp)
jmp L(m2)
ALIGN(16)
@@ -709,11 +703,9 @@ C Function mpn_addmul_2s_2
C Function mpn_sqr_diag_addlsh1
lea -4(n,n), j
- mov (%rsp), %r11
-
- lea (rp,j,8), rp
+ mov 8(rp), %r11
lea -8(up), up
- lea 8(%rsp,j,8), tp
+ lea (rp,j,8), rp
neg j
mov (up,j,4), %rax
mul %rax
@@ -741,9 +733,9 @@ L(top): mov (up,j,4), %rax
adc %rdx, %r11
mov %r10, (rp,j,8)
L(d0): mov %r11, 8(rp,j,8)
- mov (tp,j,8), %r10
+ mov 16(rp,j,8), %r10
adc %r10, %r10
- mov 8(tp,j,8), %r11
+ mov 24(rp,j,8), %r11
adc %r11, %r11
nop
sbb R32(%rbp), R32(%rbp) C save CF
@@ -754,9 +746,9 @@ L(d0): mov %r11, 8(rp,j,8)
adc %rdx, %r11
mov %r10, 16(rp,j,8)
L(d1): mov %r11, 24(rp,j,8)
- mov 16(tp,j,8), %r10
+ mov 32(rp,j,8), %r10
adc %r10, %r10
- mov 24(tp,j,8), %r11
+ mov 40(rp,j,8), %r11
adc %r11, %r11
sbb R32(%rbx), R32(%rbx) C save CF
add $4, j
@@ -769,7 +761,7 @@ L(d1): mov %r11, 24(rp,j,8)
adc %rdx, %r11
mov %r10, (rp)
mov %r11, 8(rp)
- mov (tp), %r10
+ mov 16(rp), %r10
adc %r10, %r10
sbb R32(%rbp), R32(%rbp) C save CF
neg R32(%rbp)
@@ -781,7 +773,6 @@ L(d1): mov %r11, 24(rp,j,8)
mov %r10, 16(rp)
mov %rdx, 24(rp)
- add $eval(8+STACK_ALLOC), %rsp
pop %r14
pop %r13
pop %r12