From 8fec7463e2f655a6d996fa6300fb43b259d94901 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 24 Oct 2011 09:12:00 +0200 Subject: Put intermediate result into R, don't allocate any stack space. --- mpn/x86_64/sqr_basecase.asm | 71 ++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm index b84019458..93c666599 100644 --- a/mpn/x86_64/sqr_basecase.asm +++ b/mpn/x86_64/sqr_basecase.asm @@ -30,11 +30,11 @@ C large trip count. Instead, we should follow the generic/sqr_basecase.c C code which uses addmul_2s from the start, conditionally leaving a 1x1 C multiply to the end. (In assembly code, one would stop invoking C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.) -C * This code only handles operands up to SQR_TOOM2_THRESHOLD_MAX. That -C means we can safely use 32-bit operations for all sizes, unlike in e.g., -C mpn_addmul_1. +C * Align more labels, should shave off a few cycles. +C * We can safely use 32-bit size operations, since operands with (2^32) +C limbs will lead to non-termination in practice. C * The jump table could probably be optimized, at least for non-pic. -C * The special code for n=1,2,3 was quickly written. It is probably too +C * The special code for n <= 4 was quickly written. It is probably too C large and unnecessarily slow. C * Consider combining small cases code so that the n=k-1 code jumps into the C middle of the n=k code. @@ -62,12 +62,6 @@ define(`rp', `%rdi') define(`up', `%rsi') define(`n_param', `%rdx') -C We should really trim this, for better spatial locality. Alternatively, -C we could grab the upper part of the stack area, leaving the lower part -C instead of the upper part unused. -deflit(SQR_TOOM2_THRESHOLD_MAX, 80) -define(`STACK_ALLOC', eval(8*2*SQR_TOOM2_THRESHOLD_MAX)) - define(`n', `%r11') define(`tp', `%r12') define(`i', `%r8') @@ -85,12 +79,12 @@ ASM_START() ALIGN(16) PROLOGUE(mpn_sqr_basecase) - add $-48, %rsp - mov %rbx, 40(%rsp) - mov %rbp, 32(%rsp) - mov %r12, 24(%rsp) - mov %r13, 16(%rsp) - mov %r14, 8(%rsp) + add $-40, %rsp + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) mov R32(n_param), R32(n) C free original n register (rdx) mov R32(n_param), R32(%rcx) @@ -117,7 +111,7 @@ L(1): mov (up), %rax mul %rax mov %rax, (rp) mov %rdx, 8(rp) - add $40, %rsp + add $32, %rsp pop %rbx ret @@ -141,7 +135,7 @@ L(2): mov (up), %rax mov %r10, 16(rp) adc $0, %r11 mov %r11, 24(rp) - add $40, %rsp + add $32, %rsp pop %rbx ret @@ -186,7 +180,7 @@ L(3): mov (up), %rax adc %r10, 24(rp) adc %r11, 32(rp) adc %rbx, 40(rp) - add $40, %rsp + add $32, %rsp pop %rbx ret @@ -256,15 +250,15 @@ L(4): mov (up), %rax adc %r12, 40(rp) adc %rbp, 48(rp) adc %rbx, 56(rp) - add $24, %rsp + add $16, %rsp pop %r12 pop %rbp pop %rbx ret -L(0m4): add $-STACK_ALLOC, %rsp - lea -24(%rsp,n,8), tp C point tp in middle of result operand +L(0m4): + lea -16(rp,n,8), tp C point tp in middle of result operand mov (up), v0 mov 8(up), %rax lea (up,n,8), up C point up at end of input operand @@ -321,8 +315,8 @@ L(L3): xor R32(w1), R32(w1) jmp L(dowhile) -L(1m4): add $-STACK_ALLOC, %rsp - lea (%rsp,n,8), tp C point tp in middle of result operand +L(1m4): + lea 8(rp,n,8), tp C point tp in middle of result operand mov (up), v0 C u0 mov 8(up), %rax C u1 lea 8(up,n,8), up C point up at end of input operand @@ -336,7 +330,7 @@ C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) mul v0 C u0 * u1 mov %rdx, w1 xor R32(w2), R32(w2) - mov %rax, (%rsp) + mov %rax, 8(rp) jmp L(m0) ALIGN(16) @@ -399,8 +393,8 @@ L(m2x): mov (up,j,8), %rax jmp L(dowhile_end) -L(2m4): add $-STACK_ALLOC, %rsp - lea -24(%rsp,n,8), tp C point tp in middle of result operand +L(2m4): + lea -16(rp,n,8), tp C point tp in middle of result operand mov (up), v0 mov 8(up), %rax lea (up,n,8), up C point up at end of input operand @@ -456,8 +450,8 @@ L(L1): xor R32(w0), R32(w0) jmp L(dowhile_mid) -L(3m4): add $-STACK_ALLOC, %rsp - lea (%rsp,n,8), tp C point tp in middle of result operand +L(3m4): + lea 8(rp,n,8), tp C point tp in middle of result operand mov (up), v0 C u0 mov 8(up), %rax C u1 lea 8(up,n,8), up C point up at end of input operand @@ -472,7 +466,7 @@ C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) mov %rdx, w3 xor R32(w0), R32(w0) xor R32(w1), R32(w1) - mov %rax, (%rsp) + mov %rax, 8(rp) jmp L(m2) ALIGN(16) @@ -709,11 +703,9 @@ C Function mpn_addmul_2s_2 C Function mpn_sqr_diag_addlsh1 lea -4(n,n), j - mov (%rsp), %r11 - - lea (rp,j,8), rp + mov 8(rp), %r11 lea -8(up), up - lea 8(%rsp,j,8), tp + lea (rp,j,8), rp neg j mov (up,j,4), %rax mul %rax @@ -741,9 +733,9 @@ L(top): mov (up,j,4), %rax adc %rdx, %r11 mov %r10, (rp,j,8) L(d0): mov %r11, 8(rp,j,8) - mov (tp,j,8), %r10 + mov 16(rp,j,8), %r10 adc %r10, %r10 - mov 8(tp,j,8), %r11 + mov 24(rp,j,8), %r11 adc %r11, %r11 nop sbb R32(%rbp), R32(%rbp) C save CF @@ -754,9 +746,9 @@ L(d0): mov %r11, 8(rp,j,8) adc %rdx, %r11 mov %r10, 16(rp,j,8) L(d1): mov %r11, 24(rp,j,8) - mov 16(tp,j,8), %r10 + mov 32(rp,j,8), %r10 adc %r10, %r10 - mov 24(tp,j,8), %r11 + mov 40(rp,j,8), %r11 adc %r11, %r11 sbb R32(%rbx), R32(%rbx) C save CF add $4, j @@ -769,7 +761,7 @@ L(d1): mov %r11, 24(rp,j,8) adc %rdx, %r11 mov %r10, (rp) mov %r11, 8(rp) - mov (tp), %r10 + mov 16(rp), %r10 adc %r10, %r10 sbb R32(%rbp), R32(%rbp) C save CF neg R32(%rbp) @@ -781,7 +773,6 @@ L(d1): mov %r11, 24(rp,j,8) mov %r10, 16(rp) mov %rdx, 24(rp) - add $eval(8+STACK_ALLOC), %rsp pop %r14 pop %r13 pop %r12 -- cgit v1.2.1