From 26c75cf8267919f81a1759c9c965a52c660233f9 Mon Sep 17 00:00:00 2001 From: Pedro Alvarez Date: Fri, 27 May 2016 17:39:31 +0100 Subject: go to gmp 4.3.2 --- gmp/mpn/x86_64/addmul_2.asm | 175 ++++++++++++++++++++------------------------ 1 file changed, 79 insertions(+), 96 deletions(-) (limited to 'gmp/mpn/x86_64/addmul_2.asm') diff --git a/gmp/mpn/x86_64/addmul_2.asm b/gmp/mpn/x86_64/addmul_2.asm index 18307d719f..8f133c3b00 100644 --- a/gmp/mpn/x86_64/addmul_2.asm +++ b/gmp/mpn/x86_64/addmul_2.asm @@ -1,51 +1,39 @@ dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and dnl add the result to a third limb vector. -dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. +dnl Copyright 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. -dnl + dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C AMD K8,K9 2.375 -C AMD K10 2.375 -C Intel P4 15-16 -C Intel core2 4.45 -C Intel NHM 4.32 -C Intel SBR 3.4 -C Intel atom ? -C VIA nano 4.4 +C K8,K9: 2.375 +C K10: 2.375 +C P4: ? +C P6-15: 4.45 C This code is the result of running a code generation and optimization tool C suite written by David Harvey and Torbjorn Granlund. C TODO -C * Tune feed-in and wind-down code. +C * Work on feed-in and wind-down code. +C * Convert "mov $0" to "xor". +C * Adjust initial lea to save some bytes. +C * Perhaps adjust n from n_param&3 value? C INPUT PARAMETERS define(`rp', `%rdi') @@ -61,124 +49,119 @@ define(`w2', `%rbp') define(`w3', `%r10') define(`n', `%r11') -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_addmul_2) - FUNC_ENTRY(4) - mov n_param, n push %rbx push %rbp - mov 0(vp), v0 + mov (vp), v0 mov 8(vp), v1 - mov R32(n_param), R32(%rbx) - mov (up), %rax - lea -8(up,n_param,8), up - lea -8(rp,n_param,8), rp - mul v0 + mov n_param, n neg n - and $3, R32(%rbx) - jz L(b0) - cmp $2, R32(%rbx) - jc L(b1) - jz L(b2) - -L(b3): mov %rax, w1 + lea -32(up,n_param,8), up + lea -32(rp,n_param,8), rp + + and $3, R32(n_param) + jz L(am2p0) + cmp $2, R32(n_param) + jc L(am2p1) + jz L(am2p2) +L(am2p3): + mov 32(up,n,8), %rax + mul v0 + mov %rax, w1 + mov 32(up,n,8), %rax mov %rdx, w2 xor R32(w3), R32(w3) - mov 8(up,n,8), %rax - dec n - jmp L(lo3) - -L(b2): mov %rax, w2 - mov 8(up,n,8), %rax - mov %rdx, w3 - xor R32(w0), R32(w0) - add $-2, n - jmp L(lo2) - -L(b1): mov %rax, w3 - mov 8(up,n,8), %rax - mov %rdx, w0 - xor R32(w1), R32(w1) - inc n - jmp L(lo1) - -L(b0): mov $0, R32(w3) + add $2, n + jmp L(am3) +L(am2p0): + mov 32(up,n,8), %rax + mul v0 mov %rax, w0 - mov 8(up,n,8), %rax + mov 32(up,n,8), %rax mov %rdx, w1 xor R32(w2), R32(w2) - jmp L(lo0) + add $3, n + jmp L(am0) +L(am2p1): + mov 32(up,n,8), %rax + mul v0 + mov %rax, w3 + mov 32(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(am1) +L(am2p2): + mov 32(up,n,8), %rax + mul v0 + mov %rax, w2 + mov 32(up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + add $1, n + jmp L(am2) ALIGN(32) -L(top): mov $0, R32(w1) - mul v0 - add %rax, w3 - mov (up,n,8), %rax - adc %rdx, w0 - adc $0, R32(w1) -L(lo1): mul v1 +L(top): add w3, (rp,n,8) - mov $0, R32(w3) adc %rax, w0 - mov $0, R32(w2) mov 8(up,n,8), %rax adc %rdx, w1 + mov $0, R32(w2) mul v0 add %rax, w0 mov 8(up,n,8), %rax adc %rdx, w1 adc $0, R32(w2) -L(lo0): mul v1 +L(am0): mul v1 add w0, 8(rp,n,8) adc %rax, w1 adc %rdx, w2 mov 16(up,n,8), %rax + mov $0, R32(w3) mul v0 add %rax, w1 + mov 16(up,n,8), %rax adc %rdx, w2 adc $0, R32(w3) - mov 16(up,n,8), %rax -L(lo3): mul v1 +L(am3): mul v1 add w1, 16(rp,n,8) adc %rax, w2 - adc %rdx, w3 - xor R32(w0), R32(w0) mov 24(up,n,8), %rax + adc %rdx, w3 mul v0 + mov $0, R32(w0) add %rax, w2 - mov 24(up,n,8), %rax adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,n,8), %rax adc $0, R32(w0) -L(lo2): mul v1 +L(am2): mul v1 add w2, 24(rp,n,8) adc %rax, w3 adc %rdx, w0 mov 32(up,n,8), %rax - add $4, n - js L(top) - -L(end): xor R32(w1), R32(w1) mul v0 add %rax, w3 - mov (up), %rax + mov 32(up,n,8), %rax adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add w3, (rp) + adc $0, R32(w1) +L(am1): mul v1 + add $4, n + js L(top) + + add w3, (rp,n,8) adc %rax, w0 adc %rdx, w1 - mov w0, 8(rp) + mov w0, 8(rp,n,8) mov w1, %rax pop %rbp pop %rbx - FUNC_EXIT() ret EPILOGUE() -- cgit v1.2.1