diff options
Diffstat (limited to 'gmp/mpn/x86_64/coreihwl')
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/addmul_2.asm | 238 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/aorsmul_1.asm | 198 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/gmp-mparam.h | 237 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/mul_1.asm | 155 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/mul_2.asm | 173 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/mul_basecase.asm | 441 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/mullo_basecase.asm | 426 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/redc_1.asm | 433 | ||||
-rw-r--r-- | gmp/mpn/x86_64/coreihwl/sqr_basecase.asm | 506 |
9 files changed, 0 insertions, 2807 deletions
diff --git a/gmp/mpn/x86_64/coreihwl/addmul_2.asm b/gmp/mpn/x86_64/coreihwl/addmul_2.asm deleted file mode 100644 index 54aebc888d..0000000000 --- a/gmp/mpn/x86_64/coreihwl/addmul_2.asm +++ /dev/null @@ -1,238 +0,0 @@ -dnl AMD64 mpn_addmul_2 optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 n/a -C AMD K10 n/a -C AMD bull n/a -C AMD pile n/a -C AMD steam ? -C AMD bobcat n/a -C AMD jaguar ? -C Intel P4 n/a -C Intel core n/a -C Intel NHM n/a -C Intel SBR n/a -C Intel IBR n/a -C Intel HWL 2.15 -C Intel BWL ? -C Intel atom n/a -C VIA nano n/a - -C The loop of this code is the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n_param',`%rdx') -define(`vp', `%rcx') - -define(`v0', `%r8') -define(`v1', `%r9') -define(`w0', `%rbx') -define(`w1', `%rcx') -define(`w2', `%rbp') -define(`w3', `%r10') -define(`n', `%r11') -define(`X0', `%r12') -define(`X1', `%r13') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_addmul_2) - FUNC_ENTRY(4) - push %rbx - push %rbp - push %r12 - push %r13 - - mov (vp), v0 - mov 8(vp), v1 - - mov n_param, n - shr $2, n - - test $1, R8(n_param) - jnz L(bx1) - -L(bx0): mov (rp), X0 - mov 8(rp), X1 - test $2, R8(n_param) - jnz L(b10) - -L(b00): mov (up), %rdx - lea 16(up), up - mulx( v0, %rax, w1) - add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - mov X0, (rp) - add %rax, X1 - adc $0, w2 - mov -8(up), %rdx - lea 16(rp), rp - jmp L(lo0) - -L(b10): mov (up), %rdx - inc n - mulx( v0, %rax, w1) - add %rax, X0 - adc $0, w1 - mulx( v1, %rax, w2) - mov X0, (rp) - mov 16(rp), X0 - add %rax, X1 - adc $0, w2 - xor w0, w0 - jmp L(lo2) - -L(bx1): mov (rp), X1 - mov 8(rp), X0 - test $2, R8(n_param) - jnz L(b11) - -L(b01): mov (up), %rdx - mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - mov 8(up), %rdx - mov X1, (rp) - mov 16(rp), X1 - mulx( v0, %rax, w1) - lea 24(rp), rp - lea 24(up), up - jmp L(lo1) - -L(b11): mov (up), %rdx - inc n - mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - mov X1, (rp) - mov 8(up), %rdx - mulx( v0, %rax, w1) - lea 8(rp), rp - lea 8(up), up - jmp L(lo3) - - ALIGN(16) -L(top): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 - add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - lea 32(rp), rp - add w1, X1 - mov -16(up), %rdx - mov X1, -24(rp) - adc $0, w3 - add w2, X0 - mov -8(rp), X1 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo1): add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - add w3, X0 - mov X0, -16(rp) - adc $0, w1 - add %rax, X1 - adc $0, w2 - add w0, X1 - mov -8(up), %rdx - adc $0, w2 -L(lo0): mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mov (rp), X0 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - add w1, X1 - mov X1, -8(rp) - adc $0, w3 - mov (up), %rdx - add w2, X0 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo3): add %rax, X0 - adc $0, w1 - mulx( v1, %rax, w2) - add w3, X0 - mov 8(rp), X1 - mov X0, (rp) - mov 16(rp), X0 - adc $0, w1 - add %rax, X1 - adc $0, w2 -L(lo2): mov 8(up), %rdx - lea 32(up), up - dec n - jnz L(top) - -L(end): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 - add %rax, X1 - adc $0, w3 - mulx( v1, %rdx, %rax) - add w1, X1 - mov X1, 8(rp) - adc $0, w3 - add w2, %rdx - adc $0, %rax - add w3, %rdx - mov %rdx, 16(rp) - adc $0, %rax - - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm b/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm deleted file mode 100644 index fd5a26d00f..0000000000 --- a/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm +++ /dev/null @@ -1,198 +0,0 @@ -dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 n/a -C AMD K10 n/a -C AMD bull n/a -C AMD pile n/a -C AMD steam ? -C AMD bobcat n/a -C AMD jaguar ? -C Intel P4 n/a -C Intel core n/a -C Intel NHM n/a -C Intel SBR n/a -C Intel IBR n/a -C Intel HWL 2.32 -C Intel BWL ? -C Intel atom n/a -C VIA nano n/a - -C The loop of this code is the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Handle small n separately, for lower overhead. - -define(`rp', `%rdi') C rcx -define(`up', `%rsi') C rdx -define(`n_param', `%rdx') C r8 -define(`v0_param',`%rcx') C r9 - -define(`n', `%rbp') -define(`v0', `%rdx') - -ifdef(`OPERATION_addmul_1',` - define(`ADDSUB', `add') - define(`ADCSBB', `adc') - define(`func', `mpn_addmul_1') -') -ifdef(`OPERATION_submul_1',` - define(`ADDSUB', `sub') - define(`ADCSBB', `sbb') - define(`func', `mpn_submul_1') -') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(func) - FUNC_ENTRY(4) - push %rbx - push %rbp - push %r12 - push %r13 - - mov n_param, n - mov v0_param, v0 - - test $1, R8(n) - jnz L(bx1) - -L(bx0): shr $2, n - jc L(b10) - -L(b00): mulx( (up), %r13, %r12) - mulx( 8,(up), %rbx, %rax) - add %r12, %rbx - adc $0, %rax - mov (rp), %r12 - mov 8(rp), %rcx - mulx( 16,(up), %r9, %r8) - lea -16(rp), rp - lea 16(up), up - ADDSUB %r13, %r12 - jmp L(lo0) - -L(bx1): shr $2, n - jc L(b11) - -L(b01): mulx( (up), %r11, %r10) - jnz L(gt1) -L(n1): ADDSUB %r11, (rp) - mov $0, R32(%rax) - adc %r10, %rax - jmp L(ret) - -L(gt1): mulx( 8,(up), %r13, %r12) - mulx( 16,(up), %rbx, %rax) - lea 24(up), up - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (rp), %r10 - mov 8(rp), %r12 - mov 16(rp), %rcx - lea -8(rp), rp - ADDSUB %r11, %r10 - jmp L(lo1) - -L(b11): mulx( (up), %rbx, %rax) - mov (rp), %rcx - mulx( 8,(up), %r9, %r8) - lea 8(up), up - lea -24(rp), rp - inc n C adjust n - ADDSUB %rbx, %rcx - jmp L(lo3) - -L(b10): mulx( (up), %r9, %r8) - mulx( 8,(up), %r11, %r10) - lea -32(rp), rp - mov $0, R32(%rax) - clc C clear cf - jz L(end) C depends on old shift - - ALIGN(16) -L(top): adc %rax, %r9 - lea 32(rp), rp - adc %r8, %r11 - mulx( 16,(up), %r13, %r12) - mov (rp), %r8 - mulx( 24,(up), %rbx, %rax) - lea 32(up), up - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(rp), %r10 - mov 16(rp), %r12 - ADDSUB %r9, %r8 - mov 24(rp), %rcx - mov %r8, (rp) - ADCSBB %r11, %r10 -L(lo1): mulx( (up), %r9, %r8) - mov %r10, 8(rp) - ADCSBB %r13, %r12 -L(lo0): mov %r12, 16(rp) - ADCSBB %rbx, %rcx -L(lo3): mulx( 8,(up), %r11, %r10) - mov %rcx, 24(rp) - dec n - jnz L(top) - -L(end): adc %rax, %r9 - adc %r8, %r11 - mov 32(rp), %r8 - mov %r10, %rax - adc $0, %rax - mov 40(rp), %r10 - ADDSUB %r9, %r8 - mov %r8, 32(rp) - ADCSBB %r11, %r10 - mov %r10, 40(rp) - adc $0, %rax - -L(ret): pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/coreihwl/gmp-mparam.h b/gmp/mpn/x86_64/coreihwl/gmp-mparam.h deleted file mode 100644 index eef44b3a81..0000000000 --- a/gmp/mpn/x86_64/coreihwl/gmp-mparam.h +++ /dev/null @@ -1,237 +0,0 @@ -/* Haswell gmp-mparam.h -- Compiler/machine parameter header file. - -Copyright 1991, 1993, 1994, 2000-2014 Free Software Foundation, Inc. - -This file is part of the GNU MP Library. - -The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - -or - - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. - -or both in parallel, as here. - -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. */ - -#define GMP_LIMB_BITS 64 -#define GMP_LIMB_BYTES 8 - -/* 2900 MHz Core i5 Haswell */ -/* FFT tuning limit = 75000000 */ -/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */ - -#define MOD_1_NORM_THRESHOLD 0 /* always */ -#define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 -#define USE_PREINV_DIVREM_1 1 /* native */ -#define DIV_QR_1_NORM_THRESHOLD 1 -#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 25 - -#define MUL_TOOM22_THRESHOLD 22 -#define MUL_TOOM33_THRESHOLD 74 -#define MUL_TOOM44_THRESHOLD 195 -#define MUL_TOOM6H_THRESHOLD 298 -#define MUL_TOOM8H_THRESHOLD 406 - -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132 -#define MUL_TOOM43_TO_TOOM54_THRESHOLD 170 - -#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 34 -#define SQR_TOOM3_THRESHOLD 117 -#define SQR_TOOM4_THRESHOLD 336 -#define SQR_TOOM6_THRESHOLD 426 -#define SQR_TOOM8_THRESHOLD 562 - -#define MULMID_TOOM42_THRESHOLD 42 - -#define MULMOD_BNM1_THRESHOLD 13 -#define SQRMOD_BNM1_THRESHOLD 17 - -#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ -#define MUL_FFT_TABLE3 \ - { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ - { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ - { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ - { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ - { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ - { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ - { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ - { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \ - { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ - { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ - { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ - { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ - { 135,11}, { 79,10}, { 159, 9}, { 319,10}, \ - { 167,11}, { 95,10}, { 191, 9}, { 383,11}, \ - { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ - { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ - { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ - { 159,10}, { 319, 9}, { 639,12}, { 95,11}, \ - { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ - { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ - { 271,10}, { 543, 9}, { 1087,11}, { 287,10}, \ - { 607,12}, { 159,11}, { 319,10}, { 639,11}, \ - { 335,10}, { 671,11}, { 351,10}, { 703,11}, \ - { 367,12}, { 191,11}, { 383,10}, { 767,11}, \ - { 415,10}, { 831,12}, { 223,11}, { 447,10}, \ - { 895,11}, { 479,13}, { 127,12}, { 255,11}, \ - { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \ - { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ - { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \ - { 351,11}, { 703,10}, { 1407,11}, { 735,13}, \ - { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ - { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \ - { 479,14}, { 127,12}, { 511,11}, { 1023,12}, \ - { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \ - { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \ - { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \ - { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \ - { 447,12}, { 959,11}, { 1919,13}, { 511,12}, \ - { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ - { 1343,13}, { 703,12}, { 1407,11}, { 2815,14}, \ - { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \ - { 1727,13}, { 959,12}, { 1919,14}, { 511,13}, \ - { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \ - { 1215,12}, { 2431,14}, { 639,13}, { 1279,12}, \ - { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ - { 2815,13}, { 1471,12}, { 2943,14}, { 767,13}, \ - { 1535,12}, { 3071,13}, { 1727,14}, { 895,13}, \ - { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \ - { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ - { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ - { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \ - { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ - { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ - { 2175,13}, { 4351,14}, { 2431,13}, { 4863,15}, \ - { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ - { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ - { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ - { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ - { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ - {8388608,24} } -#define MUL_FFT_TABLE3_SIZE 237 -#define MUL_FFT_THRESHOLD 4224 - -#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ -#define SQR_FFT_TABLE3 \ - { { 344, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ - { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ - { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ - { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ - { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ - { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ - { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ - { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ - { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ - { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ - { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ - { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \ - { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \ - { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \ - { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ - { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ - { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \ - { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \ - { 767,11}, { 207,10}, { 415,13}, { 63,12}, \ - { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ - { 543, 9}, { 1087,10}, { 575,11}, { 303,10}, \ - { 607,11}, { 319,10}, { 671,11}, { 351,10}, \ - { 735,11}, { 383,10}, { 767,11}, { 415,10}, \ - { 831,11}, { 447,10}, { 895,11}, { 479,13}, \ - { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ - { 607,10}, { 1215,11}, { 671,12}, { 351,11}, \ - { 735,12}, { 383,11}, { 767,12}, { 415,11}, \ - { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \ - { 479,14}, { 127,12}, { 511,11}, { 1023,12}, \ - { 543,11}, { 1087,12}, { 607,11}, { 1215,13}, \ - { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ - { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \ - { 1535,12}, { 831,13}, { 447,12}, { 959,13}, \ - { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ - { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ - { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \ - { 1663,13}, { 959,14}, { 511,13}, { 1087,12}, \ - { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ - { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ - { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ - { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ - { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ - { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ - { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \ - { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ - { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \ - { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ - { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ - { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \ - { 4223,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ - { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ - {4194304,23}, {8388608,24} } -#define SQR_FFT_TABLE3_SIZE 206 -#define SQR_FFT_THRESHOLD 3712 - -#define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 78 -#define MULLO_MUL_N_THRESHOLD 8207 - -#define DC_DIV_QR_THRESHOLD 63 -#define DC_DIVAPPR_Q_THRESHOLD 195 -#define DC_BDIV_QR_THRESHOLD 56 -#define DC_BDIV_Q_THRESHOLD 128 - -#define INV_MULMOD_BNM1_THRESHOLD 42 -#define INV_NEWTON_THRESHOLD 199 -#define INV_APPR_THRESHOLD 181 - -#define BINV_NEWTON_THRESHOLD 236 -#define REDC_1_TO_REDC_2_THRESHOLD 47 -#define REDC_2_TO_REDC_N_THRESHOLD 62 - -#define MU_DIV_QR_THRESHOLD 1470 -#define MU_DIVAPPR_Q_THRESHOLD 1589 -#define MUPI_DIV_QR_THRESHOLD 78 -#define MU_BDIV_QR_THRESHOLD 1442 -#define MU_BDIV_Q_THRESHOLD 1470 - -#define POWM_SEC_TABLE 3,22,194,257,1099 - -#define MATRIX22_STRASSEN_THRESHOLD 17 -#define HGCD_THRESHOLD 112 -#define HGCD_APPR_THRESHOLD 52 -#define HGCD_REDUCE_THRESHOLD 2681 -#define GCD_DC_THRESHOLD 807 -#define GCDEXT_DC_THRESHOLD 416 -#define JACOBI_BASE_METHOD 4 - -#define GET_STR_DC_THRESHOLD 12 -#define GET_STR_PRECOMPUTE_THRESHOLD 21 -#define SET_STR_DC_THRESHOLD 1326 -#define SET_STR_PRECOMPUTE_THRESHOLD 2627 - -#define FAC_DSC_THRESHOLD 767 -#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp/mpn/x86_64/coreihwl/mul_1.asm b/gmp/mpn/x86_64/coreihwl/mul_1.asm deleted file mode 100644 index 1e3c338f4e..0000000000 --- a/gmp/mpn/x86_64/coreihwl/mul_1.asm +++ /dev/null @@ -1,155 +0,0 @@ -dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb best -C AMD K8,K9 n/a -C AMD K10 n/a -C AMD bd1 n/a -C AMD bd2 ? -C AMD bobcat n/a -C AMD jaguar ? -C Intel P4 n/a -C Intel PNR n/a -C Intel NHM n/a -C Intel SBR n/a -C Intel IBR n/a -C Intel HWL 1.57 this -C Intel BWL ? -C Intel atom n/a -C VIA nano n/a - -C The loop of this code is the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjorn Granlund. - -define(`rp', `%rdi') C rcx -define(`up', `%rsi') C rdx -define(`n_param', `%rdx') C r8 -define(`v0_param',`%rcx') C r9 - -define(`n', `%rbp') -define(`v0', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_mul_1) - FUNC_ENTRY(4) - push %rbx - push %rbp - push %r12 - - mov n_param, n - shr $2, n - - test $1, R8(n_param) - jnz L(bx1) - -L(bx0): test $2, R8(n_param) - mov v0_param, v0 - jnz L(b10) - -L(b00): mulx( (up), %r9, %r8) - mulx( 8,(up), %r11, %r10) - mulx( 16,(up), %rcx, %r12) - lea -32(rp), rp - jmp L(lo0) - -L(b10): mulx( (up), %rcx, %r12) - mulx( 8,(up), %rbx, %rax) - lea -16(rp), rp - test n, n - jz L(cj2) - mulx( 16,(up), %r9, %r8) - lea 16(up), up - jmp L(lo2) - -L(bx1): test $2, R8(n_param) - mov v0_param, v0 - jnz L(b11) - -L(b01): mulx( (up), %rbx, %rax) - lea -24(rp), rp - test n, n - jz L(cj1) - mulx( 8,(up), %r9, %r8) - lea 8(up), up - jmp L(lo1) - -L(b11): mulx( (up), %r11, %r10) - mulx( 8,(up), %rcx, %r12) - mulx( 16,(up), %rbx, %rax) - lea -8(rp), rp - test n, n - jz L(cj3) - lea 24(up), up - jmp L(lo3) - - ALIGN(32) -L(top): lea 32(rp), rp - mov %r9, (rp) - adc %r8, %r11 -L(lo3): mulx( (up), %r9, %r8) - mov %r11, 8(rp) - adc %r10, %rcx -L(lo2): mov %rcx, 16(rp) - adc %r12, %rbx -L(lo1): mulx( 8,(up), %r11, %r10) - adc %rax, %r9 - mulx( 16,(up), %rcx, %r12) - mov %rbx, 24(rp) -L(lo0): mulx( 24,(up), %rbx, %rax) - lea 32(up), up - dec n - jnz L(top) - -L(end): lea 32(rp), rp - mov %r9, (rp) - adc %r8, %r11 -L(cj3): mov %r11, 8(rp) - adc %r10, %rcx -L(cj2): mov %rcx, 16(rp) - adc %r12, %rbx -L(cj1): mov %rbx, 24(rp) - adc $0, %rax - - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() -ASM_END() diff --git a/gmp/mpn/x86_64/coreihwl/mul_2.asm b/gmp/mpn/x86_64/coreihwl/mul_2.asm deleted file mode 100644 index 5bdb1aa645..0000000000 --- a/gmp/mpn/x86_64/coreihwl/mul_2.asm +++ /dev/null @@ -1,173 +0,0 @@ -dnl AMD64 mpn_mul_2 optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 n/a -C AMD K10 n/a -C AMD bull n/a -C AMD pile n/a -C AMD steam ? -C AMD bobcat n/a -C AMD jaguar ? -C Intel P4 n/a -C Intel core n/a -C Intel NHM n/a -C Intel SBR n/a -C Intel IBR n/a -C Intel HWL 1.86 -C Intel BWL ? -C Intel atom n/a -C VIA nano n/a - -C The loop of this code is the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Move test and jcc together, for insn fusion. - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n_param',`%rdx') -define(`vp', `%rcx') - -define(`v0', `%r8') -define(`v1', `%r9') -define(`w0', `%rbx') -define(`w1', `%rcx') -define(`w2', `%rbp') -define(`w3', `%r10') -define(`n', `%r11') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_mul_2) - FUNC_ENTRY(4) - push %rbx - push %rbp - - mov (vp), v0 - mov 8(vp), v1 - - lea 3(n_param), n - shr $2, n - - test $1, R8(n_param) - jnz L(bx1) - -L(bx0): xor w0, w0 - test $2, R8(n_param) - mov (up), %rdx - mulx( v0, w2, w1) - jz L(lo0) - -L(b10): lea -16(rp), rp - lea -16(up), up - jmp L(lo2) - -L(bx1): xor w2, w2 - test $2, R8(n_param) - mov (up), %rdx - mulx( v0, w0, w3) - jnz L(b11) - -L(b01): lea -24(rp), rp - lea 8(up), up - jmp L(lo1) - -L(b11): lea -8(rp), rp - lea -8(up), up - jmp L(lo3) - - ALIGN(16) -L(top): mulx( v1, %rax, w0) - add %rax, w2 C 0 - mov (up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 C 1 - add %rax, w2 C 0 - adc $0, w1 C 1 - add w3, w2 C 0 -L(lo0): mov w2, (rp) C 0 - adc $0, w1 C 1 - mulx( v1, %rax, w2) - add %rax, w0 C 1 - mov 8(up), %rdx - adc $0, w2 C 2 - mulx( v0, %rax, w3) - add %rax, w0 C 1 - adc $0, w3 C 2 - add w1, w0 C 1 -L(lo3): mov w0, 8(rp) C 1 - adc $0, w3 C 2 - mulx( v1, %rax, w0) - add %rax, w2 C 2 - mov 16(up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 C 3 - add %rax, w2 C 2 - adc $0, w1 C 3 - add w3, w2 C 2 -L(lo2): mov w2, 16(rp) C 2 - adc $0, w1 C 3 - mulx( v1, %rax, w2) - add %rax, w0 C 3 - mov 24(up), %rdx - adc $0, w2 C 4 - mulx( v0, %rax, w3) - add %rax, w0 C 3 - adc $0, w3 C 4 - add w1, w0 C 3 - lea 32(up), up -L(lo1): mov w0, 24(rp) C 3 - adc $0, w3 C 4 - dec n - lea 32(rp), rp - jnz L(top) - -L(end): mulx( v1, %rdx, %rax) - add %rdx, w2 - adc $0, %rax - add w3, w2 - mov w2, (rp) - adc $0, %rax - - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/coreihwl/mul_basecase.asm b/gmp/mpn/x86_64/coreihwl/mul_basecase.asm deleted file mode 100644 index b2656c8e9b..0000000000 --- a/gmp/mpn/x86_64/coreihwl/mul_basecase.asm +++ /dev/null @@ -1,441 +0,0 @@ -dnl AMD64 mpn_mul_basecase optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb mul_1 mul_2 mul_3 addmul_2 -C AMD K8,K9 n/a n/a - n/a -C AMD K10 n/a n/a - n/a -C AMD bull n/a n/a - n/a -C AMD pile n/a n/a - n/a -C AMD steam ? ? - ? -C AMD bobcat n/a n/a - n/a -C AMD jaguar ? ? - ? -C Intel P4 n/a n/a - n/a -C Intel core n/a n/a - n/a -C Intel NHM n/a n/a - n/a -C Intel SBR n/a n/a - n/a -C Intel IBR n/a n/a - n/a -C Intel HWL 1.77 1.86 - 2.15 -C Intel BWL ? ? - ? -C Intel atom n/a n/a - n/a -C VIA nano n/a n/a - n/a - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Adjoin a mul_3. -C * Further micro-optimise. - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`un_param',`%rdx') -define(`vp', `%rcx') -define(`vn', `%r8') - -define(`un', `%rbx') - -define(`w0', `%r10') -define(`w1', `%r11') -define(`w2', `%r12') -define(`w3', `%r13') -define(`n', `%rbp') -define(`v0', `%r9') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_mul_basecase) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8d ') - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - mov un_param, un C free up rdx - neg un - - mov un_param, n C FIXME: share - sar $2, n C FIXME: share - - test $1, R8(vn) - jz L(do_mul_2) - -define(`w4', `%r9') -define(`w5', `%r14') - - mov (vp), %rdx - -L(do_mul_1): - test $1, R8(un) - jnz L(m1x1) - -L(m1x0):test $2, R8(un) - jnz L(m110) - -L(m100): - mulx( (up), w5, w2) - mulx( 8,(up), w1, w3) - lea -24(rp), rp - jmp L(m1l0) - -L(m110): - mulx( (up), w3, w4) - mulx( 8,(up), w1, w5) - lea -8(rp), rp - test n, n - jz L(cj2) - mulx( 16,(up), w0, w2) - lea 16(up), up - jmp L(m1l2) - -L(m1x1):test $2, R8(un) - jz L(m111) - -L(m101): - mulx( (up), w4, w5) - lea -16(rp), rp - test n, n - jz L(cj1) - mulx( 8,(up), w0, w2) - lea 8(up), up - jmp L(m1l1) - -L(m111): - mulx( (up), w2, w3) - mulx( 8,(up), w0, w4) - mulx( 16,(up), w1, w5) - lea 24(up), up - test n, n - jnz L(gt3) - add w0, w3 - jmp L(cj3) -L(gt3): add w0, w3 - jmp L(m1l3) - - ALIGN(32) -L(m1tp):lea 32(rp), rp -L(m1l3):mov w2, (rp) - mulx( (up), w0, w2) -L(m1l2):mov w3, 8(rp) - adc w1, w4 -L(m1l1):adc w0, w5 - mov w4, 16(rp) - mulx( 8,(up), w1, w3) -L(m1l0):mov w5, 24(rp) - mulx( 16,(up), w0, w4) - adc w1, w2 - mulx( 24,(up), w1, w5) - adc w0, w3 - lea 32(up), up - dec n - jnz L(m1tp) - -L(m1ed):lea 32(rp), rp -L(cj3): mov w2, (rp) -L(cj2): mov w3, 8(rp) - adc w1, w4 -L(cj1): mov w4, 16(rp) - adc $0, w5 - mov w5, 24(rp) - - dec R32(vn) - jz L(ret5) - - lea 8(vp), vp - lea 32(rp), rp -C push %r12 -C push %r13 -C push %r14 - jmp L(do_addmul) - -L(do_mul_2): -define(`v1', `%r14') -C push %r12 -C push %r13 -C push %r14 - - mov (vp), v0 - mov 8(vp), v1 - - lea (un), n - sar $2, n - - test $1, R8(un) - jnz L(m2x1) - -L(m2x0):xor w0, w0 - test $2, R8(un) - mov (up), %rdx - mulx( v0, w2, w1) - jz L(m2l0) - -L(m210):lea -16(rp), rp - lea -16(up), up - jmp L(m2l2) - -L(m2x1):xor w2, w2 - test $2, R8(un) - mov (up), %rdx - mulx( v0, w0, w3) - jz L(m211) - -L(m201):lea -24(rp), rp - lea 8(up), up - jmp L(m2l1) - -L(m211):lea -8(rp), rp - lea -8(up), up - jmp L(m2l3) - - ALIGN(16) -L(m2tp):mulx( v1, %rax, w0) - add %rax, w2 - mov (up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 - add %rax, w2 - adc $0, w1 - add w3, w2 -L(m2l0):mov w2, (rp) - adc $0, w1 - mulx( v1, %rax, w2) - add %rax, w0 - mov 8(up), %rdx - adc $0, w2 - mulx( v0, %rax, w3) - add %rax, w0 - adc $0, w3 - add w1, w0 -L(m2l3):mov w0, 8(rp) - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, w2 - mov 16(up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 - add %rax, w2 - adc $0, w1 - add w3, w2 -L(m2l2):mov w2, 16(rp) - adc $0, w1 - mulx( v1, %rax, w2) - add %rax, w0 - mov 24(up), %rdx - adc $0, w2 - mulx( v0, %rax, w3) - add %rax, w0 - adc $0, w3 - add w1, w0 - lea 32(up), up -L(m2l1):mov w0, 24(rp) - adc $0, w3 - inc n - lea 32(rp), rp - jnz L(m2tp) - -L(m2ed):mulx( v1, %rdx, %rax) - add %rdx, w2 - adc $0, %rax - add w3, w2 - mov w2, (rp) - adc $0, %rax - mov %rax, 8(rp) - - add $-2, R32(vn) - jz L(ret5) - lea 16(vp), vp - lea 16(rp), rp - - -L(do_addmul): - push %r15 - push vn C save vn in new stack slot -define(`vn', `(%rsp)') -define(`X0', `%r14') -define(`X1', `%r15') -define(`v1', `%r8') - - lea (rp,un,8), rp - lea (up,un,8), up - -L(outer): - mov (vp), v0 - mov 8(vp), v1 - - lea 2(un), n - sar $2, n - - mov (up), %rdx - test $1, R8(un) - jnz L(bx1) - -L(bx0): mov (rp), X0 - mov 8(rp), X1 - mulx( v0, %rax, w1) - add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - mov X0, (rp) - add %rax, X1 - adc $0, w2 - mov 8(up), %rdx - test $2, R8(un) - jnz L(b10) - -L(b00): lea 16(up), up - lea 16(rp), rp - jmp L(lo0) - -L(b10): mov 16(rp), X0 - lea 32(up), up - mulx( v0, %rax, w3) - jmp L(lo2) - -L(bx1): mov (rp), X1 - mov 8(rp), X0 - mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - mov 8(up), %rdx - mov X1, (rp) - mulx( v0, %rax, w1) - test $2, R8(un) - jz L(b11) - -L(b01): mov 16(rp), X1 - lea 24(rp), rp - lea 24(up), up - jmp L(lo1) - -L(b11): lea 8(rp), rp - lea 8(up), up - jmp L(lo3) - - ALIGN(16) -L(top): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 -L(lo2): add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - lea 32(rp), rp - add w1, X1 - mov -16(up), %rdx - mov X1, -24(rp) - adc $0, w3 - add w2, X0 - mov -8(rp), X1 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo1): add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - add w3, X0 - mov X0, -16(rp) - adc $0, w1 - add %rax, X1 - adc $0, w2 - add w0, X1 - mov -8(up), %rdx - adc $0, w2 -L(lo0): mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mov (rp), X0 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - add w1, X1 - mov X1, -8(rp) - adc $0, w3 - mov (up), %rdx - add w2, X0 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo3): add %rax, X0 - adc $0, w1 - mulx( v1, %rax, w2) - add w3, X0 - mov 8(rp), X1 - mov X0, (rp) - mov 16(rp), X0 - adc $0, w1 - add %rax, X1 - adc $0, w2 - mov 8(up), %rdx - lea 32(up), up - inc n - jnz L(top) - -L(end): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 - add %rax, X1 - adc $0, w3 - mulx( v1, %rdx, %rax) - add w1, X1 - mov X1, 8(rp) - adc $0, w3 - add w2, %rdx - adc $0, %rax - add w3, %rdx - mov %rdx, 16(rp) - adc $0, %rax - mov %rax, 24(rp) - - addl $-2, vn - lea 16(vp), vp - lea -16(up,un,8), up - lea 32(rp,un,8), rp - jnz L(outer) - - pop %rax C deallocate vn slot - pop %r15 -L(ret5):pop %r14 -L(ret4):pop %r13 -L(ret3):pop %r12 -L(ret2):pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm b/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm deleted file mode 100644 index 9986e8bcfa..0000000000 --- a/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm +++ /dev/null @@ -1,426 +0,0 @@ -dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb mul_2 addmul_2 -C AMD K8,K9 n/a n/a -C AMD K10 n/a n/a -C AMD bull n/a n/a -C AMD pile n/a n/a -C AMD steam ? ? -C AMD bobcat n/a n/a -C AMD jaguar ? ? -C Intel P4 n/a n/a -C Intel core n/a n/a -C Intel NHM n/a n/a -C Intel SBR n/a n/a -C Intel IBR n/a n/a -C Intel HWL 1.86 2.15 -C Intel BWL ? ? -C Intel atom n/a n/a -C VIA nano n/a n/a - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Implement proper cor2, replacing current cor0. -C * Micro-optimise. - -C When playing with pointers, set this to $2 to fall back to conservative -C indexing in wind-down code. -define(`I',`$1') - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`vp_param', `%rdx') -define(`n', `%rcx') - -define(`vp', `%r8') -define(`X0', `%r14') -define(`X1', `%r15') - -define(`w0', `%r10') -define(`w1', `%r11') -define(`w2', `%r12') -define(`w3', `%r13') -define(`i', `%rbp') -define(`v0', `%r9') -define(`v1', `%rbx') - -C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_mullo_basecase) - FUNC_ENTRY(4) - - mov vp_param, vp - mov (up), %rdx - - cmp $4, n - jb L(small) - - push %rbx - push %rbp - push %r12 - push %r13 - - mov (vp), v0 - mov 8(vp), v1 - - lea 2(n), i - shr $2, i - neg n - add $2, n - - push up C put entry `up' on stack - - test $1, R8(n) - jnz L(m2x1) - -L(m2x0):mulx( v0, w0, w3) - xor R32(w2), R32(w2) - test $2, R8(n) - jz L(m2b2) - -L(m2b0):lea -8(rp), rp - lea -8(up), up - jmp L(m2e0) - -L(m2b2):lea -24(rp), rp - lea 8(up), up - jmp L(m2e2) - -L(m2x1):mulx( v0, w2, w1) - xor R32(w0), R32(w0) - test $2, R8(n) - jnz L(m2b3) - -L(m2b1):jmp L(m2e1) - -L(m2b3):lea -16(rp), rp - lea -16(up), up - jmp L(m2e3) - - ALIGN(16) -L(m2tp):mulx( v1, %rax, w0) - add %rax, w2 - mov (up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 - add %rax, w2 - adc $0, w1 - add w3, w2 -L(m2e1):mov w2, (rp) - adc $0, w1 - mulx( v1, %rax, w2) - add %rax, w0 - mov 8(up), %rdx - adc $0, w2 - mulx( v0, %rax, w3) - add %rax, w0 - adc $0, w3 - add w1, w0 -L(m2e0):mov w0, 8(rp) - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, w2 - mov 16(up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 - add %rax, w2 - adc $0, w1 - add w3, w2 -L(m2e3):mov w2, 16(rp) - adc $0, w1 - mulx( v1, %rax, w2) - add %rax, w0 - mov 24(up), %rdx - adc $0, w2 - mulx( v0, %rax, w3) - add %rax, w0 - adc $0, w3 - add w1, w0 - lea 32(up), up -L(m2e2):mov w0, 24(rp) - adc $0, w3 - dec i - lea 32(rp), rp - jnz L(m2tp) - -L(m2ed):mulx( v1, %rax, w0) - add %rax, w2 - mov (up), %rdx - mulx( v0, %rax, w1) - add w2, %rax - add w3, %rax - mov %rax, (rp) - - mov (%rsp), up C restore `up' to beginning - lea 16(vp), vp - lea 8(rp,n,8), rp C put back rp to old rp + 2 - add $2, n - jge L(cor1) - - push %r14 - push %r15 - -L(outer): - mov (vp), v0 - mov 8(vp), v1 - - lea (n), i - sar $2, i - - mov (up), %rdx - test $1, R8(n) - jnz L(bx1) - -L(bx0): mov (rp), X1 - mov 8(rp), X0 - mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - mov 8(up), %rdx - mov X1, (rp) - mulx( v0, %rax, w1) - test $2, R8(n) - jz L(b2) - -L(b0): lea 8(rp), rp - lea 8(up), up - jmp L(lo0) - -L(b2): mov 16(rp), X1 - lea 24(rp), rp - lea 24(up), up - jmp L(lo2) - -L(bx1): mov (rp), X0 - mov 8(rp), X1 - mulx( v0, %rax, w1) - add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - mov X0, (rp) - add %rax, X1 - adc $0, w2 - mov 8(up), %rdx - test $2, R8(n) - jnz L(b3) - -L(b1): lea 16(up), up - lea 16(rp), rp - jmp L(lo1) - -L(b3): mov 16(rp), X0 - lea 32(up), up - mulx( v0, %rax, w3) - inc i - jz L(cj3) - jmp L(lo3) - - ALIGN(16) -L(top): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 -L(lo3): add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - lea 32(rp), rp - add w1, X1 - mov -16(up), %rdx - mov X1, -24(rp) - adc $0, w3 - add w2, X0 - mov -8(rp), X1 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo2): add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - add w3, X0 - mov X0, -16(rp) - adc $0, w1 - add %rax, X1 - adc $0, w2 - add w0, X1 - mov -8(up), %rdx - adc $0, w2 -L(lo1): mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mov (rp), X0 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - add w1, X1 - mov X1, -8(rp) - adc $0, w3 - mov (up), %rdx - add w2, X0 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo0): add %rax, X0 - adc $0, w1 - mulx( v1, %rax, w2) - add w3, X0 - mov 8(rp), X1 - mov X0, (rp) - mov 16(rp), X0 - adc $0, w1 - add %rax, X1 - adc $0, w2 - mov 8(up), %rdx - lea 32(up), up - inc i - jnz L(top) - -L(end): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 -L(cj3): add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - add w1, X1 - mov -16(up), %rdx - mov X1, 8(rp) - adc $0, w3 - add w2, X0 - mulx( v0, %rax, w1) - add X0, %rax - add w3, %rax - mov %rax, 16(rp) - - mov 16(%rsp), up C restore `up' to beginning - lea 16(vp), vp - lea 24(rp,n,8), rp C put back rp to old rp + 2 - add $2, n - jl L(outer) - - pop %r15 - pop %r14 - - jnz L(cor0) - -L(cor1):mov (vp), v0 - mov 8(vp), v1 - mov (up), %rdx - mulx( v0, %r12, %rbp) C u0 x v2 - add (rp), %r12 C FIXME: rp[0] still available in reg? - adc %rax, %rbp - mov 8(up), %r10 - imul v0, %r10 - imul v1, %rdx - mov %r12, (rp) - add %r10, %rdx - add %rbp, %rdx - mov %rdx, 8(rp) - pop %rax C deallocate `up' copy - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - -L(cor0):mov (vp), %r11 - imul (up), %r11 - add %rax, %r11 - mov %r11, (rp) - pop %rax C deallocate `up' copy - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - - ALIGN(16) -L(small): - cmp $2, n - jae L(gt1) -L(n1): imul (vp), %rdx - mov %rdx, (rp) - FUNC_EXIT() - ret -L(gt1): ja L(gt2) -L(n2): mov (vp), %r9 - mulx( %r9, %rax, %rdx) - mov %rax, (rp) - mov 8(up), %rax - imul %r9, %rax - add %rax, %rdx - mov 8(vp), %r9 - mov (up), %rcx - imul %r9, %rcx - add %rcx, %rdx - mov %rdx, 8(rp) - FUNC_EXIT() - ret -L(gt2): -L(n3): mov (vp), %r9 - mulx( %r9, %rax, %r10) C u0 x v0 - mov %rax, (rp) - mov 8(up), %rdx - mulx( %r9, %rax, %rdx) C u1 x v0 - imul 16(up), %r9 C u2 x v0 - add %rax, %r10 - adc %rdx, %r9 - mov 8(vp), %r11 - mov (up), %rdx - mulx( %r11, %rax, %rdx) C u0 x v1 - add %rax, %r10 - adc %rdx, %r9 - imul 8(up), %r11 C u1 x v1 - add %r11, %r9 - mov %r10, 8(rp) - mov 16(vp), %r10 - mov (up), %rax - imul %rax, %r10 C u0 x v2 - add %r10, %r9 - mov %r9, 16(rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/coreihwl/redc_1.asm b/gmp/mpn/x86_64/coreihwl/redc_1.asm deleted file mode 100644 index f1a475e53c..0000000000 --- a/gmp/mpn/x86_64/coreihwl/redc_1.asm +++ /dev/null @@ -1,433 +0,0 @@ -dnl AMD64 mpn_redc_1 optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 n/a -C AMD K10 n/a -C AMD bull n/a -C AMD pile n/a -C AMD steam ? -C AMD bobcat n/a -C AMD jaguar ? -C Intel P4 n/a -C Intel core n/a -C Intel NHM n/a -C Intel SBR n/a -C Intel IBR n/a -C Intel HWL 2.32 -C Intel BWL ? -C Intel atom n/a -C VIA nano n/a - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Micro-optimise. -C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2 -C cycles, though. - -define(`rp', `%rdi') C rcx -define(`up', `%rsi') C rdx -define(`mp_param', `%rdx') C r8 -define(`n', `%rcx') C r9 -define(`u0inv_param', `%r8') C stack - -define(`i', `%r14') -define(`j', `%r15') -define(`mp', `%rdi') -define(`u0inv', `(%rsp)') C stack - -ABI_SUPPORT(DOS64) C FIXME: needs verification -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_redc_1) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8 ') - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - push rp - mov mp_param, mp C note that rp and mp shares register - mov (up), %rdx - - neg n - push %r8 C put u0inv on stack - imul u0inv_param, %rdx C first iteration q0 - mov n, j C outer loop induction var - - test $1, R8(n) - jnz L(bx1) - -L(bx0): test $2, R8(n) - jz L(o0b) - - cmp $-2, R32(n) - jnz L(o2) - -C Special code for n = 2 since general code cannot handle it - mov 8(%rsp), %rbx C rp - lea 16(%rsp), %rsp C deallocate two slots - mulx( (mp), %r9, %r12) - mulx( 8,(mp), %r11, %r10) - add %r12, %r11 - adc $0, %r10 - add (up), %r9 C = 0 - adc 8(up), %r11 C r11 = up[1] - adc $0, %r10 C -> up[0] - mov %r11, %rdx - imul u0inv_param, %rdx - mulx( (mp), %r13, %r12) - mulx( 8,(mp), %r14, %r15) - xor R32(%rax), R32(%rax) - add %r12, %r14 - adc $0, %r15 - add %r11, %r13 C = 0 - adc 16(up), %r14 C rp[2] - adc $0, %r15 C -> up[1] - add %r14, %r10 - adc 24(up), %r15 - mov %r10, (%rbx) - mov %r15, 8(%rbx) - setc R8(%rax) - jmp L(ret) - -L(o2): lea 2(n), i C inner loop induction var - mulx( (mp), %r9, %r8) - mulx( 8,(mp), %r11, %r10) - sar $2, i - add %r8, %r11 - jmp L(lo2) - - ALIGN(16) -L(tp2): adc %rax, %r9 - lea 32(up), up - adc %r8, %r11 -L(lo2): mulx( 16,(mp), %r13, %r12) - mov (up), %r8 - mulx( 24,(mp), %rbx, %rax) - lea 32(mp), mp - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(up), %r10 - mov 16(up), %r12 - add %r9, %r8 - mov 24(up), %rbp - mov %r8, (up) - adc %r11, %r10 - mulx( (mp), %r9, %r8) - mov %r10, 8(up) - adc %r13, %r12 - mov %r12, 16(up) - adc %rbx, %rbp - mulx( 8,(mp), %r11, %r10) - mov %rbp, 24(up) - inc i - jnz L(tp2) - -L(ed2): mov 56(up,n,8), %rdx C next iteration up[0] - lea 16(mp,n,8), mp C mp = (last starting mp) - adc %rax, %r9 - adc %r8, %r11 - mov 32(up), %r8 - adc $0, %r10 - imul u0inv, %rdx C next iteration q0 - mov 40(up), %rax - add %r9, %r8 - mov %r8, 32(up) - adc %r11, %rax - mov %rax, 40(up) - lea 56(up,n,8), up C up = (last starting up) + 1 - adc $0, %r10 - mov %r10, -8(up) - inc j - jnz L(o2) - - jmp L(cj) - - -L(bx1): test $2, R8(n) - jz L(o3a) - -L(o1a): cmp $-1, R32(n) - jnz L(o1b) - -C Special code for n = 1 since general code cannot handle it - mov 8(%rsp), %rbx C rp - lea 16(%rsp), %rsp C deallocate two slots - mulx( (mp), %r11, %r10) - add (up), %r11 - adc 8(up), %r10 - mov %r10, (%rbx) - mov $0, R32(%rax) - setc R8(%rax) - jmp L(ret) - -L(o1b): lea 24(mp), mp -L(o1): lea 1(n), i C inner loop induction var - mulx( -24,(mp), %r11, %r10) - mulx( -16,(mp), %r13, %r12) - mulx( -8,(mp), %rbx, %rax) - sar $2, i - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (up), %r10 - mov 8(up), %r12 - mov 16(up), %rbp - add %r11, %r10 - jmp L(lo1) - - ALIGN(16) -L(tp1): adc %rax, %r9 - lea 32(up), up - adc %r8, %r11 - mulx( 16,(mp), %r13, %r12) - mov -8(up), %r8 - mulx( 24,(mp), %rbx, %rax) - lea 32(mp), mp - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (up), %r10 - mov 8(up), %r12 - add %r9, %r8 - mov 16(up), %rbp - mov %r8, -8(up) - adc %r11, %r10 -L(lo1): mulx( (mp), %r9, %r8) - mov %r10, (up) - adc %r13, %r12 - mov %r12, 8(up) - adc %rbx, %rbp - mulx( 8,(mp), %r11, %r10) - mov %rbp, 16(up) - inc i - jnz L(tp1) - -L(ed1): mov 48(up,n,8), %rdx C next iteration up[0] - lea 40(mp,n,8), mp C mp = (last starting mp) - adc %rax, %r9 - adc %r8, %r11 - mov 24(up), %r8 - adc $0, %r10 - imul u0inv, %rdx C next iteration q0 - mov 32(up), %rax - add %r9, %r8 - mov %r8, 24(up) - adc %r11, %rax - mov %rax, 32(up) - lea 48(up,n,8), up C up = (last starting up) + 1 - adc $0, %r10 - mov %r10, -8(up) - inc j - jnz L(o1) - - jmp L(cj) - -L(o3a): cmp $-3, R32(n) - jnz L(o3b) - -C Special code for n = 3 since general code cannot handle it -L(n3): mulx( (mp), %rbx, %rax) - mulx( 8,(mp), %r9, %r14) - add (up), %rbx - mulx( 16,(mp), %r11, %r10) - adc %rax, %r9 C W 1 - adc %r14, %r11 C W 2 - mov 8(up), %r14 - mov u0inv_param, %rdx - adc $0, %r10 C W 3 - mov 16(up), %rax - add %r9, %r14 C W 1 - mov %r14, 8(up) - mulx( %r14, %rdx, %r13) C next iteration q0 - adc %r11, %rax C W 2 - mov %rax, 16(up) - adc $0, %r10 C W 3 - mov %r10, (up) - lea 8(up), up C up = (last starting up) + 1 - inc j - jnz L(n3) - - jmp L(cj) - -L(o3b): lea 8(mp), mp -L(o3): lea 4(n), i C inner loop induction var - mulx( -8,(mp), %rbx, %rax) - mulx( (mp), %r9, %r8) - mov (up), %rbp - mulx( 8,(mp), %r11, %r10) - sar $2, i - add %rbx, %rbp - nop - adc %rax, %r9 - jmp L(lo3) - - ALIGN(16) -L(tp3): adc %rax, %r9 - lea 32(up), up -L(lo3): adc %r8, %r11 - mulx( 16,(mp), %r13, %r12) - mov 8(up), %r8 - mulx( 24,(mp), %rbx, %rax) - lea 32(mp), mp - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 16(up), %r10 - mov 24(up), %r12 - add %r9, %r8 - mov 32(up), %rbp - mov %r8, 8(up) - adc %r11, %r10 - mulx( (mp), %r9, %r8) - mov %r10, 16(up) - adc %r13, %r12 - mov %r12, 24(up) - adc %rbx, %rbp - mulx( 8,(mp), %r11, %r10) - mov %rbp, 32(up) - inc i - jnz L(tp3) - -L(ed3): mov 64(up,n,8), %rdx C next iteration up[0] - lea 24(mp,n,8), mp C mp = (last starting mp) - adc %rax, %r9 - adc %r8, %r11 - mov 40(up), %r8 - adc $0, %r10 - imul u0inv, %rdx C next iteration q0 - mov 48(up), %rax - add %r9, %r8 - mov %r8, 40(up) - adc %r11, %rax - mov %rax, 48(up) - lea 64(up,n,8), up C up = (last starting up) + 1 - adc $0, %r10 - mov %r10, -8(up) - inc j - jnz L(o3) - - jmp L(cj) - -L(o0b): lea 16(mp), mp -L(o0): mov n, i C inner loop induction var - mulx( -16,(mp), %r13, %r12) - mulx( -8,(mp), %rbx, %rax) - sar $2, i - add %r12, %rbx - adc $0, %rax - mov (up), %r12 - mov 8(up), %rbp - mulx( (mp), %r9, %r8) - add %r13, %r12 - jmp L(lo0) - - ALIGN(16) -L(tp0): adc %rax, %r9 - lea 32(up), up - adc %r8, %r11 - mulx( 16,(mp), %r13, %r12) - mov -16(up), %r8 - mulx( 24,(mp), %rbx, %rax) - lea 32(mp), mp - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov -8(up), %r10 - mov (up), %r12 - add %r9, %r8 - mov 8(up), %rbp - mov %r8, -16(up) - adc %r11, %r10 - mulx( (mp), %r9, %r8) - mov %r10, -8(up) - adc %r13, %r12 - mov %r12, (up) -L(lo0): adc %rbx, %rbp - mulx( 8,(mp), %r11, %r10) - mov %rbp, 8(up) - inc i - jnz L(tp0) - -L(ed0): mov 40(up,n,8), %rdx C next iteration up[0] - lea 32(mp,n,8), mp C mp = (last starting mp) - adc %rax, %r9 - adc %r8, %r11 - mov 16(up), %r8 - adc $0, %r10 - imul u0inv, %rdx C next iteration q0 - mov 24(up), %rax - add %r9, %r8 - mov %r8, 16(up) - adc %r11, %rax - mov %rax, 24(up) - lea 40(up,n,8), up C up = (last starting up) + 1 - adc $0, %r10 - mov %r10, -8(up) - inc j - jnz L(o0) - -L(cj): -IFSTD(` mov 8(%rsp), %rdi C param 1: rp - lea 16(%rsp), %rsp C deallocate two slots - lea (up,n,8), %rdx C param 3: up - n - neg R32(n) ') C param 4: n - -IFDOS(` mov up, %rdx C param 2: up - lea (up,n,8), %r8 C param 3: up - n - neg R32(n) - mov n, %r9 C param 4: n - mov 8(%rsp), %rcx C param 1: rp - lea 16(%rsp), %rsp ') C deallocate two slots - - CALL( mpn_add_n) - -L(ret): pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm b/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm deleted file mode 100644 index 641cdf349a..0000000000 --- a/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm +++ /dev/null @@ -1,506 +0,0 @@ -dnl AMD64 mpn_sqr_basecase optimised for Intel Haswell. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 -C AMD K8,K9 n/a n/a n/a -C AMD K10 n/a n/a n/a -C AMD bull n/a n/a n/a -C AMD pile n/a n/a n/a -C AMD steam ? ? ? -C AMD bobcat n/a n/a n/a -C AMD jaguar ? ? ? -C Intel P4 n/a n/a n/a -C Intel core n/a n/a n/a -C Intel NHM n/a n/a n/a -C Intel SBR n/a n/a n/a -C Intel IBR n/a n/a n/a -C Intel HWL 1.86 2.15 ~2.5 -C Intel BWL ? ? ? -C Intel atom n/a n/a n/a -C VIA nano n/a n/a n/a - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund, except -C that the sqr_diag_addlsh1 loop was manually written. - -C TODO -C * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be -C possible. -C * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for -C n = 0 (mod 2). These loops could fall into specific "corner" code. -C * Consider splitting outer loop into 4. -C * Streamline pointer updates. -C * Perhaps suppress a few more xor insns in feed-in code. -C * Make sure we write no dead registers in feed-in code. -C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch -C out for negative sizes being zero-extended, though. -C * Provide straight-line code for n = 4; then look for simplifications in -C main code. - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`un_param',`%rdx') - - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_sqr_basecase) - FUNC_ENTRY(3) - - cmp $2, un_param - jae L(gt1) - - mov (up), %rdx - mulx( %rdx, %rax, %rdx) - mov %rax, (rp) - mov %rdx, 8(rp) - FUNC_EXIT() - ret - -L(gt1): jne L(gt2) - - mov (up), %rdx - mov 8(up), %rcx - mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 - mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 - mov %rcx, %rdx - mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 - add %r9, %r9 C W 1 - adc %r10, %r10 C W 2 - adc $0, %rdx C W 3 - add %r9, %r8 C W 1 - adc %r11, %r10 C W 2 - adc $0, %rdx C W 3 - mov %rax, (rp) - mov %r8, 8(rp) - mov %r10, 16(rp) - mov %rdx, 24(rp) - FUNC_EXIT() - ret - -L(gt2): cmp $4, un_param - jae L(gt3) -define(`v0', `%r8') -define(`v1', `%r9') -define(`w0', `%r10') -define(`w2', `%r11') - - mov (up), v0 - mov 8(up), %rdx - mov %rdx, v1 - mulx( v0, w2, %rax) - mov 16(up), %rdx - mulx( v0, w0, %rcx) - mov w2, %r8 - add %rax, w0 - adc $0, %rcx - mulx( v1, %rdx, %rax) - add %rcx, %rdx - mov %rdx, 24(rp) - adc $0, %rax - mov %rax, 32(rp) - xor R32(%rcx), R32(%rcx) - mov (up), %rdx - mulx( %rdx, %rax, w2) - mov %rax, (rp) - add %r8, %r8 - adc w0, w0 - setc R8(%rcx) - mov 8(up), %rdx - mulx( %rdx, %rax, %rdx) - add w2, %r8 - adc %rax, w0 - mov %r8, 8(rp) - mov w0, 16(rp) - mov 24(rp), %r8 - mov 32(rp), w0 - lea (%rdx,%rcx), w2 - adc %r8, %r8 - adc w0, w0 - setc R8(%rcx) - mov 16(up), %rdx - mulx( %rdx, %rax, %rdx) - add w2, %r8 - adc %rax, w0 - mov %r8, 24(rp) - mov w0, 32(rp) - adc %rcx, %rdx - mov %rdx, 40(rp) - FUNC_EXIT() - ret - -L(gt3): - -define(`v0', `%r8') -define(`v1', `%r9') -define(`w0', `%r10') -define(`w1', `%r11') -define(`w2', `%rbx') -define(`w3', `%rbp') -define(`un', `%r12') -define(`n', `%rcx') - -define(`X0', `%r13') -define(`X1', `%r14') - -L(do_mul_2): - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - mov $0, R32(un) - sub un_param, un C free up rdx - push un - mov (up), v0 - mov 8(up), %rdx - lea 2(un), n - sar $2, n C FIXME: suppress, change loop? - inc un C decrement |un| - mov %rdx, v1 - - test $1, R8(un) - jnz L(mx1) - -L(mx0): mulx( v0, w2, w1) - mov 16(up), %rdx - mov w2, 8(rp) - xor w2, w2 - mulx( v0, w0, w3) - test $2, R8(un) - jz L(m00) - -L(m10): lea -8(rp), rp - lea -8(up), up - jmp L(mlo2) - -L(m00): lea 8(up), up - lea 8(rp), rp - jmp L(mlo0) - -L(mx1): mulx( v0, w0, w3) - mov 16(up), %rdx - mov w0, 8(rp) - xor w0, w0 - mulx( v0, w2, w1) - test $2, R8(un) - jz L(mlo3) - -L(m01): lea 16(rp), rp - lea 16(up), up - jmp L(mlo1) - - ALIGN(32) -L(mtop):mulx( v1, %rax, w0) - add %rax, w2 C 0 - mov (up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 C 1 - add %rax, w2 C 0 -L(mlo1):adc $0, w1 C 1 - add w3, w2 C 0 - mov w2, (rp) C 0 - adc $0, w1 C 1 - mulx( v1, %rax, w2) - add %rax, w0 C 1 - mov 8(up), %rdx - adc $0, w2 C 2 - mulx( v0, %rax, w3) - add %rax, w0 C 1 - adc $0, w3 C 2 -L(mlo0):add w1, w0 C 1 - mov w0, 8(rp) C 1 - adc $0, w3 C 2 - mulx( v1, %rax, w0) - add %rax, w2 C 2 - mov 16(up), %rdx - mulx( v0, %rax, w1) - adc $0, w0 C 3 - add %rax, w2 C 2 - adc $0, w1 C 3 -L(mlo3):add w3, w2 C 2 - mov w2, 16(rp) C 2 - adc $0, w1 C 3 - mulx( v1, %rax, w2) - add %rax, w0 C 3 - mov 24(up), %rdx - adc $0, w2 C 4 - mulx( v0, %rax, w3) - add %rax, w0 C 3 - adc $0, w3 C 4 -L(mlo2):add w1, w0 C 3 - lea 32(up), up - mov w0, 24(rp) C 3 - adc $0, w3 C 4 - inc n - lea 32(rp), rp - jnz L(mtop) - -L(mend):mulx( v1, %rdx, %rax) - add %rdx, w2 - adc $0, %rax - add w3, w2 - mov w2, (rp) - adc $0, %rax - mov %rax, 8(rp) - - lea 16(up), up - lea -16(rp), rp - -L(do_addmul_2): -L(outer): - lea (up,un,8), up C put back up to 2 positions above last time - lea 48(rp,un,8), rp C put back rp to 4 positions above last time - - mov -8(up), v0 C shared between addmul_2 and corner - - add $2, un C decrease |un| - cmp $-2, un - jge L(corner) - - mov (up), v1 - - lea 1(un), n - sar $2, n C FIXME: suppress, change loop? - - mov v1, %rdx - test $1, R8(un) - jnz L(bx1) - -L(bx0): mov (rp), X0 - mov 8(rp), X1 - mulx( v0, %rax, w1) - add %rax, X0 - adc $0, w1 - mov X0, (rp) - xor w2, w2 - test $2, R8(un) - jnz L(b10) - -L(b00): mov 8(up), %rdx - lea 16(rp), rp - lea 16(up), up - jmp L(lo0) - -L(b10): mov 8(up), %rdx - mov 16(rp), X0 - lea 32(up), up - inc n - mulx( v0, %rax, w3) - jz L(ex) - jmp L(lo2) - -L(bx1): mov (rp), X1 - mov 8(rp), X0 - mulx( v0, %rax, w3) - mov 8(up), %rdx - add %rax, X1 - adc $0, w3 - xor w0, w0 - mov X1, (rp) - mulx( v0, %rax, w1) - test $2, R8(un) - jz L(b11) - -L(b01): mov 16(rp), X1 - lea 24(rp), rp - lea 24(up), up - jmp L(lo1) - -L(b11): lea 8(rp), rp - lea 8(up), up - jmp L(lo3) - - ALIGN(32) -L(top): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 -L(lo2): add %rax, X1 - adc $0, w3 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - lea 32(rp), rp - add w1, X1 - mov -16(up), %rdx - mov X1, -24(rp) - adc $0, w3 - add w2, X0 - mov -8(rp), X1 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo1): add %rax, X0 - mulx( v1, %rax, w2) - adc $0, w1 - add w3, X0 - mov X0, -16(rp) - adc $0, w1 - add %rax, X1 - adc $0, w2 - add w0, X1 - mov -8(up), %rdx - adc $0, w2 -L(lo0): mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mov (rp), X0 - mulx( v1, %rax, w0) - add %rax, X0 - adc $0, w0 - add w1, X1 - mov X1, -8(rp) - adc $0, w3 - mov (up), %rdx - add w2, X0 - mulx( v0, %rax, w1) - adc $0, w0 -L(lo3): add %rax, X0 - adc $0, w1 - mulx( v1, %rax, w2) - add w3, X0 - mov 8(rp), X1 - mov X0, (rp) - mov 16(rp), X0 - adc $0, w1 - add %rax, X1 - adc $0, w2 - mov 8(up), %rdx - lea 32(up), up - inc n - jnz L(top) - -L(end): mulx( v0, %rax, w3) - add w0, X1 - adc $0, w2 -L(ex): add %rax, X1 - adc $0, w3 - mulx( v1, %rdx, %rax) - add w1, X1 - mov X1, 8(rp) - adc $0, w3 - add w2, %rdx - adc $0, %rax - add %rdx, w3 - mov w3, 16(rp) - adc $0, %rax - mov %rax, 24(rp) - - jmp L(outer) C loop until a small corner remains - -L(corner): - pop un - mov (up), %rdx - jg L(small_corner) - - mov %rdx, v1 - mov (rp), X0 - mov %rax, X1 C Tricky rax reuse of last iteration - mulx( v0, %rax, w1) - add %rax, X0 - adc $0, w1 - mov X0, (rp) - mov 8(up), %rdx - mulx( v0, %rax, w3) - add %rax, X1 - adc $0, w3 - mulx( v1, %rdx, %rax) - add w1, X1 - mov X1, 8(rp) - adc $0, w3 - add w3, %rdx - mov %rdx, 16(rp) - adc $0, %rax - mov %rax, 24(rp) - lea 32(rp), rp - lea 16(up), up - jmp L(com) - -L(small_corner): - mulx( v0, X1, w3) - add %rax, X1 C Tricky rax reuse of last iteration - adc $0, w3 - mov X1, (rp) - mov w3, 8(rp) - lea 16(rp), rp - lea 8(up), up - -L(com): - -L(sqr_diag_addlsh1): - lea 8(up,un,8), up C put back up at its very beginning - lea (rp,un,8), rp - lea (rp,un,8), rp C put back rp at its very beginning - inc un - - mov -8(up), %rdx - xor R32(%rbx), R32(%rbx) C clear CF as side effect - mulx( %rdx, %rax, %r10) - mov %rax, 8(rp) - mov 16(rp), %r8 - mov 24(rp), %r9 - jmp L(dm) - - ALIGN(16) -L(dtop):mov 32(rp), %r8 - mov 40(rp), %r9 - lea 16(rp), rp - lea (%rdx,%rbx), %r10 -L(dm): adc %r8, %r8 - adc %r9, %r9 - setc R8(%rbx) - mov (up), %rdx - lea 8(up), up - mulx( %rdx, %rax, %rdx) - add %r10, %r8 - adc %rax, %r9 - mov %r8, 16(rp) - mov %r9, 24(rp) - inc un - jnz L(dtop) - -L(dend):adc %rbx, %rdx - mov %rdx, 32(rp) - - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() |