diff options
Diffstat (limited to 'gmp/mpn/x86_64/bd1')
-rw-r--r-- | gmp/mpn/x86_64/bd1/README | 11 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/aorrlsh1_n.asm | 37 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/aorsmul_1.asm | 181 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/com.asm | 37 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/copyd.asm | 37 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/copyi.asm | 37 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/gcd_1.asm | 37 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/gmp-mparam.h | 236 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/hamdist.asm | 38 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/mul_1.asm | 184 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/mul_2.asm | 192 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/mul_basecase.asm | 416 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/popcount.asm | 38 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/sec_tabselect.asm | 37 | ||||
-rw-r--r-- | gmp/mpn/x86_64/bd1/sublsh1_n.asm | 37 |
15 files changed, 1555 insertions, 0 deletions
diff --git a/gmp/mpn/x86_64/bd1/README b/gmp/mpn/x86_64/bd1/README new file mode 100644 index 0000000000..ccd210e0d6 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/README @@ -0,0 +1,11 @@ +This directory contains code for AMD bulldozer including its piledriver update. + +We currently make limited use of SIMD instructions, both via the MPN_PATH and +via inclusion of x86_64/fastsse files. + +The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably +means that an all-core GMP load (such as a HPC load) might run slower if there +is significant SIMD dependency. + +We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any +SIMD code. diff --git a/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm new file mode 100644 index 0000000000..c34a5fa134 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/gmp/mpn/x86_64/bd1/aorsmul_1.asm b/gmp/mpn/x86_64/bd1/aorsmul_1.asm new file mode 100644 index 0000000000..96fec9f5ac --- /dev/null +++ b/gmp/mpn/x86_64/bd1/aorsmul_1.asm @@ -0,0 +1,181 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 4.5-4.7 +C AMD bobcat +C Intel P4 +C Intel core2 +C Intel NHM +C Intel SBR +C Intel atom +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Try to make loop run closer to 4 c/l. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul v0 + +IFSTD(` mov %rbx, n ') + + and $3, R32(%rbx) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%rbx) + jb L(b1) + jz L(b2) + +L(b3): mov $0, R32(%r8) + mov %rax, %rbx + mov $0, R32(%r9) + mov 8(up), %rax + mov %rdx, %r10 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov $0, R32(%r10) + mov %rax, %r8 + mov %rdx, %rbx + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): cmp $1, n + jz L(n1) + mov %rax, %r9 + mov 8(up), %rax + mov %rdx, %r8 + mov $0, R32(%rbx) + lea (up,n,8), up + neg n + inc n + jmp L(L1) + +L(b2): mov $0, R32(%rbx) + mov %rax, %r10 + mov %rdx, %r9 + mov 8(up), %rax + mov $0, R32(%r8) + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(32) +L(top): mul v0 + ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 +L(L1): mul v0 + mov $0, R32(%r10) + ADDSUB %r9, 8(rp,n,8) + adc %rax, %r8 + adc %rdx, %rbx + mov 8(up,n,8), %rax +L(L0): mul v0 + ADDSUB %r8, 16(rp,n,8) + mov $0, R32(%r8) + adc %rax, %rbx + mov $0, R32(%r9) + mov 16(up,n,8), %rax + adc %rdx, %r10 +L(L3): mul v0 + ADDSUB %rbx, 24(rp,n,8) + mov $0, R32(%rbx) + adc %rax, %r10 + adc %rdx, %r9 + mov 24(up,n,8), %rax + add $4, n + js L(top) + +L(end): mul v0 + ADDSUB %r10, (rp) + adc %r9, %rax + adc %r8, %rdx +L(n1): ADDSUB %rax, 8(rp) + adc $0, %rdx + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp/mpn/x86_64/bd1/com.asm b/gmp/mpn/x86_64/bd1/com.asm new file mode 100644 index 0000000000..43f356117a --- /dev/null +++ b/gmp/mpn/x86_64/bd1/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp/mpn/x86_64/bd1/copyd.asm b/gmp/mpn/x86_64/bd1/copyd.asm new file mode 100644 index 0000000000..675cdc3f6b --- /dev/null +++ b/gmp/mpn/x86_64/bd1/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp/mpn/x86_64/bd1/copyi.asm b/gmp/mpn/x86_64/bd1/copyi.asm new file mode 100644 index 0000000000..ceef036585 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp/mpn/x86_64/bd1/gcd_1.asm b/gmp/mpn/x86_64/bd1/gcd_1.asm new file mode 100644 index 0000000000..3d8e5c7ab1 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/gcd_1.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_1) +include_mpn(`x86_64/core2/gcd_1.asm') diff --git a/gmp/mpn/x86_64/bd1/gmp-mparam.h b/gmp/mpn/x86_64/bd1/gmp-mparam.h new file mode 100644 index 0000000000..5014f9f469 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/gmp-mparam.h @@ -0,0 +1,236 @@ +/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600 MHz Bulldozer Zambezi */ +/* FFT tuning limit = 40000000 */ +/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 59 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 115 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 150 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 242 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 14 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \ + { 19, 7}, { 10, 6}, { 25, 7}, { 15, 6}, \ + { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 167,11}, { 95,10}, \ + { 191,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319, 9}, { 1279,11}, { 367,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \ + { 319,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \ + { 895,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \ + { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \ + { 319,11}, { 1279,12}, { 671,11}, { 1343,10}, \ + { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \ + { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,10}, { 4863,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \ + { 831,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,11}, { 4863,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 217 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 17, 6}, { 9, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543, 9}, { 1087,11}, { 303,10}, \ + { 607,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671, 9}, { 1343,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1599,10}, \ + { 3199,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1599,11}, \ + { 3199,13}, { 831,12}, { 1727,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2943,12}, { 5887,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3327,12}, \ + { 6655,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,13}, { 3839,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4351,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 220 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 7246 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 180 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 80 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 188 + +#define BINV_NEWTON_THRESHOLD 248 +#define REDC_1_TO_REDC_2_THRESHOLD 52 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1360 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1360 + +#define POWM_SEC_TABLE 1,16,194,386,452,2245 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 108 +#define HGCD_APPR_THRESHOLD 51 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 298 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 418 +#define SET_STR_PRECOMPUTE_THRESHOLD 1289 + +#define FAC_DSC_THRESHOLD 252 +#define FAC_ODD_THRESHOLD 23 diff --git a/gmp/mpn/x86_64/bd1/hamdist.asm b/gmp/mpn/x86_64/bd1/hamdist.asm new file mode 100644 index 0000000000..93e1e5632b --- /dev/null +++ b/gmp/mpn/x86_64/bd1/hamdist.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/k10/hamdist.asm') diff --git a/gmp/mpn/x86_64/bd1/mul_1.asm b/gmp/mpn/x86_64/bd1/mul_1.asm new file mode 100644 index 0000000000..e59667c085 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/mul_1.asm @@ -0,0 +1,184 @@ +dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 4 +C AMD bobcat +C Intel P4 +C Intel core2 +C Intel NHM +C Intel SBR +C Intel atom +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Move loop code into feed-in blocks, to save insn for zeroing regs. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``rbx'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +IFSTD(` add %r8, %rax ') +IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) + adc $0, %rdx + jmp L(common) + +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +L(common): +IFSTD(` mov %r11, n ') + + and $3, R32(%r11) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%r11) + jb L(b1) + jz L(b2) + +L(b3): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up), %rax + mul v0 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov %rax, %r9 + mov %rdx, %r10 + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): mov %rax, %r8 + cmp $1, n + jz L(n1) + mov %rdx, %r9 + lea (up,n,8), up + neg n + mov %r8, 16(rp,n,8) + inc n + jmp L(L1) + +L(b2): mov %rax, %r11 + mov %rdx, %r8 + mov 8(up), %rax + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(16) +L(top): mul v0 + mov %rdx, %r9 + add %rax, %r8 + adc $0, %r9 + mov %r8, 8(rp,n,8) + mov %r11, (rp,n,8) +L(L1): mov (up,n,8), %rax + mul v0 + add %rax, %r9 + mov %rdx, %r10 + mov 8(up,n,8), %rax + adc $0, %r10 +L(L0): mul v0 + add %rax, %r10 + mov %rdx, %r11 + mov 16(up,n,8), %rax + adc $0, %r11 + mul v0 + mov %r9, 16(rp,n,8) +L(L3): add %rax, %r11 + mov %r10, 24(rp,n,8) + mov %rdx, %r8 + adc $0, %r8 + add $4, n + mov -8(up,n,8), %rax + js L(top) + +L(end): mul v0 + add %rax, %r8 + adc $0, %rdx + mov %r11, (rp) +L(n1): mov %r8, 8(rp) + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp/mpn/x86_64/bd1/mul_2.asm b/gmp/mpn/x86_64/bd1/mul_2.asm new file mode 100644 index 0000000000..4ed5f30561 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/mul_2.asm @@ -0,0 +1,192 @@ +dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bull 4.36 average, quite fluctuating +C AMD pile 4.38 slighty fluctuating +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. +C Scheme: genxmul --mul + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mov n_param, n + mul v0 + neg n + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov (up,n,8), %rax + jmp L(lo0) + +L(b10): mov %rax, w2 + mov %rdx, w3 + mov (up,n,8), %rax + xor R32(w0), R32(w0) + mul v1 + add $-2, n + jmp L(lo2) + +L(bx1): test $2, R8(n) + jz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up,n,8), %rax + mul v1 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + xor R32(w3), R32(w3) + dec n + jmp L(lo3) + + ALIGN(32) +L(top): mov -8(up,n,8), %rax + mul v1 + mov w2, -16(rp,n,8) +L(lo1): add %rax, w0 + mov w3, -8(rp,n,8) + adc %rdx, w1 + mov (up,n,8), %rax + mul v0 + mov $0, R32(w2) + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + mov (up,n,8), %rax +L(lo0): mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,n,8), %rax + mul v0 + add %rax, w1 + mov w0, (rp,n,8) + mov $0, R32(w3) + mov 8(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + mov $0, R32(w0) + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov w1, 8(rp,n,8) +L(lo2): add %rax, w3 + adc %rdx, w0 + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + mov $0, R32(w1) + adc $0, R32(w1) + add $4, n + jnc L(top) + +L(end): mov -8(up,n,8), %rax + mul v1 + mov w2, -16(rp,n,8) + add %rax, w0 + mov w3, -8(rp,n,8) + adc %rdx, w1 + mov w0, (rp,n,8) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp/mpn/x86_64/bd1/mul_basecase.asm b/gmp/mpn/x86_64/bd1/mul_basecase.asm new file mode 100644 index 0000000000..e47ba587cd --- /dev/null +++ b/gmp/mpn/x86_64/bd1/mul_basecase.asm @@ -0,0 +1,416 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull ~4.8 ~4.55 - ~4.3 +C AMD pile ~4.6 ~4.55 - ~4.55 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Merge bull-specific mul_1, if it is not slower the TOOM22 range. +C Alternatively, we could tweak the present code (which was loopmixed for a +C different CPU). +C * Merge faster mul_2, such as the one in the same directory as this file. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + mov %rax, w2 C 0 + mov (up,un,8), %rax + mov %rdx, w1 C 1 + mul v1 + mov %rax, w0 C 1 + mov w2, (rp,un,8) C 0 + mov 8(up,un,8), %rax + mov %rdx, w2 C 2 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + mov %rax, w0 C 1 + mov %rdx, w3 C 2 + mov (up,un,8), %rax + mul v1 + mov w0, (rp,un,8) C 1 + mov %rdx, w0 C 3 + mov %rax, w2 C 0 + mov 8(up,un,8), %rax + jmp L(m2l1) + + ALIGN(32) +L(m2tp):add %rax, w2 C 0 + mov (up,n,8), %rax + adc $0, w0 C 1 +L(m2l1):mul v0 + add %rax, w2 C 0 + mov (up,n,8), %rax + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mul v1 + add w3, w2 C 0 + adc $0, w1 C 1 + add %rax, w0 C 1 + mov w2, (rp,n,8) C 0 + mov 8(up,n,8), %rax + mov %rdx, w2 C 2 + adc $0, w2 C 2 +L(m2l0):mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + add w1, w0 C 1 + adc $0, w3 C 2 + mov 8(up,n,8), %rax + mul v1 + add $2, n + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + jnc L(m2tp) + +L(m2ed):add %rax, w2 + adc $0, %rdx + add w3, w2 + adc $0, %rdx + mov w2, I((rp),(rp,n,8)) + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + mov (up,un,8), %rax + mul v0 + + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov %rax, X1 + mov (up,un,8), %rax + mov %rdx, X0 + mul v1 + test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), n C un = 4, 8, 12, ... + mov (rp,un,8), w3 + mov %rax, w0 + mov 8(up,un,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(un), n C un = 2, 6, 10, ... + mov (rp,un,8), w1 + mov %rdx, w3 + mov %rax, w2 + mov 8(up,un,8), %rax + jmp L(lo2) + +L(bx1): mov %rax, X0 + mov (up,un,8), %rax + mov %rdx, X1 + mul v1 + test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), n C un = 1, 5, 9, ... + mov (rp,un,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(un), n C un = 3, 7, 11, ... + mov (rp,un,8), w0 + mov %rax, w1 + mov 8(up,un,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, I(-16(rp),-16(rp,n,8)) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + mov I(-8(rp),-8(rp,n,8)), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, I(-8(rp),-8(rp,n,8)) + add w3, X1 + mov X1, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp/mpn/x86_64/bd1/popcount.asm b/gmp/mpn/x86_64/bd1/popcount.asm new file mode 100644 index 0000000000..8f22a715b6 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/popcount.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/k10/popcount.asm') diff --git a/gmp/mpn/x86_64/bd1/sec_tabselect.asm b/gmp/mpn/x86_64/bd1/sec_tabselect.asm new file mode 100644 index 0000000000..e4360341d9 --- /dev/null +++ b/gmp/mpn/x86_64/bd1/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp/mpn/x86_64/bd1/sublsh1_n.asm b/gmp/mpn/x86_64/bd1/sublsh1_n.asm new file mode 100644 index 0000000000..4ba673d15a --- /dev/null +++ b/gmp/mpn/x86_64/bd1/sublsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_sublsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') |