summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86_64/coreihwl
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86_64/coreihwl')
-rw-r--r--gmp/mpn/x86_64/coreihwl/addmul_2.asm238
-rw-r--r--gmp/mpn/x86_64/coreihwl/aorsmul_1.asm198
-rw-r--r--gmp/mpn/x86_64/coreihwl/gmp-mparam.h237
-rw-r--r--gmp/mpn/x86_64/coreihwl/mul_1.asm155
-rw-r--r--gmp/mpn/x86_64/coreihwl/mul_2.asm173
-rw-r--r--gmp/mpn/x86_64/coreihwl/mul_basecase.asm441
-rw-r--r--gmp/mpn/x86_64/coreihwl/mullo_basecase.asm426
-rw-r--r--gmp/mpn/x86_64/coreihwl/redc_1.asm433
-rw-r--r--gmp/mpn/x86_64/coreihwl/sqr_basecase.asm506
9 files changed, 0 insertions, 2807 deletions
diff --git a/gmp/mpn/x86_64/coreihwl/addmul_2.asm b/gmp/mpn/x86_64/coreihwl/addmul_2.asm
deleted file mode 100644
index 54aebc888d..0000000000
--- a/gmp/mpn/x86_64/coreihwl/addmul_2.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl AMD64 mpn_addmul_2 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 2.15
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vp', `%rcx')
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-define(`X0', `%r12')
-define(`X1', `%r13')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_addmul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov (vp), v0
- mov 8(vp), v1
-
- mov n_param, n
- shr $2, n
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): mov (rp), X0
- mov 8(rp), X1
- test $2, R8(n_param)
- jnz L(b10)
-
-L(b00): mov (up), %rdx
- lea 16(up), up
- mulx( v0, %rax, w1)
- add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- mov X0, (rp)
- add %rax, X1
- adc $0, w2
- mov -8(up), %rdx
- lea 16(rp), rp
- jmp L(lo0)
-
-L(b10): mov (up), %rdx
- inc n
- mulx( v0, %rax, w1)
- add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- mov X0, (rp)
- mov 16(rp), X0
- add %rax, X1
- adc $0, w2
- xor w0, w0
- jmp L(lo2)
-
-L(bx1): mov (rp), X1
- mov 8(rp), X0
- test $2, R8(n_param)
- jnz L(b11)
-
-L(b01): mov (up), %rdx
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov 8(up), %rdx
- mov X1, (rp)
- mov 16(rp), X1
- mulx( v0, %rax, w1)
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo1)
-
-L(b11): mov (up), %rdx
- inc n
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov X1, (rp)
- mov 8(up), %rdx
- mulx( v0, %rax, w1)
- lea 8(rp), rp
- lea 8(up), up
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo1): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo0): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
-L(lo2): mov 8(up), %rdx
- lea 32(up), up
- dec n
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
- add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w2, %rdx
- adc $0, %rax
- add w3, %rdx
- mov %rdx, 16(rp)
- adc $0, %rax
-
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm b/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm
deleted file mode 100644
index fd5a26d00f..0000000000
--- a/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm
+++ /dev/null
@@ -1,198 +0,0 @@
-dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 2.32
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Handle small n separately, for lower overhead.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rbp')
-define(`v0', `%rdx')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`ADCSBB', `adc')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`ADCSBB', `sbb')
- define(`func', `mpn_submul_1')
-')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov n_param, n
- mov v0_param, v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): shr $2, n
- jc L(b10)
-
-L(b00): mulx( (up), %r13, %r12)
- mulx( 8,(up), %rbx, %rax)
- add %r12, %rbx
- adc $0, %rax
- mov (rp), %r12
- mov 8(rp), %rcx
- mulx( 16,(up), %r9, %r8)
- lea -16(rp), rp
- lea 16(up), up
- ADDSUB %r13, %r12
- jmp L(lo0)
-
-L(bx1): shr $2, n
- jc L(b11)
-
-L(b01): mulx( (up), %r11, %r10)
- jnz L(gt1)
-L(n1): ADDSUB %r11, (rp)
- mov $0, R32(%rax)
- adc %r10, %rax
- jmp L(ret)
-
-L(gt1): mulx( 8,(up), %r13, %r12)
- mulx( 16,(up), %rbx, %rax)
- lea 24(up), up
- add %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov (rp), %r10
- mov 8(rp), %r12
- mov 16(rp), %rcx
- lea -8(rp), rp
- ADDSUB %r11, %r10
- jmp L(lo1)
-
-L(b11): mulx( (up), %rbx, %rax)
- mov (rp), %rcx
- mulx( 8,(up), %r9, %r8)
- lea 8(up), up
- lea -24(rp), rp
- inc n C adjust n
- ADDSUB %rbx, %rcx
- jmp L(lo3)
-
-L(b10): mulx( (up), %r9, %r8)
- mulx( 8,(up), %r11, %r10)
- lea -32(rp), rp
- mov $0, R32(%rax)
- clc C clear cf
- jz L(end) C depends on old shift
-
- ALIGN(16)
-L(top): adc %rax, %r9
- lea 32(rp), rp
- adc %r8, %r11
- mulx( 16,(up), %r13, %r12)
- mov (rp), %r8
- mulx( 24,(up), %rbx, %rax)
- lea 32(up), up
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov 8(rp), %r10
- mov 16(rp), %r12
- ADDSUB %r9, %r8
- mov 24(rp), %rcx
- mov %r8, (rp)
- ADCSBB %r11, %r10
-L(lo1): mulx( (up), %r9, %r8)
- mov %r10, 8(rp)
- ADCSBB %r13, %r12
-L(lo0): mov %r12, 16(rp)
- ADCSBB %rbx, %rcx
-L(lo3): mulx( 8,(up), %r11, %r10)
- mov %rcx, 24(rp)
- dec n
- jnz L(top)
-
-L(end): adc %rax, %r9
- adc %r8, %r11
- mov 32(rp), %r8
- mov %r10, %rax
- adc $0, %rax
- mov 40(rp), %r10
- ADDSUB %r9, %r8
- mov %r8, 32(rp)
- ADCSBB %r11, %r10
- mov %r10, 40(rp)
- adc $0, %rax
-
-L(ret): pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/gmp-mparam.h b/gmp/mpn/x86_64/coreihwl/gmp-mparam.h
deleted file mode 100644
index eef44b3a81..0000000000
--- a/gmp/mpn/x86_64/coreihwl/gmp-mparam.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Haswell gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 2900 MHz Core i5 Haswell */
-/* FFT tuning limit = 75000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 25
-
-#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 195
-#define MUL_TOOM6H_THRESHOLD 298
-#define MUL_TOOM8H_THRESHOLD 406
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 170
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 117
-#define SQR_TOOM4_THRESHOLD 336
-#define SQR_TOOM6_THRESHOLD 426
-#define SQR_TOOM8_THRESHOLD 562
-
-#define MULMID_TOOM42_THRESHOLD 42
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 135,11}, { 79,10}, { 159, 9}, { 319,10}, \
- { 167,11}, { 95,10}, { 191, 9}, { 383,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
- { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \
- { 159,10}, { 319, 9}, { 639,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 607,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703,11}, \
- { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
- { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
- { 351,11}, { 703,10}, { 1407,11}, { 735,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
- { 479,14}, { 127,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
- { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \
- { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
- { 447,12}, { 959,11}, { 1919,13}, { 511,12}, \
- { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
- { 1343,13}, { 703,12}, { 1407,11}, { 2815,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1727,13}, { 959,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \
- { 1215,12}, { 2431,14}, { 639,13}, { 1279,12}, \
- { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
- { 2815,13}, { 1471,12}, { 2943,14}, { 767,13}, \
- { 1535,12}, { 3071,13}, { 1727,14}, { 895,13}, \
- { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
- { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
- { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \
- { 2175,13}, { 4351,14}, { 2431,13}, { 4863,15}, \
- { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
- { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
- { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 237
-#define MUL_FFT_THRESHOLD 4224
-
-#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 344, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \
- { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \
- { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \
- { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \
- { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \
- { 767,11}, { 207,10}, { 415,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543, 9}, { 1087,10}, { 575,11}, { 303,10}, \
- { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
- { 735,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 447,10}, { 895,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,10}, { 1215,11}, { 671,12}, { 351,11}, \
- { 735,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
- { 479,14}, { 127,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 607,11}, { 1215,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \
- { 1535,12}, { 831,13}, { 447,12}, { 959,13}, \
- { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
- { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1663,13}, { 959,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
- { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
- { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
- { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \
- { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
- { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \
- { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
- { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
- { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \
- { 4223,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 206
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 78
-#define MULLO_MUL_N_THRESHOLD 8207
-
-#define DC_DIV_QR_THRESHOLD 63
-#define DC_DIVAPPR_Q_THRESHOLD 195
-#define DC_BDIV_QR_THRESHOLD 56
-#define DC_BDIV_Q_THRESHOLD 128
-
-#define INV_MULMOD_BNM1_THRESHOLD 42
-#define INV_NEWTON_THRESHOLD 199
-#define INV_APPR_THRESHOLD 181
-
-#define BINV_NEWTON_THRESHOLD 236
-#define REDC_1_TO_REDC_2_THRESHOLD 47
-#define REDC_2_TO_REDC_N_THRESHOLD 62
-
-#define MU_DIV_QR_THRESHOLD 1470
-#define MU_DIVAPPR_Q_THRESHOLD 1589
-#define MUPI_DIV_QR_THRESHOLD 78
-#define MU_BDIV_QR_THRESHOLD 1442
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 3,22,194,257,1099
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 112
-#define HGCD_APPR_THRESHOLD 52
-#define HGCD_REDUCE_THRESHOLD 2681
-#define GCD_DC_THRESHOLD 807
-#define GCDEXT_DC_THRESHOLD 416
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 1326
-#define SET_STR_PRECOMPUTE_THRESHOLD 2627
-
-#define FAC_DSC_THRESHOLD 767
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/x86_64/coreihwl/mul_1.asm b/gmp/mpn/x86_64/coreihwl/mul_1.asm
deleted file mode 100644
index 1e3c338f4e..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mul_1.asm
+++ /dev/null
@@ -1,155 +0,0 @@
-dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bd1 n/a
-C AMD bd2 ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel PNR n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 1.57 this
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rbp')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_1)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
-
- mov n_param, n
- shr $2, n
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n_param)
- mov v0_param, v0
- jnz L(b10)
-
-L(b00): mulx( (up), %r9, %r8)
- mulx( 8,(up), %r11, %r10)
- mulx( 16,(up), %rcx, %r12)
- lea -32(rp), rp
- jmp L(lo0)
-
-L(b10): mulx( (up), %rcx, %r12)
- mulx( 8,(up), %rbx, %rax)
- lea -16(rp), rp
- test n, n
- jz L(cj2)
- mulx( 16,(up), %r9, %r8)
- lea 16(up), up
- jmp L(lo2)
-
-L(bx1): test $2, R8(n_param)
- mov v0_param, v0
- jnz L(b11)
-
-L(b01): mulx( (up), %rbx, %rax)
- lea -24(rp), rp
- test n, n
- jz L(cj1)
- mulx( 8,(up), %r9, %r8)
- lea 8(up), up
- jmp L(lo1)
-
-L(b11): mulx( (up), %r11, %r10)
- mulx( 8,(up), %rcx, %r12)
- mulx( 16,(up), %rbx, %rax)
- lea -8(rp), rp
- test n, n
- jz L(cj3)
- lea 24(up), up
- jmp L(lo3)
-
- ALIGN(32)
-L(top): lea 32(rp), rp
- mov %r9, (rp)
- adc %r8, %r11
-L(lo3): mulx( (up), %r9, %r8)
- mov %r11, 8(rp)
- adc %r10, %rcx
-L(lo2): mov %rcx, 16(rp)
- adc %r12, %rbx
-L(lo1): mulx( 8,(up), %r11, %r10)
- adc %rax, %r9
- mulx( 16,(up), %rcx, %r12)
- mov %rbx, 24(rp)
-L(lo0): mulx( 24,(up), %rbx, %rax)
- lea 32(up), up
- dec n
- jnz L(top)
-
-L(end): lea 32(rp), rp
- mov %r9, (rp)
- adc %r8, %r11
-L(cj3): mov %r11, 8(rp)
- adc %r10, %rcx
-L(cj2): mov %rcx, 16(rp)
- adc %r12, %rbx
-L(cj1): mov %rbx, 24(rp)
- adc $0, %rax
-
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/coreihwl/mul_2.asm b/gmp/mpn/x86_64/coreihwl/mul_2.asm
deleted file mode 100644
index 5bdb1aa645..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mul_2.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl AMD64 mpn_mul_2 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 1.86
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Move test and jcc together, for insn fusion.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vp', `%rcx')
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea 3(n_param), n
- shr $2, n
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): xor w0, w0
- test $2, R8(n_param)
- mov (up), %rdx
- mulx( v0, w2, w1)
- jz L(lo0)
-
-L(b10): lea -16(rp), rp
- lea -16(up), up
- jmp L(lo2)
-
-L(bx1): xor w2, w2
- test $2, R8(n_param)
- mov (up), %rdx
- mulx( v0, w0, w3)
- jnz L(b11)
-
-L(b01): lea -24(rp), rp
- lea 8(up), up
- jmp L(lo1)
-
-L(b11): lea -8(rp), rp
- lea -8(up), up
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v1, %rax, w0)
- add %rax, w2 C 0
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 1
- add %rax, w2 C 0
- adc $0, w1 C 1
- add w3, w2 C 0
-L(lo0): mov w2, (rp) C 0
- adc $0, w1 C 1
- mulx( v1, %rax, w2)
- add %rax, w0 C 1
- mov 8(up), %rdx
- adc $0, w2 C 2
- mulx( v0, %rax, w3)
- add %rax, w0 C 1
- adc $0, w3 C 2
- add w1, w0 C 1
-L(lo3): mov w0, 8(rp) C 1
- adc $0, w3 C 2
- mulx( v1, %rax, w0)
- add %rax, w2 C 2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 3
- add %rax, w2 C 2
- adc $0, w1 C 3
- add w3, w2 C 2
-L(lo2): mov w2, 16(rp) C 2
- adc $0, w1 C 3
- mulx( v1, %rax, w2)
- add %rax, w0 C 3
- mov 24(up), %rdx
- adc $0, w2 C 4
- mulx( v0, %rax, w3)
- add %rax, w0 C 3
- adc $0, w3 C 4
- add w1, w0 C 3
- lea 32(up), up
-L(lo1): mov w0, 24(rp) C 3
- adc $0, w3 C 4
- dec n
- lea 32(rp), rp
- jnz L(top)
-
-L(end): mulx( v1, %rdx, %rax)
- add %rdx, w2
- adc $0, %rax
- add w3, w2
- mov w2, (rp)
- adc $0, %rax
-
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/mul_basecase.asm b/gmp/mpn/x86_64/coreihwl/mul_basecase.asm
deleted file mode 100644
index b2656c8e9b..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mul_basecase.asm
+++ /dev/null
@@ -1,441 +0,0 @@
-dnl AMD64 mpn_mul_basecase optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9 n/a n/a - n/a
-C AMD K10 n/a n/a - n/a
-C AMD bull n/a n/a - n/a
-C AMD pile n/a n/a - n/a
-C AMD steam ? ? - ?
-C AMD bobcat n/a n/a - n/a
-C AMD jaguar ? ? - ?
-C Intel P4 n/a n/a - n/a
-C Intel core n/a n/a - n/a
-C Intel NHM n/a n/a - n/a
-C Intel SBR n/a n/a - n/a
-C Intel IBR n/a n/a - n/a
-C Intel HWL 1.77 1.86 - 2.15
-C Intel BWL ? ? - ?
-C Intel atom n/a n/a - n/a
-C VIA nano n/a n/a - n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Adjoin a mul_3.
-C * Further micro-optimise.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-define(`vp', `%rcx')
-define(`vn', `%r8')
-
-define(`un', `%rbx')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`n', `%rbp')
-define(`v0', `%r9')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- mov un_param, un C free up rdx
- neg un
-
- mov un_param, n C FIXME: share
- sar $2, n C FIXME: share
-
- test $1, R8(vn)
- jz L(do_mul_2)
-
-define(`w4', `%r9')
-define(`w5', `%r14')
-
- mov (vp), %rdx
-
-L(do_mul_1):
- test $1, R8(un)
- jnz L(m1x1)
-
-L(m1x0):test $2, R8(un)
- jnz L(m110)
-
-L(m100):
- mulx( (up), w5, w2)
- mulx( 8,(up), w1, w3)
- lea -24(rp), rp
- jmp L(m1l0)
-
-L(m110):
- mulx( (up), w3, w4)
- mulx( 8,(up), w1, w5)
- lea -8(rp), rp
- test n, n
- jz L(cj2)
- mulx( 16,(up), w0, w2)
- lea 16(up), up
- jmp L(m1l2)
-
-L(m1x1):test $2, R8(un)
- jz L(m111)
-
-L(m101):
- mulx( (up), w4, w5)
- lea -16(rp), rp
- test n, n
- jz L(cj1)
- mulx( 8,(up), w0, w2)
- lea 8(up), up
- jmp L(m1l1)
-
-L(m111):
- mulx( (up), w2, w3)
- mulx( 8,(up), w0, w4)
- mulx( 16,(up), w1, w5)
- lea 24(up), up
- test n, n
- jnz L(gt3)
- add w0, w3
- jmp L(cj3)
-L(gt3): add w0, w3
- jmp L(m1l3)
-
- ALIGN(32)
-L(m1tp):lea 32(rp), rp
-L(m1l3):mov w2, (rp)
- mulx( (up), w0, w2)
-L(m1l2):mov w3, 8(rp)
- adc w1, w4
-L(m1l1):adc w0, w5
- mov w4, 16(rp)
- mulx( 8,(up), w1, w3)
-L(m1l0):mov w5, 24(rp)
- mulx( 16,(up), w0, w4)
- adc w1, w2
- mulx( 24,(up), w1, w5)
- adc w0, w3
- lea 32(up), up
- dec n
- jnz L(m1tp)
-
-L(m1ed):lea 32(rp), rp
-L(cj3): mov w2, (rp)
-L(cj2): mov w3, 8(rp)
- adc w1, w4
-L(cj1): mov w4, 16(rp)
- adc $0, w5
- mov w5, 24(rp)
-
- dec R32(vn)
- jz L(ret5)
-
- lea 8(vp), vp
- lea 32(rp), rp
-C push %r12
-C push %r13
-C push %r14
- jmp L(do_addmul)
-
-L(do_mul_2):
-define(`v1', `%r14')
-C push %r12
-C push %r13
-C push %r14
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea (un), n
- sar $2, n
-
- test $1, R8(un)
- jnz L(m2x1)
-
-L(m2x0):xor w0, w0
- test $2, R8(un)
- mov (up), %rdx
- mulx( v0, w2, w1)
- jz L(m2l0)
-
-L(m210):lea -16(rp), rp
- lea -16(up), up
- jmp L(m2l2)
-
-L(m2x1):xor w2, w2
- test $2, R8(un)
- mov (up), %rdx
- mulx( v0, w0, w3)
- jz L(m211)
-
-L(m201):lea -24(rp), rp
- lea 8(up), up
- jmp L(m2l1)
-
-L(m211):lea -8(rp), rp
- lea -8(up), up
- jmp L(m2l3)
-
- ALIGN(16)
-L(m2tp):mulx( v1, %rax, w0)
- add %rax, w2
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2l0):mov w2, (rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 8(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
-L(m2l3):mov w0, 8(rp)
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, w2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2l2):mov w2, 16(rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 24(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
- lea 32(up), up
-L(m2l1):mov w0, 24(rp)
- adc $0, w3
- inc n
- lea 32(rp), rp
- jnz L(m2tp)
-
-L(m2ed):mulx( v1, %rdx, %rax)
- add %rdx, w2
- adc $0, %rax
- add w3, w2
- mov w2, (rp)
- adc $0, %rax
- mov %rax, 8(rp)
-
- add $-2, R32(vn)
- jz L(ret5)
- lea 16(vp), vp
- lea 16(rp), rp
-
-
-L(do_addmul):
- push %r15
- push vn C save vn in new stack slot
-define(`vn', `(%rsp)')
-define(`X0', `%r14')
-define(`X1', `%r15')
-define(`v1', `%r8')
-
- lea (rp,un,8), rp
- lea (up,un,8), up
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
-
- lea 2(un), n
- sar $2, n
-
- mov (up), %rdx
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): mov (rp), X0
- mov 8(rp), X1
- mulx( v0, %rax, w1)
- add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- mov X0, (rp)
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- test $2, R8(un)
- jnz L(b10)
-
-L(b00): lea 16(up), up
- lea 16(rp), rp
- jmp L(lo0)
-
-L(b10): mov 16(rp), X0
- lea 32(up), up
- mulx( v0, %rax, w3)
- jmp L(lo2)
-
-L(bx1): mov (rp), X1
- mov 8(rp), X0
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov 8(up), %rdx
- mov X1, (rp)
- mulx( v0, %rax, w1)
- test $2, R8(un)
- jz L(b11)
-
-L(b01): mov 16(rp), X1
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo1)
-
-L(b11): lea 8(rp), rp
- lea 8(up), up
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(lo2): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo1): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo0): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- lea 32(up), up
- inc n
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
- add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w2, %rdx
- adc $0, %rax
- add w3, %rdx
- mov %rdx, 16(rp)
- adc $0, %rax
- mov %rax, 24(rp)
-
- addl $-2, vn
- lea 16(vp), vp
- lea -16(up,un,8), up
- lea 32(rp,un,8), rp
- jnz L(outer)
-
- pop %rax C deallocate vn slot
- pop %r15
-L(ret5):pop %r14
-L(ret4):pop %r13
-L(ret3):pop %r12
-L(ret2):pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm b/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm
deleted file mode 100644
index 9986e8bcfa..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm
+++ /dev/null
@@ -1,426 +0,0 @@
-dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2
-C AMD K8,K9 n/a n/a
-C AMD K10 n/a n/a
-C AMD bull n/a n/a
-C AMD pile n/a n/a
-C AMD steam ? ?
-C AMD bobcat n/a n/a
-C AMD jaguar ? ?
-C Intel P4 n/a n/a
-C Intel core n/a n/a
-C Intel NHM n/a n/a
-C Intel SBR n/a n/a
-C Intel IBR n/a n/a
-C Intel HWL 1.86 2.15
-C Intel BWL ? ?
-C Intel atom n/a n/a
-C VIA nano n/a n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Implement proper cor2, replacing current cor0.
-C * Micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n', `%rcx')
-
-define(`vp', `%r8')
-define(`X0', `%r14')
-define(`X1', `%r15')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`i', `%rbp')
-define(`v0', `%r9')
-define(`v1', `%rbx')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mullo_basecase)
- FUNC_ENTRY(4)
-
- mov vp_param, vp
- mov (up), %rdx
-
- cmp $4, n
- jb L(small)
-
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea 2(n), i
- shr $2, i
- neg n
- add $2, n
-
- push up C put entry `up' on stack
-
- test $1, R8(n)
- jnz L(m2x1)
-
-L(m2x0):mulx( v0, w0, w3)
- xor R32(w2), R32(w2)
- test $2, R8(n)
- jz L(m2b2)
-
-L(m2b0):lea -8(rp), rp
- lea -8(up), up
- jmp L(m2e0)
-
-L(m2b2):lea -24(rp), rp
- lea 8(up), up
- jmp L(m2e2)
-
-L(m2x1):mulx( v0, w2, w1)
- xor R32(w0), R32(w0)
- test $2, R8(n)
- jnz L(m2b3)
-
-L(m2b1):jmp L(m2e1)
-
-L(m2b3):lea -16(rp), rp
- lea -16(up), up
- jmp L(m2e3)
-
- ALIGN(16)
-L(m2tp):mulx( v1, %rax, w0)
- add %rax, w2
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2e1):mov w2, (rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 8(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
-L(m2e0):mov w0, 8(rp)
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, w2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2e3):mov w2, 16(rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 24(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
- lea 32(up), up
-L(m2e2):mov w0, 24(rp)
- adc $0, w3
- dec i
- lea 32(rp), rp
- jnz L(m2tp)
-
-L(m2ed):mulx( v1, %rax, w0)
- add %rax, w2
- mov (up), %rdx
- mulx( v0, %rax, w1)
- add w2, %rax
- add w3, %rax
- mov %rax, (rp)
-
- mov (%rsp), up C restore `up' to beginning
- lea 16(vp), vp
- lea 8(rp,n,8), rp C put back rp to old rp + 2
- add $2, n
- jge L(cor1)
-
- push %r14
- push %r15
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
-
- lea (n), i
- sar $2, i
-
- mov (up), %rdx
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): mov (rp), X1
- mov 8(rp), X0
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov 8(up), %rdx
- mov X1, (rp)
- mulx( v0, %rax, w1)
- test $2, R8(n)
- jz L(b2)
-
-L(b0): lea 8(rp), rp
- lea 8(up), up
- jmp L(lo0)
-
-L(b2): mov 16(rp), X1
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo2)
-
-L(bx1): mov (rp), X0
- mov 8(rp), X1
- mulx( v0, %rax, w1)
- add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- mov X0, (rp)
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- test $2, R8(n)
- jnz L(b3)
-
-L(b1): lea 16(up), up
- lea 16(rp), rp
- jmp L(lo1)
-
-L(b3): mov 16(rp), X0
- lea 32(up), up
- mulx( v0, %rax, w3)
- inc i
- jz L(cj3)
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(lo3): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo2): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo1): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo0): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- lea 32(up), up
- inc i
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(cj3): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- add w1, X1
- mov -16(up), %rdx
- mov X1, 8(rp)
- adc $0, w3
- add w2, X0
- mulx( v0, %rax, w1)
- add X0, %rax
- add w3, %rax
- mov %rax, 16(rp)
-
- mov 16(%rsp), up C restore `up' to beginning
- lea 16(vp), vp
- lea 24(rp,n,8), rp C put back rp to old rp + 2
- add $2, n
- jl L(outer)
-
- pop %r15
- pop %r14
-
- jnz L(cor0)
-
-L(cor1):mov (vp), v0
- mov 8(vp), v1
- mov (up), %rdx
- mulx( v0, %r12, %rbp) C u0 x v2
- add (rp), %r12 C FIXME: rp[0] still available in reg?
- adc %rax, %rbp
- mov 8(up), %r10
- imul v0, %r10
- imul v1, %rdx
- mov %r12, (rp)
- add %r10, %rdx
- add %rbp, %rdx
- mov %rdx, 8(rp)
- pop %rax C deallocate `up' copy
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(cor0):mov (vp), %r11
- imul (up), %r11
- add %rax, %r11
- mov %r11, (rp)
- pop %rax C deallocate `up' copy
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- cmp $2, n
- jae L(gt1)
-L(n1): imul (vp), %rdx
- mov %rdx, (rp)
- FUNC_EXIT()
- ret
-L(gt1): ja L(gt2)
-L(n2): mov (vp), %r9
- mulx( %r9, %rax, %rdx)
- mov %rax, (rp)
- mov 8(up), %rax
- imul %r9, %rax
- add %rax, %rdx
- mov 8(vp), %r9
- mov (up), %rcx
- imul %r9, %rcx
- add %rcx, %rdx
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-L(gt2):
-L(n3): mov (vp), %r9
- mulx( %r9, %rax, %r10) C u0 x v0
- mov %rax, (rp)
- mov 8(up), %rdx
- mulx( %r9, %rax, %rdx) C u1 x v0
- imul 16(up), %r9 C u2 x v0
- add %rax, %r10
- adc %rdx, %r9
- mov 8(vp), %r11
- mov (up), %rdx
- mulx( %r11, %rax, %rdx) C u0 x v1
- add %rax, %r10
- adc %rdx, %r9
- imul 8(up), %r11 C u1 x v1
- add %r11, %r9
- mov %r10, 8(rp)
- mov 16(vp), %r10
- mov (up), %rax
- imul %rax, %r10 C u0 x v2
- add %r10, %r9
- mov %r9, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/redc_1.asm b/gmp/mpn/x86_64/coreihwl/redc_1.asm
deleted file mode 100644
index f1a475e53c..0000000000
--- a/gmp/mpn/x86_64/coreihwl/redc_1.asm
+++ /dev/null
@@ -1,433 +0,0 @@
-dnl AMD64 mpn_redc_1 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 2.32
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise.
-C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2
-C cycles, though.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv_param', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%rdi')
-define(`u0inv', `(%rsp)') C stack
-
-ABI_SUPPORT(DOS64) C FIXME: needs verification
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- push rp
- mov mp_param, mp C note that rp and mp shares register
- mov (up), %rdx
-
- neg n
- push %r8 C put u0inv on stack
- imul u0inv_param, %rdx C first iteration q0
- mov n, j C outer loop induction var
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- jz L(o0b)
-
- cmp $-2, R32(n)
- jnz L(o2)
-
-C Special code for n = 2 since general code cannot handle it
- mov 8(%rsp), %rbx C rp
- lea 16(%rsp), %rsp C deallocate two slots
- mulx( (mp), %r9, %r12)
- mulx( 8,(mp), %r11, %r10)
- add %r12, %r11
- adc $0, %r10
- add (up), %r9 C = 0
- adc 8(up), %r11 C r11 = up[1]
- adc $0, %r10 C -> up[0]
- mov %r11, %rdx
- imul u0inv_param, %rdx
- mulx( (mp), %r13, %r12)
- mulx( 8,(mp), %r14, %r15)
- xor R32(%rax), R32(%rax)
- add %r12, %r14
- adc $0, %r15
- add %r11, %r13 C = 0
- adc 16(up), %r14 C rp[2]
- adc $0, %r15 C -> up[1]
- add %r14, %r10
- adc 24(up), %r15
- mov %r10, (%rbx)
- mov %r15, 8(%rbx)
- setc R8(%rax)
- jmp L(ret)
-
-L(o2): lea 2(n), i C inner loop induction var
- mulx( (mp), %r9, %r8)
- mulx( 8,(mp), %r11, %r10)
- sar $2, i
- add %r8, %r11
- jmp L(lo2)
-
- ALIGN(16)
-L(tp2): adc %rax, %r9
- lea 32(up), up
- adc %r8, %r11
-L(lo2): mulx( 16,(mp), %r13, %r12)
- mov (up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov 8(up), %r10
- mov 16(up), %r12
- add %r9, %r8
- mov 24(up), %rbp
- mov %r8, (up)
- adc %r11, %r10
- mulx( (mp), %r9, %r8)
- mov %r10, 8(up)
- adc %r13, %r12
- mov %r12, 16(up)
- adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 24(up)
- inc i
- jnz L(tp2)
-
-L(ed2): mov 56(up,n,8), %rdx C next iteration up[0]
- lea 16(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 32(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 40(up), %rax
- add %r9, %r8
- mov %r8, 32(up)
- adc %r11, %rax
- mov %rax, 40(up)
- lea 56(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o2)
-
- jmp L(cj)
-
-
-L(bx1): test $2, R8(n)
- jz L(o3a)
-
-L(o1a): cmp $-1, R32(n)
- jnz L(o1b)
-
-C Special code for n = 1 since general code cannot handle it
- mov 8(%rsp), %rbx C rp
- lea 16(%rsp), %rsp C deallocate two slots
- mulx( (mp), %r11, %r10)
- add (up), %r11
- adc 8(up), %r10
- mov %r10, (%rbx)
- mov $0, R32(%rax)
- setc R8(%rax)
- jmp L(ret)
-
-L(o1b): lea 24(mp), mp
-L(o1): lea 1(n), i C inner loop induction var
- mulx( -24,(mp), %r11, %r10)
- mulx( -16,(mp), %r13, %r12)
- mulx( -8,(mp), %rbx, %rax)
- sar $2, i
- add %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov (up), %r10
- mov 8(up), %r12
- mov 16(up), %rbp
- add %r11, %r10
- jmp L(lo1)
-
- ALIGN(16)
-L(tp1): adc %rax, %r9
- lea 32(up), up
- adc %r8, %r11
- mulx( 16,(mp), %r13, %r12)
- mov -8(up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov (up), %r10
- mov 8(up), %r12
- add %r9, %r8
- mov 16(up), %rbp
- mov %r8, -8(up)
- adc %r11, %r10
-L(lo1): mulx( (mp), %r9, %r8)
- mov %r10, (up)
- adc %r13, %r12
- mov %r12, 8(up)
- adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 16(up)
- inc i
- jnz L(tp1)
-
-L(ed1): mov 48(up,n,8), %rdx C next iteration up[0]
- lea 40(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 24(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 32(up), %rax
- add %r9, %r8
- mov %r8, 24(up)
- adc %r11, %rax
- mov %rax, 32(up)
- lea 48(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o1)
-
- jmp L(cj)
-
-L(o3a): cmp $-3, R32(n)
- jnz L(o3b)
-
-C Special code for n = 3 since general code cannot handle it
-L(n3): mulx( (mp), %rbx, %rax)
- mulx( 8,(mp), %r9, %r14)
- add (up), %rbx
- mulx( 16,(mp), %r11, %r10)
- adc %rax, %r9 C W 1
- adc %r14, %r11 C W 2
- mov 8(up), %r14
- mov u0inv_param, %rdx
- adc $0, %r10 C W 3
- mov 16(up), %rax
- add %r9, %r14 C W 1
- mov %r14, 8(up)
- mulx( %r14, %rdx, %r13) C next iteration q0
- adc %r11, %rax C W 2
- mov %rax, 16(up)
- adc $0, %r10 C W 3
- mov %r10, (up)
- lea 8(up), up C up = (last starting up) + 1
- inc j
- jnz L(n3)
-
- jmp L(cj)
-
-L(o3b): lea 8(mp), mp
-L(o3): lea 4(n), i C inner loop induction var
- mulx( -8,(mp), %rbx, %rax)
- mulx( (mp), %r9, %r8)
- mov (up), %rbp
- mulx( 8,(mp), %r11, %r10)
- sar $2, i
- add %rbx, %rbp
- nop
- adc %rax, %r9
- jmp L(lo3)
-
- ALIGN(16)
-L(tp3): adc %rax, %r9
- lea 32(up), up
-L(lo3): adc %r8, %r11
- mulx( 16,(mp), %r13, %r12)
- mov 8(up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov 16(up), %r10
- mov 24(up), %r12
- add %r9, %r8
- mov 32(up), %rbp
- mov %r8, 8(up)
- adc %r11, %r10
- mulx( (mp), %r9, %r8)
- mov %r10, 16(up)
- adc %r13, %r12
- mov %r12, 24(up)
- adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 32(up)
- inc i
- jnz L(tp3)
-
-L(ed3): mov 64(up,n,8), %rdx C next iteration up[0]
- lea 24(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 40(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 48(up), %rax
- add %r9, %r8
- mov %r8, 40(up)
- adc %r11, %rax
- mov %rax, 48(up)
- lea 64(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o3)
-
- jmp L(cj)
-
-L(o0b): lea 16(mp), mp
-L(o0): mov n, i C inner loop induction var
- mulx( -16,(mp), %r13, %r12)
- mulx( -8,(mp), %rbx, %rax)
- sar $2, i
- add %r12, %rbx
- adc $0, %rax
- mov (up), %r12
- mov 8(up), %rbp
- mulx( (mp), %r9, %r8)
- add %r13, %r12
- jmp L(lo0)
-
- ALIGN(16)
-L(tp0): adc %rax, %r9
- lea 32(up), up
- adc %r8, %r11
- mulx( 16,(mp), %r13, %r12)
- mov -16(up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov -8(up), %r10
- mov (up), %r12
- add %r9, %r8
- mov 8(up), %rbp
- mov %r8, -16(up)
- adc %r11, %r10
- mulx( (mp), %r9, %r8)
- mov %r10, -8(up)
- adc %r13, %r12
- mov %r12, (up)
-L(lo0): adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 8(up)
- inc i
- jnz L(tp0)
-
-L(ed0): mov 40(up,n,8), %rdx C next iteration up[0]
- lea 32(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 16(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 24(up), %rax
- add %r9, %r8
- mov %r8, 16(up)
- adc %r11, %rax
- mov %rax, 24(up)
- lea 40(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o0)
-
-L(cj):
-IFSTD(` mov 8(%rsp), %rdi C param 1: rp
- lea 16(%rsp), %rsp C deallocate two slots
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` mov up, %rdx C param 2: up
- lea (up,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov 8(%rsp), %rcx C param 1: rp
- lea 16(%rsp), %rsp ') C deallocate two slots
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm b/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm
deleted file mode 100644
index 641cdf349a..0000000000
--- a/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm
+++ /dev/null
@@ -1,506 +0,0 @@
-dnl AMD64 mpn_sqr_basecase optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
-C AMD K8,K9 n/a n/a n/a
-C AMD K10 n/a n/a n/a
-C AMD bull n/a n/a n/a
-C AMD pile n/a n/a n/a
-C AMD steam ? ? ?
-C AMD bobcat n/a n/a n/a
-C AMD jaguar ? ? ?
-C Intel P4 n/a n/a n/a
-C Intel core n/a n/a n/a
-C Intel NHM n/a n/a n/a
-C Intel SBR n/a n/a n/a
-C Intel IBR n/a n/a n/a
-C Intel HWL 1.86 2.15 ~2.5
-C Intel BWL ? ? ?
-C Intel atom n/a n/a n/a
-C VIA nano n/a n/a n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
-C that the sqr_diag_addlsh1 loop was manually written.
-
-C TODO
-C * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be
-C possible.
-C * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for
-C n = 0 (mod 2). These loops could fall into specific "corner" code.
-C * Consider splitting outer loop into 4.
-C * Streamline pointer updates.
-C * Perhaps suppress a few more xor insns in feed-in code.
-C * Make sure we write no dead registers in feed-in code.
-C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
-C out for negative sizes being zero-extended, though.
-C * Provide straight-line code for n = 4; then look for simplifications in
-C main code.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
-
- cmp $2, un_param
- jae L(gt1)
-
- mov (up), %rdx
- mulx( %rdx, %rax, %rdx)
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(gt1): jne L(gt2)
-
- mov (up), %rdx
- mov 8(up), %rcx
- mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
- mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
- mov %rcx, %rdx
- mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
- add %r9, %r9 C W 1
- adc %r10, %r10 C W 2
- adc $0, %rdx C W 3
- add %r9, %r8 C W 1
- adc %r11, %r10 C W 2
- adc $0, %rdx C W 3
- mov %rax, (rp)
- mov %r8, 8(rp)
- mov %r10, 16(rp)
- mov %rdx, 24(rp)
- FUNC_EXIT()
- ret
-
-L(gt2): cmp $4, un_param
- jae L(gt3)
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%r10')
-define(`w2', `%r11')
-
- mov (up), v0
- mov 8(up), %rdx
- mov %rdx, v1
- mulx( v0, w2, %rax)
- mov 16(up), %rdx
- mulx( v0, w0, %rcx)
- mov w2, %r8
- add %rax, w0
- adc $0, %rcx
- mulx( v1, %rdx, %rax)
- add %rcx, %rdx
- mov %rdx, 24(rp)
- adc $0, %rax
- mov %rax, 32(rp)
- xor R32(%rcx), R32(%rcx)
- mov (up), %rdx
- mulx( %rdx, %rax, w2)
- mov %rax, (rp)
- add %r8, %r8
- adc w0, w0
- setc R8(%rcx)
- mov 8(up), %rdx
- mulx( %rdx, %rax, %rdx)
- add w2, %r8
- adc %rax, w0
- mov %r8, 8(rp)
- mov w0, 16(rp)
- mov 24(rp), %r8
- mov 32(rp), w0
- lea (%rdx,%rcx), w2
- adc %r8, %r8
- adc w0, w0
- setc R8(%rcx)
- mov 16(up), %rdx
- mulx( %rdx, %rax, %rdx)
- add w2, %r8
- adc %rax, w0
- mov %r8, 24(rp)
- mov w0, 32(rp)
- adc %rcx, %rdx
- mov %rdx, 40(rp)
- FUNC_EXIT()
- ret
-
-L(gt3):
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%rbx')
-define(`w3', `%rbp')
-define(`un', `%r12')
-define(`n', `%rcx')
-
-define(`X0', `%r13')
-define(`X1', `%r14')
-
-L(do_mul_2):
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- mov $0, R32(un)
- sub un_param, un C free up rdx
- push un
- mov (up), v0
- mov 8(up), %rdx
- lea 2(un), n
- sar $2, n C FIXME: suppress, change loop?
- inc un C decrement |un|
- mov %rdx, v1
-
- test $1, R8(un)
- jnz L(mx1)
-
-L(mx0): mulx( v0, w2, w1)
- mov 16(up), %rdx
- mov w2, 8(rp)
- xor w2, w2
- mulx( v0, w0, w3)
- test $2, R8(un)
- jz L(m00)
-
-L(m10): lea -8(rp), rp
- lea -8(up), up
- jmp L(mlo2)
-
-L(m00): lea 8(up), up
- lea 8(rp), rp
- jmp L(mlo0)
-
-L(mx1): mulx( v0, w0, w3)
- mov 16(up), %rdx
- mov w0, 8(rp)
- xor w0, w0
- mulx( v0, w2, w1)
- test $2, R8(un)
- jz L(mlo3)
-
-L(m01): lea 16(rp), rp
- lea 16(up), up
- jmp L(mlo1)
-
- ALIGN(32)
-L(mtop):mulx( v1, %rax, w0)
- add %rax, w2 C 0
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 1
- add %rax, w2 C 0
-L(mlo1):adc $0, w1 C 1
- add w3, w2 C 0
- mov w2, (rp) C 0
- adc $0, w1 C 1
- mulx( v1, %rax, w2)
- add %rax, w0 C 1
- mov 8(up), %rdx
- adc $0, w2 C 2
- mulx( v0, %rax, w3)
- add %rax, w0 C 1
- adc $0, w3 C 2
-L(mlo0):add w1, w0 C 1
- mov w0, 8(rp) C 1
- adc $0, w3 C 2
- mulx( v1, %rax, w0)
- add %rax, w2 C 2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 3
- add %rax, w2 C 2
- adc $0, w1 C 3
-L(mlo3):add w3, w2 C 2
- mov w2, 16(rp) C 2
- adc $0, w1 C 3
- mulx( v1, %rax, w2)
- add %rax, w0 C 3
- mov 24(up), %rdx
- adc $0, w2 C 4
- mulx( v0, %rax, w3)
- add %rax, w0 C 3
- adc $0, w3 C 4
-L(mlo2):add w1, w0 C 3
- lea 32(up), up
- mov w0, 24(rp) C 3
- adc $0, w3 C 4
- inc n
- lea 32(rp), rp
- jnz L(mtop)
-
-L(mend):mulx( v1, %rdx, %rax)
- add %rdx, w2
- adc $0, %rax
- add w3, w2
- mov w2, (rp)
- adc $0, %rax
- mov %rax, 8(rp)
-
- lea 16(up), up
- lea -16(rp), rp
-
-L(do_addmul_2):
-L(outer):
- lea (up,un,8), up C put back up to 2 positions above last time
- lea 48(rp,un,8), rp C put back rp to 4 positions above last time
-
- mov -8(up), v0 C shared between addmul_2 and corner
-
- add $2, un C decrease |un|
- cmp $-2, un
- jge L(corner)
-
- mov (up), v1
-
- lea 1(un), n
- sar $2, n C FIXME: suppress, change loop?
-
- mov v1, %rdx
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): mov (rp), X0
- mov 8(rp), X1
- mulx( v0, %rax, w1)
- add %rax, X0
- adc $0, w1
- mov X0, (rp)
- xor w2, w2
- test $2, R8(un)
- jnz L(b10)
-
-L(b00): mov 8(up), %rdx
- lea 16(rp), rp
- lea 16(up), up
- jmp L(lo0)
-
-L(b10): mov 8(up), %rdx
- mov 16(rp), X0
- lea 32(up), up
- inc n
- mulx( v0, %rax, w3)
- jz L(ex)
- jmp L(lo2)
-
-L(bx1): mov (rp), X1
- mov 8(rp), X0
- mulx( v0, %rax, w3)
- mov 8(up), %rdx
- add %rax, X1
- adc $0, w3
- xor w0, w0
- mov X1, (rp)
- mulx( v0, %rax, w1)
- test $2, R8(un)
- jz L(b11)
-
-L(b01): mov 16(rp), X1
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo1)
-
-L(b11): lea 8(rp), rp
- lea 8(up), up
- jmp L(lo3)
-
- ALIGN(32)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(lo2): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo1): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo0): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- lea 32(up), up
- inc n
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(ex): add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w2, %rdx
- adc $0, %rax
- add %rdx, w3
- mov w3, 16(rp)
- adc $0, %rax
- mov %rax, 24(rp)
-
- jmp L(outer) C loop until a small corner remains
-
-L(corner):
- pop un
- mov (up), %rdx
- jg L(small_corner)
-
- mov %rdx, v1
- mov (rp), X0
- mov %rax, X1 C Tricky rax reuse of last iteration
- mulx( v0, %rax, w1)
- add %rax, X0
- adc $0, w1
- mov X0, (rp)
- mov 8(up), %rdx
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w3, %rdx
- mov %rdx, 16(rp)
- adc $0, %rax
- mov %rax, 24(rp)
- lea 32(rp), rp
- lea 16(up), up
- jmp L(com)
-
-L(small_corner):
- mulx( v0, X1, w3)
- add %rax, X1 C Tricky rax reuse of last iteration
- adc $0, w3
- mov X1, (rp)
- mov w3, 8(rp)
- lea 16(rp), rp
- lea 8(up), up
-
-L(com):
-
-L(sqr_diag_addlsh1):
- lea 8(up,un,8), up C put back up at its very beginning
- lea (rp,un,8), rp
- lea (rp,un,8), rp C put back rp at its very beginning
- inc un
-
- mov -8(up), %rdx
- xor R32(%rbx), R32(%rbx) C clear CF as side effect
- mulx( %rdx, %rax, %r10)
- mov %rax, 8(rp)
- mov 16(rp), %r8
- mov 24(rp), %r9
- jmp L(dm)
-
- ALIGN(16)
-L(dtop):mov 32(rp), %r8
- mov 40(rp), %r9
- lea 16(rp), rp
- lea (%rdx,%rbx), %r10
-L(dm): adc %r8, %r8
- adc %r9, %r9
- setc R8(%rbx)
- mov (up), %rdx
- lea 8(up), up
- mulx( %rdx, %rax, %rdx)
- add %r10, %r8
- adc %rax, %r9
- mov %r8, 16(rp)
- mov %r9, 24(rp)
- inc un
- jnz L(dtop)
-
-L(dend):adc %rbx, %rdx
- mov %rdx, 32(rp)
-
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()