summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86_64/bd1
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86_64/bd1')
-rw-r--r--gmp/mpn/x86_64/bd1/README11
-rw-r--r--gmp/mpn/x86_64/bd1/aorrlsh1_n.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/aorsmul_1.asm181
-rw-r--r--gmp/mpn/x86_64/bd1/com.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/copyd.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/copyi.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/gcd_1.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/gmp-mparam.h236
-rw-r--r--gmp/mpn/x86_64/bd1/hamdist.asm38
-rw-r--r--gmp/mpn/x86_64/bd1/mul_1.asm184
-rw-r--r--gmp/mpn/x86_64/bd1/mul_2.asm192
-rw-r--r--gmp/mpn/x86_64/bd1/mul_basecase.asm416
-rw-r--r--gmp/mpn/x86_64/bd1/popcount.asm38
-rw-r--r--gmp/mpn/x86_64/bd1/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/sublsh1_n.asm37
15 files changed, 1555 insertions, 0 deletions
diff --git a/gmp/mpn/x86_64/bd1/README b/gmp/mpn/x86_64/bd1/README
new file mode 100644
index 0000000000..ccd210e0d6
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/README
@@ -0,0 +1,11 @@
+This directory contains code for AMD bulldozer including its piledriver update.
+
+We currently make limited use of SIMD instructions, both via the MPN_PATH and
+via inclusion of x86_64/fastsse files.
+
+The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably
+means that an all-core GMP load (such as a HPC load) might run slower if there
+is significant SIMD dependency.
+
+We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any
+SIMD code.
diff --git a/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm
new file mode 100644
index 0000000000..c34a5fa134
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')
diff --git a/gmp/mpn/x86_64/bd1/aorsmul_1.asm b/gmp/mpn/x86_64/bd1/aorsmul_1.asm
new file mode 100644
index 0000000000..96fec9f5ac
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/aorsmul_1.asm
@@ -0,0 +1,181 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1 4.5-4.7
+C AMD bobcat
+C Intel P4
+C Intel core2
+C Intel NHM
+C Intel SBR
+C Intel atom
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Try to make loop run closer to 4 c/l.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%r11')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`v0', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
+ mul v0
+
+IFSTD(` mov %rbx, n ')
+
+ and $3, R32(%rbx)
+ lea -16(rp,n,8), rp
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jb L(b1)
+ jz L(b2)
+
+L(b3): mov $0, R32(%r8)
+ mov %rax, %rbx
+ mov $0, R32(%r9)
+ mov 8(up), %rax
+ mov %rdx, %r10
+ lea (up,n,8), up
+ not n
+ jmp L(L3)
+
+L(b0): mov $0, R32(%r10)
+ mov %rax, %r8
+ mov %rdx, %rbx
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ jmp L(L0)
+
+L(b1): cmp $1, n
+ jz L(n1)
+ mov %rax, %r9
+ mov 8(up), %rax
+ mov %rdx, %r8
+ mov $0, R32(%rbx)
+ lea (up,n,8), up
+ neg n
+ inc n
+ jmp L(L1)
+
+L(b2): mov $0, R32(%rbx)
+ mov %rax, %r10
+ mov %rdx, %r9
+ mov 8(up), %rax
+ mov $0, R32(%r8)
+ lea (up,n,8), up
+ neg n
+ add $2, n
+ jns L(end)
+
+ ALIGN(32)
+L(top): mul v0
+ ADDSUB %r10, (rp,n,8)
+ adc %rax, %r9
+ mov (up,n,8), %rax
+ adc %rdx, %r8
+L(L1): mul v0
+ mov $0, R32(%r10)
+ ADDSUB %r9, 8(rp,n,8)
+ adc %rax, %r8
+ adc %rdx, %rbx
+ mov 8(up,n,8), %rax
+L(L0): mul v0
+ ADDSUB %r8, 16(rp,n,8)
+ mov $0, R32(%r8)
+ adc %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(up,n,8), %rax
+ adc %rdx, %r10
+L(L3): mul v0
+ ADDSUB %rbx, 24(rp,n,8)
+ mov $0, R32(%rbx)
+ adc %rax, %r10
+ adc %rdx, %r9
+ mov 24(up,n,8), %rax
+ add $4, n
+ js L(top)
+
+L(end): mul v0
+ ADDSUB %r10, (rp)
+ adc %r9, %rax
+ adc %r8, %rdx
+L(n1): ADDSUB %rax, 8(rp)
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86_64/bd1/com.asm b/gmp/mpn/x86_64/bd1/com.asm
new file mode 100644
index 0000000000..43f356117a
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp/mpn/x86_64/bd1/copyd.asm b/gmp/mpn/x86_64/bd1/copyd.asm
new file mode 100644
index 0000000000..675cdc3f6b
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/bd1/copyi.asm b/gmp/mpn/x86_64/bd1/copyi.asm
new file mode 100644
index 0000000000..ceef036585
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/bd1/gcd_1.asm b/gmp/mpn/x86_64/bd1/gcd_1.asm
new file mode 100644
index 0000000000..3d8e5c7ab1
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/gcd_1.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_1)
+include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/gmp/mpn/x86_64/bd1/gmp-mparam.h b/gmp/mpn/x86_64/bd1/gmp-mparam.h
new file mode 100644
index 0000000000..5014f9f469
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/gmp-mparam.h
@@ -0,0 +1,236 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 40000000 */
+/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 59
+#define MUL_TOOM44_THRESHOLD 166
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 115
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 150
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 85
+#define SQR_TOOM4_THRESHOLD 242
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 22
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 404, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
+ { 19, 7}, { 10, 6}, { 25, 7}, { 15, 6}, \
+ { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 103,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 167,11}, { 95,10}, \
+ { 191,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 255,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319, 9}, { 1279,11}, { 367,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \
+ { 319,10}, { 1279,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \
+ { 895,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \
+ { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \
+ { 319,11}, { 1279,12}, { 671,11}, { 1343,10}, \
+ { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
+ { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,10}, { 4863,12}, { 1343,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \
+ { 831,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \
+ { 1215,12}, { 2431,11}, { 4863,13}, { 1343,12}, \
+ { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \
+ { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
+ { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
+ { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 217
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 380, 5}, { 17, 6}, { 9, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,10}, \
+ { 191,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543, 9}, { 1087,11}, { 303,10}, \
+ { 607,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671, 9}, { 1343,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
+ { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
+ { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \
+ { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 767,11}, { 1599,10}, \
+ { 3199,12}, { 831,13}, { 447,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
+ { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \
+ { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1599,11}, \
+ { 3199,13}, { 831,12}, { 1727,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \
+ { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2943,12}, { 5887,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3327,12}, \
+ { 6655,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
+ { 3583,14}, { 1919,13}, { 3839,16}, { 511,15}, \
+ { 1023,14}, { 2175,13}, { 4351,14}, { 2303,13}, \
+ { 4607,14}, { 2431,13}, { 4863,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 220
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 39
+#define MULLO_MUL_N_THRESHOLD 7246
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 180
+#define DC_BDIV_QR_THRESHOLD 47
+#define DC_BDIV_Q_THRESHOLD 80
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 226
+#define INV_APPR_THRESHOLD 188
+
+#define BINV_NEWTON_THRESHOLD 248
+#define REDC_1_TO_REDC_2_THRESHOLD 52
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1360
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1360
+
+#define POWM_SEC_TABLE 1,16,194,386,452,2245
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 108
+#define HGCD_APPR_THRESHOLD 51
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 298
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 418
+#define SET_STR_PRECOMPUTE_THRESHOLD 1289
+
+#define FAC_DSC_THRESHOLD 252
+#define FAC_ODD_THRESHOLD 23
diff --git a/gmp/mpn/x86_64/bd1/hamdist.asm b/gmp/mpn/x86_64/bd1/hamdist.asm
new file mode 100644
index 0000000000..93e1e5632b
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/hamdist.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/k10/hamdist.asm')
diff --git a/gmp/mpn/x86_64/bd1/mul_1.asm b/gmp/mpn/x86_64/bd1/mul_1.asm
new file mode 100644
index 0000000000..e59667c085
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/mul_1.asm
@@ -0,0 +1,184 @@
+dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1 4
+C AMD bobcat
+C Intel P4
+C Intel core2
+C Intel NHM
+C Intel SBR
+C Intel atom
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Move loop code into feed-in blocks, to save insn for zeroing regs.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`v0', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``rbx'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %r11 ')
+ mul v0
+
+IFSTD(` add %r8, %rax ')
+IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns)
+ adc $0, %rdx
+ jmp L(common)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %r11 ')
+ mul v0
+
+L(common):
+IFSTD(` mov %r11, n ')
+
+ and $3, R32(%r11)
+ lea -16(rp,n,8), rp
+ jz L(b0)
+ cmp $2, R32(%r11)
+ jb L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up), %rax
+ mul v0
+ lea (up,n,8), up
+ not n
+ jmp L(L3)
+
+L(b0): mov %rax, %r9
+ mov %rdx, %r10
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ jmp L(L0)
+
+L(b1): mov %rax, %r8
+ cmp $1, n
+ jz L(n1)
+ mov %rdx, %r9
+ lea (up,n,8), up
+ neg n
+ mov %r8, 16(rp,n,8)
+ inc n
+ jmp L(L1)
+
+L(b2): mov %rax, %r11
+ mov %rdx, %r8
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ add $2, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mul v0
+ mov %rdx, %r9
+ add %rax, %r8
+ adc $0, %r9
+ mov %r8, 8(rp,n,8)
+ mov %r11, (rp,n,8)
+L(L1): mov (up,n,8), %rax
+ mul v0
+ add %rax, %r9
+ mov %rdx, %r10
+ mov 8(up,n,8), %rax
+ adc $0, %r10
+L(L0): mul v0
+ add %rax, %r10
+ mov %rdx, %r11
+ mov 16(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ mov %r9, 16(rp,n,8)
+L(L3): add %rax, %r11
+ mov %r10, 24(rp,n,8)
+ mov %rdx, %r8
+ adc $0, %r8
+ add $4, n
+ mov -8(up,n,8), %rax
+ js L(top)
+
+L(end): mul v0
+ add %rax, %r8
+ adc $0, %rdx
+ mov %r11, (rp)
+L(n1): mov %r8, 8(rp)
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86_64/bd1/mul_2.asm b/gmp/mpn/x86_64/bd1/mul_2.asm
new file mode 100644
index 0000000000..4ed5f30561
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/mul_2.asm
@@ -0,0 +1,192 @@
+dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull 4.36 average, quite fluctuating
+C AMD pile 4.38 slighty fluctuating
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+C Scheme: genxmul --mul
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ mov n_param, n
+ mul v0
+ neg n
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov (up,n,8), %rax
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov %rdx, w3
+ mov (up,n,8), %rax
+ xor R32(w0), R32(w0)
+ mul v1
+ add $-2, n
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ mul v1
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov %rdx, w2
+ mov (up,n,8), %rax
+ xor R32(w3), R32(w3)
+ dec n
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mov -8(up,n,8), %rax
+ mul v1
+ mov w2, -16(rp,n,8)
+L(lo1): add %rax, w0
+ mov w3, -8(rp,n,8)
+ adc %rdx, w1
+ mov (up,n,8), %rax
+ mul v0
+ mov $0, R32(w2)
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mov (up,n,8), %rax
+L(lo0): mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w1
+ mov w0, (rp,n,8)
+ mov $0, R32(w3)
+ mov 8(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ mov $0, R32(w0)
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov w1, 8(rp,n,8)
+L(lo2): add %rax, w3
+ adc %rdx, w0
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ mov $0, R32(w1)
+ adc $0, R32(w1)
+ add $4, n
+ jnc L(top)
+
+L(end): mov -8(up,n,8), %rax
+ mul v1
+ mov w2, -16(rp,n,8)
+ add %rax, w0
+ mov w3, -8(rp,n,8)
+ adc %rdx, w1
+ mov w0, (rp,n,8)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/bd1/mul_basecase.asm b/gmp/mpn/x86_64/bd1/mul_basecase.asm
new file mode 100644
index 0000000000..e47ba587cd
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/mul_basecase.asm
@@ -0,0 +1,416 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull ~4.8 ~4.55 - ~4.3
+C AMD pile ~4.6 ~4.55 - ~4.55
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
+C Alternatively, we could tweak the present code (which was loopmixed for a
+C different CPU).
+C * Merge faster mul_2, such as the one in the same directory as this file.
+C * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ mov un_param, un C free up rdx
+ neg un
+
+ mov (up), %rax C shared for mul_1 and mul_2
+ lea (up,un_param,8), up C point at operand end
+ lea (rp,un_param,8), rp C point at rp[un-1]
+
+ mov (vp), v0 C shared for mul_1 and mul_2
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
+ mov %rdx, w1
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(m110)
+
+L(m100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(m1l0)
+
+L(m110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(m1l2)
+
+L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
+ mov %rdx, w0
+ test $2, R8(un)
+ jz L(m111)
+
+L(m101):lea 3(un), n C un = 1, 5, 9, ...
+ test n, n
+ js L(m1l1)
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(m111):lea 1(un), n C un = 3, 7, 11, ...
+ mov 8(up,un,8), %rax
+ jmp L(m1l3)
+
+ ALIGN(16)
+L(m1tp):mov %rdx, w0
+ add %rax, w1
+L(m1l1):mov -16(up,n,8), %rax
+ adc $0, w0
+ mul v0
+ add %rax, w0
+ mov w1, -24(rp,n,8)
+ mov -8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(m1l0):mul v0
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ adc $0, w0
+L(m1l3):mul v0
+ mov w1, -8(rp,n,8)
+ mov %rdx, w1
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc $0, w1
+L(m1l2):mul v0
+ mov w0, (rp,n,8)
+ add $4, n
+ jnc L(m1tp)
+
+L(m1ed):add %rax, w1
+ adc $0, %rdx
+ mov w1, I(-8(rp),-24(rp,n,8))
+ mov %rdx, I((rp),-16(rp,n,8))
+
+ dec R32(vn)
+ jz L(ret2)
+
+ lea 8(vp), vp
+ lea 8(rp), rp
+ push %r12
+ push %r13
+ push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+ push %r12
+ push %r13
+ push %r14
+
+ mov 8(vp), v1
+
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea (un), n
+ mov %rax, w2 C 0
+ mov (up,un,8), %rax
+ mov %rdx, w1 C 1
+ mul v1
+ mov %rax, w0 C 1
+ mov w2, (rp,un,8) C 0
+ mov 8(up,un,8), %rax
+ mov %rdx, w2 C 2
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ mov %rax, w0 C 1
+ mov %rdx, w3 C 2
+ mov (up,un,8), %rax
+ mul v1
+ mov w0, (rp,un,8) C 1
+ mov %rdx, w0 C 3
+ mov %rax, w2 C 0
+ mov 8(up,un,8), %rax
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):add %rax, w2 C 0
+ mov (up,n,8), %rax
+ adc $0, w0 C 1
+L(m2l1):mul v0
+ add %rax, w2 C 0
+ mov (up,n,8), %rax
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mul v1
+ add w3, w2 C 0
+ adc $0, w1 C 1
+ add %rax, w0 C 1
+ mov w2, (rp,n,8) C 0
+ mov 8(up,n,8), %rax
+ mov %rdx, w2 C 2
+ adc $0, w2 C 2
+L(m2l0):mul v0
+ add %rax, w0 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ add w1, w0 C 1
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+ mul v1
+ add $2, n
+ mov w0, -8(rp,n,8) C 1
+ mov %rdx, w0 C 3
+ jnc L(m2tp)
+
+L(m2ed):add %rax, w2
+ adc $0, %rdx
+ add w3, w2
+ adc $0, %rdx
+ mov w2, I((rp),(rp,n,8))
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $-2, R32(vn)
+ jz L(ret5)
+
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up,un,8), %rax
+ mul v0
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): mov %rax, X1
+ mov (up,un,8), %rax
+ mov %rdx, X0
+ mul v1
+ test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), n C un = 4, 8, 12, ...
+ mov (rp,un,8), w3
+ mov %rax, w0
+ mov 8(up,un,8), %rax
+ mov %rdx, w1
+ jmp L(lo0)
+
+L(b10): lea 2(un), n C un = 2, 6, 10, ...
+ mov (rp,un,8), w1
+ mov %rdx, w3
+ mov %rax, w2
+ mov 8(up,un,8), %rax
+ jmp L(lo2)
+
+L(bx1): mov %rax, X0
+ mov (up,un,8), %rax
+ mov %rdx, X1
+ mul v1
+ test $2, R8(un)
+ jz L(b11)
+
+L(b01): lea 1(un), n C un = 1, 5, 9, ...
+ mov (rp,un,8), w2
+ mov %rdx, w0
+ mov %rax, w3
+ jmp L(lo1)
+
+L(b11): lea -1(un), n C un = 3, 7, 11, ...
+ mov (rp,un,8), w0
+ mov %rax, w1
+ mov 8(up,un,8), %rax
+ mov %rdx, w2
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,n,8)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up,n,8), %rax
+ mul v1
+ mov -8(rp,n,8), w1
+ mov %rdx, w0
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X0, -8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ mov (up,n,8), %rax
+ adc $0, X0
+ mov (rp,n,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(lo0): mul v0
+ add w3, X1
+ mov X1, (rp,n,8)
+ adc %rax, X0
+ mov 8(up,n,8), %rax
+ mov %rdx, X1
+ adc $0, X1
+ mov 8(rp,n,8), w3
+ mul v1
+ add w3, w0
+ adc %rax, w1
+ mov 16(up,n,8), %rax
+ mov %rdx, w2
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ adc $0, X0
+ mov 16(up,n,8), %rax
+ mov 16(rp,n,8), w0
+ mul v1
+ mov %rdx, w3
+ add w0, w1
+ adc %rax, w2
+ adc $0, w3
+ mov 24(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ add w1, X1
+ mov X1, I(-16(rp),-16(rp,n,8))
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ mov I(-8(rp),-8(rp,n,8)), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, %rdx
+ add w2, X0
+ adc $0, X1
+ mov X0, I(-8(rp),-8(rp,n,8))
+ add w3, X1
+ mov X1, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+
+ addl $-2, vn
+ lea 16(vp), vp
+ lea 16(rp), rp
+ jnz L(outer)
+
+ pop %rax C deallocate vn slot
+ pop %r15
+L(ret5):pop %r14
+ pop %r13
+ pop %r12
+L(ret2):pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/bd1/popcount.asm b/gmp/mpn/x86_64/bd1/popcount.asm
new file mode 100644
index 0000000000..8f22a715b6
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/popcount.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_popcount -- population count.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/k10/popcount.asm')
diff --git a/gmp/mpn/x86_64/bd1/sec_tabselect.asm b/gmp/mpn/x86_64/bd1/sec_tabselect.asm
new file mode 100644
index 0000000000..e4360341d9
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/bd1/sublsh1_n.asm b/gmp/mpn/x86_64/bd1/sublsh1_n.asm
new file mode 100644
index 0000000000..4ba673d15a
--- /dev/null
+++ b/gmp/mpn/x86_64/bd1/sublsh1_n.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_sublsh1_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')