summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86/atom
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86/atom')
-rw-r--r--gmp/mpn/x86/atom/aorrlsh1_n.asm53
-rw-r--r--gmp/mpn/x86/atom/aorrlsh2_n.asm53
-rw-r--r--gmp/mpn/x86/atom/aorrlshC_n.asm156
-rw-r--r--gmp/mpn/x86/atom/aors_n.asm159
-rw-r--r--gmp/mpn/x86/atom/aorslshC_n.asm247
-rw-r--r--gmp/mpn/x86/atom/bdiv_q_1.asm35
-rw-r--r--gmp/mpn/x86/atom/cnd_add_n.asm113
-rw-r--r--gmp/mpn/x86/atom/cnd_sub_n.asm124
-rw-r--r--gmp/mpn/x86/atom/dive_1.asm34
-rw-r--r--gmp/mpn/x86/atom/gmp-mparam.h201
-rw-r--r--gmp/mpn/x86/atom/logops_n.asm151
-rw-r--r--gmp/mpn/x86/atom/lshift.asm218
-rw-r--r--gmp/mpn/x86/atom/lshiftc.asm159
-rw-r--r--gmp/mpn/x86/atom/mmx/copyd.asm34
-rw-r--r--gmp/mpn/x86/atom/mmx/copyi.asm34
-rw-r--r--gmp/mpn/x86/atom/mmx/hamdist.asm34
-rw-r--r--gmp/mpn/x86/atom/mod_34lsub1.asm34
-rw-r--r--gmp/mpn/x86/atom/mode1o.asm34
-rw-r--r--gmp/mpn/x86/atom/rshift.asm152
-rw-r--r--gmp/mpn/x86/atom/sse2/aorsmul_1.asm174
-rw-r--r--gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/divrem_1.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/mod_1_1.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/mod_1_4.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/mul_1.asm124
-rw-r--r--gmp/mpn/x86/atom/sse2/mul_basecase.asm501
-rw-r--r--gmp/mpn/x86/atom/sse2/popcount.asm35
-rw-r--r--gmp/mpn/x86/atom/sse2/sqr_basecase.asm634
-rw-r--r--gmp/mpn/x86/atom/sublsh1_n.asm34
-rw-r--r--gmp/mpn/x86/atom/sublsh2_n.asm57
30 files changed, 3720 insertions, 0 deletions
diff --git a/gmp/mpn/x86/atom/aorrlsh1_n.asm b/gmp/mpn/x86/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000000..cd1a650022
--- /dev/null
+++ b/gmp/mpn/x86/atom/aorrlsh1_n.asm
@@ -0,0 +1,53 @@
+dnl Intel Atom mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 31)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(M4_inst, adc)
+ define(M4_opp, sub)
+ define(M4_function, mpn_addlsh1_n)
+ define(M4_function_c, mpn_addlsh1_nc)
+',`ifdef(`OPERATION_rsblsh1_n', `
+ define(M4_inst, sbb)
+ define(M4_opp, add)
+ define(M4_function, mpn_rsblsh1_n)
+ define(M4_function_c, mpn_rsblsh1_nc)
+',`m4_error(`Need OPERATION_addlsh1_n or OPERATION_rsblsh1_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86/atom/aorrlsh2_n.asm b/gmp/mpn/x86/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000000..10f4419de9
--- /dev/null
+++ b/gmp/mpn/x86/atom/aorrlsh2_n.asm
@@ -0,0 +1,53 @@
+dnl Intel Atom mpn_addlsh2_n/mpn_rsblsh2_n -- rp[] = (vp[] << 2) +- up[]
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(M4_inst, adcl)
+ define(M4_opp, subl)
+ define(M4_function, mpn_addlsh2_n)
+ define(M4_function_c, mpn_addlsh2_nc)
+',`ifdef(`OPERATION_rsblsh2_n', `
+ define(M4_inst, sbbl)
+ define(M4_opp, addl)
+ define(M4_function, mpn_rsblsh2_n)
+ define(M4_function_c, mpn_rsblsh2_nc)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_rsblsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n mpn_rsblsh2_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86/atom/aorrlshC_n.asm b/gmp/mpn/x86/atom/aorrlshC_n.asm
new file mode 100644
index 0000000000..71cfe490d6
--- /dev/null
+++ b/gmp/mpn/x86/atom/aorrlshC_n.asm
@@ -0,0 +1,156 @@
+dnl Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_signed_limb_t carry);
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CORB, 20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_DBLD, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_DBLD')
+define(SAVE_VP,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+ movl PARAM_CORB, %eax
+ movl %eax, %edx
+ shr $LSH, %edx
+ andl $1, %edx
+ M4_opp %edx, %eax
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ xor %eax, %eax
+ xor %edx, %edx
+L(start_nc):
+ push rp FRAME_pushl()
+
+ mov PARAM_SIZE, %ecx C size
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ incl %ecx C size + 1
+ mov PARAM_SRC, up
+ mov vp, SAVE_VP
+ shr %ecx C (size+1)\2
+ mov PARAM_DBLD, vp
+ mov %ebp, SAVE_EBP
+ mov %ecx, VAR_COUNT
+ jnc L(entry) C size odd
+
+ shr %edx C size even
+ mov (vp), %ecx
+ lea 4(vp), vp
+ lea (%eax,%ecx,M), %edx
+ mov %ecx, %eax
+ lea -4(up), up
+ lea -4(rp), rp
+ jmp L(enteven)
+
+ ALIGN(16)
+L(oop):
+ lea (%eax,%ecx,M), %ebp
+ shr $RSH, %ecx
+ mov 4(vp), %eax
+ shr %edx
+ lea 8(vp), vp
+ M4_inst (up), %ebp
+ lea (%ecx,%eax,M), %edx
+ mov %ebp, (rp)
+L(enteven):
+ M4_inst 4(up), %edx
+ lea 8(up), up
+ mov %edx, 4(rp)
+ adc %edx, %edx
+ shr $RSH, %eax
+ lea 8(rp), rp
+L(entry):
+ mov (vp), %ecx
+ decl VAR_COUNT
+ jnz L(oop)
+
+ lea (%eax,%ecx,M), %ebp
+ shr $RSH, %ecx
+ shr %edx
+ mov SAVE_VP, vp
+ M4_inst (up), %ebp
+ mov %ecx, %eax
+ mov SAVE_UP, up
+ M4_inst $0, %eax
+ mov %ebp, (rp)
+ mov SAVE_EBP, %ebp
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+
+ASM_END()
diff --git a/gmp/mpn/x86/atom/aors_n.asm b/gmp/mpn/x86/atom/aors_n.asm
new file mode 100644
index 0000000000..45ec287c3a
--- /dev/null
+++ b/gmp/mpn/x86/atom/aors_n.asm
@@ -0,0 +1,159 @@
+dnl Intel Atom mpn_add_n/mpn_sub_n -- rp[] = up[] +- vp[].
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 3
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`cy', `%ecx')
+define(`r1', `%ecx')
+define(`r2', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_n)
+ xor cy, cy C carry
+L(start):
+ mov PARAM_SIZE, %eax C size
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC1, up
+ shr %eax C size >> 1
+ mov vp, SAVE_VP
+ mov PARAM_SRC2, vp
+ jz L(one) C size == 1
+ jc L(three) C size % 2 == 1
+
+ shr cy
+ mov (up), r2
+ lea 4(up), up
+ lea 4(vp), vp
+ lea -4(rp), rp
+ jmp L(entry)
+L(one):
+ shr cy
+ mov (up), r1
+ jmp L(end)
+L(three):
+ shr cy
+ mov (up), r1
+
+ ALIGN(16)
+L(oop):
+ M4_inst (vp), r1
+ lea 8(up), up
+ mov -4(up), r2
+ lea 8(vp), vp
+ mov r1, (rp)
+L(entry):
+ M4_inst -4(vp), r2
+ lea 8(rp), rp
+ dec %eax
+ mov (up), r1
+ mov r2, -4(rp)
+ jnz L(oop)
+
+L(end): C %eax is zero here
+ mov SAVE_UP, up
+ M4_inst (vp), r1
+ mov SAVE_VP, vp
+ mov r1, (rp)
+ adc %eax, %eax
+ mov SAVE_RP, rp
+ ret
+EPILOGUE()
+
+PROLOGUE(M4_function_nc)
+ mov PARAM_CARRY, cy C carry
+ jmp L(start)
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/aorslshC_n.asm b/gmp/mpn/x86/atom/aorslshC_n.asm
new file mode 100644
index 0000000000..75ace65e51
--- /dev/null
+++ b/gmp/mpn/x86/atom/aorslshC_n.asm
@@ -0,0 +1,247 @@
+dnl Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C)
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_signed_limb_t borrow);
+
+defframe(PARAM_CORB, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size,);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t borrow);
+
+C if src1 == dst, _ip1 is used
+
+C cycles/limb
+C dst!=src1,src2 dst==src1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 7 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(GPARAM_CORB, 20)
+defframe(GPARAM_SIZE, 16)
+defframe(GPARAM_SRC2, 12)
+
+dnl re-use parameter space
+define(SAVE_EBP,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp', `%edi')
+define(`up', `%esi')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_ip_function_c)
+deflit(`FRAME',0)
+ movl PARAM_CORB, %ecx
+ movl %ecx, %edx
+ shr $LSH, %edx
+ andl $1, %edx
+ M4_opp %edx, %ecx
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_ip_function)
+deflit(`FRAME',0)
+
+ xor %ecx, %ecx
+ xor %edx, %edx
+L(start_nc):
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ mov %ebx, SAVE_EBX
+ mov PARAM_SIZE, %ebx C size
+L(inplace):
+ incl %ebx C size + 1
+ shr %ebx C (size+1)\2
+ mov %ebp, SAVE_EBP
+ jnc L(entry) C size odd
+
+ add %edx, %edx C size even
+ mov %ecx, %ebp
+ mov (up), %ecx
+ lea -4(rp), rp
+ lea (%ebp,%ecx,M), %eax
+ lea 4(up), up
+ jmp L(enteven)
+
+ ALIGN(16)
+L(oop):
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ mov 4(up), %ecx
+ add %edx, %edx
+ lea 8(up), up
+ M4_inst %ebp, (rp)
+ lea (%eax,%ecx,M), %eax
+
+L(enteven):
+ M4_inst %eax, 4(rp)
+ lea 8(rp), rp
+
+ sbb %edx, %edx
+ shr $RSH, %ecx
+
+L(entry):
+ mov (up), %eax
+ decl %ebx
+ jnz L(oop)
+
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ shr %edx
+ M4_inst %ebp, (rp)
+ mov SAVE_UP, up
+ adc $0, %eax
+ mov SAVE_EBP, %ebp
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+ movl GPARAM_CORB, %ecx
+ movl %ecx, %edx
+ shr $LSH, %edx
+ andl $1, %edx
+ M4_opp %edx, %ecx
+ jmp L(generic_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ xor %ecx, %ecx
+ xor %edx, %edx
+L(generic_nc):
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ cmp rp, up
+ mov %ebx, SAVE_EBX
+ jne L(general)
+ mov GPARAM_SIZE, %ebx C size
+ mov GPARAM_SRC2, up
+ jmp L(inplace)
+
+L(general):
+ mov GPARAM_SIZE, %eax C size
+ mov %ebx, SAVE_EBX
+ incl %eax C size + 1
+ mov up, %ebx C vp
+ mov GPARAM_SRC2, up C up
+ shr %eax C (size+1)\2
+ mov %ebp, SAVE_EBP
+ mov %eax, GPARAM_SIZE
+ jnc L(entry2) C size odd
+
+ add %edx, %edx C size even
+ mov %ecx, %ebp
+ mov (up), %ecx
+ lea -4(rp), rp
+ lea -4(%ebx), %ebx
+ lea (%ebp,%ecx,M), %eax
+ lea 4(up), up
+ jmp L(enteven2)
+
+ ALIGN(16)
+L(oop2):
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ mov 4(up), %ecx
+ add %edx, %edx
+ lea 8(up), up
+ mov (%ebx), %edx
+ M4_inst %ebp, %edx
+ lea (%eax,%ecx,M), %eax
+ mov %edx, (rp)
+L(enteven2):
+ mov 4(%ebx), %edx
+ lea 8(%ebx), %ebx
+ M4_inst %eax, %edx
+ mov %edx, 4(rp)
+ sbb %edx, %edx
+ shr $RSH, %ecx
+ lea 8(rp), rp
+L(entry2):
+ mov (up), %eax
+ decl GPARAM_SIZE
+ jnz L(oop2)
+
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ shr %edx
+ mov (%ebx), %edx
+ M4_inst %ebp, %edx
+ mov %edx, (rp)
+ mov SAVE_UP, up
+ adc $0, %eax
+ mov SAVE_EBP, %ebp
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+
+ASM_END()
diff --git a/gmp/mpn/x86/atom/bdiv_q_1.asm b/gmp/mpn/x86/atom/bdiv_q_1.asm
new file mode 100644
index 0000000000..31e908ec44
--- /dev/null
+++ b/gmp/mpn/x86/atom/bdiv_q_1.asm
@@ -0,0 +1,35 @@
+dnl Intel Atom mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel
+dnl division by 1-limb divisor, returning quotient only.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`x86/pentium/bdiv_q_1.asm')
diff --git a/gmp/mpn/x86/atom/cnd_add_n.asm b/gmp/mpn/x86/atom/cnd_add_n.asm
new file mode 100644
index 0000000000..50bf2ad64b
--- /dev/null
+++ b/gmp/mpn/x86/atom/cnd_add_n.asm
@@ -0,0 +1,113 @@
+dnl X86 mpn_cnd_add_n optimised for Intel Atom.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3-4 (Prescott) ?
+C Intel atom 4.67
+C AMD K6 ?
+C AMD K7 ?
+C AMD K8 ?
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+define(`n', `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+
+ mov cnd, %eax C make cnd into a mask (1)
+ mov 24(%esp), rp
+ neg %eax C make cnd into a mask (1)
+ mov 28(%esp), up
+ sbb %eax, %eax C make cnd into a mask (1)
+ mov 32(%esp), vp
+ mov %eax, cnd C make cnd into a mask (1)
+ mov 36(%esp), n
+
+ xor %edx, %edx
+
+ shr $1, n
+ jnc L(top)
+
+ mov 0(vp), %eax
+ and cnd, %eax
+ lea 4(vp), vp
+ add 0(up), %eax
+ lea 4(rp), rp
+ lea 4(up), up
+ sbb %edx, %edx
+ mov %eax, -4(rp)
+ inc n
+ dec n
+ je L(end)
+
+L(top): sbb %edx, %edx
+ mov 0(vp), %eax
+ and cnd, %eax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov -4(vp), %ebx
+ and cnd, %ebx
+ add %edx, %edx
+ adc 0(up), %eax
+ lea 8(up), up
+ mov %eax, -8(rp)
+ adc -4(up), %ebx
+ dec n
+ mov %ebx, -4(rp)
+ jne L(top)
+
+L(end): mov $0, %eax
+ adc %eax, %eax
+
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/cnd_sub_n.asm b/gmp/mpn/x86/atom/cnd_sub_n.asm
new file mode 100644
index 0000000000..221bedca37
--- /dev/null
+++ b/gmp/mpn/x86/atom/cnd_sub_n.asm
@@ -0,0 +1,124 @@
+dnl X86 mpn_cnd_sub_n optimised for Intel Atom.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3-4 (Prescott) ?
+C Intel atom 5.67
+C AMD K6 ?
+C AMD K7 ?
+C AMD K8 ?
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+define(`n', `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_cnd_sub_n)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+
+ mov cnd, %eax C make cnd into a mask (1)
+ mov 24(%esp), rp
+ neg %eax C make cnd into a mask (1)
+ mov 28(%esp), up
+ sbb %eax, %eax C make cnd into a mask (1)
+ mov 32(%esp), vp
+ mov %eax, cnd C make cnd into a mask (1)
+ mov 36(%esp), n
+
+ xor %edx, %edx
+
+ inc n
+ shr n
+ jnc L(ent)
+
+ mov 0(vp), %eax
+ and cnd, %eax
+ lea 4(vp), vp
+ mov 0(up), %edx
+ sub %eax, %edx
+ lea 4(rp), rp
+ lea 4(up), up
+ mov %edx, -4(rp)
+ sbb %edx, %edx C save cy
+
+L(ent): mov 0(vp), %ebx
+ and cnd, %ebx
+ add %edx, %edx C restore cy
+ mov 0(up), %edx
+ dec n
+ je L(end)
+
+L(top): sbb %ebx, %edx
+ mov 4(vp), %eax
+ mov %edx, 0(rp)
+ sbb %edx, %edx C save cy
+ mov 8(vp), %ebx
+ lea 8(up), up
+ and cnd, %ebx
+ and cnd, %eax
+ add %edx, %edx C restore cy
+ mov -4(up), %edx
+ lea 8(rp), rp
+ sbb %eax, %edx
+ mov %edx, -4(rp)
+ dec n
+ mov 0(up), %edx
+ lea 8(vp), vp
+ jne L(top)
+
+L(end): sbb %ebx, %edx
+ mov %edx, 0(rp)
+
+ mov $0, %eax
+ adc %eax, %eax
+
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/dive_1.asm b/gmp/mpn/x86/atom/dive_1.asm
new file mode 100644
index 0000000000..71036a15a4
--- /dev/null
+++ b/gmp/mpn/x86/atom/dive_1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86/pentium/dive_1.asm')
diff --git a/gmp/mpn/x86/atom/gmp-mparam.h b/gmp/mpn/x86/atom/gmp-mparam.h
new file mode 100644
index 0000000000..45df12806c
--- /dev/null
+++ b/gmp/mpn/x86/atom/gmp-mparam.h
@@ -0,0 +1,201 @@
+/* Intel Atom/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1667 MHz Pineview (Atom D510) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-14, gcc 4.5 */
+
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 31
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 178
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 127
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 105
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 54
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \
+ { 223,10}, { 447,12}, { 127,11}, { 255,10}, \
+ { 543,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 735,12}, { 383,11}, { 831,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 703,11}, \
+ { 1471,13}, { 383,12}, { 831,11}, { 1663,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1663,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 150
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255,10}, { 79, 9}, { 159, 8}, { 319,10}, \
+ { 95, 9}, { 191,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \
+ { 287, 8}, { 575, 9}, { 303, 8}, { 607,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
+ { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \
+ { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 415,11}, { 223,10}, { 447,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
+ { 831,12}, { 447,11}, { 959,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 831,11}, { 1663,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \
+ { 1471,13}, { 767,12}, { 1663,13}, { 895,12}, \
+ { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 151
+#define SQR_FFT_THRESHOLD 2880
+
+#define MULLO_BASECASE_THRESHOLD 6
+#define MULLO_DC_THRESHOLD 48
+#define MULLO_MUL_N_THRESHOLD 8907
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 250
+#define DC_BDIV_QR_THRESHOLD 59
+#define DC_BDIV_Q_THRESHOLD 169
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 246
+#define INV_APPR_THRESHOLD 246
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 114
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1334
+
+#define POWM_SEC_TABLE 1,22,98,416,1378
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 133
+#define HGCD_APPR_THRESHOLD 169
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 460
+#define GCDEXT_DC_THRESHOLD 342
+#define JACOBI_BASE_METHOD 3
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 321
+#define SET_STR_PRECOMPUTE_THRESHOLD 1099
+
+#define FAC_DSC_THRESHOLD 198
+#define FAC_ODD_THRESHOLD 34
diff --git a/gmp/mpn/x86/atom/logops_n.asm b/gmp/mpn/x86/atom/logops_n.asm
new file mode 100644
index 0000000000..3cb6d7310c
--- /dev/null
+++ b/gmp/mpn/x86/atom/logops_n.asm
@@ -0,0 +1,151 @@
+dnl Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C op nop opn
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 3 3.5 3.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4_inst', `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n, , andl, )
+M4_choose_op( andn_n, , andl, yes)
+M4_choose_op( nand_n, yes, andl, )
+M4_choose_op( ior_n, , orl, )
+M4_choose_op( iorn_n, , orl, yes)
+M4_choose_op( nior_n, yes, orl, )
+M4_choose_op( xor_n, , xorl, )
+M4_choose_op( xnor_n, yes, xorl, )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size);
+C
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`cnt', `%eax')
+define(`r1', `%ecx')
+define(`r2', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function)
+ mov PARAM_SIZE, cnt C size
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC1, up
+ shr cnt C size >> 1
+ mov vp, SAVE_VP
+ mov PARAM_SRC2, vp
+ mov (up), r1
+ jz L(end) C size == 1
+ jnc L(even) C size % 2 == 0
+
+ ALIGN(16)
+L(oop):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ M4_inst (vp), r1
+ lea 8(up), up
+ mov -4(up), r2
+M4post(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ lea 8(vp), vp
+ mov r1, (rp)
+L(entry):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r2)')
+ M4_inst -4(vp), r2
+ lea 8(rp), rp
+M4post(` notl_or_xorl_GMP_NUMB_MASK(r2)')
+ dec cnt
+ mov (up), r1
+ mov r2, -4(rp)
+ jnz L(oop)
+
+L(end):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ mov SAVE_UP, up
+ M4_inst (vp), r1
+M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)')
+ mov SAVE_VP, vp
+ mov r1, (rp)
+ mov SAVE_RP, rp
+ ret
+
+L(even):
+ mov r1, r2
+ lea 4(up), up
+ lea 4(vp), vp
+ lea -4(rp), rp
+ jmp L(entry)
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/lshift.asm b/gmp/mpn/x86/atom/lshift.asm
new file mode 100644
index 0000000000..f2c70dd3e8
--- /dev/null
+++ b/gmp/mpn/x86/atom/lshift.asm
@@ -0,0 +1,218 @@
+dnl Intel Atom mpn_lshift -- mpn left shift.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned cnt);
+
+C cycles/limb
+C cnt!=1 cnt==1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 5 2.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`cnt', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_lshift)
+ mov PARAM_CNT, cnt
+ mov PARAM_SIZE, %edx
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,cnt
+ jne L(normal)
+ cmpl rp, up
+ jnc L(special) C jump if s_ptr + 1 >= res_ptr
+ leal (up,%edx,4),%eax
+ cmpl %eax,rp
+ jnc L(special) C jump if res_ptr >= s_ptr + size
+
+L(normal):
+ lea -4(up,%edx,4), up
+ mov %ebx, SAVE_EBX
+ lea -4(rp,%edx,4), rp
+
+ shr %edx
+ mov (up), %eax
+ mov %edx, VAR_COUNT
+ jnc L(evn)
+
+ mov %eax, %ebx
+ shl %cl, %ebx
+ neg cnt
+ shr %cl, %eax
+ test %edx, %edx
+ jnz L(gt1)
+ mov %ebx, (rp)
+ jmp L(quit)
+
+L(gt1): mov %ebp, SAVE_EBP
+ push %eax
+ mov -4(up), %eax
+ mov %eax, %ebp
+ shr %cl, %eax
+ jmp L(lo1)
+
+L(evn): mov %ebp, SAVE_EBP
+ neg cnt
+ mov %eax, %ebp
+ mov -4(up), %edx
+ shr %cl, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ neg cnt
+ decl VAR_COUNT
+ lea 4(rp), rp
+ lea -4(up), up
+ jz L(end)
+ push %eax FRAME_pushl()
+
+ ALIGN(8)
+L(top): shl %cl, %ebp
+ or %ebp, %edx
+ shl %cl, %ebx
+ neg cnt
+ mov -4(up), %eax
+ mov %eax, %ebp
+ mov %edx, -4(rp)
+ shr %cl, %eax
+ lea -8(rp), rp
+L(lo1): mov -8(up), %edx
+ or %ebx, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ lea -8(up), up
+ neg cnt
+ mov %eax, (rp)
+ decl VAR_COUNT
+ jg L(top)
+
+ pop %eax FRAME_popl()
+L(end):
+ shl %cl, %ebp
+ shl %cl, %ebx
+ or %ebp, %edx
+ mov SAVE_EBP, %ebp
+ mov %edx, -4(rp)
+ mov %ebx, -8(rp)
+
+L(quit):
+ mov SAVE_UP, up
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+
+L(special):
+deflit(`FRAME',4)
+ lea 3(%edx), %eax C size + 3
+ dec %edx C size - 1
+ mov (up), %ecx
+ shr $2, %eax C (size + 3) / 4
+ and $3, %edx C (size - 1) % 4
+ jz L(goloop) C jmp if size == 1 (mod 4)
+ shr %edx
+ jnc L(odd) C jum if size == 3 (mod 4)
+
+ add %ecx, %ecx
+ lea 4(up), up
+ mov %ecx, (rp)
+ mov (up), %ecx
+ lea 4(rp), rp
+
+ dec %edx
+ jnz L(goloop) C jump if size == 0 (mod 4)
+L(odd): lea -8(up), up
+ lea -8(rp), rp
+ jmp L(sentry) C reached if size == 2 or 3 (mod 4)
+
+L(sloop):
+ adc %ecx, %ecx
+ mov 4(up), %edx
+ mov %ecx, (rp)
+ adc %edx, %edx
+ mov 8(up), %ecx
+ mov %edx, 4(rp)
+L(sentry):
+ adc %ecx, %ecx
+ mov 12(up), %edx
+ mov %ecx, 8(rp)
+ adc %edx, %edx
+ lea 16(up), up
+ mov %edx, 12(rp)
+ lea 16(rp), rp
+ mov (up), %ecx
+L(goloop):
+ decl %eax
+ jnz L(sloop)
+
+L(squit):
+ adc %ecx, %ecx
+ mov %ecx, (rp)
+ adc %eax, %eax
+
+ mov SAVE_UP, up
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/lshiftc.asm b/gmp/mpn/x86/atom/lshiftc.asm
new file mode 100644
index 0000000000..5be53ed19d
--- /dev/null
+++ b/gmp/mpn/x86/atom/lshiftc.asm
@@ -0,0 +1,159 @@
+dnl Intel Atom mpn_lshiftc -- mpn left shift with complement.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshiftc (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned cnt);
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 5.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`cnt', `%ecx')
+
+ASM_START()
+ TEXT
+
+PROLOGUE(mpn_lshiftc)
+deflit(`FRAME',0)
+ mov PARAM_CNT, cnt
+ mov PARAM_SIZE, %edx
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+
+ lea -4(up,%edx,4), up
+ mov %ebx, SAVE_EBX
+ lea -4(rp,%edx,4), rp
+
+ shr %edx
+ mov (up), %eax
+ mov %edx, VAR_COUNT
+ jnc L(evn)
+
+ mov %eax, %ebx
+ shl %cl, %ebx
+ neg cnt
+ shr %cl, %eax
+ test %edx, %edx
+ jnz L(gt1)
+ not %ebx
+ mov %ebx, (rp)
+ jmp L(quit)
+
+L(gt1): mov %ebp, SAVE_EBP
+ push %eax
+ mov -4(up), %eax
+ mov %eax, %ebp
+ shr %cl, %eax
+ jmp L(lo1)
+
+L(evn): mov %ebp, SAVE_EBP
+ neg cnt
+ mov %eax, %ebp
+ mov -4(up), %edx
+ shr %cl, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ neg cnt
+ decl VAR_COUNT
+ lea 4(rp), rp
+ lea -4(up), up
+ jz L(end)
+ push %eax FRAME_pushl()
+
+L(top): shl %cl, %ebp
+ or %ebp, %edx
+ shl %cl, %ebx
+ neg cnt
+ not %edx
+ mov -4(up), %eax
+ mov %eax, %ebp
+ mov %edx, -4(rp)
+ shr %cl, %eax
+ lea -8(rp), rp
+L(lo1): mov -8(up), %edx
+ or %ebx, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ not %eax
+ lea -8(up), up
+ neg cnt
+ mov %eax, (rp)
+ decl VAR_COUNT
+ jg L(top)
+
+ pop %eax FRAME_popl()
+L(end):
+ shl %cl, %ebp
+ shl %cl, %ebx
+ or %ebp, %edx
+ mov SAVE_EBP, %ebp
+ not %edx
+ not %ebx
+ mov %edx, -4(rp)
+ mov %ebx, -8(rp)
+
+L(quit):
+ mov SAVE_UP, up
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/mmx/copyd.asm b/gmp/mpn/x86/atom/mmx/copyd.asm
new file mode 100644
index 0000000000..b80fb033fe
--- /dev/null
+++ b/gmp/mpn/x86/atom/mmx/copyd.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86/k7/mmx/copyd.asm')
diff --git a/gmp/mpn/x86/atom/mmx/copyi.asm b/gmp/mpn/x86/atom/mmx/copyi.asm
new file mode 100644
index 0000000000..49b6b8d662
--- /dev/null
+++ b/gmp/mpn/x86/atom/mmx/copyi.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86/k7/mmx/copyi.asm')
diff --git a/gmp/mpn/x86/atom/mmx/hamdist.asm b/gmp/mpn/x86/atom/mmx/hamdist.asm
new file mode 100644
index 0000000000..3fe8253240
--- /dev/null
+++ b/gmp/mpn/x86/atom/mmx/hamdist.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_hamdist -- hamming distance.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/gmp/mpn/x86/atom/mod_34lsub1.asm b/gmp/mpn/x86/atom/mod_34lsub1.asm
new file mode 100644
index 0000000000..6d57ba385d
--- /dev/null
+++ b/gmp/mpn/x86/atom/mod_34lsub1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_34lsub1)
+include_mpn(`x86/p6/mod_34lsub1.asm')
diff --git a/gmp/mpn/x86/atom/mode1o.asm b/gmp/mpn/x86/atom/mode1o.asm
new file mode 100644
index 0000000000..c9ee6bd2db
--- /dev/null
+++ b/gmp/mpn/x86/atom/mode1o.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_modexact_1_odd mpn_modexact_1c_odd)
+include_mpn(`x86/pentium/mode1o.asm')
diff --git a/gmp/mpn/x86/atom/rshift.asm b/gmp/mpn/x86/atom/rshift.asm
new file mode 100644
index 0000000000..1cb5dbefe9
--- /dev/null
+++ b/gmp/mpn/x86/atom/rshift.asm
@@ -0,0 +1,152 @@
+dnl Intel Atom mpn_rshift -- mpn right shift.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Converted from AMD64 by Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned cnt);
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`cnt', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_rshift)
+ mov PARAM_CNT, cnt
+ mov PARAM_SIZE, %edx
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+ mov %ebx, SAVE_EBX
+
+ shr %edx
+ mov (up), %eax
+ mov %edx, VAR_COUNT
+ jnc L(evn)
+
+ mov %eax, %ebx
+ shr %cl, %ebx
+ neg cnt
+ shl %cl, %eax
+ test %edx, %edx
+ jnz L(gt1)
+ mov %ebx, (rp)
+ jmp L(quit)
+
+L(gt1): mov %ebp, SAVE_EBP
+ push %eax
+ mov 4(up), %eax
+ mov %eax, %ebp
+ shl %cl, %eax
+ jmp L(lo1)
+
+L(evn): mov %ebp, SAVE_EBP
+ neg cnt
+ mov %eax, %ebp
+ mov 4(up), %edx
+ shl %cl, %eax
+ mov %edx, %ebx
+ shl %cl, %edx
+ neg cnt
+ decl VAR_COUNT
+ lea -4(rp), rp
+ lea 4(up), up
+ jz L(end)
+ push %eax FRAME_pushl()
+
+ ALIGN(8)
+L(top): shr %cl, %ebp
+ or %ebp, %edx
+ shr %cl, %ebx
+ neg cnt
+ mov 4(up), %eax
+ mov %eax, %ebp
+ mov %edx, 4(rp)
+ shl %cl, %eax
+ lea 8(rp), rp
+L(lo1): mov 8(up), %edx
+ or %ebx, %eax
+ mov %edx, %ebx
+ shl %cl, %edx
+ lea 8(up), up
+ neg cnt
+ mov %eax, (rp)
+ decl VAR_COUNT
+ jg L(top)
+
+ pop %eax FRAME_popl()
+L(end):
+ shr %cl, %ebp
+ shr %cl, %ebx
+ or %ebp, %edx
+ mov SAVE_EBP, %ebp
+ mov %edx, 4(rp)
+ mov %ebx, 8(rp)
+
+L(quit):
+ mov SAVE_UP, up
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/sse2/aorsmul_1.asm b/gmp/mpn/x86/atom/sse2/aorsmul_1.asm
new file mode 100644
index 0000000000..969a14a919
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/aorsmul_1.asm
@@ -0,0 +1,174 @@
+dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 8
+C AMD K6
+C AMD K7 -
+C AMD K8
+C AMD K10
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUB, add)
+ define(func_1, mpn_addmul_1)
+ define(func_1c, mpn_addmul_1c)')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUB, sub)
+ define(func_1, mpn_submul_1)
+ define(func_1c, mpn_submul_1c)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_1)
+ xor %edx, %edx
+L(ent): push %edi
+ push %esi
+ push %ebx
+ mov 16(%esp), rp
+ mov 20(%esp), up
+ mov 24(%esp), n
+ movd 28(%esp), %mm7
+ test $1, n
+ jz L(fi0or2)
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ shr $2, n
+ jnc L(fi1)
+
+L(fi3): lea -8(up), up
+ lea -8(rp), rp
+ movd 12(up), %mm1
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ add $1, n C increment and clear carry
+ jmp L(lo3)
+
+L(fi1): movd %mm0, %ebx
+ jz L(wd1)
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ jmp L(lo1)
+
+L(fi0or2):
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ shr $2, n
+ movd 4(up), %mm0
+ jc L(fi2)
+ lea -4(up), up
+ lea -4(rp), rp
+ movd %mm1, %eax
+ pmuludq %mm7, %mm0
+ jmp L(lo0)
+
+L(fi2): lea 4(up), up
+ add $1, n C increment and clear carry
+ movd %mm1, %eax
+ lea -12(rp), rp
+ jmp L(lo2)
+
+C ALIGN(16) C alignment seems irrelevant
+L(top): movd 4(up), %mm1
+ adc $0, %edx
+ ADDSUB %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(lo1): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ ADDSUB %ebx, (rp)
+L(lo0): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ ADDSUB %eax, 4(rp)
+L(lo3): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ ADDSUB %ebx, 8(rp)
+L(lo2): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec n
+ jnz L(top)
+
+L(end): adc n, %edx C n is zero here
+ ADDSUB %eax, 12(rp)
+ movd %mm0, %ebx
+ lea 16(rp), rp
+L(wd1): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc n, %eax
+ ADDSUB %ebx, (rp)
+ emms
+ adc n, %eax
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
+PROLOGUE(func_1c)
+ mov 20(%esp), %edx C carry
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm b/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000000..782e914019
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_bdiv_dbm1.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_dbm1c)
+include_mpn(`x86/pentium4/sse2/bdiv_dbm1c.asm')
diff --git a/gmp/mpn/x86/atom/sse2/divrem_1.asm b/gmp/mpn/x86/atom/sse2/divrem_1.asm
new file mode 100644
index 0000000000..f84709a22e
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/divrem_1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_preinv_divrem_1 mpn_divrem_1c mpn_divrem_1)
+include_mpn(`x86/pentium4/sse2/divrem_1.asm')
diff --git a/gmp/mpn/x86/atom/sse2/mod_1_1.asm b/gmp/mpn/x86/atom/sse2/mod_1_1.asm
new file mode 100644
index 0000000000..ae6581d9b6
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/mod_1_1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom/SSE2 mpn_mod_1_1.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')
diff --git a/gmp/mpn/x86/atom/sse2/mod_1_4.asm b/gmp/mpn/x86/atom/sse2/mod_1_4.asm
new file mode 100644
index 0000000000..31faa3f0a3
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/mod_1_4.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom/SSE2 mpn_mod_1_4.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')
diff --git a/gmp/mpn/x86/atom/sse2/mul_1.asm b/gmp/mpn/x86/atom/sse2/mul_1.asm
new file mode 100644
index 0000000000..aa3bb974bb
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/mul_1.asm
@@ -0,0 +1,124 @@
+dnl Intel Atom mpn_mul_1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 7.5
+C AMD K6 -
+C AMD K7 -
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+define(`rp', `%edx')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_mul_1c)
+ movd PARAM_CARRY, %mm6 C carry
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(8) C for compact code
+PROLOGUE(mpn_mul_1)
+ pxor %mm6, %mm6
+L(ent): push %esi FRAME_pushl()
+ mov PARAM_SRC, up
+ mov PARAM_SIZE, %eax C size
+ movd PARAM_MUL, %mm7
+ movd (up), %mm0
+ mov %eax, n
+ and $3, %eax
+ pmuludq %mm7, %mm0
+ mov PARAM_DST, rp
+ jz L(lo0)
+ cmp $2, %eax
+ lea -16(up,%eax,4),up
+ lea -16(rp,%eax,4),rp
+ jc L(lo1)
+ jz L(lo2)
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(lo0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+L(lo3): paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+L(lo2): paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+L(lo1): paddq %mm0, %mm6
+ sub $4, n
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ ja L(top)
+
+ psrlq $32, %mm6
+ movd %mm6, %eax
+ emms
+ pop %esi FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/atom/sse2/mul_basecase.asm b/gmp/mpn/x86/atom/sse2/mul_basecase.asm
new file mode 100644
index 0000000000..97d3aeb5ad
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/mul_basecase.asm
@@ -0,0 +1,501 @@
+dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
+dnl a third limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside of inner loops.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs
+C for inlined mul_1, allowing us to postpone all pushes.
+C * Perhaps write special code for vn <= un < M, for some small M.
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn,
+C mp_srcptr yp, mp_size_t yn);
+C
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`un', `%ecx')
+define(`vp', `%ebp')
+define(`vn', `36(%esp)')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ mov 28(%esp), un
+ mov 32(%esp), vp
+
+ movd (up), %mm0
+ movd (vp), %mm7
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+
+ mov un, %eax
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ ja L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ decl vn
+ jz L(done)
+ lea -8(rp), rp
+
+L(ol3): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 16(rp,un,4), rp
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd 4(up), %mm1
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea -8(up), up
+ xor %edx, %edx C zero edx and CF
+ jmp L(a3)
+
+L(la3): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+L(a3): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la3)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol3)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(of0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ ja L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ decl vn
+ jz L(done)
+ lea -4(rp), rp
+
+L(ol0): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 20(rp,un,4), rp
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd 4(up), %mm0
+ lea -4(up), up
+ movd %mm1, %eax
+ pmuludq %mm7, %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a0)
+
+L(la0): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+L(a0): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la0)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol0)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm1): movd -12(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -12(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of1): paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, (rp)
+ lea 16(up), up
+ ja L(lm1)
+
+ psrlq $32, %mm6
+ movd %mm6, 4(rp)
+
+ decl vn
+ jz L(done)
+ lea -16(rp), rp
+
+L(ol1): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 24(rp,un,4), rp
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd %mm0, %ebx
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ xor %edx, %edx C zero edx and CF
+ inc un
+ jmp L(a1)
+
+L(la1): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(a1): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la1)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol1)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm2): movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of2): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 4(rp)
+ lea 16(up), up
+ ja L(lm2)
+
+ psrlq $32, %mm6
+ movd %mm6, 8(rp)
+
+ decl vn
+ jz L(done)
+ lea -12(rp), rp
+
+L(ol2): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 12(rp,un,4), rp
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd 4(up), %mm0
+ lea 4(up), up
+ movd %mm1, %eax
+ xor %edx, %edx C zero edx and CF
+ jmp L(lo2)
+
+L(la2): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+L(lo2): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la2)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol2)
+C jmp L(done)
+
+C ================================================================
+L(done):
+ emms
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86/atom/sse2/popcount.asm b/gmp/mpn/x86/atom/sse2/popcount.asm
new file mode 100644
index 0000000000..7847aec8e6
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/popcount.asm
@@ -0,0 +1,35 @@
+dnl Intel Atom mpn_popcount -- population count.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86/atom/sse2/sqr_basecase.asm b/gmp/mpn/x86/atom/sse2/sqr_basecase.asm
new file mode 100644
index 0000000000..af19ed854d
--- /dev/null
+++ b/gmp/mpn/x86/atom/sse2/sqr_basecase.asm
@@ -0,0 +1,634 @@
+dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside of inner loops.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
+C all pushes.
+C * Perhaps write special code for n < M, for some small M.
+C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
+C with even less pipelined code.
+C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
+C Consider breaking out earlier, saving high the cost of short loops.
+
+C void mpn_sqr_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn);
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+define(`un', `%ebp')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ push %edi
+ push %esi
+ mov 12(%esp), rp
+ mov 16(%esp), up
+ mov 20(%esp), n
+
+ lea 4(rp), rp C write triangular product starting at rp[1]
+ dec n
+ movd (up), %mm7
+
+ jz L(one)
+ lea 4(up), up
+ push %ebx
+ push %ebp
+ mov n, %eax
+
+ movd (up), %mm0
+ neg n
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+ mov n, un
+
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ js L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ inc n
+C jz L(done)
+ lea -12(up), up
+ lea 4(rp), rp
+ jmp L(ol2)
+
+C ================================================================
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(of0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ js L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ inc n
+C jz L(done)
+ lea -8(up), up
+ lea 8(rp), rp
+ jmp L(ol3)
+
+C ================================================================
+ ALIGN(16)
+L(lm1): movd -12(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -12(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of1): paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, (rp)
+ lea 16(up), up
+ js L(lm1)
+
+ psrlq $32, %mm6
+ movd %mm6, 4(rp)
+
+ inc n
+ jz L(done) C goes away when we add special n=2 code
+ lea -20(up), up
+ lea -4(rp), rp
+ jmp L(ol0)
+
+C ================================================================
+ ALIGN(16)
+L(lm2): movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of2): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 4(rp)
+ lea 16(up), up
+ js L(lm2)
+
+ psrlq $32, %mm6
+ movd %mm6, 8(rp)
+
+ inc n
+C jz L(done)
+ lea -16(up), up
+C lea (rp), rp
+C jmp L(ol1)
+
+C ================================================================
+
+L(ol1): lea 4(up,n,4), up
+ movd (up), %mm7 C read next U invariant limb
+ lea 8(rp,n,4), rp
+ mov n, un
+
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd %mm1, %ebx
+ inc un
+ jz L(re1)
+
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a1)
+
+L(la1): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+L(a1): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la1)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+
+C ================================================================
+
+L(ol0): lea (up,n,4), up
+ movd 4(up), %mm7 C read next U invariant limb
+ lea 4(rp,n,4), rp
+ mov n, un
+
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd 12(up), %mm1
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ xor %edx, %edx C zero edx and CF
+ jmp L(a0)
+
+L(la0): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+L(a0): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la0)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+
+C ================================================================
+
+L(ol3): lea 12(up,n,4), up
+ movd -8(up), %mm7 C read next U invariant limb
+ lea (rp,n,4), rp C put rp back
+ mov n, un
+
+ movd -4(up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd %mm1, %ebx
+ movd (up), %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a3)
+
+L(la3): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+L(a3): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la3)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+
+C ================================================================
+
+L(ol2): lea 8(up,n,4), up
+ movd -4(up), %mm7 C read next U invariant limb
+ lea 12(rp,n,4), rp
+ mov n, un
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ xor %edx, %edx
+ sar $2, un
+ movd 4(up), %mm1
+ test un, un C clear carry
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ inc un
+ jnz L(a2)
+ jmp L(re2)
+
+L(la2): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(a2): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la2)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+ jmp L(ol1)
+
+C ================================================================
+L(re2): psrlq $32, %mm0
+ movd (up), %mm7 C read next U invariant limb
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ lea 4(rp), rp
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ movd 4(up), %mm1
+ adc un, %eax
+ add %ebx, (rp)
+ pmuludq %mm7, %mm1
+ adc un, %eax
+ mov %eax, 4(rp)
+ movd %mm1, %ebx
+
+L(re1): psrlq $32, %mm1
+ add %ebx, 4(rp)
+ movd %mm1, %eax
+ adc un, %eax
+ xor n, n C make n zeroness assumption below true
+ mov %eax, 8(rp)
+
+L(done): C n is zero here
+ mov 24(%esp), up
+ mov 28(%esp), %eax
+
+ movd (up), %mm0
+ inc %eax
+ pmuludq %mm0, %mm0
+ lea 4(up), up
+ mov 20(%esp), rp
+ shr %eax
+ movd %mm0, (rp)
+ psrlq $32, %mm0
+ lea -12(rp), rp
+ mov %eax, 28(%esp)
+ jnc L(odd)
+
+ movd %mm0, %ebp
+ movd (up), %mm0
+ lea 8(rp), rp
+ pmuludq %mm0, %mm0
+ lea -4(up), up
+ add 8(rp), %ebp
+ movd %mm0, %edx
+ adc 12(rp), %edx
+ rcr n
+ jmp L(ent)
+
+C ALIGN(16) C alignment seems irrelevant
+L(top): movd (up), %mm1
+ adc n, n
+ movd %mm0, %eax
+ pmuludq %mm1, %mm1
+ movd 4(up), %mm0
+ adc (rp), %eax
+ movd %mm1, %ebx
+ pmuludq %mm0, %mm0
+ psrlq $32, %mm1
+ adc 4(rp), %ebx
+ movd %mm1, %ebp
+ movd %mm0, %edx
+ adc 8(rp), %ebp
+ adc 12(rp), %edx
+ rcr n C FIXME: isn't this awfully slow on atom???
+ adc %eax, (rp)
+ adc %ebx, 4(rp)
+L(ent): lea 8(up), up
+ adc %ebp, 8(rp)
+ psrlq $32, %mm0
+ adc %edx, 12(rp)
+L(odd): decl 28(%esp)
+ lea 16(rp), rp
+ jnz L(top)
+
+L(end): adc n, n
+ movd %mm0, %eax
+ adc n, %eax
+ mov %eax, (rp)
+
+L(rtn): emms
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+
+L(one): pmuludq %mm7, %mm7
+ movq %mm7, -4(rp)
+ emms
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86/atom/sublsh1_n.asm b/gmp/mpn/x86/atom/sublsh1_n.asm
new file mode 100644
index 0000000000..d3e7e5b5cb
--- /dev/null
+++ b/gmp/mpn/x86/atom/sublsh1_n.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
+include_mpn(`x86/k7/sublsh1_n.asm')
diff --git a/gmp/mpn/x86/atom/sublsh2_n.asm b/gmp/mpn/x86/atom/sublsh2_n.asm
new file mode 100644
index 0000000000..79405cf9f4
--- /dev/null
+++ b/gmp/mpn/x86/atom/sublsh2_n.asm
@@ -0,0 +1,57 @@
+dnl Intel Atom mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(M4_inst, adcl)
+ define(M4_opp, subl)
+ define(M4_function, mpn_addlsh2_n)
+ define(M4_function_c, mpn_addlsh2_nc)
+ define(M4_ip_function_c, mpn_addlsh2_nc_ip1)
+ define(M4_ip_function, mpn_addlsh2_n_ip1)
+',`ifdef(`OPERATION_sublsh2_n', `
+ define(M4_inst, sbbl)
+ define(M4_opp, addl)
+ define(M4_function, mpn_sublsh2_n)
+ define(M4_function_c, mpn_sublsh2_nc)
+ define(M4_ip_function_c, mpn_sublsh2_nc_ip1)
+ define(M4_ip_function, mpn_sublsh2_n_ip1)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1)
+
+include_mpn(`x86/atom/aorslshC_n.asm')