summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86/k7
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86/k7')
-rw-r--r--gmp/mpn/x86/k7/README174
-rw-r--r--gmp/mpn/x86/k7/addlsh1_n.asm196
-rw-r--r--gmp/mpn/x86/k7/aors_n.asm258
-rw-r--r--gmp/mpn/x86/k7/aorsmul_1.asm167
-rw-r--r--gmp/mpn/x86/k7/bdiv_q_1.asm244
-rw-r--r--gmp/mpn/x86/k7/dive_1.asm207
-rw-r--r--gmp/mpn/x86/k7/gcd_1.asm186
-rw-r--r--gmp/mpn/x86/k7/gmp-mparam.h241
-rw-r--r--gmp/mpn/x86/k7/invert_limb.asm193
-rw-r--r--gmp/mpn/x86/k7/mmx/com.asm125
-rw-r--r--gmp/mpn/x86/k7/mmx/copyd.asm144
-rw-r--r--gmp/mpn/x86/k7/mmx/copyi.asm157
-rw-r--r--gmp/mpn/x86/k7/mmx/divrem_1.asm832
-rw-r--r--gmp/mpn/x86/k7/mmx/lshift.asm481
-rw-r--r--gmp/mpn/x86/k7/mmx/popham.asm213
-rw-r--r--gmp/mpn/x86/k7/mmx/rshift.asm480
-rw-r--r--gmp/mpn/x86/k7/mod_1_1.asm221
-rw-r--r--gmp/mpn/x86/k7/mod_1_4.asm260
-rw-r--r--gmp/mpn/x86/k7/mod_34lsub1.asm188
-rw-r--r--gmp/mpn/x86/k7/mode1o.asm180
-rw-r--r--gmp/mpn/x86/k7/mul_1.asm237
-rw-r--r--gmp/mpn/x86/k7/mul_basecase.asm602
-rw-r--r--gmp/mpn/x86/k7/sqr_basecase.asm635
-rw-r--r--gmp/mpn/x86/k7/sublsh1_n.asm173
24 files changed, 6794 insertions, 0 deletions
diff --git a/gmp/mpn/x86/k7/README b/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000000..5711b612c5
--- /dev/null
+++ b/gmp/mpn/x86/k7/README
@@ -0,0 +1,174 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+ AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions. All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+ cycles/limb
+ mpn_add/sub_n 1.6
+
+ mpn_copyi 0.75 or 1.0 \ varying with data alignment
+ mpn_copyd 0.75 or 1.0 /
+
+ mpn_divrem_1 17.0 integer part, 15.0 fractional part
+ mpn_mod_1 17.0
+ mpn_divexact_by3 8.0
+
+ mpn_l/rshift 1.2
+
+ mpn_mul_1 3.4
+ mpn_addmul/submul_1 3.9
+
+ mpn_mul_basecase 4.42 cycles/crossproduct (approx)
+ mpn_sqr_basecase 2.3 cycles/crossproduct (approx)
+ or 4.55 cycles/triangleproduct (approx)
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on
+the speed of the multiplication routines. The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead. The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling. An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken. Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies. K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl direct
+divl 39 cycles back-to-back
+lodsl,etc vector
+loop 1 cycle vector (decl/jnz opens up one decode slot)
+movd reg vector
+movd mem direct
+mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl vector (use movl for more than one pop)
+pushl direct, will pair with a load
+shrdl %cl vector, 3 cycles, seems to be 3 decode too
+xorl r,r false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision K, February 2002. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision D, March 2000. This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999. This has some notes on general Athlon optimizations as well as
+3DNow. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/gmp/mpn/x86/k7/addlsh1_n.asm b/gmp/mpn/x86/k7/addlsh1_n.asm
new file mode 100644
index 0000000000..a957b6f78e
--- /dev/null
+++ b/gmp/mpn/x86/k7/addlsh1_n.asm
@@ -0,0 +1,196 @@
+dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
+C The innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers. It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 6
+C AMD K6 ?
+C AMD K7 2.5
+C AMD K8
+
+C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
+C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
+C that means we need an initial magic multiply.
+C
+C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
+C cannot do rsblsh1_n since we feed carry from the shift blocks to the
+C add/subtract blocks, which is right for addition but reversed for
+C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
+C without losing any time, since we're not issue limited but carry recurrency
+C latency.
+C
+C Breaking carry recurrency might be a good idea. We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force is to 2*2-way unrolling.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_DBLD, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_COUNT,`PARAM_DST')
+define(VAR_TMP,`PARAM_DBLD')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+
+ mov $0x2aaaaaab, %eax
+
+ push %ebx FRAME_pushl()
+ mov PARAM_SIZE, %ebx C size
+
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+
+ mul %ebx
+
+ push up FRAME_pushl()
+ mov PARAM_SRC, up
+
+ not %edx C count = -(size\8)-1
+ mov %edx, VAR_COUNT
+
+ push vp FRAME_pushl()
+ mov PARAM_DBLD, vp
+
+ lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
+ xor %edx, %edx
+ lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
+ or %ebx, %ebx
+ jz L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (vp), %eax
+ adc %eax, %eax
+ rcr %edx C restore 1st saved carry bit
+ lea 4(vp), vp
+ adc (up), %eax
+ lea 4(up), up
+ adc %edx, %edx C save a carry bit in edx
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ dec %ebx
+ mov %eax, (rp)
+ lea 4(rp), rp
+ jnz L(oop)
+ mov vp, VAR_TMP
+L(exact):
+ incl VAR_COUNT
+ jz L(end)
+
+ ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (vp), %eax
+ adc %eax, %eax
+ mov 4(vp), %ebx
+ adc %ebx, %ebx
+ mov 8(vp), %ecx
+ adc %ecx, %ecx
+
+ rcr %edx C restore 1st saved carry bit
+
+ adc (up), %eax
+ mov %eax, (rp)
+ adc 4(up), %ebx
+ mov %ebx, 4(rp)
+ adc 8(up), %ecx
+ mov %ecx, 8(rp)
+
+ mov 12(vp), %eax
+ adc %eax, %eax
+ mov 16(vp), %ebx
+ adc %ebx, %ebx
+ mov 20(vp), %ecx
+ adc %ecx, %ecx
+
+ lea 24(vp), vp
+ adc %edx, %edx C save a carry bit in edx
+
+ adc 12(up), %eax
+ mov %eax, 12(rp)
+ adc 16(up), %ebx
+ mov %ebx, 16(rp)
+ adc 20(up), %ecx
+
+ lea 24(up), up
+
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ mov %ecx, 20(rp)
+ incl VAR_COUNT
+ lea 24(rp), rp
+ jne L(top)
+
+L(end):
+ pop vp FRAME_popl()
+ pop up FRAME_popl()
+
+ifdef(`CPU_P6',`
+ xor %eax, %eax
+ shr $1, %edx
+ adc %edx, %eax
+',`
+ adc $0, %edx
+ mov %edx, %eax
+')
+ pop rp FRAME_popl()
+ pop %ebx FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/k7/aors_n.asm b/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000000..1a08072029
--- /dev/null
+++ b/gmp/mpn/x86/k7/aors_n.asm
@@ -0,0 +1,258 @@
+dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.64 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 8 1.9
+dnl 16 1.64
+dnl 32 1.7
+dnl 64 2.0
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which might be the best possible with
+C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
+C which can be done each cycle, leading to 1.5 c/l.
+
+dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+ TEXT
+ ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+ movl PARAM_CARRY, %eax
+ jmp L(start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+ xorl %eax, %eax C carry
+L(start):
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ movl %ebx, SAVE_EBX
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_SRC2, %edx
+ movl PARAM_SRC1, %ebx
+ jae L(unroll)
+
+ movl PARAM_DST, %edi
+ leal (%ebx,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %edx
+
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+ shrl %eax
+
+ C This loop in in a single 16 byte code block already, so no
+ C alignment necessary.
+L(simple):
+ C eax scratch
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ M4_inst (%edx,%ecx,4), %eax
+ movl %eax, (%edi,%ecx,4)
+ incl %ecx
+ jnz L(simple)
+
+ movl $0, %eax
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ setc %al
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+ movl %ebp, SAVE_EBP
+ andl $-2, %ecx C size low bit masked out
+ andl $1, PARAM_SIZE C size low bit kept
+
+ movl %ecx, %edi
+ decl %ecx
+ movl PARAM_DST, %ebp
+
+ shrl $UNROLL_LOG2, %ecx
+ negl %edi
+ movl %esi, SAVE_ESI
+
+ andl $UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
+')
+ negl %edi
+ shrl %eax
+
+ leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+ leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%edi,%edi,8), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax zero
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi scratch (was computed jump)
+ C edi dst
+ C ebp scratch
+
+ leal UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%ebx), %esi)
+ movl disp1(%ebx), %ebp
+Zdisp( M4_inst,disp0,(%edx), %esi)
+Zdisp( movl, %esi, disp0,(%edi))
+ M4_inst disp1(%edx), %ebp
+ movl %ebp, disp1(%edi)
+')
+
+ decl %ecx
+ leal UNROLL_BYTES(%ebx), %ebx
+ leal UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ mov PARAM_SIZE, %esi
+ movl SAVE_EBP, %ebp
+ movl $0, %eax
+
+ decl %esi
+ js L(even)
+
+ movl (%ebx), %ecx
+ M4_inst UNROLL_BYTES(%edx), %ecx
+ movl %ecx, (%edi)
+L(even):
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ setc %al
+
+ movl SAVE_ESI, %esi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/aorsmul_1.asm b/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000000..eec8df6de2
--- /dev/null
+++ b/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -0,0 +1,167 @@
+dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias) 6.5
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6
+C AMD K7 3.75
+C AMD K8
+
+C TODO
+C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
+C but lose by 2x for n == 1.
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ mov 20(%esp), %edi
+ mov 24(%esp), %esi
+ mov 28(%esp), %eax
+ mov 32(%esp), %ecx
+ mov %eax, %ebx
+ shr $2, %eax
+ mov %eax, 28(%esp)
+ mov (%esi), %eax
+ and $3, %ebx
+ jz L(b0)
+ cmp $2, %ebx
+ jz L(b2)
+ jg L(b3)
+
+L(b1): lea -4(%esi), %esi
+ lea -4(%edi), %edi
+ mul %ecx
+ mov %eax, %ebx
+ mov %edx, %ebp
+ cmpl $0, 28(%esp)
+ jz L(cj1)
+ mov 8(%esi), %eax
+ jmp L(1)
+
+L(b2): mul %ecx
+ mov %eax, %ebp
+ mov 4(%esi), %eax
+ mov %edx, %ebx
+ cmpl $0, 28(%esp)
+ jne L(2)
+ jmp L(cj2)
+
+L(b3): lea -12(%esi), %esi
+ lea -12(%edi), %edi
+ mul %ecx
+ mov %eax, %ebx
+ mov %edx, %ebp
+ mov 16(%esi), %eax
+ incl 28(%esp)
+ jmp L(3)
+
+L(b0): lea -8(%esi), %esi
+ lea -8(%edi), %edi
+ mul %ecx
+ mov %eax, %ebp
+ mov 12(%esi), %eax
+ mov %edx, %ebx
+ jmp L(0)
+
+ ALIGN(16)
+L(top): lea 16(%edi), %edi
+L(2): mul %ecx
+ ADDSUB %ebp, 0(%edi)
+ mov $0, %ebp
+ adc %eax, %ebx
+ mov 8(%esi), %eax
+ adc %edx, %ebp
+L(1): mul %ecx
+ ADDSUB %ebx, 4(%edi)
+ mov $0, %ebx
+ adc %eax, %ebp
+ mov 12(%esi), %eax
+ adc %edx, %ebx
+L(0): mul %ecx
+ ADDSUB %ebp, 8(%edi)
+ mov $0, %ebp
+ adc %eax, %ebx
+ adc %edx, %ebp
+ mov 16(%esi), %eax
+L(3): mul %ecx
+ ADDSUB %ebx, 12(%edi)
+ adc %eax, %ebp
+ mov 20(%esi), %eax
+ lea 16(%esi), %esi
+ mov $0, %ebx
+ adc %edx, %ebx
+ decl 28(%esp)
+ jnz L(top)
+
+L(end): lea 16(%edi), %edi
+L(cj2): mul %ecx
+ ADDSUB %ebp, (%edi)
+ adc %eax, %ebx
+ adc $0, %edx
+L(cj1): ADDSUB %ebx, 4(%edi)
+ adc $0, %edx
+ mov %edx, %eax
+ mov (%esp), %ebp
+ mov 4(%esp), %ebx
+ mov 8(%esp), %esi
+ mov 12(%esp), %edi
+ add $16, %esp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/k7/bdiv_q_1.asm b/gmp/mpn/x86/k7/bdiv_q_1.asm
new file mode 100644
index 0000000000..df3477f539
--- /dev/null
+++ b/gmp/mpn/x86/k7/bdiv_q_1.asm
@@ -0,0 +1,244 @@
+dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort. The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl PARAM_SHIFT, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ movl PARAM_INVERSE, %eax C inv
+
+L(common):
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl %edi, VAR_DST_END
+ xorl %ebx, %ebx
+ jmp L(entry)
+
+ ALIGN(8)
+L(top):
+ C eax q
+ C ebx carry bit, 0 or 1
+ C ecx shift
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi,%ebp,4), %eax
+ movl (%esi,%ebp,4), %edi
+
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebx, %eax C apply carry bit
+ setc %bl
+ movl VAR_DST_END, %edi
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ebx
+
+L(entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ incl %ebp
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi), %eax C src high limb
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebx, %eax C apply carry bit
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+L(one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl SAVE_EBP, %ebp
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl $-1, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ C If there's usually only one or two trailing zero bits then this
+ C should be faster than bsfl.
+L(strip_twos):
+ incl %ecx
+ shrl %eax
+ jnc L(strip_twos)
+
+ movl %ebx, SAVE_EBX
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ imull %ebx, %eax C inv*inv*d
+
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+ leal (%edx,%edx), %eax C 2*inv
+
+ imull %edx, %edx C inv*inv
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ imull %ebx, %edx C inv*inv*d
+
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/dive_1.asm b/gmp/mpn/x86/k7/dive_1.asm
new file mode 100644
index 0000000000..8eb4f45ac0
--- /dev/null
+++ b/gmp/mpn/x86/k7/dive_1.asm
@@ -0,0 +1,207 @@
+dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort. The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl $-1, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ C If there's usually only one or two trailing zero bits then this
+ C should be faster than bsfl.
+L(strip_twos):
+ incl %ecx
+ shrl %eax
+ jnc L(strip_twos)
+
+ movl %ebx, SAVE_EBX
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ imull %ebx, %eax C inv*inv*d
+
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+ leal (%edx,%edx), %eax C 2*inv
+
+ imull %edx, %edx C inv*inv
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ imull %ebx, %edx C inv*inv*d
+
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl %edi, VAR_DST_END
+ xorl %ebx, %ebx
+ jmp L(entry)
+
+ ALIGN(8)
+L(top):
+ C eax q
+ C ebx carry bit, 0 or 1
+ C ecx shift
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi,%ebp,4), %eax
+ movl (%esi,%ebp,4), %edi
+
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebx, %eax C apply carry bit
+ setc %bl
+ movl VAR_DST_END, %edi
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ebx
+
+L(entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ incl %ebp
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi), %eax C src high limb
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebx, %eax C apply carry bit
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl SAVE_EBP, %ebp
+ movl %eax, -4(%edi)
+
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/gcd_1.asm b/gmp/mpn/x86/k7/gcd_1.asm
new file mode 100644
index 0000000000..c7d12c83c0
--- /dev/null
+++ b/gmp/mpn/x86/k7/gcd_1.asm
@@ -0,0 +1,186 @@
+dnl x86 mpn_gcd_1 optimised for AMD K7.
+
+dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K7 5.31
+C AMD K8,K9 5.33
+C AMD K10 5.30
+C AMD bd1 ?
+C AMD bobcat 7.02
+C Intel P4-2 10.1
+C Intel P4-3/4 10.0
+C Intel P6/13 5.88
+C Intel core2 6.26
+C Intel NHM 6.83
+C Intel SBR 8.50
+C Intel atom 8.90
+C VIA nano ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+C TODO
+C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
+C * Stream things better through registers, avoiding some copying.
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+C Threshold of when to call bmod when U is one limb. Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`DIV_THRES_LOG2', 7)
+
+
+define(`up', `%edi')
+define(`n', `%esi')
+define(`v0', `%edx')
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+ push %edi
+ push %esi
+
+ mov 12(%esp), up
+ mov 16(%esp), n
+ mov 20(%esp), v0
+
+ mov (up), %eax C U low limb
+ or v0, %eax C x | y
+ mov $-1, %ecx
+
+L(twos):
+ inc %ecx
+ shr %eax
+ jnc L(twos)
+
+ shr %cl, v0
+ mov %ecx, %eax C common twos
+
+L(divide_strip_y):
+ shr v0
+ jnc L(divide_strip_y)
+ adc v0, v0
+
+ push %eax
+ push v0
+
+ cmp $1, n
+ jnz L(reduce_nby1)
+
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+ mov (up), %ecx
+ mov %ecx, %eax
+ shr $DIV_THRES_LOG2, %ecx
+ cmp %ecx, v0
+ ja L(reduced)
+
+ mov v0, %esi
+ xor %edx, %edx
+ div %esi
+ mov %edx, %eax
+ jmp L(reduced)
+
+L(reduce_nby1):
+ifdef(`PIC_WITH_EBX',`
+ push %ebx
+ call L(movl_eip_to_ebx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+')
+ push v0 C param 3
+ push n C param 2
+ push up C param 1
+ cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
+ jl L(bmod)
+ CALL( mpn_mod_1)
+ jmp L(called)
+L(bmod):
+ CALL( mpn_modexact_1_odd)
+
+L(called):
+ add $12, %esp C deallocate params
+ifdef(`PIC_WITH_EBX',`
+ pop %ebx
+')
+L(reduced):
+ pop %edx
+
+ LEA( ctz_table, %esi)
+ test %eax, %eax
+ mov %eax, %ecx
+ jnz L(mid)
+ jmp L(end)
+
+ ALIGN(16) C K8 BC P4 NHM SBR
+L(top): cmovc( %ecx, %eax) C if x-y < 0 0
+ cmovc( %edi, %edx) C use x,y-x 0
+L(mid): and $MASK, %ecx C 0
+ movzbl (%esi,%ecx), %ecx C 1
+ jz L(shift_alot) C 1
+ shr %cl, %eax C 3
+ mov %eax, %edi C 4
+ mov %edx, %ecx C 3
+ sub %eax, %ecx C 4
+ sub %edx, %eax C 4
+ jnz L(top) C 5
+
+L(end): pop %ecx
+ mov %edx, %eax
+ shl %cl, %eax
+ pop %esi
+ pop %edi
+ ret
+
+L(shift_alot):
+ shr $MAXSHIFT, %eax
+ mov %eax, %ecx
+ jmp L(mid)
+
+ifdef(`PIC_WITH_EBX',`
+L(movl_eip_to_ebx):
+ mov (%esp), %ebx
+ ret
+')
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/gmp-mparam.h b/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000000..9977a113e2
--- /dev/null
+++ b/gmp/mpn/x86/k7/gmp-mparam.h
@@ -0,0 +1,241 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2083 MHz K7 Barton */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-13, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 24
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 85
+#define MUL_TOOM44_THRESHOLD 147
+#define MUL_TOOM6H_THRESHOLD 216
+#define MUL_TOOM8H_THRESHOLD 309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 124
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 50
+#define SQR_TOOM3_THRESHOLD 81
+#define SQR_TOOM4_THRESHOLD 216
+#define SQR_TOOM6_THRESHOLD 306
+#define SQR_TOOM8_THRESHOLD 446
+
+#define MULMID_TOOM42_THRESHOLD 56
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */
+#define MUL_FFT_TABLE3 \
+ { { 904, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \
+ { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \
+ { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
+ { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
+ { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
+ { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \
+ { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 71, 8}, { 143, 9}, { 79, 8}, \
+ { 159,10}, { 47, 9}, { 95, 8}, { 191, 9}, \
+ { 103,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255, 9}, { 143,10}, { 79, 9}, { 167,10}, \
+ { 95, 9}, { 199,10}, { 111,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 8}, { 1087,10}, \
+ { 287,11}, { 159,10}, { 319, 9}, { 639,11}, \
+ { 191,10}, { 383, 9}, { 767, 8}, { 1535, 9}, \
+ { 799, 8}, { 1599,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \
+ { 607, 9}, { 1215, 8}, { 2431,11}, { 319,10}, \
+ { 639, 9}, { 1279,10}, { 671, 9}, { 1343,12}, \
+ { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \
+ { 799, 9}, { 1599,10}, { 831, 9}, { 1663,10}, \
+ { 863,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \
+ { 1151,11}, { 607,10}, { 1215, 9}, { 2431,12}, \
+ { 319,11}, { 639,10}, { 1407,11}, { 735,10}, \
+ { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \
+ { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \
+ { 1663,11}, { 895,10}, { 1791,11}, { 959,10}, \
+ { 1919,13}, { 255,12}, { 511,11}, { 1023,10}, \
+ { 2047,11}, { 1087,12}, { 575,11}, { 1151,10}, \
+ { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1279,10}, { 2559,11}, { 1407,10}, { 2815,11}, \
+ { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \
+ { 1791,10}, { 3583,12}, { 959,11}, { 1919,10}, \
+ { 3839,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2175,12}, { 1151,11}, \
+ { 2303,12}, { 1215,11}, { 2431,13}, { 639,12}, \
+ { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1663,11}, { 3327,13}, { 895,12}, \
+ { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \
+ { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
+ { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
+ { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
+ { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \
+ { 4095,13}, { 2175,12}, { 4351,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
+ { 2943,12}, { 5887,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 228
+#define MUL_FFT_THRESHOLD 7808
+
+#define SQR_FFT_MODF_THRESHOLD 888 /* k = 6 */
+#define SQR_FFT_TABLE3 \
+ { { 888, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \
+ { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \
+ { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
+ { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
+ { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
+ { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 9}, { 23, 8}, { 47, 7}, \
+ { 95, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 79,10}, { 47, 9}, { 95, 8}, { 191,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \
+ { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511,10}, { 143, 9}, { 287, 8}, \
+ { 575,10}, { 159,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639, 8}, { 1279, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 799, 8}, { 1599, 9}, { 831,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 575, 9}, { 1215, 8}, \
+ { 2431,11}, { 319,10}, { 639, 9}, { 1279,10}, \
+ { 671, 9}, { 1407,12}, { 191,10}, { 799, 9}, \
+ { 1599,10}, { 831, 9}, { 1663,10}, { 863, 9}, \
+ { 1727,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087, 9}, \
+ { 2175,10}, { 1119,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215, 9}, { 2431,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,10}, { 1343, 9}, \
+ { 2687,11}, { 703,10}, { 1407,11}, { 735,10}, \
+ { 1471, 9}, { 2943,10}, { 1503,12}, { 383,11}, \
+ { 767,10}, { 1535,11}, { 799,10}, { 1599,11}, \
+ { 863,10}, { 1727,12}, { 447,11}, { 895,10}, \
+ { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \
+ { 511,11}, { 1023,10}, { 2047,11}, { 1087,10}, \
+ { 2175,11}, { 1119,12}, { 575,11}, { 1151,10}, \
+ { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1407,10}, { 2815,11}, { 1471,10}, { 2943,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1663,10}, \
+ { 3327,12}, { 895,11}, { 1791,10}, { 3583,12}, \
+ { 959,11}, { 1919,10}, { 3839,11}, { 1983,14}, \
+ { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
+ { 1087,11}, { 2175,12}, { 1151,11}, { 2303,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1407,11}, \
+ { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1663,11}, { 3327,12}, { 1727,13}, { 895,12}, \
+ { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \
+ { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
+ { 2175,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
+ { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
+ { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
+ { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \
+ { 4095,13}, { 2175,12}, { 4351,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 229
+#define SQR_FFT_THRESHOLD 7552
+
+#define MULLO_BASECASE_THRESHOLD 8
+#define MULLO_DC_THRESHOLD 36
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 45
+#define DC_DIVAPPR_Q_THRESHOLD 208
+#define DC_BDIV_QR_THRESHOLD 43
+#define DC_BDIV_Q_THRESHOLD 140
+
+#define INV_MULMOD_BNM1_THRESHOLD 62
+#define INV_NEWTON_THRESHOLD 204
+#define INV_APPR_THRESHOLD 204
+
+#define BINV_NEWTON_THRESHOLD 230
+#define REDC_1_TO_REDC_N_THRESHOLD 59
+
+#define MU_DIV_QR_THRESHOLD 1752
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 82
+#define MU_BDIV_QR_THRESHOLD 1360
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,16,102,336,1221
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 120
+#define HGCD_APPR_THRESHOLD 143
+#define HGCD_REDUCE_THRESHOLD 4818
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 33
+#define SET_STR_DC_THRESHOLD 298
+#define SET_STR_PRECOMPUTE_THRESHOLD 1187
+
+#define FAC_DSC_THRESHOLD 602
+#define FAC_ODD_THRESHOLD 29
diff --git a/gmp/mpn/x86/k7/invert_limb.asm b/gmp/mpn/x86/k7/invert_limb.asm
new file mode 100644
index 0000000000..6cce455a9d
--- /dev/null
+++ b/gmp/mpn/x86/k7/invert_limb.asm
@@ -0,0 +1,193 @@
+dnl x86 mpn_invert_limb
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles (approx) div
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 ?
+C AMD K7 41 53
+C AMD K8 ?
+
+C TODO
+C * These c/l numbers are for a non-PIC build. Consider falling back to using
+C the 'div' instruction for PIC builds.
+C * Perhaps use this file--or at least the algorithm--for more machines than k7.
+
+C Register usage:
+C Input D in %edi
+C Current approximation is in %eax and/or %ecx
+C %ebx and %edx are temporaries
+C %esi and %ebp are unused
+
+defframe(PARAM_DIVISOR,4)
+
+ASM_START()
+
+C Make approx_tab global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+ deflit(`approx_tab', MPN(invert_limb_tab))
+ GLOBL approx_tab')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_invert_limb)
+deflit(`FRAME', 0)
+ mov PARAM_DIVISOR, %eax
+ C Avoid push/pop on k7.
+ sub $8, %esp FRAME_subl_esp(8)
+ mov %ebx, (%esp)
+ mov %edi, 4(%esp)
+
+ mov %eax, %edi
+ shr $22, %eax
+ifdef(`PIC',`
+ LEA( approx_tab, %ebx)
+ movzwl -1024(%ebx, %eax, 2), %eax
+',`
+ movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
+')
+
+ C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
+ mov %eax, %ecx
+ imul %eax, %eax
+ mov %edi, %ebx
+ shr $11, %ebx
+ inc %ebx
+ mul %ebx
+ mov %edi, %ebx C Prepare
+ shr %ebx
+ sbb %eax, %eax
+ sub %eax, %ebx C %ebx = d_31, %eax = mask
+ shl $4, %ecx
+ dec %ecx
+ sub %edx, %ecx C %ecx = v1
+
+ C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
+ imul %ecx, %ebx
+ and %ecx, %eax
+ shr %eax
+ sub %ebx, %eax
+ mul %ecx
+ mov %edi, %eax C Prepare for next mul
+ shl $15, %ecx
+ shr %edx
+ add %edx, %ecx C %ecx = v2
+
+ mul %ecx
+ add %edi, %eax
+ mov %ecx, %eax
+ adc %edi, %edx
+ sub %edx, %eax C %eax = v3
+
+ mov (%esp), %ebx
+ mov 4(%esp), %edi
+ add $8, %esp
+
+ ret
+
+EPILOGUE()
+
+DEF_OBJECT(approx_tab,2)
+ .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
+ .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
+ .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
+ .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
+ .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
+ .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
+ .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
+ .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
+ .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
+ .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
+ .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
+ .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
+ .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
+ .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
+ .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
+ .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
+ .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
+ .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
+ .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
+ .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
+ .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
+ .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
+ .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
+ .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
+ .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
+ .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
+ .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
+ .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
+ .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
+ .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
+ .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
+ .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
+ .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
+ .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
+ .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
+ .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
+ .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
+ .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
+ .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
+ .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
+ .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
+ .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
+ .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
+ .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
+ .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
+ .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
+ .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
+ .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
+ .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
+ .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
+ .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
+ .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
+ .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
+ .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
+ .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
+ .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
+ .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
+ .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
+ .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
+ .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
+ .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
+ .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
+ .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
+ .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
+END_OBJECT(approx_tab)
diff --git a/gmp/mpn/x86/k7/mmx/com.asm b/gmp/mpn/x86/k7/mmx/com.asm
new file mode 100644
index 0000000000..a258c224f1
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/com.asm
@@ -0,0 +1,125 @@
+dnl AMD Athlon mpn_com -- mpn bitwise one's complement.
+
+dnl Copyright 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.0 cycles/limb
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop form below is necessary for the claimed speed. It needs to be
+C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it
+C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on
+C the movq's and achieve the necessary size.
+C
+C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one
+C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked
+C (offset by the size, as per the loop addressing) and one high limb
+C processed separately to get alignment.
+C
+C The padding for the nails case is unattractive, but shouldn't cost any
+C cycles. Explicit .byte's guarantee the desired instructions, at a point
+C where we're probably stalled waiting for loads anyway.
+C
+C Enhancements:
+C
+C The combination load/pxor/store might be able to be unrolled to approach
+C 0.5 c/l if desired.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+ movl PARAM_DST, %edx
+ movl PARAM_SIZE, %ecx
+ pcmpeqd %mm7, %mm7
+
+ leal (%edx,%ecx,4), %eax
+ andl $4, %eax
+ifelse(GMP_NAIL_BITS,0,,
+` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK
+
+ movl PARAM_SRC, %eax
+ movd -4(%eax,%ecx,4), %mm0 C src high limb
+
+ifelse(GMP_NAIL_BITS,0,,
+` C padding for alignment below
+ .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi
+ .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi
+')
+
+ jz L(aligned)
+
+ pxor %mm7, %mm0
+ movd %mm0, -4(%edx,%ecx,4) C dst high limb
+ decl %ecx
+ jz L(done)
+L(aligned):
+
+ addl $4, %eax
+ addl $4, %edx
+ decl %ecx
+ jz L(one)
+
+ C offset 0x30 for no nails, or 0x40 for nails
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter
+ C edx dst
+
+ subl $2, %ecx
+ movq (%eax,%ecx,4), %mm0
+ pxor %mm7, %mm0
+ movq %mm0, (%edx,%ecx,4)
+ jg L(top)
+
+ jnz L(done) C if size even
+
+L(one):
+ movd -4(%eax), %mm0 C src low limb
+ pxor %mm7, %mm0
+ movd %mm0, -4(%edx) C dst low limb
+
+L(done):
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/copyd.asm b/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000000..59ece40920
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -0,0 +1,144 @@
+dnl AMD K7 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C alignment dst/src, A=0mod8 N=4mod8
+C A/A A/N N/A N/N
+C K7 0.75 1.0 1.0 0.75
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ jae L(unroll)
+
+ orl %ecx, %ecx
+ jz L(simple_done)
+
+L(simple):
+ C eax src
+ C ebx scratch
+ C ecx counter
+ C edx dst
+ C
+ C this loop is 2 cycles/limb
+
+ movl -4(%eax,%ecx,4), %ebx
+ movl %ebx, -4(%edx,%ecx,4)
+ decl %ecx
+ jnz L(simple)
+
+L(simple_done):
+ movl SAVE_EBX, %ebx
+ ret
+
+
+L(unroll):
+ movl %esi, SAVE_ESI
+ leal (%eax,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %esi
+
+ andl %esi, %ebx
+ movl SAVE_ESI, %esi
+ subl $4, %ecx C size-4
+
+ testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
+ jz L(aligned)
+
+ C both src and dst unaligned, process one limb to align them
+ movl 12(%eax,%ecx,4), %ebx
+ movl %ebx, 12(%edx,%ecx,4)
+ decl %ecx
+L(aligned):
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, limbs
+ C edx dst
+
+ movq 8(%eax,%ecx,4), %mm0
+ movq (%eax,%ecx,4), %mm1
+ subl $4, %ecx
+ movq %mm0, 16+8(%edx,%ecx,4)
+ movq %mm1, 16(%edx,%ecx,4)
+ jns L(top)
+
+
+ C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %cl
+ jz L(finish_not_two)
+
+ movq 8(%eax,%ecx,4), %mm0
+ movq %mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+ testb $1, %cl
+ jz L(done)
+
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+
+L(done):
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/copyi.asm b/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000000..9a28f927ec
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -0,0 +1,157 @@
+dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C alignment dst/src, A=0mod8 N=4mod8
+C A/A A/N N/A N/N
+C K7 0.75 1.0 1.0 0.75
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
+C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
+C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in
+C one cycle, so perhaps some scheduling is needed to ensure it's a
+C load+store in each cycle, not store+store.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ jae L(unroll)
+
+ orl %ecx, %ecx
+ jz L(simple_done)
+
+L(simple):
+ C eax src, incrementing
+ C ebx scratch
+ C ecx counter
+ C edx dst, incrementing
+ C
+ C this loop is 2 cycles/limb
+
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ decl %ecx
+ leal 4(%eax), %eax
+ leal 4(%edx), %edx
+ jnz L(simple)
+
+L(simple_done):
+ movl SAVE_EBX, %ebx
+ ret
+
+
+L(unroll):
+ movl %eax, %ebx
+ leal -12(%eax,%ecx,4), %eax C src end - 12
+ subl $3, %ecx C size-3
+
+ andl %edx, %ebx
+ leal (%edx,%ecx,4), %edx C dst end - 12
+ negl %ecx
+
+ testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
+ jz L(aligned)
+
+ C both src and dst unaligned, process one limb to align them
+ movl (%eax,%ecx,4), %ebx
+ movl %ebx, (%edx,%ecx,4)
+ incl %ecx
+L(aligned):
+
+
+ ALIGN(16)
+L(top):
+ C eax src end - 12
+ C ebx
+ C ecx counter, negative, limbs
+ C edx dst end - 12
+
+ movq (%eax,%ecx,4), %mm0
+ movq 8(%eax,%ecx,4), %mm1
+ addl $4, %ecx
+ movq %mm0, -16(%edx,%ecx,4)
+ movq %mm1, -16+8(%edx,%ecx,4)
+ ja L(top) C jump no carry and not zero
+
+
+ C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %cl
+ jnz L(finish_not_two)
+
+ movq (%eax,%ecx,4), %mm0
+ movq %mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+ testb $1, %cl
+ jnz L(done)
+
+ movl 8(%eax), %ebx
+ movl %ebx, 8(%edx)
+
+L(done):
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/divrem_1.asm b/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000000..cf343280bb
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -0,0 +1,832 @@
+dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
+dnl division.
+
+dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s. "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code. "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If
+C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case
+C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then
+C if q1==0xFFFFFFFF that must be the right value.
+C
+C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
+C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This
+C then goes through as normal, and finding no addback required. sbbl costs
+C an extra cycle over what the main loop code does, but it keeps code size
+C and complexity down.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is used. The latter
+C is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case. It doesn't seem worth checking for this though, since it
+C probably occurs infrequently, in particular note that big_base for a
+C decimal mpn_get_str is not normalized in a 32-bit limb.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The inverse takes about 50 cycles to calculate, but after that the
+dnl multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC, -28)
+defframe(VAR_DST, -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_DST, %edx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edx, VAR_DST_STOP C &dst[xsize+2]
+ movl %edi, SAVE_EDI
+ xorl %edi, %edi C carry
+
+ movl -4(%esi,%ebx,4), %eax C src high limb
+ xor %ecx, %ecx
+
+ C
+
+ C
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edi) C high is carry if high<divisor
+ cmovnc( %eax, %ecx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %ecx, -12(%edx,%ebx,4) C dst high limb
+ sbbl $0, %ebx C skip one division if high<divisor
+ movl PARAM_PREINV_SHIFT, %ecx
+
+ leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
+ movl $32, %eax
+
+ movl %edx, VAR_DST C &dst[xsize+size]
+
+ shll %cl, %ebp C d normalized
+ subl %ecx, %eax
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C rshift
+ movl PARAM_PREINV_INVERSE, %eax
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ C offset 0xa1, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+ orl %ecx, %ecx C size
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ jz L(no_skip_div) C if size==0
+ movl -4(%esi,%ecx,4), %eax C src high limb
+ xorl %esi, %esi
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edx) C high is carry if high<divisor
+ cmovnc( %eax, %esi) C 0 if skip div, src high if not
+
+ movl %esi, (%edi,%ecx,4) C dst high limb
+ sbbl $0, %ecx C size-1 if high<divisor
+ movl PARAM_SRC, %esi C reload
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer)
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx scratch (remainder)
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ movl %eax, (%edi,%ecx,4)
+ decl %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ movl PARAM_DST, %edi
+ orl %ebx, %ebx
+ jnz L(divide_fraction)
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ movl %edx, %eax
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx scratch (remainder)
+ C esi
+ C edi dst
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ decl %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
+ leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
+
+ movl %edi, VAR_DST
+ movl %ebx, VAR_DST_STOP
+
+ movl %ecx, %ebx C size
+ movl $31, %ecx
+
+ movl %edx, %edi C carry
+ movl $-1, %edx
+
+ C
+
+ xorl %eax, %ecx C l
+ incl %eax C 32-l
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+ C eax inverse
+ C ebx size
+ C ecx shift
+ C edx
+ C esi src
+ C edi carry
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ orl %ebx, %ebx C size
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ jz L(start_zero)
+ movl %eax, VAR_SRC
+ cmpl $1, %ebx
+
+ movl 8(%eax), %esi C src high limb
+ jz L(start_one)
+
+L(start_two_or_more):
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ cmpl $2, %ebx
+ je L(integer_two_left)
+ jmp L(integer_top)
+
+
+L(start_one):
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shll %cl, %esi C n10 = high << l
+ movl %eax, VAR_SRC
+ jmp L(integer_one_left)
+
+
+L(start_zero):
+ C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+ C skipped a division.
+
+ shll %cl, %edi C n2 = carry << l
+ movl %edi, %eax C return value for zero_done
+ cmpl $0, PARAM_XSIZE
+
+ je L(zero_done)
+ jmp L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution. The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C cycles
+C n2+n1 1
+C mul 6
+C q1+1 1
+C mul 6
+C sub 1
+C addback 1
+C ---
+C 16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing. Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE). This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it. In this case
+C there's 19 cycles shown, but it executes at 17.
+
+ ALIGN(16)
+L(integer_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl VAR_SRC, %ecx
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next limb and the one below it
+ subl $4, %ecx
+
+ movl %ecx, VAR_SRC
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+ movl VAR_DST, %ecx
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+
+ leal -4(%ecx), %ecx
+
+ C
+
+ subl %eax, %esi
+ movl VAR_DST_STOP, %eax
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+ cmpl %eax, %ecx
+
+ movl %ebx, (%ecx)
+ movl %ecx, VAR_DST
+ jne L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case. This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl PARAM_SRC, %ecx
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd (%ecx), %mm0 C src low limb
+
+ movl VAR_DST_STOP, %ecx
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+
+ movl %ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ movl VAR_DST_STOP, %ecx
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx C q1 if q1+1 overflowed
+
+ mull %ebx
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+
+ movl %ebx, -8(%ecx)
+ subl $8, %ecx
+
+
+
+L(integer_none):
+ cmpl $0, PARAM_XSIZE
+ jne L(fraction_some)
+
+ movl %edi, %eax
+L(fraction_done):
+ movl VAR_NORM, %ecx
+L(zero_done):
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ shrl %cl, %eax
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx
+ C edx
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl VAR_DST, %ecx
+ movl VAR_DST_STOP, %edx
+ subl $4, %ecx
+
+ psrlq %mm7, %mm0
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ movl %ecx, VAR_DST
+
+ movd %mm0, %esi C next n10
+
+ movl $-1, (%ecx)
+ cmpl %ecx, %edx
+ jne L(integer_top)
+
+ jmp L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles. The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C = m*d + b*d - m - b
+C = floor((b(b-d)-1)/d)*d + b*d - m - b
+C = rnd(b(b-d)-1) + b*d - m - b
+C = rnd(b(b-d)-1 + b*d) - m - b
+C = rnd(b*b-1) - m - b
+C <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution. There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+ ALIGN(16)
+L(fraction_some):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi carry
+ C ebp divisor
+
+ movl PARAM_DST, %esi
+ movl VAR_DST_STOP, %ecx C &dst[xsize+2]
+ movl %edi, %eax
+
+ subl $8, %ecx C &dst[xsize]
+ jmp L(fraction_entry)
+
+
+ ALIGN(16)
+L(fraction_top):
+ C eax n2 carry, then scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst, decrementing
+ C edx scratch
+ C esi dst stop point
+ C edi (will be n2)
+ C ebp divisor
+
+ movl %ebx, (%ecx) C previous q
+ movl %eax, %edi C remainder->n2
+
+L(fraction_entry):
+ mull VAR_INVERSE C m*n2
+
+ movl %ebp, %eax C d
+ subl $4, %ecx C dst
+ leal 1(%edi), %ebx
+
+ C
+
+ C
+
+ C
+
+ C
+
+ addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ C
+
+ C
+
+ negl %eax C low of n - (q1+1)*d
+
+ C
+
+ sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
+ leal (%ebp,%eax), %edx
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+ cmpl %esi, %ecx
+
+ jne L(fraction_top)
+
+
+ movl %ebx, (%ecx)
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/lshift.asm b/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000000..b3383cf2c3
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -0,0 +1,481 @@
+dnl AMD K7 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 1.51
+dnl 8 1.26
+dnl 16 1.21
+dnl 32 1.2
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+ movl PARAM_SHIFT, %ecx
+ movl %edi, SAVE_EDI
+
+ movl PARAM_DST, %edi
+ decl %eax
+ jnz L(more_than_one_limb)
+
+ movl (%edx), %edx
+
+ shldl( %cl, %edx, %eax) C eax was decremented to zero
+
+ shll %cl, %edx
+
+ movl %edx, (%edi)
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ movd PARAM_SHIFT, %mm6
+ movd (%edx,%eax,4), %mm5 C src high limb
+ cmp $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+ negl %ecx
+ movd (%edx), %mm4 C src low limb
+
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+
+L(simple_top):
+ C eax loop counter, limbs
+ C ebx
+ C ecx
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm4 src low limb
+ C mm5 src high limb
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%edx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%edi,%eax,4)
+ jnz L(simple_top)
+
+
+ psllq %mm6, %mm5
+ psllq %mm6, %mm4
+
+ psrlq $32, %mm5
+ movd %mm4, (%edi) C dst low limb
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx (saved)
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm5 src high limb, for return value
+ C mm6 lshift
+
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+ leal -4(%edx,%eax,4), %edx C &src[size-2]
+
+ testb $4, %dl
+ movq (%edx), %mm1 C src high qword
+
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb (marked xxx) separately to
+ C make it so
+ C
+ C source -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest -4(edi,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ psllq %mm6, %mm1
+ subl $4, %edx
+ movl %eax, PARAM_SIZE C size-1
+
+ psrlq $32, %mm1
+ decl %eax C size-2 is new size-1
+
+ movd %mm1, 4(%edi,%eax,4)
+ movq (%edx), %mm1 C new src high qword
+L(start_src_aligned):
+
+
+ leal -4(%edi,%eax,4), %edi C &dst[size-2]
+ psllq %mm6, %mm5
+
+ testl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ jz L(start_dst_aligned)
+
+
+ C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+ C shift is 32 bits extra. High limb of dst (marked xxx) handled
+ C here separately.
+ C
+ C source %edx
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest %edi
+ C +-------+-------+-------+--
+ C | xxx |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ psllq %mm6, %mm1
+ addl $32, %ecx C shift+32
+
+ psrlq $32, %mm1
+
+ movd %mm1, 4(%edi)
+ movq %mm0, %mm1
+ subl $4, %edi
+
+ movd %ecx, %mm6 C new lshift
+L(start_dst_aligned):
+
+ decl %eax C size-2, two last limbs handled at end
+ movq %mm1, %mm2 C copy of src high qword
+ negl %ecx
+
+ andl $-2, %eax C round size down to even
+ addl $64, %ecx
+
+ movl %eax, %ebx
+ negl %eax
+
+ andl $UNROLL_MASK, %eax
+ decl %ebx
+
+ shll %eax
+
+ movd %ecx, %mm7 C rshift = 64-lshift
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%eax,%eax,4), %esi
+')
+ shrl $UNROLL_LOG2, %ebx C loop counter
+
+ leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+ movl PARAM_SIZE, %eax C for use at end
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%eax,%eax,4), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax size (for use at end)
+ C ebx loop counter
+ C ecx rshift
+ C edx src
+ C esi computed jump
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm1 \ carry (alternating, mm2 first)
+ C mm2 /
+ C mm6 lshift
+ C mm7 rshift
+ C
+ C 10 code bytes/limb
+ C
+ C The two chunks differ in whether mm1 or mm2 hold the carry.
+ C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 - 8))
+
+Zdisp( movq, disp0,(%edx), %mm0)
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+Zdisp( movq, %mm0, disp0,(%edi))
+
+
+Zdisp( movq, disp1,(%edx), %mm0)
+ psllq %mm6, %mm1
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm1, %mm0
+Zdisp( movq, %mm0, disp1,(%edi))
+')
+
+ subl $UNROLL_BYTES, %edx
+ subl $UNROLL_BYTES, %edi
+ decl %ebx
+
+ jns L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+ testb $1, %al
+ movl SAVE_EBX, %ebx
+ psllq %mm6, %mm2 C wanted left shifted in all cases below
+
+ movd %mm5, %eax
+
+ movl SAVE_ESI, %esi
+ jz L(end_even)
+
+
+L(end_odd):
+
+ C Size odd, destination was aligned.
+ C
+ C source edx+8 edx+4
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edi
+ C --+---------------+---------------+-------+
+ C | written | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size odd, destination was unaligned.
+ C
+ C source edx+8 edx+4
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edi
+ C --+---------------+---------------+
+ C | written | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at (%edi), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+ movd disp(4) (%edx), %mm0
+ testb $32, %cl
+
+ movq %mm0, %mm1
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+ psllq %mm6, %mm1
+
+ por %mm2, %mm0
+
+ movq %mm0, disp(0) (%edi)
+ jz L(end_odd_unaligned)
+ movd %mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+L(end_even):
+
+ C Size even, destination was aligned.
+ C
+ C source edx+8
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edi
+ C --+---------------+---------------+
+ C | written | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size even, destination was unaligned.
+ C
+ C source edx+8
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edi+4
+ C --+---------------+-------+
+ C | written | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movq for the aligned case overwrites the movd for the
+ C unaligned case.
+
+ movq %mm2, %mm0
+ psrlq $32, %mm2
+
+ testb $32, %cl
+ movd %mm2, disp(4) (%edi)
+
+ jz L(end_even_unaligned)
+ movq %mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/popham.asm b/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000000..95965b74d4
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/popham.asm
@@ -0,0 +1,213 @@
+dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl distance.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C P3 generic 6.5 7
+C P3 model 9 (Banias) 5.7 6.1
+C P3 model 13 (Dothan) 5.75 6
+C K7 5 6
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code. The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ RODATA
+ ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+L(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+')
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+
+',`
+ movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq L(rodata_3333333333333333), %mm6
+ movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+ pxor %mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+ movd (%eax,%ecx,8), %mm1
+
+HAM(` movd (%edx,%ecx,8), %mm0
+ pxor %mm0, %mm1
+')
+ orl %ecx, %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+ decl %ecx
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+
+ psubd %mm1, %mm0 C bit pairs
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+
+ psadbw( %mm4, %mm0)
+
+ paddd %mm0, %mm2 C add to total
+ jnz L(top)
+
+
+ movd %mm2, %eax
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/rshift.asm b/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000000..345d23a25e
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -0,0 +1,480 @@
+dnl AMD K7 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 1.51
+dnl 8 1.26
+dnl 16 1.21
+dnl 32 1.2
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+ movl PARAM_SHIFT, %ecx
+ movl %edi, SAVE_EDI
+
+ movl PARAM_DST, %edi
+ decl %eax
+ jnz L(more_than_one_limb)
+
+ movl (%edx), %edx C src limb
+
+ shrdl( %cl, %edx, %eax) C eax was decremented to zero
+
+ shrl %cl, %edx
+
+ movl %edx, (%edi) C dst limb
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ movd PARAM_SHIFT, %mm6 C rshift
+ movd (%edx), %mm5 C src low limb
+ cmp $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+ leal (%edx,%eax,4), %edx C &src[size-1]
+ leal -4(%edi,%eax,4), %edi C &dst[size-2]
+
+ movd (%edx), %mm4 C src high limb
+ negl %eax
+
+
+L(simple_top):
+ C eax loop counter, limbs, negative
+ C ebx
+ C ecx shift
+ C edx carry
+ C edx &src[size-1]
+ C edi &dst[size-2]
+ C ebp
+ C
+ C mm0 scratch
+ C mm4 src high limb
+ C mm5 src low limb
+ C mm6 shift
+
+ movq (%edx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edi,%eax,4)
+ jnz L(simple_top)
+
+
+ psllq $32, %mm5
+ psrlq %mm6, %mm4
+
+ psrlq %mm6, %mm5
+ movd %mm4, 4(%edi) C dst high limb
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm5 src low limb
+ C mm6 rshift
+
+ testb $4, %dl
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edi
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%edx), %mm0 C src low two limbs
+ addl $4, %edx
+ movl %eax, PARAM_SIZE C size-1
+
+ addl $4, %edi
+ decl %eax C size-2 is new size-1
+
+ psrlq %mm6, %mm0
+ movl %edi, PARAM_DST C new dst
+
+ movd %mm0, -4(%edi)
+L(start_src_aligned):
+
+
+ movq (%edx), %mm1 C src low two limbs
+ decl %eax C size-2, two last limbs handled at end
+ testl $4, %edi
+
+ psrlq %mm6, %mm5
+ jz L(start_dst_aligned)
+
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
+ C
+ C source edx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edi
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ psrlq %mm6, %mm1
+ addl $32, %ecx C shift+32
+
+ movd %mm1, (%edi)
+ movq %mm0, %mm1
+ addl $4, %edi C new dst
+
+ movd %ecx, %mm6
+L(start_dst_aligned):
+
+
+ movq %mm1, %mm2 C copy of src low two limbs
+ negl %ecx
+ andl $-2, %eax C round size down to even
+
+ movl %eax, %ebx
+ negl %eax
+ addl $64, %ecx
+
+ andl $UNROLL_MASK, %eax
+ decl %ebx
+
+ shll %eax
+
+ movd %ecx, %mm7 C lshift = 64-rshift
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%eax,%eax,4), %esi
+ negl %eax
+')
+ shrl $UNROLL_LOG2, %ebx C loop counter
+
+ leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+ movl PARAM_SIZE, %eax C for use at end
+
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%eax,%eax,4), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+ negl %eax
+
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(64)
+L(top):
+ C eax size, for use at end
+ C ebx loop counter
+ C ecx lshift
+ C edx src
+ C esi was computed jump
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm1 \ carry (alternating)
+ C mm2 /
+ C mm6 rshift
+ C mm7 lshift
+ C
+ C 10 code bytes/limb
+ C
+ C The two chunks differ in whether mm1 or mm2 hold the carry.
+ C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 8))
+
+Zdisp( movq, disp0,(%edx), %mm0)
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm2, %mm0
+Zdisp( movq, %mm0, disp0,(%edi))
+
+
+Zdisp( movq, disp1,(%edx), %mm0)
+ psrlq %mm6, %mm1
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm1, %mm0
+Zdisp( movq, %mm0, disp1,(%edi))
+')
+
+ addl $UNROLL_BYTES, %edx
+ addl $UNROLL_BYTES, %edi
+ decl %ebx
+
+ jns L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+ testb $1, %al
+ psrlq %mm6, %mm2 C wanted rshifted in all cases below
+ movl SAVE_ESI, %esi
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EBX, %ebx
+ jz L(end_even)
+
+
+ C Size odd, destination was aligned.
+ C
+ C source
+ C edx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edi
+ C +-------+---------------+---------------+--
+ C | | | written |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size odd, destination was unaligned.
+ C
+ C source
+ C edx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edi
+ C +---------------+---------------+--
+ C | | written |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword to store, and in the aligned case there's
+ C a further extra limb of dst to be formed.
+
+
+ movd disp0(%edx), %mm0
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ testb $32, %cl
+
+ por %mm2, %mm0
+ psrlq %mm6, %mm1
+
+ movq %mm0, disp0(%edi)
+ jz L(finish_odd_unaligned)
+
+ movd %mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+L(end_even):
+
+ C Size even, destination was aligned.
+ C
+ C source
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edi
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size even, destination was unaligned.
+ C
+ C source
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edi
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is the same data as the movq for
+ C the aligned case, it's just a choice between whether one or two
+ C limbs should be written.
+
+
+ testb $32, %cl
+ movd %mm2, disp0(%edi)
+
+ jz L(end_even_unaligned)
+
+ movq %mm2, disp0(%edi)
+L(end_even_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_1_1.asm b/gmp/mpn/x86/k7/mod_1_1.asm
new file mode 100644
index 0000000000..1bbe6f92d7
--- /dev/null
+++ b/gmp/mpn/x86/k7/mod_1_1.asm
@@ -0,0 +1,221 @@
+dnl x86-32 mpn_mod_1_1p, requiring cmov.
+
+dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 ?
+C AMD K7 7
+C AMD K8 ?
+
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx') C Also shift count
+
+C Stack frame
+C pre 36(%esp)
+C b 32(%esp)
+C n 28(%esp)
+C ap 24(%esp)
+C return 20(%esp)
+C %ebp 16(%esp)
+C %edi 12(%esp)
+C %esi 8(%esp)
+C %ebx 4(%esp)
+C B2mod (%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mod_1_1p)
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ mov 32(%esp), %ebp C pre[]
+
+ mov 12(%ebp), %eax C B2modb
+ push %eax C Put it on stack
+
+ mov n, %edx
+ mov 24(%esp), ap
+
+ lea (ap, %edx, 4), ap
+ mov -4(ap), %eax
+ cmp $3, %edx
+ jnc L(first)
+ mov -8(ap), r0
+ jmp L(reduce_two)
+
+L(first):
+ C First iteration, no r2
+ mull B2modb
+ mov -12(ap), r0
+ add %eax, r0
+ mov -8(ap), %eax
+ adc %edx, %eax
+ sbb r2, r2
+ subl $3, n
+ lea -16(ap), ap
+ jz L(reduce_three)
+
+ mov B2modb, B2mb
+ sub b, B2mb
+ lea (B2mb, r0), t0
+ jmp L(mid)
+
+ ALIGN(16)
+L(top): C Loopmixed to 7 c/l on k7
+ add %eax, r0
+ lea (B2mb, r0), t0
+ mov r2, %eax
+ adc %edx, %eax
+ sbb r2, r2
+L(mid): mull B2modb
+ and B2modb, r2
+ add r0, r2
+ decl n
+ mov (ap), r0
+ cmovc( t0, r2)
+ lea -4(ap), ap
+ jnz L(top)
+
+ add %eax, r0
+ mov r2, %eax
+ adc %edx, %eax
+ sbb r2, r2
+
+L(reduce_three):
+ C Eliminate r2
+ and b, r2
+ sub r2, %eax
+
+L(reduce_two):
+ mov pre, %ebp
+ movb 4(%ebp), %cl
+ test %cl, %cl
+ jz L(normalized)
+
+ C Unnormalized, use B1modb to reduce to size < B b
+ mull 8(%ebp)
+ xor t0, t0
+ add %eax, r0
+ adc %edx, t0
+ mov t0, %eax
+
+ C Left-shift to normalize
+ shld %cl, r0, %eax C Always use shld?
+
+ shl %cl, r0
+ jmp L(udiv)
+
+L(normalized):
+ mov %eax, t0
+ sub b, t0
+ cmovnc( t0, %eax)
+
+L(udiv):
+ lea 1(%eax), t0
+ mull (%ebp)
+ mov b, %ebx C Needed in register for lea
+ add r0, %eax
+ adc t0, %edx
+ imul %ebx, %edx
+ sub %edx, r0
+ cmp r0, %eax
+ lea (%ebx, r0), %eax
+ cmovnc( r0, %eax)
+ cmp %ebx, %eax
+ jnc L(fix)
+L(ok): shr %cl, %eax
+
+ add $4, %esp
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+
+ ret
+L(fix): sub %ebx, %eax
+ jmp L(ok)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+ push %ebp
+ mov 12(%esp), %ebp
+ push %esi
+ bsr %ebp, %ecx
+ push %ebx
+ xor $31, %ecx
+ mov 16(%esp), %esi
+ sal %cl, %ebp
+ mov %ebp, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebp C On K7, invert_limb would be a few cycles faster.
+ mov %eax, (%esi) C store bi
+ mov %ecx, 4(%esi) C store cnt
+ neg %ebp
+ mov $1, %edx
+ shld %cl, %eax, %edx
+ imul %ebp, %edx
+ shr %cl, %edx
+ imul %ebp, %eax
+ mov %edx, 8(%esi) C store B1modb
+ mov %eax, 12(%esi) C store B2modb
+ pop %ebx
+ pop %esi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_1_4.asm b/gmp/mpn/x86/k7/mod_1_4.asm
new file mode 100644
index 0000000000..bb7597edd2
--- /dev/null
+++ b/gmp/mpn/x86/k7/mod_1_4.asm
@@ -0,0 +1,260 @@
+dnl x86-32 mpn_mod_1s_4p, requiring cmov.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 6
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 15.5
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 ?
+C AMD K7 4.75
+C AMD K8 ?
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ sub $28, %esp
+ mov 60(%esp), %edi C cps[]
+ mov 8(%edi), %eax
+ mov 12(%edi), %edx
+ mov 16(%edi), %ecx
+ mov 20(%edi), %esi
+ mov 24(%edi), %edi
+ mov %eax, 4(%esp)
+ mov %edx, 8(%esp)
+ mov %ecx, 12(%esp)
+ mov %esi, 16(%esp)
+ mov %edi, 20(%esp)
+ mov 52(%esp), %eax C n
+ xor %edi, %edi
+ mov 48(%esp), %esi C up
+ lea -12(%esi,%eax,4), %esi
+ and $3, %eax
+ je L(b0)
+ cmp $2, %eax
+ jc L(b1)
+ je L(b2)
+
+L(b3): mov 4(%esi), %eax
+ mull 4(%esp)
+ mov (%esi), %ebp
+ add %eax, %ebp
+ adc %edx, %edi
+ mov 8(%esi), %eax
+ mull 8(%esp)
+ lea -12(%esi), %esi
+ jmp L(m0)
+
+L(b0): mov (%esi), %eax
+ mull 4(%esp)
+ mov -4(%esi), %ebp
+ add %eax, %ebp
+ adc %edx, %edi
+ mov 4(%esi), %eax
+ mull 8(%esp)
+ add %eax, %ebp
+ adc %edx, %edi
+ mov 8(%esi), %eax
+ mull 12(%esp)
+ lea -16(%esi), %esi
+ jmp L(m0)
+
+L(b1): mov 8(%esi), %ebp
+ lea -4(%esi), %esi
+ jmp L(m1)
+
+L(b2): mov 8(%esi), %edi
+ mov 4(%esi), %ebp
+ lea -8(%esi), %esi
+ jmp L(m1)
+
+ ALIGN(16)
+L(top): mov (%esi), %eax
+ mull 4(%esp)
+ mov -4(%esi), %ebx
+ xor %ecx, %ecx
+ add %eax, %ebx
+ adc %edx, %ecx
+ mov 4(%esi), %eax
+ mull 8(%esp)
+ add %eax, %ebx
+ adc %edx, %ecx
+ mov 8(%esi), %eax
+ mull 12(%esp)
+ add %eax, %ebx
+ adc %edx, %ecx
+ lea -16(%esi), %esi
+ mov 16(%esp), %eax
+ mul %ebp
+ add %eax, %ebx
+ adc %edx, %ecx
+ mov 20(%esp), %eax
+ mul %edi
+ mov %ebx, %ebp
+ mov %ecx, %edi
+L(m0): add %eax, %ebp
+ adc %edx, %edi
+L(m1): subl $4, 52(%esp)
+ ja L(top)
+
+L(end): mov 4(%esp), %eax
+ mul %edi
+ mov 60(%esp), %edi
+ add %eax, %ebp
+ adc $0, %edx
+ mov 4(%edi), %ecx
+ mov %edx, %esi
+ mov %ebp, %eax
+ sal %cl, %esi
+ mov %ecx, %ebx
+ neg %ecx
+ shr %cl, %eax
+ or %esi, %eax
+ lea 1(%eax), %esi
+ mull (%edi)
+ mov %ebx, %ecx
+ mov %eax, %ebx
+ mov %ebp, %eax
+ mov 56(%esp), %ebp
+ sal %cl, %eax
+ add %eax, %ebx
+ adc %esi, %edx
+ imul %ebp, %edx
+ sub %edx, %eax
+ lea (%eax,%ebp), %edx
+ cmp %eax, %ebx
+ cmovc( %edx, %eax)
+ mov %eax, %edx
+ sub %ebp, %eax
+ cmovc( %edx, %eax)
+ add $28, %esp
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ shr %cl, %eax
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
+ mov 24(%esp), %ebx
+ bsr %ebx, %ecx
+ xor $31, %ecx
+ sal %cl, %ebx C b << cnt
+ mov %ebx, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebx
+ xor %edi, %edi
+ sub %ebx, %edi
+ mov $1, %esi
+ mov %eax, (%ebp) C store bi
+ mov %ecx, 4(%ebp) C store cnt
+ shld %cl, %eax, %esi
+ imul %edi, %esi
+ mov %eax, %edi
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 8(%ebp) C store B1modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 12(%ebp) C store B2modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 16(%ebp) C store B3modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 20(%ebp) C store B4modb
+
+ not %edx
+ imul %ebx, %edx
+ add %edx, %ebx
+ cmp %edx, %eax
+ cmovnc( %edx, %ebx)
+
+ shr %cl, %ebx
+ mov %ebx, 24(%ebp) C store B5modb
+
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_34lsub1.asm b/gmp/mpn/x86/k7/mod_34lsub1.asm
new file mode 100644
index 0000000000..ee3ad04099
--- /dev/null
+++ b/gmp/mpn/x86/k7/mod_34lsub1.asm
@@ -0,0 +1,188 @@
+dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 1
+C Hammer: 1
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C The loop form below and the 64 byte code alignment seem necessary for the
+C claimed speed. This is a bit strange, since normally k7 isn't very
+C sensitive to such things. Perhaps there has to be 6 instructions in the
+C first 16 bytes for the BTB entry or something.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EDI, `PARAM_SIZE')
+
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx
+ ja L(three_or_more)
+
+ movl (%edx), %eax
+ jb L(one)
+
+ movl 4(%edx), %ecx
+ movl %eax, %edx
+ shrl $24, %eax C src[0] low
+
+ andl $0xFFFFFF, %edx C src[0] high
+ addl %edx, %eax
+ movl %ecx, %edx
+
+ andl $0xFFFF, %ecx
+ shrl $16, %edx C src[1] high
+ addl %edx, %eax
+
+ shll $8, %ecx C src[1] low
+ addl %ecx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+
+ pushl %ebx FRAME_pushl()
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+
+ movl %edi, SAVE_EDI
+ pushl %esi FRAME_pushl()
+ xorl %esi, %esi C and clear carry flag
+
+
+ C code offset 0x40 at this point
+L(top):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx counter, limbs
+ C edx src
+ C esi acc 2mod3
+ C edi
+
+ leal 24(%edx), %edx
+ leal -2(%ecx), %ecx
+ adcl -24(%edx), %eax
+ adcl -20(%edx), %ebx
+ adcl -16(%edx), %esi
+
+ decl %ecx
+ jng L(done_loop)
+
+ leal -2(%ecx), %ecx
+ adcl -12(%edx), %eax
+ adcl -8(%edx), %ebx
+ adcl -4(%edx), %esi
+
+ decl %ecx
+ jg L(top)
+
+
+ leal 12(%edx), %edx
+
+
+L(done_loop):
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+ incl %ecx
+ movl $0xFFFFFFFF, %edi
+ js L(combine)
+
+ adcl -12(%edx), %eax
+ decl %ecx
+ movl $0xFFFFFF00, %edi
+ js L(combine)
+
+ adcl -8(%edx), %ebx
+ movl $0xFFFF0000, %edi
+
+
+L(combine):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx
+ C edx
+ C esi acc 2mod3
+ C edi mask
+
+ sbbl %ecx, %ecx C carry
+ movl %eax, %edx C 0mod3
+ shrl $24, %eax C 0mod3 high
+
+ andl %edi, %ecx C carry masked
+ andl $0x00FFFFFF, %edx C 0mod3 low
+ movl %ebx, %edi C 1mod3
+
+ subl %ecx, %eax C apply carry
+ shrl $16, %ebx C 1mod3 high
+ andl $0xFFFF, %edi
+
+ addl %edx, %eax C apply 0mod3 low
+ movl %esi, %edx C 2mod3
+ shll $8, %edi C 1mod3 low
+
+ addl %ebx, %eax C apply 1mod3 high
+ shrl $8, %esi C 2mod3 high
+ movzbl %dl, %edx C 2mod3 low
+
+ addl %edi, %eax C apply 1mod3 low
+ shll $16, %edx C 2mod3 low
+
+ addl %esi, %eax C apply 2mod3 high
+ popl %esi FRAME_popl()
+
+ movl SAVE_EDI, %edi
+ addl %edx, %eax C apply 2mod3 low
+ popl %ebx FRAME_popl()
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mode1o.asm b/gmp/mpn/x86/k7/mode1o.asm
new file mode 100644
index 0000000000..6472ec5949
--- /dev/null
+++ b/gmp/mpn/x86/k7/mode1o.asm
@@ -0,0 +1,180 @@
+dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 7.0
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C With the loop running at just 11 cycles it doesn't seem worth bothering to
+C check for high<divisor to save one step.
+C
+C Using a divl for size==1 measures slower than the modexact method, which
+C is not too surprising since for the latter it's only about 24 cycles to
+C calculate the modular inverse.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+deflit(STACK_SPACE, 16)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+L(start_1c):
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_DIVISOR, %esi
+
+ movl %edi, SAVE_EDI
+
+ shrl %eax C d/2
+
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edi)
+ movzbl (%eax,%edi), %edi C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %edi C inv 8 bits
+')
+
+ xorl %edx, %edx C initial extra carry
+ leal (%edi,%edi), %eax C 2*inv
+
+ imull %edi, %edi C inv*inv
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SRC, %ebx
+
+ imull %esi, %edi C inv*inv*d
+
+ subl %edi, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %edi C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull %esi, %eax C inv*inv*d
+
+ leal (%ebx,%ebp,4), %ebx C src end
+ negl %ebp C -size
+
+ subl %eax, %edi C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl %esi, %eax
+ imull %edi, %eax
+ cmpl $1, %eax')
+
+
+C The dependent chain here is
+C
+C cycles
+C subl %edx, %eax 1
+C imull %edi, %eax 4
+C mull %esi 6 (high limb)
+C ----
+C total 11
+C
+C Out of order execution hides the load latency for the source data, so no
+C special scheduling is required.
+
+L(top):
+ C eax src limb
+ C ebx src end ptr
+ C ecx next carry bit, 0 or 1 (or initial carry param)
+ C edx carry limb, high of last product
+ C esi divisor
+ C edi inverse
+ C ebp counter, limbs, negative
+
+ movl (%ebx,%ebp,4), %eax
+
+ subl %ecx, %eax C apply carry bit
+ movl $0, %ecx
+
+ setc %cl C new carry bit
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ecx
+
+ imull %edi, %eax
+
+ mull %esi
+
+ incl %ebp
+ jnz L(top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ leal (%ecx,%edx), %eax
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mul_1.asm b/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000000..755cd2ed50
--- /dev/null
+++ b/gmp/mpn/x86/k7/mul_1.asm
@@ -0,0 +1,237 @@
+dnl AMD K7 mpn_mul_1.
+
+dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6
+C AMD K7 3.25
+C AMD K8
+
+C TODO
+C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
+C but we might be able to do even better.
+C * The feed-in code for mul_1c is crude.
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ mov 20(%esp), %edi
+ mov 24(%esp), %esi
+ mov 28(%esp), %ebp
+ mov 32(%esp), %ecx
+ mov %ebp, %ebx
+ shr $2, %ebp
+ mov %ebp, 28(%esp)
+ mov (%esi), %eax
+ and $3, %ebx
+ jz L(c0)
+ cmp $2, %ebx
+ mov 36(%esp), %ebx
+ jz L(c2)
+ jg L(c3)
+
+L(c1): lea -4(%edi), %edi
+ mul %ecx
+ test %ebp, %ebp
+ jnz 1f
+ add %ebx, %eax
+ mov %eax, 4(%edi)
+ mov %edx, %eax
+ adc %ebp, %eax
+ jmp L(rt)
+1: add %eax, %ebx
+ mov $0, %ebp
+ adc %edx, %ebp
+ mov 4(%esi), %eax
+ jmp L(1)
+
+L(c2): lea 4(%esi), %esi
+ mul %ecx
+ test %ebp, %ebp
+ mov %ebx, %ebp
+ jnz 2f
+ add %eax, %ebp
+ mov $0, %ebx
+ adc %edx, %ebx
+ mov (%esi), %eax
+ jmp L(cj2)
+2: add %eax, %ebp
+ mov $0, %ebx
+ adc %edx, %ebx
+ mov (%esi), %eax
+ jmp L(2)
+
+L(c3): lea 8(%esi), %esi
+ lea -12(%edi), %edi
+ mul %ecx
+ add %eax, %ebx
+ mov $0, %ebp
+ adc %edx, %ebp
+ mov -4(%esi), %eax
+ incl 28(%esp)
+ jmp L(3)
+
+L(c0): mov 36(%esp), %ebx
+ lea -4(%esi), %esi
+ lea -8(%edi), %edi
+ mul %ecx
+ mov %ebx, %ebp
+ add %eax, %ebp
+ mov $0, %ebx
+ adc %edx, %ebx
+ mov 8(%esi), %eax
+ jmp L(0)
+
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ mov 20(%esp), %edi
+ mov 24(%esp), %esi
+ mov 28(%esp), %ebp
+ mov 32(%esp), %ecx
+ mov %ebp, %ebx
+ shr $2, %ebp
+ mov %ebp, 28(%esp)
+ mov (%esi), %eax
+ and $3, %ebx
+ jz L(b0)
+ cmp $2, %ebx
+ jz L(b2)
+ jg L(b3)
+
+L(b1): lea -4(%edi), %edi
+ mul %ecx
+ test %ebp, %ebp
+ jnz L(gt1)
+ mov %eax, 4(%edi)
+ mov %edx, %eax
+ jmp L(rt)
+L(gt1): mov %eax, %ebx
+ mov %edx, %ebp
+ mov 4(%esi), %eax
+ jmp L(1)
+
+L(b2): lea 4(%esi), %esi
+ mul %ecx
+ test %ebp, %ebp
+ mov %eax, %ebp
+ mov %edx, %ebx
+ mov (%esi), %eax
+ jnz L(2)
+ jmp L(cj2)
+
+L(b3): lea 8(%esi), %esi
+ lea -12(%edi), %edi
+ mul %ecx
+ mov %eax, %ebx
+ mov %edx, %ebp
+ mov -4(%esi), %eax
+ incl 28(%esp)
+ jmp L(3)
+
+L(b0): lea -4(%esi), %esi
+ lea -8(%edi), %edi
+ mul %ecx
+ mov %eax, %ebp
+ mov %edx, %ebx
+ mov 8(%esi), %eax
+ jmp L(0)
+
+ ALIGN(16)
+L(top): mov $0, %ebx
+ adc %edx, %ebx
+L(2): mul %ecx
+ add %eax, %ebx
+ mov %ebp, 0(%edi)
+ mov 4(%esi), %eax
+ mov $0, %ebp
+ adc %edx, %ebp
+L(1): mul %ecx
+ add %eax, %ebp
+ mov 8(%esi), %eax
+ mov %ebx, 4(%edi)
+ mov $0, %ebx
+ adc %edx, %ebx
+L(0): mov %ebp, 8(%edi)
+ mul %ecx
+ add %eax, %ebx
+ mov 12(%esi), %eax
+ lea 16(%esi), %esi
+ mov $0, %ebp
+ adc %edx, %ebp
+L(3): mov %ebx, 12(%edi)
+ mul %ecx
+ lea 16(%edi), %edi
+ add %eax, %ebp
+ decl 28(%esp)
+ mov 0(%esi), %eax
+ jnz L(top)
+
+L(end): mov $0, %ebx
+ adc %edx, %ebx
+L(cj2): mul %ecx
+ add %eax, %ebx
+ mov %ebp, (%edi)
+L(cj1): mov %ebx, 4(%edi)
+ adc $0, %edx
+ mov %edx, %eax
+
+L(rt): mov (%esp), %ebp
+ mov 4(%esp), %ebx
+ mov 8(%esp), %esi
+ mov 12(%esp), %edi
+ add $16, %esp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86/k7/mul_basecase.asm b/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000000..4dfb500885
--- /dev/null
+++ b/gmp/mpn/x86/k7/mul_basecase.asm
@@ -0,0 +1,602 @@
+dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+C limbs/loop unrolling).
+
+
+
+dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl 8 4.67
+dnl 16 4.59
+dnl 32 4.42
+dnl Maximum possible with the current code is 32.
+dnl
+dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl done with a straight run through a block of code, no inner loop. Using
+dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once. The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+ movl (%eax), %eax C yp low limb
+
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ mull (%edx)
+
+ movl PARAM_WP, %ecx
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+ decl PARAM_YSIZE
+ pushl %ebx defframe_pushl(`SAVE_EBX')
+ movl %eax, %ecx C yp low limb
+
+ movl PARAM_WP, %ebx
+ pushl %esi defframe_pushl(`SAVE_ESI')
+ movl %edx, %esi C xp
+
+ movl (%edx), %eax C xp low limb
+ jnz L(two_by_two)
+
+
+ C two limbs by one limb
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+
+ movl %esi, 4(%ebx)
+ movl SAVE_ESI, %esi
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+ ALIGN(16)
+L(two_by_two):
+ C eax xp low limb
+ C ebx wp
+ C ecx yp low limb
+ C edx
+ C esi xp
+ C edi
+ C ebp
+
+dnl FRAME carries on from previous
+
+ mull %ecx C xp[0] * yp[0]
+
+ push %edi defframe_pushl(`SAVE_EDI')
+ movl %edx, %edi C carry, for wp[1]
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+ movl 4(%ecx), %ecx C yp[1]
+ movl %edi, 4(%ebx)
+
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+
+ adcl $0, %edx
+ movl (%esi), %eax C xp[0]
+
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ adcl %edx, %edi
+ movl %edi, 8(%ebx)
+
+ adcl $0, %esi
+ movl SAVE_EDI, %edi
+ movl %esi, 12(%ebx)
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline. Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times). A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+
+dnl FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+ subl $16, %esp
+deflit(`FRAME',16)
+
+ movl %edi, SAVE_EDI
+ movl PARAM_WP, %edi
+
+ movl %ebx, SAVE_EBX
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+
+ movl %esi, SAVE_ESI
+ xorl %ebx, %ebx
+ leal (%edx,%ecx,4), %esi C xp end
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi xp end
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+ movl PARAM_XSIZE, %ecx
+
+ movl %ebx, (%edi) C final carry
+ decl %edx
+
+ jnz L(ysize_more_than_one)
+
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_EBP, %ebp
+ movl SAVE_ESI, %esi
+ addl $FRAME, %esp
+
+ ret
+
+
+L(ysize_more_than_one):
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+ C simple addmul looping
+ C
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ leal 4(%eax,%edx,4), %ebp C yp end
+ negl %ecx
+ negl %edx
+
+ movl (%esi,%ecx,4), %eax C xp low limb
+ movl %edx, PARAM_YSIZE C -(ysize-1)
+ incl %ecx
+
+ xorl %ebx, %ebx C initial carry
+ movl %ecx, PARAM_XSIZE C -(xsize-1)
+ movl %ebp, PARAM_YP
+
+ movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
+ jmp L(simple_outer_entry)
+
+
+ C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):
+ C ebp ysize counter, negative
+
+ movl PARAM_YP, %edx
+ movl PARAM_XSIZE, %ecx C -(xsize-1)
+ xorl %ebx, %ebx C carry
+
+ movl %ebp, PARAM_YSIZE
+ addl $4, %edi C next position in wp
+
+ movl (%edx,%ebp,4), %ebp C yp limb - multiplier
+ movl -4(%esi,%ecx,4), %eax C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+ C eax xp limb
+ C ebx carry limb
+ C ecx loop counter (negative)
+ C edx scratch
+ C esi xp end
+ C edi wp end
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ebx
+ adcl $0, %edx
+
+ addl %ebx, (%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %edx
+
+ incl %ecx
+ movl %edx, %ebx
+ jnz L(simple_inner)
+
+
+ mull %ebp
+
+ movl PARAM_YSIZE, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ addl %ebx, (%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, 4(%edi)
+ jnz L(simple_outer_top)
+
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp. This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER, -20)
+defframe(VAR_ADJUST, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_XP_LOW, -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ movl PARAM_XP, %esi
+ movl 4(%eax), %ebp C multiplier (yp second limb)
+ leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
+
+ movl PARAM_WP, %edi
+ movl %eax, PARAM_YP
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
+ decl %ecx C xsize-1
+
+ movl (%esi), %eax C xp low limb
+ andl $-UNROLL_MASK-1, %ebx
+ negl %ecx
+
+ subl $VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+ negl %ebx
+ andl $UNROLL_MASK, %ecx
+
+ movl %ebx, VAR_ADJUST
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ sarl $UNROLL_LOG2, %ebx
+
+ C 17 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+ negl %edx
+
+ movl %eax, VAR_XP_LOW
+ movl %ecx, VAR_JMP
+ leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
+ leal 4(%esi,%edx,4), %esi C and start at second limb
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%ecx,%edx,1), %ecx
+ addl $L(unroll_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_outer_top):
+ C ebp ysize counter, negative
+
+ movl VAR_ADJUST, %ebx
+ movl PARAM_YP, %edx
+
+ movl VAR_XP_LOW, %eax
+ movl %ebp, PARAM_YSIZE C store incremented ysize counter
+
+ leal 4(%edi,%ebx,4), %edi
+ leal (%esi,%ebx,4), %esi
+ sarl $UNROLL_LOG2, %ebx
+
+ movl (%edx,%ebp,4), %ebp C yp next multiplier
+ movl VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ testb $1, %cl C and clear carry bit
+ movl %ebx, VAR_COUNTER
+ movl $0, %ebx
+
+ movl $0, %ecx
+ cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
+ cmovnz( %eax, %ebx)
+
+ C Extra fetch of VAR_JMP is bad, but registers are tight
+ jmp *VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_top):
+ C eax xp limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi xp+8
+ C edi wp
+ C ebp yp multiplier limb
+ C
+ C VAR_COUNTER loop counter, negative
+ C
+ C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ adcl %edx, %ebx
+
+ mull %ebp
+
+Zdisp( addl, %ecx, disp0,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+
+ movl disp1(%esi), %eax
+ adcl %edx, %ecx
+
+ mull %ebp
+
+ addl %ebx, disp1(%edi)
+ movl $0, %ebx
+
+ adcl %eax, %ecx
+')
+
+
+ incl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+ leal UNROLL_BYTES(%edi), %edi
+
+ jnz L(unroll_top)
+
+
+ C eax
+ C ebx zero
+ C ecx low
+ C edx high
+ C esi
+ C edi wp, pointing at second last limb)
+ C ebp
+ C
+ C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+ movl PARAM_YSIZE, %ebp
+ adcl $0, %edx
+ addl %ecx, disp0(%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, disp1(%edi)
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/sqr_basecase.asm b/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000000..7b6a97e0df
--- /dev/null
+++ b/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -0,0 +1,635 @@
+dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+C (measured on the speed difference between 25 and 50 limbs, which is
+C roughly the Karatsuba recursing range).
+
+
+dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl some comments.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ cmpl $2, %ecx
+
+ movl PARAM_DST, %edx
+ je L(two_limbs)
+ ja L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ecx size
+ C edx dst
+
+ movl (%eax), %eax
+ movl %edx, %ecx
+
+ mull %eax
+
+ movl %edx, 4(%ecx)
+ movl %eax, (%ecx)
+ ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers. Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+ ALIGN(16)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+deflit(`FRAME',0)
+
+ pushl %ebx FRAME_pushl()
+ movl %eax, %ebx C src
+ movl (%eax), %eax
+
+ movl %edx, %ecx C dst
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx) C dst[1]
+
+ mull %eax C src[1]^2
+
+ movl %eax, 8(%ecx) C dst[2]
+ movl (%ebx), %eax
+
+ movl %edx, 12(%ecx) C dst[3]
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ popl %ebx
+
+ addl %eax, 4(%ecx)
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+ ASSERT(nc)
+
+ addl %eax, 4(%ecx)
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+ ASSERT(nc)
+
+ ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+ subl $STACK_SPACE, %esp
+ cmpl $4, %ecx
+ jae L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+ C eax src
+ C ecx size
+ C edx dst
+
+ movl %ebx, SAVE_EBX
+ movl %eax, %ebx C src
+ movl (%eax), %eax
+
+ movl %edx, %ecx C dst
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+ movl %edx, 4(%ecx)
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl 8(%ebx), %eax
+ movl %edx, 12(%ecx)
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl (%ebx), %eax
+ movl %edx, 20(%ecx)
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl (%ebx), %eax
+ movl %edx, %edi
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %ebp, SAVE_EBP
+ movl $0, %ebp
+
+ movl 4(%ebx), %eax
+ adcl %edx, %ebp
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ adcl $0, %edx
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+ movl 4(%ecx), %eax
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+
+ adcl $0, %ebx
+ addl %eax, %esi
+ movl 8(%ecx), %eax
+
+ adcl %eax, %edi
+ movl 12(%ecx), %eax
+ movl %esi, 4(%ecx)
+
+ adcl %eax, %ebp
+ movl 16(%ecx), %eax
+ movl %edi, 8(%ecx)
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+
+ adcl %eax, %edx
+ movl 20(%ecx), %eax
+ movl %ebp, 12(%ecx)
+
+ adcl %ebx, %eax
+ ASSERT(nc)
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ movl %edx, 16(%ecx)
+ movl %eax, 20(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP', -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+ movl %ebx, SAVE_EBX
+ movl %edi, SAVE_EDI
+ leal (%edx,%ecx,4), %edi C &dst[size]
+
+ movl %esi, SAVE_ESI
+ movl %ebp, SAVE_EBP
+ leal (%eax,%ecx,4), %esi C &src[size]
+
+ movl (%eax), %ebp C multiplier
+ movl $0, %ebx
+ decl %ecx
+
+ negl %ecx
+ subl $EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+ C eax scratch
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time. There seems no way to avoid this.
+
+dnl This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl With the unmodified &src[size] and &dst[size] pointers, the
+dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl Because the last chunk of code is generated differently, a label placed
+dnl at the end doesn't work. Instead calculate the implied end using the
+dnl start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+ C eax
+ C ebx carry
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, (%edi)
+
+ subl $4, %ecx
+ jz L(corner)
+
+ negl %ecx
+ifelse(OFFSET,0,,`subl $OFFSET, %edi')
+ifelse(OFFSET,0,,`subl $OFFSET, %esi')
+
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+ C The calculated jump mustn't come out to before the start of the
+ C code available. This is the limit UNROLL_COUNT puts on the src
+ C operand size, but checked here directly using the jump address.
+ ASSERT(ae,
+ `movl_text_address(L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx high limb to store
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi &src[size], constant
+ C edi dst ptr, high of last addmul
+ C ebp
+
+ movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
+ movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
+
+ movl %edx, VAR_COUNTER
+
+ mull %ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+ testb $1, %cl
+ movl %edx, %ebx C high carry
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ cmovX( %ebx, %ecx) C high carry reverse
+ cmovX( %eax, %ebx) C low carry reverse
+
+ leal CODE_BYTES_PER_LIMB(%edx), %eax
+ xorl %edx, %edx
+ leal 4(%edi), %edi
+
+ movl %eax, VAR_JMP
+
+ jmp *%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ addl (%esp), %ecx
+ addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+ addl %edx, %ecx
+ ret_internal
+')
+
+
+ C Must be an even address to preserve the significance of the low
+ C bit of the jump address indicating which way around ecx/ebx should
+ C start.
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax next limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src - 4))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%esi), %eax)
+ adcl %edx, %ebx
+
+ mull %ebp
+
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+',`
+ dnl this bit comes out last
+Zdisp( movl, disp_src,(%esi), %eax)
+ adcl %edx, %ecx
+
+ mull %ebp
+
+Zdisp( addl, %ebx, disp_dst,(%edi))
+
+ifelse(forloop_last,0,
+` movl $0, %ebx')
+
+ adcl %eax, %ecx
+')
+')
+
+ C eax next limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ adcl $0, %edx
+ addl %ecx, -4+OFFSET(%edi)
+ movl VAR_JMP, %ecx
+
+ adcl $0, %edx
+
+ movl %edx, m4_empty_if_zero(OFFSET) (%edi)
+ movl VAR_COUNTER, %edx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %esi
+ addl $OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-5]
+
+ movl -12(%esi), %ebp
+ movl -8(%esi), %eax
+ movl %eax, %ecx
+
+ mull %ebp
+
+ addl %eax, -4(%edi)
+ movl -4(%esi), %eax
+
+ adcl $0, %edx
+ movl %edx, %ebx
+ movl %eax, %esi
+
+ mull %ebp
+
+ addl %ebx, %eax
+
+ adcl $0, %edx
+ addl %eax, (%edi)
+ movl %esi, %eax
+
+ adcl $0, %edx
+ movl %edx, %ebx
+
+ mull %ecx
+
+ addl %ebx, %eax
+ movl %eax, 4(%edi)
+
+ adcl $0, %edx
+ movl %edx, 8(%edi)
+
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edi
+ xorl %ecx, %ecx C clear carry
+
+ leal (%edi,%eax,8), %edi
+ notl %eax C -size-1, preserve carry
+
+ leal 2(%eax), %eax C -(size-1)
+
+L(lshift):
+ C eax counter, negative
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi dst, pointing just after last limb
+ C ebp
+
+ rcll -4(%edi,%eax,8)
+ rcll (%edi,%eax,8)
+ incl %eax
+ jnz L(lshift)
+
+ setc %al
+
+ movl PARAM_SRC, %esi
+ movl %eax, -4(%edi) C dst most significant limb
+
+ movl PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+
+ mull %eax
+
+ leal (%esi,%ecx,4), %esi C src point just after last limb
+ negl %ecx
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+ incl %ecx
+
+L(diag):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, negative
+ C edx carry
+ C esi src just after last limb
+ C edi dst just after last limb
+ C ebp
+
+ movl (%esi,%ecx,4), %eax
+ movl %edx, %ebx
+
+ mull %eax
+
+ addl %ebx, -4(%edi,%ecx,8)
+ adcl %eax, (%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ addl %edx, -4(%edi) C dst most significant limb
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/sublsh1_n.asm b/gmp/mpn/x86/k7/sublsh1_n.asm
new file mode 100644
index 0000000000..523b01218d
--- /dev/null
+++ b/gmp/mpn/x86/k7/sublsh1_n.asm
@@ -0,0 +1,173 @@
+dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The
+C innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers. It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 6.75
+C AMD K6
+C AMD K7
+C AMD K8
+
+C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
+C processors. It uses 2*4-way unrolling, for good reasons.
+C
+C Breaking carry recurrency might be a good idea. We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force is to 2*2-way unrolling.
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_sublsh1_n_ip1)
+deflit(`FRAME',0)
+
+define(`rp', `%edi')
+define(`up', `%esi')
+
+ mov PARAM_SIZE, %eax C size
+ push up FRAME_pushl()
+ push rp FRAME_pushl()
+ xor %edx, %edx
+ mov PARAM_SRC, up
+ mov PARAM_DST, rp
+ mov %ebx, SAVE_EBX
+ mov %eax, %ebx
+ shr $3, %eax
+
+ not %eax C count = -(size\8)-i
+ and $7, %ebx C size % 8
+ jz L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (up), %ecx
+ adc %ecx, %ecx
+ rcr %edx C restore 1st saved carry bit
+ lea 4(up), up
+ sbb %ecx, (rp)
+ lea 4(rp), rp
+ adc %edx, %edx C save a carry bit in edx
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ dec %ebx
+ jnz L(oop)
+L(exact):
+ inc %eax
+ jz L(end)
+ mov %eax, VAR_COUNT
+ mov %ebp, SAVE_EBP
+
+ ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (up), %eax
+ adc %eax, %eax
+ mov 4(up), %ebx
+ adc %ebx, %ebx
+ mov 8(up), %ecx
+ adc %ecx, %ecx
+ mov 12(up), %ebp
+ adc %ebp, %ebp
+
+ rcr %edx C restore 1st saved carry bit
+
+ sbb %eax, (rp)
+ sbb %ebx, 4(rp)
+ sbb %ecx, 8(rp)
+ sbb %ebp, 12(rp)
+
+ mov 16(up), %eax
+ adc %eax, %eax
+ mov 20(up), %ebx
+ adc %ebx, %ebx
+ mov 24(up), %ecx
+ adc %ecx, %ecx
+ mov 28(up), %ebp
+ adc %ebp, %ebp
+
+ lea 32(up), up
+ adc %edx, %edx C save a carry bit in edx
+
+ sbb %eax, 16(rp)
+ sbb %ebx, 20(rp)
+ sbb %ecx, 24(rp)
+ sbb %ebp, 28(rp)
+
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ incl VAR_COUNT
+ lea 32(rp), rp
+ jne L(top)
+
+ mov SAVE_EBP, %ebp
+L(end):
+ mov SAVE_EBX, %ebx
+
+ifdef(`CPU_P6',`
+ xor %eax, %eax
+ shr $1, %edx
+ adc %edx, %eax
+',`
+ adc $0, %edx
+ mov %edx, %eax
+')
+ pop rp FRAME_popl()
+ pop up FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()