summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86/k7
diff options
context:
space:
mode:
authorPedro Alvarez <pedro.alvarez@codethink.co.uk>2016-05-27 17:39:31 +0100
committerPedro Alvarez <pedro.alvarez@codethink.co.uk>2016-05-27 17:53:32 +0100
commit26c75cf8267919f81a1759c9c965a52c660233f9 (patch)
treecf2a39cf56c2c8ac45760854413ab233e6263974 /gmp/mpn/x86/k7
parent56892c1d217baea02092b51a09bbc924130ca84c (diff)
downloadgcc-tarball-26c75cf8267919f81a1759c9c965a52c660233f9.tar.gz
Diffstat (limited to 'gmp/mpn/x86/k7')
-rw-r--r--gmp/mpn/x86/k7/README25
-rw-r--r--gmp/mpn/x86/k7/addlsh1_n.asm196
-rw-r--r--gmp/mpn/x86/k7/aors_n.asm35
-rw-r--r--gmp/mpn/x86/k7/aorsmul_1.asm50
-rw-r--r--gmp/mpn/x86/k7/bdiv_q_1.asm244
-rw-r--r--gmp/mpn/x86/k7/dive_1.asm35
-rw-r--r--gmp/mpn/x86/k7/gcd_1.asm481
-rw-r--r--gmp/mpn/x86/k7/gmp-mparam.h292
-rw-r--r--gmp/mpn/x86/k7/invert_limb.asm193
-rw-r--r--gmp/mpn/x86/k7/mmx/com_n.asm (renamed from gmp/mpn/x86/k7/mmx/com.asm)39
-rw-r--r--gmp/mpn/x86/k7/mmx/copyd.asm33
-rw-r--r--gmp/mpn/x86/k7/mmx/copyi.asm33
-rw-r--r--gmp/mpn/x86/k7/mmx/divrem_1.asm49
-rw-r--r--gmp/mpn/x86/k7/mmx/lshift.asm35
-rw-r--r--gmp/mpn/x86/k7/mmx/mod_1.asm509
-rw-r--r--gmp/mpn/x86/k7/mmx/popham.asm37
-rw-r--r--gmp/mpn/x86/k7/mmx/rshift.asm35
-rw-r--r--gmp/mpn/x86/k7/mod_1_1.asm221
-rw-r--r--gmp/mpn/x86/k7/mod_1_4.asm260
-rw-r--r--gmp/mpn/x86/k7/mod_34lsub1.asm36
-rw-r--r--gmp/mpn/x86/k7/mode1o.asm37
-rw-r--r--gmp/mpn/x86/k7/mul_1.asm46
-rw-r--r--gmp/mpn/x86/k7/mul_basecase.asm35
-rw-r--r--gmp/mpn/x86/k7/sqr_basecase.asm47
-rw-r--r--gmp/mpn/x86/k7/sublsh1_n.asm173
25 files changed, 1120 insertions, 2056 deletions
diff --git a/gmp/mpn/x86/k7/README b/gmp/mpn/x86/k7/README
index 5711b612c5..e2c5e0c18d 100644
--- a/gmp/mpn/x86/k7/README
+++ b/gmp/mpn/x86/k7/README
@@ -3,28 +3,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/k7/addlsh1_n.asm b/gmp/mpn/x86/k7/addlsh1_n.asm
deleted file mode 100644
index a957b6f78e..0000000000
--- a/gmp/mpn/x86/k7/addlsh1_n.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
-C The innerloop is 2*3-way unrolled, which is best we can do with the available
-C registers. It seems tricky to use the same structure for rsblsh1_n, since we
-C cannot feed carry between operations there.
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 6
-C AMD K6 ?
-C AMD K7 2.5
-C AMD K8
-
-C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
-C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
-C that means we need an initial magic multiply.
-C
-C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
-C cannot do rsblsh1_n since we feed carry from the shift blocks to the
-C add/subtract blocks, which is right for addition but reversed for
-C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
-C without losing any time, since we're not issue limited but carry recurrency
-C latency.
-C
-C Breaking carry recurrency might be a good idea. We would then need separate
-C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
-
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_DBLD, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_COUNT,`PARAM_DST')
-define(VAR_TMP,`PARAM_DBLD')
-
-ASM_START()
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_addlsh1_n)
-deflit(`FRAME',0)
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebp')
-
- mov $0x2aaaaaab, %eax
-
- push %ebx FRAME_pushl()
- mov PARAM_SIZE, %ebx C size
-
- push rp FRAME_pushl()
- mov PARAM_DST, rp
-
- mul %ebx
-
- push up FRAME_pushl()
- mov PARAM_SRC, up
-
- not %edx C count = -(size\8)-1
- mov %edx, VAR_COUNT
-
- push vp FRAME_pushl()
- mov PARAM_DBLD, vp
-
- lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
- xor %edx, %edx
- lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
- or %ebx, %ebx
- jz L(exact)
-
-L(oop):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
- rcr %edx C restore 1st saved carry bit
- lea 4(vp), vp
- adc (up), %eax
- lea 4(up), up
- adc %edx, %edx C save a carry bit in edx
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- dec %ebx
- mov %eax, (rp)
- lea 4(rp), rp
- jnz L(oop)
- mov vp, VAR_TMP
-L(exact):
- incl VAR_COUNT
- jz L(end)
-
- ALIGN(16)
-L(top):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
- mov 4(vp), %ebx
- adc %ebx, %ebx
- mov 8(vp), %ecx
- adc %ecx, %ecx
-
- rcr %edx C restore 1st saved carry bit
-
- adc (up), %eax
- mov %eax, (rp)
- adc 4(up), %ebx
- mov %ebx, 4(rp)
- adc 8(up), %ecx
- mov %ecx, 8(rp)
-
- mov 12(vp), %eax
- adc %eax, %eax
- mov 16(vp), %ebx
- adc %ebx, %ebx
- mov 20(vp), %ecx
- adc %ecx, %ecx
-
- lea 24(vp), vp
- adc %edx, %edx C save a carry bit in edx
-
- adc 12(up), %eax
- mov %eax, 12(rp)
- adc 16(up), %ebx
- mov %ebx, 16(rp)
- adc 20(up), %ecx
-
- lea 24(up), up
-
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- mov %ecx, 20(rp)
- incl VAR_COUNT
- lea 24(rp), rp
- jne L(top)
-
-L(end):
- pop vp FRAME_popl()
- pop up FRAME_popl()
-
-ifdef(`CPU_P6',`
- xor %eax, %eax
- shr $1, %edx
- adc %edx, %eax
-',`
- adc $0, %edx
- mov %edx, %eax
-')
- pop rp FRAME_popl()
- pop %ebx FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/k7/aors_n.asm b/gmp/mpn/x86/k7/aors_n.asm
index 1a08072029..d84de3ee98 100644
--- a/gmp/mpn/x86/k7/aors_n.asm
+++ b/gmp/mpn/x86/k7/aors_n.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl Copyright 1999-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/aorsmul_1.asm b/gmp/mpn/x86/k7/aorsmul_1.asm
index eec8df6de2..b247c29131 100644
--- a/gmp/mpn/x86/k7/aorsmul_1.asm
+++ b/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -1,49 +1,39 @@
dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias) 6.5
+C cycles/limb
+C P5:
+C P6 model 0-8,10-12)
+C P6 model 9 (Banias)
C P6 model 13 (Dothan)
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6
-C AMD K7 3.75
-C AMD K8
+C K6:
+C K7: 3.75
+C K8:
C TODO
C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
diff --git a/gmp/mpn/x86/k7/bdiv_q_1.asm b/gmp/mpn/x86/k7/bdiv_q_1.asm
deleted file mode 100644
index df3477f539..0000000000
--- a/gmp/mpn/x86/k7/bdiv_q_1.asm
+++ /dev/null
@@ -1,244 +0,0 @@
-dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
-
-dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C Athlon: 11.0
-C Hammer: 9.0
-
-
-C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-C The dependent chain is mul+imul+sub for 11 cycles and that speed is
-C achieved with no special effort. The load and shrld latencies are hidden
-C by out of order execution.
-C
-C It's a touch faster on size==1 to use the mul-by-inverse than divl.
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-defframe(VAR_INVERSE, -20)
-defframe(VAR_DST_END, -24)
-
-deflit(STACK_SPACE, 24)
-
- TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
- ALIGN(16)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
- movl PARAM_SHIFT, %ecx C shift count
-
- movl %ebp, SAVE_EBP
- movl PARAM_SIZE, %ebp
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebx, SAVE_EBX
-
- leal (%esi,%ebp,4), %esi C src end
- leal (%edi,%ebp,4), %edi C dst end
- negl %ebp C -size
-
- movl PARAM_INVERSE, %eax C inv
-
-L(common):
- movl %eax, VAR_INVERSE
- movl (%esi,%ebp,4), %eax C src[0]
-
- incl %ebp
- jz L(one)
-
- movl (%esi,%ebp,4), %edx C src[1]
-
- shrdl( %cl, %edx, %eax)
-
- movl %edi, VAR_DST_END
- xorl %ebx, %ebx
- jmp L(entry)
-
- ALIGN(8)
-L(top):
- C eax q
- C ebx carry bit, 0 or 1
- C ecx shift
- C edx
- C esi src end
- C edi dst end
- C ebp counter, limbs, negative
-
- mull PARAM_DIVISOR C carry limb in edx
-
- movl -4(%esi,%ebp,4), %eax
- movl (%esi,%ebp,4), %edi
-
- shrdl( %cl, %edi, %eax)
-
- subl %ebx, %eax C apply carry bit
- setc %bl
- movl VAR_DST_END, %edi
-
- subl %edx, %eax C apply carry limb
- adcl $0, %ebx
-
-L(entry):
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi,%ebp,4)
- incl %ebp
- jnz L(top)
-
-
- mull PARAM_DIVISOR C carry limb in edx
-
- movl -4(%esi), %eax C src high limb
- shrl %cl, %eax
- movl SAVE_ESI, %esi
-
- subl %ebx, %eax C apply carry bit
- movl SAVE_EBX, %ebx
- movl SAVE_EBP, %ebp
-
- subl %edx, %eax C apply carry limb
-
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi)
- movl SAVE_EDI, %edi
- addl $STACK_SPACE, %esp
-
- ret
-
-L(one):
- shrl %cl, %eax
- movl SAVE_ESI, %esi
- movl SAVE_EBX, %ebx
-
- imull VAR_INVERSE, %eax
-
- movl SAVE_EBP, %ebp
-
- movl %eax, -4(%edi)
- movl SAVE_EDI, %edi
- addl $STACK_SPACE, %esp
-
- ret
-EPILOGUE()
-
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-
- ALIGN(16)
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_DIVISOR, %eax
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
- movl $-1, %ecx C shift count
-
- movl %ebp, SAVE_EBP
- movl PARAM_SIZE, %ebp
-
- movl %esi, SAVE_ESI
- movl %edi, SAVE_EDI
-
- C If there's usually only one or two trailing zero bits then this
- C should be faster than bsfl.
-L(strip_twos):
- incl %ecx
- shrl %eax
- jnc L(strip_twos)
-
- movl %ebx, SAVE_EBX
- leal 1(%eax,%eax), %ebx C d without twos
- andl $127, %eax C d/2, 7 bits
-
-ifdef(`PIC',`
- LEA( binvert_limb_table, %edx)
- movzbl (%eax,%edx), %eax C inv 8 bits
-',`
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits
-')
-
- leal (%eax,%eax), %edx C 2*inv
- movl %ebx, PARAM_DIVISOR C d without twos
-
- imull %eax, %eax C inv*inv
-
- movl PARAM_SRC, %esi
- movl PARAM_DST, %edi
-
- imull %ebx, %eax C inv*inv*d
-
- subl %eax, %edx C inv = 2*inv - inv*inv*d
- leal (%edx,%edx), %eax C 2*inv
-
- imull %edx, %edx C inv*inv
-
- leal (%esi,%ebp,4), %esi C src end
- leal (%edi,%ebp,4), %edi C dst end
- negl %ebp C -size
-
- imull %ebx, %edx C inv*inv*d
-
- subl %edx, %eax C inv = 2*inv - inv*inv*d
-
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
- pushl %eax FRAME_pushl()
- imull PARAM_DIVISOR, %eax
- cmpl $1, %eax
- popl %eax FRAME_popl()')
-
- jmp L(common)
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/dive_1.asm b/gmp/mpn/x86/k7/dive_1.asm
index 8eb4f45ac0..c994e0fb06 100644
--- a/gmp/mpn/x86/k7/dive_1.asm
+++ b/gmp/mpn/x86/k7/dive_1.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -116,7 +105,7 @@ ifdef(`PIC',`
subl %edx, %eax C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
imull PARAM_DIVISOR, %eax
cmpl $1, %eax
diff --git a/gmp/mpn/x86/k7/gcd_1.asm b/gmp/mpn/x86/k7/gcd_1.asm
index c7d12c83c0..f912f43730 100644
--- a/gmp/mpn/x86/k7/gcd_1.asm
+++ b/gmp/mpn/x86/k7/gcd_1.asm
@@ -1,186 +1,369 @@
-dnl x86 mpn_gcd_1 optimised for AMD K7.
+dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
-dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/bit (approx)
-C AMD K7 5.31
-C AMD K8,K9 5.33
-C AMD K10 5.30
-C AMD bd1 ?
-C AMD bobcat 7.02
-C Intel P4-2 10.1
-C Intel P4-3/4 10.0
-C Intel P6/13 5.88
-C Intel core2 6.26
-C Intel NHM 6.83
-C Intel SBR 8.50
-C Intel atom 8.90
-C VIA nano ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C TODO
-C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
-C * Stream things better through registers, avoiding some copying.
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C K7: 6.75 cycles/bit (approx) 1x1 gcd
+C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
+
+
+dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
+dnl where x is the larger of the two. See tune/README for more.
+dnl
+dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
+dnl suggests 40/7*2=11.4 but 7 seems to be about right.
+
+deflit(DIV_THRESHOLD, 7)
+
+C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C
+C This is mixed in with the code, but as per the k7 optimization manual it's
+C a full cache line and suitably aligned so it won't get swapped between
+C code and data. Having it in TEXT rather than RODATA saves needing a GOT
+C entry when PIC.
+C
+C Actually, there doesn't seem to be a measurable difference between this in
+C it's own cache line or plonked in the middle of the code. Presumably
+C since TEXT is read-only there's no worries about coherency.
+
+deflit(MASK, 63)
deflit(MAXSHIFT, 6)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-DEF_OBJECT(ctz_table,64)
+ TEXT
+ ALIGN(64)
+L(table):
.byte MAXSHIFT
forloop(i,1,MASK,
` .byte m4_count_trailing_zeros(i)
')
-END_OBJECT(ctz_table)
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`DIV_THRES_LOG2', 7)
+C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
+C
+
+defframe(PARAM_LIMB, 12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
-define(`up', `%edi')
-define(`n', `%esi')
-define(`v0', `%edx')
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(CALL_DIVISOR,-20)
+defframe(CALL_SIZE, -24)
+defframe(CALL_SRC, -28)
+deflit(STACK_SPACE, 28)
-ASM_START()
TEXT
ALIGN(16)
+
PROLOGUE(mpn_gcd_1)
- push %edi
- push %esi
+deflit(`FRAME',0)
+
+ ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
+ ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
+
+ movl PARAM_SRC, %eax
+ movl PARAM_LIMB, %edx
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
- mov 12(%esp), up
- mov 16(%esp), n
- mov 20(%esp), v0
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
- mov (up), %eax C U low limb
- or v0, %eax C x | y
- mov $-1, %ecx
+ movl (%eax), %esi C src low limb
+
+ifdef(`PIC',`
+ movl %edi, SAVE_EDI
+ call L(movl_eip_to_edi)
+L(here):
+ addl $L(table)-L(here), %edi
+')
+
+ movl %esi, %ebx
+ orl %edx, %esi C x|y
+ movl $-1, %ecx
L(twos):
- inc %ecx
- shr %eax
- jnc L(twos)
+ incl %ecx
+ shrl %esi
+ jnc L(twos) C 3/4 chance of x or y odd already
- shr %cl, v0
- mov %ecx, %eax C common twos
+ shrl %cl, %ebx
+ shrl %cl, %edx
+ movl %ecx, %esi C common twos
-L(divide_strip_y):
- shr v0
- jnc L(divide_strip_y)
- adc v0, v0
-
- push %eax
- push v0
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %ecx
- mov %ecx, %eax
- shr $DIV_THRES_LOG2, %ecx
- cmp %ecx, v0
- ja L(reduced)
-
- mov v0, %esi
- xor %edx, %edx
- div %esi
- mov %edx, %eax
- jmp L(reduced)
-
-L(reduce_nby1):
-ifdef(`PIC_WITH_EBX',`
- push %ebx
- call L(movl_eip_to_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
+ movl PARAM_SIZE, %ecx
+ cmpl $1, %ecx
+ ja L(divide)
+
+
+ C eax
+ C ebx x
+ C ecx
+ C edx y
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ movl %edx, %eax
+ cmpl %ebx, %edx
+
+ cmovb( %ebx, %eax) C swap to make x bigger than y
+ cmovb( %edx, %ebx)
+
+
+L(strip_y):
+ C eax x
+ C ebx y
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ ASSERT(nz,`orl %ebx,%ebx')
+ shrl %ebx
+ jnc L(strip_y)
+ rcll %ebx
+
+
+ C eax x
+ C ebx y (odd)
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ movl %eax, %ecx
+ movl %ebx, %edx
+ shrl $DIV_THRESHOLD, %eax
+
+ cmpl %eax, %ebx
+ movl %ecx, %eax
+ ja L(strip_x_entry) C do x%y if x much bigger than y
+
+
+ xorl %edx, %edx
+
+ divl %ebx
+
+ orl %edx, %edx
+ movl %edx, %eax C remainder -> x
+ movl %ebx, %edx C y
+
+ jz L(done_ebx)
+ jmp L(strip_x)
+
+
+ C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by
+ C ensuring the end of the jnz at the end of this loop doesn't cross
+ C into the next cache line at 0xC0.
+ C
+ C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
+ C it crosses but doesn't suffer any measurable slowdown.
+
+L(top):
+ C eax x
+ C ebx y-x
+ C ecx x-y
+ C edx y
+ C esi twos, for use at end
+ C edi [PIC] L(table)
+
+ cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x
+ cmovc( %eax, %edx)
+
+L(strip_x):
+ movl %ecx, %eax
+L(strip_x_entry):
+ andl $MASK, %ecx
+
+ ASSERT(nz, `orl %eax, %eax')
+
+ifdef(`PIC',`
+ movb (%ecx,%edi), %cl
+',`
+ movb L(table) (%ecx), %cl
')
- push v0 C param 3
- push n C param 2
- push up C param 1
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
- CALL( mpn_mod_1)
- jmp L(called)
-L(bmod):
- CALL( mpn_modexact_1_odd)
-
-L(called):
- add $12, %esp C deallocate params
-ifdef(`PIC_WITH_EBX',`
- pop %ebx
+
+ shrl %cl, %eax
+ cmpb $MAXSHIFT, %cl
+
+ movl %eax, %ecx
+ movl %edx, %ebx
+ je L(strip_x)
+
+ ASSERT(nz, `testl $1, %eax') C both odd
+ ASSERT(nz, `testl $1, %edx')
+
+ subl %eax, %ebx
+ subl %edx, %ecx
+ jnz L(top)
+
+
+L(done):
+ movl %esi, %ecx
+ movl SAVE_ESI, %esi
+ifdef(`PIC',`
+ movl SAVE_EDI, %edi
')
-L(reduced):
- pop %edx
-
- LEA( ctz_table, %esi)
- test %eax, %eax
- mov %eax, %ecx
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K8 BC P4 NHM SBR
-L(top): cmovc( %ecx, %eax) C if x-y < 0 0
- cmovc( %edi, %edx) C use x,y-x 0
-L(mid): and $MASK, %ecx C 0
- movzbl (%esi,%ecx), %ecx C 1
- jz L(shift_alot) C 1
- shr %cl, %eax C 3
- mov %eax, %edi C 4
- mov %edx, %ecx C 3
- sub %eax, %ecx C 4
- sub %edx, %eax C 4
- jnz L(top) C 5
-
-L(end): pop %ecx
- mov %edx, %eax
- shl %cl, %eax
- pop %esi
- pop %edi
- ret
-L(shift_alot):
- shr $MAXSHIFT, %eax
- mov %eax, %ecx
- jmp L(mid)
+ shll %cl, %eax
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
-ifdef(`PIC_WITH_EBX',`
-L(movl_eip_to_ebx):
- mov (%esp), %ebx
ret
+
+
+
+C -----------------------------------------------------------------------------
+C two or more limbs
+
+dnl MODEXACT_THRESHOLD is the size at which it's better to call
+dnl mpn_modexact_1_odd than do an inline loop.
+
+deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
+
+L(divide):
+ C eax src
+ C ebx
+ C ecx size
+ C edx y
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+L(divide_strip_y):
+ ASSERT(nz,`orl %edx,%edx')
+ shrl %edx
+ jnc L(divide_strip_y)
+ leal 1(%edx,%edx), %ebx C y now odd
+
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+ movl -4(%eax,%ecx,4), %eax C src high limb
+
+ cmp $MODEXACT_THRESHOLD, %ecx
+ jae L(modexact)
+
+ cmpl %ebx, %eax C high cmp divisor
+ movl $0, %edx
+
+ cmovc( %eax, %edx) C skip a div if high<divisor
+ sbbl $0, %ecx
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx y
+ C ecx counter (size to 1, inclusive)
+ C edx carry (remainder)
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp src
+
+ movl -4(%ebp,%ecx,4), %eax
+
+ divl %ebx
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+ C eax
+ C ebx y (odd)
+ C ecx
+ C edx x
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ orl %edx, %edx
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+
+ movl %edx, %ecx
+ movl %ebx, %edx
+ jnz L(strip_x_entry)
+
+
+L(done_ebx):
+ movl %ebx, %eax
+ jmp L(done)
+
+
+
+L(modexact):
+ C eax
+ C ebx y
+ C ecx size
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp src
+
+ifdef(`PIC',`
+ movl %ebp, CALL_SRC
+ movl %ebx, %ebp C y
+ movl %edi, %ebx C L(table)
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
+ movl %ebp, CALL_DIVISOR
+ movl %ecx, CALL_SIZE
+
+ call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
+',`
+dnl non-PIC
+ movl %ebx, CALL_DIVISOR
+ movl %ebp, CALL_SRC
+ movl %ecx, CALL_SIZE
+
+ call GSYM_PREFIX`'mpn_modexact_1_odd
')
+
+ C eax x
+ C ebx [non-PIC] y
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp [PIC] y
+
+ orl %eax, %eax
+ movl ifdef(`PIC',`%ebp',`%ebx'), %edx
+ movl SAVE_EBP, %ebp
+
+ movl %eax, %ecx
+ jnz L(strip_x_entry)
+
+ movl %edx, %eax
+ jmp L(done)
+
+
+ifdef(`PIC', `
+L(movl_eip_to_edi):
+ movl (%esp), %edi
+ ret_internal
+')
+
EPILOGUE()
diff --git a/gmp/mpn/x86/k7/gmp-mparam.h b/gmp/mpn/x86/k7/gmp-mparam.h
index 9977a113e2..ced0c020f7 100644
--- a/gmp/mpn/x86/k7/gmp-mparam.h
+++ b/gmp/mpn/x86/k7/gmp-mparam.h
@@ -1,241 +1,73 @@
/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free
+Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2083 MHz K7 Barton */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 3
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 24
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 85
-#define MUL_TOOM44_THRESHOLD 147
-#define MUL_TOOM6H_THRESHOLD 216
-#define MUL_TOOM8H_THRESHOLD 309
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 124
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 50
-#define SQR_TOOM3_THRESHOLD 81
-#define SQR_TOOM4_THRESHOLD 216
-#define SQR_TOOM6_THRESHOLD 306
-#define SQR_TOOM8_THRESHOLD 446
-
-#define MULMID_TOOM42_THRESHOLD 56
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */
-#define MUL_FFT_TABLE3 \
- { { 904, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \
- { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \
- { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
- { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
- { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
- { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
- { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \
- { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \
- { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 71, 8}, { 143, 9}, { 79, 8}, \
- { 159,10}, { 47, 9}, { 95, 8}, { 191, 9}, \
- { 103,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 143,10}, { 79, 9}, { 167,10}, \
- { 95, 9}, { 199,10}, { 111,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
- { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 8}, { 1087,10}, \
- { 287,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 191,10}, { 383, 9}, { 767, 8}, { 1535, 9}, \
- { 799, 8}, { 1599,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \
- { 607, 9}, { 1215, 8}, { 2431,11}, { 319,10}, \
- { 639, 9}, { 1279,10}, { 671, 9}, { 1343,12}, \
- { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \
- { 799, 9}, { 1599,10}, { 831, 9}, { 1663,10}, \
- { 863,13}, { 127,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \
- { 1151,11}, { 607,10}, { 1215, 9}, { 2431,12}, \
- { 319,11}, { 639,10}, { 1407,11}, { 735,10}, \
- { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \
- { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \
- { 1663,11}, { 895,10}, { 1791,11}, { 959,10}, \
- { 1919,13}, { 255,12}, { 511,11}, { 1023,10}, \
- { 2047,11}, { 1087,12}, { 575,11}, { 1151,10}, \
- { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
- { 1279,10}, { 2559,11}, { 1407,10}, { 2815,11}, \
- { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \
- { 1791,10}, { 3583,12}, { 959,11}, { 1919,10}, \
- { 3839,14}, { 255,13}, { 511,12}, { 1023,11}, \
- { 2047,12}, { 1087,11}, { 2175,12}, { 1151,11}, \
- { 2303,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1663,11}, { 3327,13}, { 895,12}, \
- { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \
- { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
- { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \
- { 4095,13}, { 2175,12}, { 4351,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
- { 2943,12}, { 5887,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 228
-#define MUL_FFT_THRESHOLD 7808
-
-#define SQR_FFT_MODF_THRESHOLD 888 /* k = 6 */
-#define SQR_FFT_TABLE3 \
- { { 888, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \
- { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \
- { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
- { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
- { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
- { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
- { 63, 8}, { 39, 9}, { 23, 8}, { 47, 7}, \
- { 95, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 79,10}, { 47, 9}, { 95, 8}, { 191,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \
- { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
- { 639, 8}, { 1279, 9}, { 671,11}, { 191,10}, \
- { 383, 9}, { 799, 8}, { 1599, 9}, { 831,11}, \
- { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 575, 9}, { 1215, 8}, \
- { 2431,11}, { 319,10}, { 639, 9}, { 1279,10}, \
- { 671, 9}, { 1407,12}, { 191,10}, { 799, 9}, \
- { 1599,10}, { 831, 9}, { 1663,10}, { 863, 9}, \
- { 1727,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087, 9}, \
- { 2175,10}, { 1119,11}, { 575,10}, { 1151,11}, \
- { 607,10}, { 1215, 9}, { 2431,12}, { 319,11}, \
- { 639,10}, { 1279,11}, { 671,10}, { 1343, 9}, \
- { 2687,11}, { 703,10}, { 1407,11}, { 735,10}, \
- { 1471, 9}, { 2943,10}, { 1503,12}, { 383,11}, \
- { 767,10}, { 1535,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 895,10}, \
- { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \
- { 511,11}, { 1023,10}, { 2047,11}, { 1087,10}, \
- { 2175,11}, { 1119,12}, { 575,11}, { 1151,10}, \
- { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
- { 1407,10}, { 2815,11}, { 1471,10}, { 2943,12}, \
- { 767,11}, { 1599,12}, { 831,11}, { 1663,10}, \
- { 3327,12}, { 895,11}, { 1791,10}, { 3583,12}, \
- { 959,11}, { 1919,10}, { 3839,11}, { 1983,14}, \
- { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
- { 1087,11}, { 2175,12}, { 1151,11}, { 2303,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1407,11}, \
- { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1663,11}, { 3327,12}, { 1727,13}, { 895,12}, \
- { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \
- { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
- { 2175,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \
- { 4095,13}, { 2175,12}, { 4351,13}, { 2431,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 229
-#define SQR_FFT_THRESHOLD 7552
-
-#define MULLO_BASECASE_THRESHOLD 8
-#define MULLO_DC_THRESHOLD 36
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 45
-#define DC_DIVAPPR_Q_THRESHOLD 208
-#define DC_BDIV_QR_THRESHOLD 43
-#define DC_BDIV_Q_THRESHOLD 140
-
-#define INV_MULMOD_BNM1_THRESHOLD 62
-#define INV_NEWTON_THRESHOLD 204
-#define INV_APPR_THRESHOLD 204
-
-#define BINV_NEWTON_THRESHOLD 230
-#define REDC_1_TO_REDC_N_THRESHOLD 59
-
-#define MU_DIV_QR_THRESHOLD 1752
-#define MU_DIVAPPR_Q_THRESHOLD 1528
-#define MUPI_DIV_QR_THRESHOLD 82
-#define MU_BDIV_QR_THRESHOLD 1360
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 1,16,102,336,1221
-
-#define MATRIX22_STRASSEN_THRESHOLD 16
-#define HGCD_THRESHOLD 120
-#define HGCD_APPR_THRESHOLD 143
-#define HGCD_REDUCE_THRESHOLD 4818
-#define GCD_DC_THRESHOLD 474
-#define GCDEXT_DC_THRESHOLD 345
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 15
-#define GET_STR_PRECOMPUTE_THRESHOLD 33
-#define SET_STR_DC_THRESHOLD 298
-#define SET_STR_PRECOMPUTE_THRESHOLD 1187
-
-#define FAC_DSC_THRESHOLD 602
-#define FAC_ODD_THRESHOLD 29
+/* 2083 MHz Athlon */
+
+/* Generated by tuneup.c, 2008-12-23, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 28
+#define MUL_TOOM3_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 52
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 196
+
+#define MULLOW_BASECASE_THRESHOLD 10
+#define MULLOW_DC_THRESHOLD 96
+#define MULLOW_MUL_N_THRESHOLD 234
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 86
+#define POWM_THRESHOLD 134
+#define MATRIX22_STRASSEN_THRESHOLD 18
+#define HGCD_THRESHOLD 163
+#define GCD_DC_THRESHOLD 665
+#define GCDEXT_DC_THRESHOLD 605
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 19
+#define GET_STR_PRECOMPUTE_THRESHOLD 35
+#define SET_STR_DC_THRESHOLD 826
+#define SET_STR_PRECOMPUTE_THRESHOLD 1691
+
+#define MUL_FFT_TABLE { 432, 864, 1664, 4608, 10240, 40960, 163840, 655360, 0 }
+#define MUL_FFT_MODF_THRESHOLD 496
+#define MUL_FFT_THRESHOLD 4864
+
+#define SQR_FFT_TABLE { 432, 864, 1664, 4608, 10240, 40960, 98304, 655360, 0 }
+#define SQR_FFT_MODF_THRESHOLD 432
+#define SQR_FFT_THRESHOLD 3840
+
+/* These tables need to be updated. */
+
+#define MUL_FFT_TABLE2 {{1, 4}, {401, 5}, {801, 6}, {817, 5}, {865, 6}, {1025, 5}, {1057, 6}, {1601, 7}, {1633, 6}, {1729, 7}, {1921, 6}, {2113, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {2945, 6}, {3009, 7}, {3457, 8}, {3521, 7}, {4481, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6017, 8}, {7553, 9}, {7681, 8}, {9985, 9}, {11777, 8}, {13057, 9}, {13825, 8}, {14081, 9}, {15873, 8}, {16641, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24577, 9}, {25089, 8}, {25345, 9}, {27393, 10}, {27649, 9}, {28161, 10}, {31745, 9}, {38913, 10}, {39425, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {98305, 11}, {99329, 10}, {100353, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {110593, 11}, {112641, 10}, {113665, 11}, {129025, 10}, {162817, 11}, {194561, 10}, {195585, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4976641, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1, 4}, {401, 5}, {417, 4}, {433, 5}, {881, 6}, {961, 5}, {993, 6}, {1857, 7}, {1921, 6}, {2049, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {3457, 8}, {3841, 7}, {4481, 8}, {4609, 7}, {4737, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6273, 8}, {7041, 9}, {7681, 8}, {9985, 9}, {10241, 8}, {10497, 9}, {11777, 8}, {13057, 9}, {15873, 8}, {16385, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24321, 9}, {24577, 10}, {24833, 9}, {25601, 10}, {27137, 9}, {27649, 10}, {31745, 9}, {38401, 10}, {38913, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {99329, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {107521, 11}, {110593, 10}, {113665, 11}, {129025, 10}, {154625, 11}, {155649, 10}, {162817, 11}, {194561, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {727041, 11}, {729089, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4714497, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {7073793, 13}, {7331841, 12}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/x86/k7/invert_limb.asm b/gmp/mpn/x86/k7/invert_limb.asm
deleted file mode 100644
index 6cce455a9d..0000000000
--- a/gmp/mpn/x86/k7/invert_limb.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl x86 mpn_invert_limb
-
-dnl Contributed to the GNU project by Niels Möller
-
-dnl Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles (approx) div
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) ?
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 ?
-C AMD K7 41 53
-C AMD K8 ?
-
-C TODO
-C * These c/l numbers are for a non-PIC build. Consider falling back to using
-C the 'div' instruction for PIC builds.
-C * Perhaps use this file--or at least the algorithm--for more machines than k7.
-
-C Register usage:
-C Input D in %edi
-C Current approximation is in %eax and/or %ecx
-C %ebx and %edx are temporaries
-C %esi and %ebp are unused
-
-defframe(PARAM_DIVISOR,4)
-
-ASM_START()
-
-C Make approx_tab global to work around Apple relocation bug.
-ifdef(`DARWIN',`
- deflit(`approx_tab', MPN(invert_limb_tab))
- GLOBL approx_tab')
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_invert_limb)
-deflit(`FRAME', 0)
- mov PARAM_DIVISOR, %eax
- C Avoid push/pop on k7.
- sub $8, %esp FRAME_subl_esp(8)
- mov %ebx, (%esp)
- mov %edi, 4(%esp)
-
- mov %eax, %edi
- shr $22, %eax
-ifdef(`PIC',`
- LEA( approx_tab, %ebx)
- movzwl -1024(%ebx, %eax, 2), %eax
-',`
- movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
-')
-
- C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
- mov %eax, %ecx
- imul %eax, %eax
- mov %edi, %ebx
- shr $11, %ebx
- inc %ebx
- mul %ebx
- mov %edi, %ebx C Prepare
- shr %ebx
- sbb %eax, %eax
- sub %eax, %ebx C %ebx = d_31, %eax = mask
- shl $4, %ecx
- dec %ecx
- sub %edx, %ecx C %ecx = v1
-
- C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
- imul %ecx, %ebx
- and %ecx, %eax
- shr %eax
- sub %ebx, %eax
- mul %ecx
- mov %edi, %eax C Prepare for next mul
- shl $15, %ecx
- shr %edx
- add %edx, %ecx C %ecx = v2
-
- mul %ecx
- add %edi, %eax
- mov %ecx, %eax
- adc %edi, %edx
- sub %edx, %eax C %eax = v3
-
- mov (%esp), %ebx
- mov 4(%esp), %edi
- add $8, %esp
-
- ret
-
-EPILOGUE()
-
-DEF_OBJECT(approx_tab,2)
- .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
- .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
- .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
- .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
- .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
- .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
- .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
- .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
- .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
- .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
- .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
- .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
- .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
- .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
- .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
- .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
- .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
- .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
- .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
- .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
- .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
- .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
- .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
- .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
- .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
- .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
- .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
- .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
- .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
- .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
- .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
- .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
- .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
- .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
- .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
- .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
- .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
- .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
- .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
- .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
- .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
- .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
- .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
- .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
- .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
- .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
- .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
- .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
- .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
- .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
- .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
- .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
- .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
- .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
- .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
- .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
- .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
- .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
- .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
- .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
- .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
- .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
- .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
- .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
-END_OBJECT(approx_tab)
diff --git a/gmp/mpn/x86/k7/mmx/com.asm b/gmp/mpn/x86/k7/mmx/com_n.asm
index a258c224f1..068c01f076 100644
--- a/gmp/mpn/x86/k7/mmx/com.asm
+++ b/gmp/mpn/x86/k7/mmx/com_n.asm
@@ -1,32 +1,21 @@
-dnl AMD Athlon mpn_com -- mpn bitwise one's complement.
+dnl AMD Athlon mpn_com_n -- mpn bitwise one's complement.
dnl Copyright 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -34,7 +23,7 @@ include(`../config.m4')
C K7: 1.0 cycles/limb
-C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C The loop form below is necessary for the claimed speed. It needs to be
C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it
@@ -62,7 +51,7 @@ defframe(PARAM_DST, 4)
TEXT
ALIGN(16)
-PROLOGUE(mpn_com)
+PROLOGUE(mpn_com_n)
deflit(`FRAME',0)
movl PARAM_DST, %edx
diff --git a/gmp/mpn/x86/k7/mmx/copyd.asm b/gmp/mpn/x86/k7/mmx/copyd.asm
index 59ece40920..4601fcd75a 100644
--- a/gmp/mpn/x86/k7/mmx/copyd.asm
+++ b/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_copyd -- copy limb vector, decrementing.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mmx/copyi.asm b/gmp/mpn/x86/k7/mmx/copyi.asm
index 9a28f927ec..a17d575ff4 100644
--- a/gmp/mpn/x86/k7/mmx/copyi.asm
+++ b/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mmx/divrem_1.asm b/gmp/mpn/x86/k7/mmx/divrem_1.asm
index cf343280bb..fa5824c7b9 100644
--- a/gmp/mpn/x86/k7/mmx/divrem_1.asm
+++ b/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -1,33 +1,22 @@
dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
dnl division.
-dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -456,7 +445,7 @@ C chain, and nothing better than 18 cycles has been found when using it.
C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
C be an extremely rare event.
C
-C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
C if some special data is coming out with this always, the q1_ff special
C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
C induce the q1_ff case, for speed measurements or testing. Note that
@@ -735,12 +724,12 @@ C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
C rnd() means rounding down to a multiple of d.
C
C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
-C = m*d + b*d - m - b
-C = floor((b(b-d)-1)/d)*d + b*d - m - b
-C = rnd(b(b-d)-1) + b*d - m - b
-C = rnd(b(b-d)-1 + b*d) - m - b
-C = rnd(b*b-1) - m - b
-C <= (b-2)*b
+C = m*d + b*d - m - b
+C = floor((b(b-d)-1)/d)*d + b*d - m - b
+C = rnd(b(b-d)-1) + b*d - m - b
+C = rnd(b(b-d)-1 + b*d) - m - b
+C = rnd(b*b-1) - m - b
+C <= (b-2)*b
C
C Unchanged from the general case is that the final quotient limb q can be
C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
diff --git a/gmp/mpn/x86/k7/mmx/lshift.asm b/gmp/mpn/x86/k7/mmx/lshift.asm
index b3383cf2c3..b3bff8ffd1 100644
--- a/gmp/mpn/x86/k7/mmx/lshift.asm
+++ b/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_lshift -- mpn left shift.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mmx/mod_1.asm b/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..2b42e55caf
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,509 @@
+dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb.
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+dnl
+dnl The inverse takes about 50 cycles to calculate, but after that the
+dnl multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+ movl PARAM_SRC, %ecx
+ movl PARAM_SIZE, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_INVERSE, %edx
+
+ movl %esi, SAVE_ESI
+ movl -4(%ecx,%eax,4), %edi C src high limb
+ leal -16(%ecx,%eax,4), %ecx C &src[size-4]
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_INVERSE, %edx
+
+ movl $0, VAR_NORM C l==0
+
+ movl %edi, %esi
+ subl %ebp, %edi C high-divisor
+
+ cmovc( %esi, %edi) C restore if underflow
+ decl %eax
+ jz L(done_edi) C size==1, high-divisor only
+
+ movl 8(%ecx), %esi C src second high limb
+ movl %edx, VAR_INVERSE
+
+ movl $32, %ebx C 32-l
+ decl %eax
+ jz L(inverse_one_left) C size==2, one divide
+
+ movd %ebx, %mm7 C 32-l
+ decl %eax
+ jz L(inverse_two_left) C size==3, two divides
+
+ jmp L(inverse_top) C size>=4
+
+
+L(done_edi):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl %edi, %eax
+
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+
+
+ ALIGN(32)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C carry flag if high<divisor
+
+ cmovc( %eax, %edx) C src high limb as initial carry
+ sbbl $0, %ecx C size-1 to skip one div
+ jz L(divide_done)
+
+
+ ALIGN(16)
+L(start_1c):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ cmpl $MUL_THRESHOLD, %ecx
+ jae L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx scratch (remainder)
+ C esi src
+ C edi
+ C ebp
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ movl %edx, %eax
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ movl %ebx, SAVE_EBX
+ movl %ecx, %ebx C size
+
+ movl %edi, SAVE_EDI
+ movl $31, %ecx
+
+ movl %edx, %edi C carry
+ movl $-1, %edx
+
+ C
+
+ xorl %eax, %ecx C l
+ incl %eax C 32-l
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C 32-l
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+ C
+
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl 8(%eax), %esi C src high limb
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ movl %eax, %ecx C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+ cmpl $2, %ebx
+ je L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa. They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary. Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+ ALIGN(32)
+L(inverse_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx src pointer, decrementing
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl PARAM_SIZE, %ebx C dummy
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next src limb and the one below it
+ subl $4, %ecx
+
+ movl %ecx, PARAM_SIZE C dummy
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+ nop C dummy
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+ leal (%ecx), %ecx C dummy
+
+ C
+
+ C
+
+ subl %eax, %esi C low n - (q1+1)*d
+ movl PARAM_SRC, %eax
+
+ C
+
+ sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ cmpl %eax, %ecx
+ jae L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx &src[-1]
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src dword)
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd 4(%ecx), %mm0 C src low limb
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+
+
+L(inverse_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movl VAR_NORM, %ecx C for final denorm
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ movl SAVE_EBX, %ebx
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ movl %esi, %eax C remainder
+ movl SAVE_ESI, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ leal (%ebp,%eax), %edx
+ movl SAVE_EBP, %ebp
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ movl SAVE_EDI, %edi
+
+ shrl %cl, %eax C denorm remainder
+ addl $STACK_SPACE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx src pointer
+ C edx
+ C esi n10
+ C edi (n2)
+ C ebp divisor
+
+ movl PARAM_SRC, %edx
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi C next n10
+
+ cmpl %edx, %ecx
+ jae L(inverse_top)
+ jmp L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/popham.asm b/gmp/mpn/x86/k7/mmx/popham.asm
index 95965b74d4..5dc0a78c42 100644
--- a/gmp/mpn/x86/k7/mmx/popham.asm
+++ b/gmp/mpn/x86/k7/mmx/popham.asm
@@ -1,40 +1,29 @@
dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
dnl distance.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C popcount hamdist
C P3 generic 6.5 7
-C P3 model 9 (Banias) 5.7 6.1
+C P3 model 9 (Banias) ? ?
C P3 model 13 (Dothan) 5.75 6
C K7 5 6
diff --git a/gmp/mpn/x86/k7/mmx/rshift.asm b/gmp/mpn/x86/k7/mmx/rshift.asm
index 345d23a25e..3566ce85d7 100644
--- a/gmp/mpn/x86/k7/mmx/rshift.asm
+++ b/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_rshift -- mpn right shift.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mod_1_1.asm b/gmp/mpn/x86/k7/mod_1_1.asm
deleted file mode 100644
index 1bbe6f92d7..0000000000
--- a/gmp/mpn/x86/k7/mod_1_1.asm
+++ /dev/null
@@ -1,221 +0,0 @@
-dnl x86-32 mpn_mod_1_1p, requiring cmov.
-
-dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
-
-dnl Copyright 2010, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) ?
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 ?
-C AMD K7 7
-C AMD K8 ?
-
-define(`B2mb', `%ebx')
-define(`r0', `%esi')
-define(`r2', `%ebp')
-define(`t0', `%edi')
-define(`ap', `%ecx') C Also shift count
-
-C Stack frame
-C pre 36(%esp)
-C b 32(%esp)
-C n 28(%esp)
-C ap 24(%esp)
-C return 20(%esp)
-C %ebp 16(%esp)
-C %edi 12(%esp)
-C %esi 8(%esp)
-C %ebx 4(%esp)
-C B2mod (%esp)
-
-define(`B2modb', `(%esp)')
-define(`n', `28(%esp)')
-define(`b', `32(%esp)')
-define(`pre', `36(%esp)')
-
-C mp_limb_t
-C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
-C
-C The pre array contains bi, cnt, B1modb, B2modb
-C Note: This implementation needs B1modb only when cnt > 0
-
-ASM_START()
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_mod_1_1p)
- push %ebp
- push %edi
- push %esi
- push %ebx
- mov 32(%esp), %ebp C pre[]
-
- mov 12(%ebp), %eax C B2modb
- push %eax C Put it on stack
-
- mov n, %edx
- mov 24(%esp), ap
-
- lea (ap, %edx, 4), ap
- mov -4(ap), %eax
- cmp $3, %edx
- jnc L(first)
- mov -8(ap), r0
- jmp L(reduce_two)
-
-L(first):
- C First iteration, no r2
- mull B2modb
- mov -12(ap), r0
- add %eax, r0
- mov -8(ap), %eax
- adc %edx, %eax
- sbb r2, r2
- subl $3, n
- lea -16(ap), ap
- jz L(reduce_three)
-
- mov B2modb, B2mb
- sub b, B2mb
- lea (B2mb, r0), t0
- jmp L(mid)
-
- ALIGN(16)
-L(top): C Loopmixed to 7 c/l on k7
- add %eax, r0
- lea (B2mb, r0), t0
- mov r2, %eax
- adc %edx, %eax
- sbb r2, r2
-L(mid): mull B2modb
- and B2modb, r2
- add r0, r2
- decl n
- mov (ap), r0
- cmovc( t0, r2)
- lea -4(ap), ap
- jnz L(top)
-
- add %eax, r0
- mov r2, %eax
- adc %edx, %eax
- sbb r2, r2
-
-L(reduce_three):
- C Eliminate r2
- and b, r2
- sub r2, %eax
-
-L(reduce_two):
- mov pre, %ebp
- movb 4(%ebp), %cl
- test %cl, %cl
- jz L(normalized)
-
- C Unnormalized, use B1modb to reduce to size < B b
- mull 8(%ebp)
- xor t0, t0
- add %eax, r0
- adc %edx, t0
- mov t0, %eax
-
- C Left-shift to normalize
- shld %cl, r0, %eax C Always use shld?
-
- shl %cl, r0
- jmp L(udiv)
-
-L(normalized):
- mov %eax, t0
- sub b, t0
- cmovnc( t0, %eax)
-
-L(udiv):
- lea 1(%eax), t0
- mull (%ebp)
- mov b, %ebx C Needed in register for lea
- add r0, %eax
- adc t0, %edx
- imul %ebx, %edx
- sub %edx, r0
- cmp r0, %eax
- lea (%ebx, r0), %eax
- cmovnc( r0, %eax)
- cmp %ebx, %eax
- jnc L(fix)
-L(ok): shr %cl, %eax
-
- add $4, %esp
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
-
- ret
-L(fix): sub %ebx, %eax
- jmp L(ok)
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps)
- push %ebp
- mov 12(%esp), %ebp
- push %esi
- bsr %ebp, %ecx
- push %ebx
- xor $31, %ecx
- mov 16(%esp), %esi
- sal %cl, %ebp
- mov %ebp, %edx
- not %edx
- mov $-1, %eax
- div %ebp C On K7, invert_limb would be a few cycles faster.
- mov %eax, (%esi) C store bi
- mov %ecx, 4(%esi) C store cnt
- neg %ebp
- mov $1, %edx
- shld %cl, %eax, %edx
- imul %ebp, %edx
- shr %cl, %edx
- imul %ebp, %eax
- mov %edx, 8(%esi) C store B1modb
- mov %eax, 12(%esi) C store B2modb
- pop %ebx
- pop %esi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_1_4.asm b/gmp/mpn/x86/k7/mod_1_4.asm
deleted file mode 100644
index bb7597edd2..0000000000
--- a/gmp/mpn/x86/k7/mod_1_4.asm
+++ /dev/null
@@ -1,260 +0,0 @@
-dnl x86-32 mpn_mod_1s_4p, requiring cmov.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 6
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) 15.5
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 ?
-C AMD K7 4.75
-C AMD K8 ?
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p)
- push %ebp
- push %edi
- push %esi
- push %ebx
- sub $28, %esp
- mov 60(%esp), %edi C cps[]
- mov 8(%edi), %eax
- mov 12(%edi), %edx
- mov 16(%edi), %ecx
- mov 20(%edi), %esi
- mov 24(%edi), %edi
- mov %eax, 4(%esp)
- mov %edx, 8(%esp)
- mov %ecx, 12(%esp)
- mov %esi, 16(%esp)
- mov %edi, 20(%esp)
- mov 52(%esp), %eax C n
- xor %edi, %edi
- mov 48(%esp), %esi C up
- lea -12(%esi,%eax,4), %esi
- and $3, %eax
- je L(b0)
- cmp $2, %eax
- jc L(b1)
- je L(b2)
-
-L(b3): mov 4(%esi), %eax
- mull 4(%esp)
- mov (%esi), %ebp
- add %eax, %ebp
- adc %edx, %edi
- mov 8(%esi), %eax
- mull 8(%esp)
- lea -12(%esi), %esi
- jmp L(m0)
-
-L(b0): mov (%esi), %eax
- mull 4(%esp)
- mov -4(%esi), %ebp
- add %eax, %ebp
- adc %edx, %edi
- mov 4(%esi), %eax
- mull 8(%esp)
- add %eax, %ebp
- adc %edx, %edi
- mov 8(%esi), %eax
- mull 12(%esp)
- lea -16(%esi), %esi
- jmp L(m0)
-
-L(b1): mov 8(%esi), %ebp
- lea -4(%esi), %esi
- jmp L(m1)
-
-L(b2): mov 8(%esi), %edi
- mov 4(%esi), %ebp
- lea -8(%esi), %esi
- jmp L(m1)
-
- ALIGN(16)
-L(top): mov (%esi), %eax
- mull 4(%esp)
- mov -4(%esi), %ebx
- xor %ecx, %ecx
- add %eax, %ebx
- adc %edx, %ecx
- mov 4(%esi), %eax
- mull 8(%esp)
- add %eax, %ebx
- adc %edx, %ecx
- mov 8(%esi), %eax
- mull 12(%esp)
- add %eax, %ebx
- adc %edx, %ecx
- lea -16(%esi), %esi
- mov 16(%esp), %eax
- mul %ebp
- add %eax, %ebx
- adc %edx, %ecx
- mov 20(%esp), %eax
- mul %edi
- mov %ebx, %ebp
- mov %ecx, %edi
-L(m0): add %eax, %ebp
- adc %edx, %edi
-L(m1): subl $4, 52(%esp)
- ja L(top)
-
-L(end): mov 4(%esp), %eax
- mul %edi
- mov 60(%esp), %edi
- add %eax, %ebp
- adc $0, %edx
- mov 4(%edi), %ecx
- mov %edx, %esi
- mov %ebp, %eax
- sal %cl, %esi
- mov %ecx, %ebx
- neg %ecx
- shr %cl, %eax
- or %esi, %eax
- lea 1(%eax), %esi
- mull (%edi)
- mov %ebx, %ecx
- mov %eax, %ebx
- mov %ebp, %eax
- mov 56(%esp), %ebp
- sal %cl, %eax
- add %eax, %ebx
- adc %esi, %edx
- imul %ebp, %edx
- sub %edx, %eax
- lea (%eax,%ebp), %edx
- cmp %eax, %ebx
- cmovc( %edx, %eax)
- mov %eax, %edx
- sub %ebp, %eax
- cmovc( %edx, %eax)
- add $28, %esp
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
- shr %cl, %eax
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p_cps)
-C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
- push %ebp
- push %edi
- push %esi
- push %ebx
- mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
- mov 24(%esp), %ebx
- bsr %ebx, %ecx
- xor $31, %ecx
- sal %cl, %ebx C b << cnt
- mov %ebx, %edx
- not %edx
- mov $-1, %eax
- div %ebx
- xor %edi, %edi
- sub %ebx, %edi
- mov $1, %esi
- mov %eax, (%ebp) C store bi
- mov %ecx, 4(%ebp) C store cnt
- shld %cl, %eax, %esi
- imul %edi, %esi
- mov %eax, %edi
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 8(%ebp) C store B1modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 12(%ebp) C store B2modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 16(%ebp) C store B3modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 20(%ebp) C store B4modb
-
- not %edx
- imul %ebx, %edx
- add %edx, %ebx
- cmp %edx, %eax
- cmovnc( %edx, %ebx)
-
- shr %cl, %ebx
- mov %ebx, 24(%ebp) C store B5modb
-
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_34lsub1.asm b/gmp/mpn/x86/k7/mod_34lsub1.asm
index ee3ad04099..f00e84dc42 100644
--- a/gmp/mpn/x86/k7/mod_34lsub1.asm
+++ b/gmp/mpn/x86/k7/mod_34lsub1.asm
@@ -1,32 +1,22 @@
dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mode1o.asm b/gmp/mpn/x86/k7/mode1o.asm
index 6472ec5949..ef858049a6 100644
--- a/gmp/mpn/x86/k7/mode1o.asm
+++ b/gmp/mpn/x86/k7/mode1o.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
-dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -122,7 +111,7 @@ ifdef(`PIC',`
subl %eax, %edi C inv = 2*inv - inv*inv*d
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
movl %esi, %eax
imull %edi, %eax
cmpl $1, %eax')
diff --git a/gmp/mpn/x86/k7/mul_1.asm b/gmp/mpn/x86/k7/mul_1.asm
index 755cd2ed50..016262d594 100644
--- a/gmp/mpn/x86/k7/mul_1.asm
+++ b/gmp/mpn/x86/k7/mul_1.asm
@@ -1,38 +1,28 @@
dnl AMD K7 mpn_mul_1.
-dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
+C cycles/limb
+C P5:
C P6 model 0-8,10-12)
C P6 model 9 (Banias)
C P6 model 13 (Dothan)
@@ -41,9 +31,9 @@ C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6
-C AMD K7 3.25
-C AMD K8
+C K6:
+C K7: 3.25
+C K8:
C TODO
C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
diff --git a/gmp/mpn/x86/k7/mul_basecase.asm b/gmp/mpn/x86/k7/mul_basecase.asm
index 4dfb500885..7f4c0002f7 100644
--- a/gmp/mpn/x86/k7/mul_basecase.asm
+++ b/gmp/mpn/x86/k7/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/sqr_basecase.asm b/gmp/mpn/x86/k7/sqr_basecase.asm
index 7b6a97e0df..408a13dc9b 100644
--- a/gmp/mpn/x86/k7/sqr_basecase.asm
+++ b/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -39,18 +28,18 @@ C roughly the Karatsuba recursing range).
dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
dnl some comments.
-deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
-ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
-`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
+`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
-m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
-deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
-C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes,
C which is quite a bit, but is considered good value since squares big
C enough to use most of the code will be spending quite a few cycles in it.
diff --git a/gmp/mpn/x86/k7/sublsh1_n.asm b/gmp/mpn/x86/k7/sublsh1_n.asm
deleted file mode 100644
index 523b01218d..0000000000
--- a/gmp/mpn/x86/k7/sublsh1_n.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The
-C innerloop is 2*3-way unrolled, which is best we can do with the available
-C registers. It seems tricky to use the same structure for rsblsh1_n, since we
-C cannot feed carry between operations there.
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 6.75
-C AMD K6
-C AMD K7
-C AMD K8
-
-C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
-C processors. It uses 2*4-way unrolling, for good reasons.
-C
-C Breaking carry recurrency might be a good idea. We would then need separate
-C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
-
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_EBP,`PARAM_DST')
-
-ASM_START()
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_sublsh1_n_ip1)
-deflit(`FRAME',0)
-
-define(`rp', `%edi')
-define(`up', `%esi')
-
- mov PARAM_SIZE, %eax C size
- push up FRAME_pushl()
- push rp FRAME_pushl()
- xor %edx, %edx
- mov PARAM_SRC, up
- mov PARAM_DST, rp
- mov %ebx, SAVE_EBX
- mov %eax, %ebx
- shr $3, %eax
-
- not %eax C count = -(size\8)-i
- and $7, %ebx C size % 8
- jz L(exact)
-
-L(oop):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (up), %ecx
- adc %ecx, %ecx
- rcr %edx C restore 1st saved carry bit
- lea 4(up), up
- sbb %ecx, (rp)
- lea 4(rp), rp
- adc %edx, %edx C save a carry bit in edx
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- dec %ebx
- jnz L(oop)
-L(exact):
- inc %eax
- jz L(end)
- mov %eax, VAR_COUNT
- mov %ebp, SAVE_EBP
-
- ALIGN(16)
-L(top):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (up), %eax
- adc %eax, %eax
- mov 4(up), %ebx
- adc %ebx, %ebx
- mov 8(up), %ecx
- adc %ecx, %ecx
- mov 12(up), %ebp
- adc %ebp, %ebp
-
- rcr %edx C restore 1st saved carry bit
-
- sbb %eax, (rp)
- sbb %ebx, 4(rp)
- sbb %ecx, 8(rp)
- sbb %ebp, 12(rp)
-
- mov 16(up), %eax
- adc %eax, %eax
- mov 20(up), %ebx
- adc %ebx, %ebx
- mov 24(up), %ecx
- adc %ecx, %ecx
- mov 28(up), %ebp
- adc %ebp, %ebp
-
- lea 32(up), up
- adc %edx, %edx C save a carry bit in edx
-
- sbb %eax, 16(rp)
- sbb %ebx, 20(rp)
- sbb %ecx, 24(rp)
- sbb %ebp, 28(rp)
-
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- incl VAR_COUNT
- lea 32(rp), rp
- jne L(top)
-
- mov SAVE_EBP, %ebp
-L(end):
- mov SAVE_EBX, %ebx
-
-ifdef(`CPU_P6',`
- xor %eax, %eax
- shr $1, %edx
- adc %edx, %eax
-',`
- adc $0, %edx
- mov %edx, %eax
-')
- pop rp FRAME_popl()
- pop up FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()