diff options
author | Pedro Alvarez <pedro.alvarez@codethink.co.uk> | 2016-05-27 17:39:31 +0100 |
---|---|---|
committer | Pedro Alvarez <pedro.alvarez@codethink.co.uk> | 2016-05-27 17:53:32 +0100 |
commit | 26c75cf8267919f81a1759c9c965a52c660233f9 (patch) | |
tree | cf2a39cf56c2c8ac45760854413ab233e6263974 /gmp/mpn/x86/k7 | |
parent | 56892c1d217baea02092b51a09bbc924130ca84c (diff) | |
download | gcc-tarball-26c75cf8267919f81a1759c9c965a52c660233f9.tar.gz |
go to gmp 4.3.2baserock/pedroalvarez/gcc-5.3.0-gmp432
Diffstat (limited to 'gmp/mpn/x86/k7')
25 files changed, 1120 insertions, 2056 deletions
diff --git a/gmp/mpn/x86/k7/README b/gmp/mpn/x86/k7/README index 5711b612c5..e2c5e0c18d 100644 --- a/gmp/mpn/x86/k7/README +++ b/gmp/mpn/x86/k7/README @@ -3,28 +3,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - -or - - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. - -or both in parallel, as here. +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. diff --git a/gmp/mpn/x86/k7/addlsh1_n.asm b/gmp/mpn/x86/k7/addlsh1_n.asm deleted file mode 100644 index a957b6f78e..0000000000 --- a/gmp/mpn/x86/k7/addlsh1_n.asm +++ /dev/null @@ -1,196 +0,0 @@ -dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns. -C The innerloop is 2*3-way unrolled, which is best we can do with the available -C registers. It seems tricky to use the same structure for rsblsh1_n, since we -C cannot feed carry between operations there. - -C cycles/limb -C P5 -C P6 model 0-8,10-12 -C P6 model 9 (Banias) -C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift) -C P4 model 0 (Willamette) -C P4 model 1 (?) -C P4 model 2 (Northwood) -C P4 model 3 (Prescott) -C P4 model 4 (Nocona) -C Intel Atom 6 -C AMD K6 ? -C AMD K7 2.5 -C AMD K8 - -C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 -C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, -C that means we need an initial magic multiply. -C -C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We -C cannot do rsblsh1_n since we feed carry from the shift blocks to the -C add/subtract blocks, which is right for addition but reversed for -C subtraction. We could perhaps do sublsh1_n, with some extra move insns, -C without losing any time, since we're not issue limited but carry recurrency -C latency. -C -C Breaking carry recurrency might be a good idea. We would then need separate -C registers for the shift carry and add/subtract carry, which in turn would -C force is to 2*2-way unrolling. - -defframe(PARAM_SIZE, 16) -defframe(PARAM_DBLD, 12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - -dnl re-use parameter space -define(VAR_COUNT,`PARAM_DST') -define(VAR_TMP,`PARAM_DBLD') - -ASM_START() - TEXT - ALIGN(8) -PROLOGUE(mpn_addlsh1_n) -deflit(`FRAME',0) - -define(`rp', `%edi') -define(`up', `%esi') -define(`vp', `%ebp') - - mov $0x2aaaaaab, %eax - - push %ebx FRAME_pushl() - mov PARAM_SIZE, %ebx C size - - push rp FRAME_pushl() - mov PARAM_DST, rp - - mul %ebx - - push up FRAME_pushl() - mov PARAM_SRC, up - - not %edx C count = -(size\8)-1 - mov %edx, VAR_COUNT - - push vp FRAME_pushl() - mov PARAM_DBLD, vp - - lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3 - xor %edx, %edx - lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6 - or %ebx, %ebx - jz L(exact) - -L(oop): -ifdef(`CPU_P6',` - shr %edx ') C restore 2nd saved carry bit - mov (vp), %eax - adc %eax, %eax - rcr %edx C restore 1st saved carry bit - lea 4(vp), vp - adc (up), %eax - lea 4(up), up - adc %edx, %edx C save a carry bit in edx -ifdef(`CPU_P6',` - adc %edx, %edx ') C save another carry bit in edx - dec %ebx - mov %eax, (rp) - lea 4(rp), rp - jnz L(oop) - mov vp, VAR_TMP -L(exact): - incl VAR_COUNT - jz L(end) - - ALIGN(16) -L(top): -ifdef(`CPU_P6',` - shr %edx ') C restore 2nd saved carry bit - mov (vp), %eax - adc %eax, %eax - mov 4(vp), %ebx - adc %ebx, %ebx - mov 8(vp), %ecx - adc %ecx, %ecx - - rcr %edx C restore 1st saved carry bit - - adc (up), %eax - mov %eax, (rp) - adc 4(up), %ebx - mov %ebx, 4(rp) - adc 8(up), %ecx - mov %ecx, 8(rp) - - mov 12(vp), %eax - adc %eax, %eax - mov 16(vp), %ebx - adc %ebx, %ebx - mov 20(vp), %ecx - adc %ecx, %ecx - - lea 24(vp), vp - adc %edx, %edx C save a carry bit in edx - - adc 12(up), %eax - mov %eax, 12(rp) - adc 16(up), %ebx - mov %ebx, 16(rp) - adc 20(up), %ecx - - lea 24(up), up - -ifdef(`CPU_P6',` - adc %edx, %edx ') C save another carry bit in edx - mov %ecx, 20(rp) - incl VAR_COUNT - lea 24(rp), rp - jne L(top) - -L(end): - pop vp FRAME_popl() - pop up FRAME_popl() - -ifdef(`CPU_P6',` - xor %eax, %eax - shr $1, %edx - adc %edx, %eax -',` - adc $0, %edx - mov %edx, %eax -') - pop rp FRAME_popl() - pop %ebx FRAME_popl() - ret -EPILOGUE() -ASM_END() diff --git a/gmp/mpn/x86/k7/aors_n.asm b/gmp/mpn/x86/k7/aors_n.asm index 1a08072029..d84de3ee98 100644 --- a/gmp/mpn/x86/k7/aors_n.asm +++ b/gmp/mpn/x86/k7/aors_n.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. -dnl Copyright 1999-2003 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/aorsmul_1.asm b/gmp/mpn/x86/k7/aorsmul_1.asm index eec8df6de2..b247c29131 100644 --- a/gmp/mpn/x86/k7/aorsmul_1.asm +++ b/gmp/mpn/x86/k7/aorsmul_1.asm @@ -1,49 +1,39 @@ dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. -dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation, +dnl Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C P5 -C P6 model 0-8,10-12 -C P6 model 9 (Banias) 6.5 +C cycles/limb +C P5: +C P6 model 0-8,10-12) +C P6 model 9 (Banias) C P6 model 13 (Dothan) C P4 model 0 (Willamette) C P4 model 1 (?) C P4 model 2 (Northwood) C P4 model 3 (Prescott) C P4 model 4 (Nocona) -C AMD K6 -C AMD K7 3.75 -C AMD K8 +C K6: +C K7: 3.75 +C K8: C TODO C * Improve feed-in and wind-down code. We beat the old code for all n != 1, diff --git a/gmp/mpn/x86/k7/bdiv_q_1.asm b/gmp/mpn/x86/k7/bdiv_q_1.asm deleted file mode 100644 index df3477f539..0000000000 --- a/gmp/mpn/x86/k7/bdiv_q_1.asm +++ /dev/null @@ -1,244 +0,0 @@ -dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division. - -dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato. - -dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb -C Athlon: 11.0 -C Hammer: 9.0 - - -C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, -C mp_limb_t divisor); -C -C The dependent chain is mul+imul+sub for 11 cycles and that speed is -C achieved with no special effort. The load and shrld latencies are hidden -C by out of order execution. -C -C It's a touch faster on size==1 to use the mul-by-inverse than divl. - -defframe(PARAM_SHIFT, 24) -defframe(PARAM_INVERSE,20) -defframe(PARAM_DIVISOR,16) -defframe(PARAM_SIZE, 12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - -defframe(SAVE_EBX, -4) -defframe(SAVE_ESI, -8) -defframe(SAVE_EDI, -12) -defframe(SAVE_EBP, -16) -defframe(VAR_INVERSE, -20) -defframe(VAR_DST_END, -24) - -deflit(STACK_SPACE, 24) - - TEXT - -C mp_limb_t -C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, -C mp_limb_t inverse, int shift) - ALIGN(16) -PROLOGUE(mpn_pi1_bdiv_q_1) -deflit(`FRAME',0) - - subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) - movl PARAM_SHIFT, %ecx C shift count - - movl %ebp, SAVE_EBP - movl PARAM_SIZE, %ebp - - movl %esi, SAVE_ESI - movl PARAM_SRC, %esi - - movl %edi, SAVE_EDI - movl PARAM_DST, %edi - - movl %ebx, SAVE_EBX - - leal (%esi,%ebp,4), %esi C src end - leal (%edi,%ebp,4), %edi C dst end - negl %ebp C -size - - movl PARAM_INVERSE, %eax C inv - -L(common): - movl %eax, VAR_INVERSE - movl (%esi,%ebp,4), %eax C src[0] - - incl %ebp - jz L(one) - - movl (%esi,%ebp,4), %edx C src[1] - - shrdl( %cl, %edx, %eax) - - movl %edi, VAR_DST_END - xorl %ebx, %ebx - jmp L(entry) - - ALIGN(8) -L(top): - C eax q - C ebx carry bit, 0 or 1 - C ecx shift - C edx - C esi src end - C edi dst end - C ebp counter, limbs, negative - - mull PARAM_DIVISOR C carry limb in edx - - movl -4(%esi,%ebp,4), %eax - movl (%esi,%ebp,4), %edi - - shrdl( %cl, %edi, %eax) - - subl %ebx, %eax C apply carry bit - setc %bl - movl VAR_DST_END, %edi - - subl %edx, %eax C apply carry limb - adcl $0, %ebx - -L(entry): - imull VAR_INVERSE, %eax - - movl %eax, -4(%edi,%ebp,4) - incl %ebp - jnz L(top) - - - mull PARAM_DIVISOR C carry limb in edx - - movl -4(%esi), %eax C src high limb - shrl %cl, %eax - movl SAVE_ESI, %esi - - subl %ebx, %eax C apply carry bit - movl SAVE_EBX, %ebx - movl SAVE_EBP, %ebp - - subl %edx, %eax C apply carry limb - - imull VAR_INVERSE, %eax - - movl %eax, -4(%edi) - movl SAVE_EDI, %edi - addl $STACK_SPACE, %esp - - ret - -L(one): - shrl %cl, %eax - movl SAVE_ESI, %esi - movl SAVE_EBX, %ebx - - imull VAR_INVERSE, %eax - - movl SAVE_EBP, %ebp - - movl %eax, -4(%edi) - movl SAVE_EDI, %edi - addl $STACK_SPACE, %esp - - ret -EPILOGUE() - -C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, -C mp_limb_t divisor); -C - - ALIGN(16) -PROLOGUE(mpn_bdiv_q_1) -deflit(`FRAME',0) - - movl PARAM_DIVISOR, %eax - subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) - movl $-1, %ecx C shift count - - movl %ebp, SAVE_EBP - movl PARAM_SIZE, %ebp - - movl %esi, SAVE_ESI - movl %edi, SAVE_EDI - - C If there's usually only one or two trailing zero bits then this - C should be faster than bsfl. -L(strip_twos): - incl %ecx - shrl %eax - jnc L(strip_twos) - - movl %ebx, SAVE_EBX - leal 1(%eax,%eax), %ebx C d without twos - andl $127, %eax C d/2, 7 bits - -ifdef(`PIC',` - LEA( binvert_limb_table, %edx) - movzbl (%eax,%edx), %eax C inv 8 bits -',` - movzbl binvert_limb_table(%eax), %eax C inv 8 bits -') - - leal (%eax,%eax), %edx C 2*inv - movl %ebx, PARAM_DIVISOR C d without twos - - imull %eax, %eax C inv*inv - - movl PARAM_SRC, %esi - movl PARAM_DST, %edi - - imull %ebx, %eax C inv*inv*d - - subl %eax, %edx C inv = 2*inv - inv*inv*d - leal (%edx,%edx), %eax C 2*inv - - imull %edx, %edx C inv*inv - - leal (%esi,%ebp,4), %esi C src end - leal (%edi,%ebp,4), %edi C dst end - negl %ebp C -size - - imull %ebx, %edx C inv*inv*d - - subl %edx, %eax C inv = 2*inv - inv*inv*d - - ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS - pushl %eax FRAME_pushl() - imull PARAM_DIVISOR, %eax - cmpl $1, %eax - popl %eax FRAME_popl()') - - jmp L(common) -EPILOGUE() diff --git a/gmp/mpn/x86/k7/dive_1.asm b/gmp/mpn/x86/k7/dive_1.asm index 8eb4f45ac0..c994e0fb06 100644 --- a/gmp/mpn/x86/k7/dive_1.asm +++ b/gmp/mpn/x86/k7/dive_1.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -116,7 +105,7 @@ ifdef(`PIC',` subl %edx, %eax C inv = 2*inv - inv*inv*d - ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB pushl %eax FRAME_pushl() imull PARAM_DIVISOR, %eax cmpl $1, %eax diff --git a/gmp/mpn/x86/k7/gcd_1.asm b/gmp/mpn/x86/k7/gcd_1.asm index c7d12c83c0..f912f43730 100644 --- a/gmp/mpn/x86/k7/gcd_1.asm +++ b/gmp/mpn/x86/k7/gcd_1.asm @@ -1,186 +1,369 @@ -dnl x86 mpn_gcd_1 optimised for AMD K7. +dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd. -dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or +dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/bit (approx) -C AMD K7 5.31 -C AMD K8,K9 5.33 -C AMD K10 5.30 -C AMD bd1 ? -C AMD bobcat 7.02 -C Intel P4-2 10.1 -C Intel P4-3/4 10.0 -C Intel P6/13 5.88 -C Intel core2 6.26 -C Intel NHM 6.83 -C Intel SBR 8.50 -C Intel atom 8.90 -C VIA nano ? -C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 - -C TODO -C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny. -C * Stream things better through registers, avoiding some copying. - -C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. +C K7: 6.75 cycles/bit (approx) 1x1 gcd +C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) + + +dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y, +dnl where x is the larger of the two. See tune/README for more. +dnl +dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair +dnl suggests 40/7*2=11.4 but 7 seems to be about right. + +deflit(DIV_THRESHOLD, 7) + +C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. +C +C This is mixed in with the code, but as per the k7 optimization manual it's +C a full cache line and suitably aligned so it won't get swapped between +C code and data. Having it in TEXT rather than RODATA saves needing a GOT +C entry when PIC. +C +C Actually, there doesn't seem to be a measurable difference between this in +C it's own cache line or plonked in the middle of the code. Presumably +C since TEXT is read-only there's no worries about coherency. + +deflit(MASK, 63) deflit(MAXSHIFT, 6) -deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) -DEF_OBJECT(ctz_table,64) + TEXT + ALIGN(64) +L(table): .byte MAXSHIFT forloop(i,1,MASK, ` .byte m4_count_trailing_zeros(i) ') -END_OBJECT(ctz_table) -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`DIV_THRES_LOG2', 7) +C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb); +C + +defframe(PARAM_LIMB, 12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) -define(`up', `%edi') -define(`n', `%esi') -define(`v0', `%edx') +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(CALL_DIVISOR,-20) +defframe(CALL_SIZE, -24) +defframe(CALL_SRC, -28) +deflit(STACK_SPACE, 28) -ASM_START() TEXT ALIGN(16) + PROLOGUE(mpn_gcd_1) - push %edi - push %esi +deflit(`FRAME',0) + + ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0 + ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1 + + movl PARAM_SRC, %eax + movl PARAM_LIMB, %edx + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) - mov 12(%esp), up - mov 16(%esp), n - mov 20(%esp), v0 + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX - mov (up), %eax C U low limb - or v0, %eax C x | y - mov $-1, %ecx + movl (%eax), %esi C src low limb + +ifdef(`PIC',` + movl %edi, SAVE_EDI + call L(movl_eip_to_edi) +L(here): + addl $L(table)-L(here), %edi +') + + movl %esi, %ebx + orl %edx, %esi C x|y + movl $-1, %ecx L(twos): - inc %ecx - shr %eax - jnc L(twos) + incl %ecx + shrl %esi + jnc L(twos) C 3/4 chance of x or y odd already - shr %cl, v0 - mov %ecx, %eax C common twos + shrl %cl, %ebx + shrl %cl, %edx + movl %ecx, %esi C common twos -L(divide_strip_y): - shr v0 - jnc L(divide_strip_y) - adc v0, v0 - - push %eax - push v0 - - cmp $1, n - jnz L(reduce_nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - mov (up), %ecx - mov %ecx, %eax - shr $DIV_THRES_LOG2, %ecx - cmp %ecx, v0 - ja L(reduced) - - mov v0, %esi - xor %edx, %edx - div %esi - mov %edx, %eax - jmp L(reduced) - -L(reduce_nby1): -ifdef(`PIC_WITH_EBX',` - push %ebx - call L(movl_eip_to_ebx) - add $_GLOBAL_OFFSET_TABLE_, %ebx + movl PARAM_SIZE, %ecx + cmpl $1, %ecx + ja L(divide) + + + C eax + C ebx x + C ecx + C edx y + C esi common twos + C edi [PIC] L(table) + C ebp + + movl %edx, %eax + cmpl %ebx, %edx + + cmovb( %ebx, %eax) C swap to make x bigger than y + cmovb( %edx, %ebx) + + +L(strip_y): + C eax x + C ebx y + C ecx + C edx + C esi common twos + C edi [PIC] L(table) + C ebp + + ASSERT(nz,`orl %ebx,%ebx') + shrl %ebx + jnc L(strip_y) + rcll %ebx + + + C eax x + C ebx y (odd) + C ecx + C edx + C esi common twos + C edi [PIC] L(table) + C ebp + + movl %eax, %ecx + movl %ebx, %edx + shrl $DIV_THRESHOLD, %eax + + cmpl %eax, %ebx + movl %ecx, %eax + ja L(strip_x_entry) C do x%y if x much bigger than y + + + xorl %edx, %edx + + divl %ebx + + orl %edx, %edx + movl %edx, %eax C remainder -> x + movl %ebx, %edx C y + + jz L(done_ebx) + jmp L(strip_x) + + + C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by + C ensuring the end of the jnz at the end of this loop doesn't cross + C into the next cache line at 0xC0. + C + C PIC on the other hand is offset 0xAC here and extends to 0xC9, so + C it crosses but doesn't suffer any measurable slowdown. + +L(top): + C eax x + C ebx y-x + C ecx x-y + C edx y + C esi twos, for use at end + C edi [PIC] L(table) + + cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x + cmovc( %eax, %edx) + +L(strip_x): + movl %ecx, %eax +L(strip_x_entry): + andl $MASK, %ecx + + ASSERT(nz, `orl %eax, %eax') + +ifdef(`PIC',` + movb (%ecx,%edi), %cl +',` + movb L(table) (%ecx), %cl ') - push v0 C param 3 - push n C param 2 - push up C param 1 - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) - CALL( mpn_mod_1) - jmp L(called) -L(bmod): - CALL( mpn_modexact_1_odd) - -L(called): - add $12, %esp C deallocate params -ifdef(`PIC_WITH_EBX',` - pop %ebx + + shrl %cl, %eax + cmpb $MAXSHIFT, %cl + + movl %eax, %ecx + movl %edx, %ebx + je L(strip_x) + + ASSERT(nz, `testl $1, %eax') C both odd + ASSERT(nz, `testl $1, %edx') + + subl %eax, %ebx + subl %edx, %ecx + jnz L(top) + + +L(done): + movl %esi, %ecx + movl SAVE_ESI, %esi +ifdef(`PIC',` + movl SAVE_EDI, %edi ') -L(reduced): - pop %edx - - LEA( ctz_table, %esi) - test %eax, %eax - mov %eax, %ecx - jnz L(mid) - jmp L(end) - - ALIGN(16) C K8 BC P4 NHM SBR -L(top): cmovc( %ecx, %eax) C if x-y < 0 0 - cmovc( %edi, %edx) C use x,y-x 0 -L(mid): and $MASK, %ecx C 0 - movzbl (%esi,%ecx), %ecx C 1 - jz L(shift_alot) C 1 - shr %cl, %eax C 3 - mov %eax, %edi C 4 - mov %edx, %ecx C 3 - sub %eax, %ecx C 4 - sub %edx, %eax C 4 - jnz L(top) C 5 - -L(end): pop %ecx - mov %edx, %eax - shl %cl, %eax - pop %esi - pop %edi - ret -L(shift_alot): - shr $MAXSHIFT, %eax - mov %eax, %ecx - jmp L(mid) + shll %cl, %eax + movl SAVE_EBX, %ebx + addl $FRAME, %esp -ifdef(`PIC_WITH_EBX',` -L(movl_eip_to_ebx): - mov (%esp), %ebx ret + + + +C ----------------------------------------------------------------------------- +C two or more limbs + +dnl MODEXACT_THRESHOLD is the size at which it's better to call +dnl mpn_modexact_1_odd than do an inline loop. + +deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5)) + +L(divide): + C eax src + C ebx + C ecx size + C edx y + C esi common twos + C edi [PIC] L(table) + C ebp + +L(divide_strip_y): + ASSERT(nz,`orl %edx,%edx') + shrl %edx + jnc L(divide_strip_y) + leal 1(%edx,%edx), %ebx C y now odd + + movl %ebp, SAVE_EBP + movl %eax, %ebp + movl -4(%eax,%ecx,4), %eax C src high limb + + cmp $MODEXACT_THRESHOLD, %ecx + jae L(modexact) + + cmpl %ebx, %eax C high cmp divisor + movl $0, %edx + + cmovc( %eax, %edx) C skip a div if high<divisor + sbbl $0, %ecx + + +L(divide_top): + C eax scratch (quotient) + C ebx y + C ecx counter (size to 1, inclusive) + C edx carry (remainder) + C esi common twos + C edi [PIC] L(table) + C ebp src + + movl -4(%ebp,%ecx,4), %eax + + divl %ebx + + decl %ecx + jnz L(divide_top) + + + C eax + C ebx y (odd) + C ecx + C edx x + C esi common twos + C edi [PIC] L(table) + C ebp + + orl %edx, %edx + movl SAVE_EBP, %ebp + movl %edx, %eax + + movl %edx, %ecx + movl %ebx, %edx + jnz L(strip_x_entry) + + +L(done_ebx): + movl %ebx, %eax + jmp L(done) + + + +L(modexact): + C eax + C ebx y + C ecx size + C edx + C esi common twos + C edi [PIC] L(table) + C ebp src + +ifdef(`PIC',` + movl %ebp, CALL_SRC + movl %ebx, %ebp C y + movl %edi, %ebx C L(table) + + addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx + movl %ebp, CALL_DIVISOR + movl %ecx, CALL_SIZE + + call GSYM_PREFIX`'mpn_modexact_1_odd@PLT +',` +dnl non-PIC + movl %ebx, CALL_DIVISOR + movl %ebp, CALL_SRC + movl %ecx, CALL_SIZE + + call GSYM_PREFIX`'mpn_modexact_1_odd ') + + C eax x + C ebx [non-PIC] y + C ecx + C edx + C esi common twos + C edi [PIC] L(table) + C ebp [PIC] y + + orl %eax, %eax + movl ifdef(`PIC',`%ebp',`%ebx'), %edx + movl SAVE_EBP, %ebp + + movl %eax, %ecx + jnz L(strip_x_entry) + + movl %edx, %eax + jmp L(done) + + +ifdef(`PIC', ` +L(movl_eip_to_edi): + movl (%esp), %edi + ret_internal +') + EPILOGUE() diff --git a/gmp/mpn/x86/k7/gmp-mparam.h b/gmp/mpn/x86/k7/gmp-mparam.h index 9977a113e2..ced0c020f7 100644 --- a/gmp/mpn/x86/k7/gmp-mparam.h +++ b/gmp/mpn/x86/k7/gmp-mparam.h @@ -1,241 +1,73 @@ /* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software -Foundation, Inc. +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free +Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. -or +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 -or both in parallel, as here. -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. */ - -#define GMP_LIMB_BITS 32 -#define GMP_LIMB_BYTES 4 - -/* 2083 MHz K7 Barton */ -/* FFT tuning limit = 25000000 */ -/* Generated by tuneup.c, 2014-03-13, gcc 4.2 */ - -#define MOD_1_NORM_THRESHOLD 0 /* always */ -#define MOD_1_UNNORM_THRESHOLD 3 -#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 -#define USE_PREINV_DIVREM_1 1 /* native */ -#define DIV_QR_1N_PI1_METHOD 1 -#define DIV_QR_1_NORM_THRESHOLD 3 -#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 24 - -#define MUL_TOOM22_THRESHOLD 28 -#define MUL_TOOM33_THRESHOLD 85 -#define MUL_TOOM44_THRESHOLD 147 -#define MUL_TOOM6H_THRESHOLD 216 -#define MUL_TOOM8H_THRESHOLD 309 - -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102 -#define MUL_TOOM43_TO_TOOM54_THRESHOLD 124 - -#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 50 -#define SQR_TOOM3_THRESHOLD 81 -#define SQR_TOOM4_THRESHOLD 216 -#define SQR_TOOM6_THRESHOLD 306 -#define SQR_TOOM8_THRESHOLD 446 - -#define MULMID_TOOM42_THRESHOLD 56 - -#define MULMOD_BNM1_THRESHOLD 17 -#define SQRMOD_BNM1_THRESHOLD 17 - -#define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */ -#define MUL_FFT_TABLE3 \ - { { 904, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \ - { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \ - { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \ - { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \ - { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \ - { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \ - { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \ - { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \ - { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ - { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \ - { 127, 9}, { 71, 8}, { 143, 9}, { 79, 8}, \ - { 159,10}, { 47, 9}, { 95, 8}, { 191, 9}, \ - { 103,11}, { 31,10}, { 63, 9}, { 127, 8}, \ - { 255, 9}, { 143,10}, { 79, 9}, { 167,10}, \ - { 95, 9}, { 199,10}, { 111,11}, { 63,10}, \ - { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ - { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ - { 383,10}, { 207,12}, { 63,11}, { 127,10}, \ - { 255, 9}, { 511,10}, { 271, 8}, { 1087,10}, \ - { 287,11}, { 159,10}, { 319, 9}, { 639,11}, \ - { 191,10}, { 383, 9}, { 767, 8}, { 1535, 9}, \ - { 799, 8}, { 1599,11}, { 223,12}, { 127,11}, \ - { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ - { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \ - { 607, 9}, { 1215, 8}, { 2431,11}, { 319,10}, \ - { 639, 9}, { 1279,10}, { 671, 9}, { 1343,12}, \ - { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \ - { 799, 9}, { 1599,10}, { 831, 9}, { 1663,10}, \ - { 863,13}, { 127,12}, { 255,11}, { 511,10}, \ - { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \ - { 1151,11}, { 607,10}, { 1215, 9}, { 2431,12}, \ - { 319,11}, { 639,10}, { 1407,11}, { 735,10}, \ - { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \ - { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \ - { 1663,11}, { 895,10}, { 1791,11}, { 959,10}, \ - { 1919,13}, { 255,12}, { 511,11}, { 1023,10}, \ - { 2047,11}, { 1087,12}, { 575,11}, { 1151,10}, \ - { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \ - { 1279,10}, { 2559,11}, { 1407,10}, { 2815,11}, \ - { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \ - { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \ - { 1791,10}, { 3583,12}, { 959,11}, { 1919,10}, \ - { 3839,14}, { 255,13}, { 511,12}, { 1023,11}, \ - { 2047,12}, { 1087,11}, { 2175,12}, { 1151,11}, \ - { 2303,12}, { 1215,11}, { 2431,13}, { 639,12}, \ - { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \ - { 767,12}, { 1663,11}, { 3327,13}, { 895,12}, \ - { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \ - { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \ - { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \ - { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \ - { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \ - { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \ - { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \ - { 4095,13}, { 2175,12}, { 4351,13}, { 2431,12}, \ - { 4863,14}, { 1279,13}, { 2559,12}, { 5119,13}, \ - { 2943,12}, { 5887,14}, { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 228 -#define MUL_FFT_THRESHOLD 7808 - -#define SQR_FFT_MODF_THRESHOLD 888 /* k = 6 */ -#define SQR_FFT_TABLE3 \ - { { 888, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \ - { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \ - { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \ - { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \ - { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \ - { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \ - { 63, 8}, { 39, 9}, { 23, 8}, { 47, 7}, \ - { 95, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \ - { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ - { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ - { 79,10}, { 47, 9}, { 95, 8}, { 191,11}, \ - { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \ - { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \ - { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \ - { 255, 8}, { 511,10}, { 143, 9}, { 287, 8}, \ - { 575,10}, { 159,11}, { 95,10}, { 191, 9}, \ - { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ - { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \ - { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ - { 639, 8}, { 1279, 9}, { 671,11}, { 191,10}, \ - { 383, 9}, { 799, 8}, { 1599, 9}, { 831,11}, \ - { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \ - { 1087,11}, { 287,10}, { 575, 9}, { 1215, 8}, \ - { 2431,11}, { 319,10}, { 639, 9}, { 1279,10}, \ - { 671, 9}, { 1407,12}, { 191,10}, { 799, 9}, \ - { 1599,10}, { 831, 9}, { 1663,10}, { 863, 9}, \ - { 1727,11}, { 447,13}, { 127,12}, { 255,11}, \ - { 511,10}, { 1023,11}, { 543,10}, { 1087, 9}, \ - { 2175,10}, { 1119,11}, { 575,10}, { 1151,11}, \ - { 607,10}, { 1215, 9}, { 2431,12}, { 319,11}, \ - { 639,10}, { 1279,11}, { 671,10}, { 1343, 9}, \ - { 2687,11}, { 703,10}, { 1407,11}, { 735,10}, \ - { 1471, 9}, { 2943,10}, { 1503,12}, { 383,11}, \ - { 767,10}, { 1535,11}, { 799,10}, { 1599,11}, \ - { 863,10}, { 1727,12}, { 447,11}, { 895,10}, \ - { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \ - { 511,11}, { 1023,10}, { 2047,11}, { 1087,10}, \ - { 2175,11}, { 1119,12}, { 575,11}, { 1151,10}, \ - { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \ - { 1407,10}, { 2815,11}, { 1471,10}, { 2943,12}, \ - { 767,11}, { 1599,12}, { 831,11}, { 1663,10}, \ - { 3327,12}, { 895,11}, { 1791,10}, { 3583,12}, \ - { 959,11}, { 1919,10}, { 3839,11}, { 1983,14}, \ - { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ - { 1087,11}, { 2175,12}, { 1151,11}, { 2303,12}, \ - { 1215,11}, { 2431,13}, { 639,12}, { 1407,11}, \ - { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \ - { 1663,11}, { 3327,12}, { 1727,13}, { 895,12}, \ - { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \ - { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \ - { 2175,13}, { 1151,12}, { 2495,13}, { 1279,12}, \ - { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \ - { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \ - { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \ - { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \ - { 4095,13}, { 2175,12}, { 4351,13}, { 2431,14}, \ - { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \ - { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 229 -#define SQR_FFT_THRESHOLD 7552 - -#define MULLO_BASECASE_THRESHOLD 8 -#define MULLO_DC_THRESHOLD 36 -#define MULLO_MUL_N_THRESHOLD 13463 - -#define DC_DIV_QR_THRESHOLD 45 -#define DC_DIVAPPR_Q_THRESHOLD 208 -#define DC_BDIV_QR_THRESHOLD 43 -#define DC_BDIV_Q_THRESHOLD 140 - -#define INV_MULMOD_BNM1_THRESHOLD 62 -#define INV_NEWTON_THRESHOLD 204 -#define INV_APPR_THRESHOLD 204 - -#define BINV_NEWTON_THRESHOLD 230 -#define REDC_1_TO_REDC_N_THRESHOLD 59 - -#define MU_DIV_QR_THRESHOLD 1752 -#define MU_DIVAPPR_Q_THRESHOLD 1528 -#define MUPI_DIV_QR_THRESHOLD 82 -#define MU_BDIV_QR_THRESHOLD 1360 -#define MU_BDIV_Q_THRESHOLD 1470 - -#define POWM_SEC_TABLE 1,16,102,336,1221 - -#define MATRIX22_STRASSEN_THRESHOLD 16 -#define HGCD_THRESHOLD 120 -#define HGCD_APPR_THRESHOLD 143 -#define HGCD_REDUCE_THRESHOLD 4818 -#define GCD_DC_THRESHOLD 474 -#define GCDEXT_DC_THRESHOLD 345 -#define JACOBI_BASE_METHOD 4 - -#define GET_STR_DC_THRESHOLD 15 -#define GET_STR_PRECOMPUTE_THRESHOLD 33 -#define SET_STR_DC_THRESHOLD 298 -#define SET_STR_PRECOMPUTE_THRESHOLD 1187 - -#define FAC_DSC_THRESHOLD 602 -#define FAC_ODD_THRESHOLD 29 +/* 2083 MHz Athlon */ + +/* Generated by tuneup.c, 2008-12-23, gcc 3.4 */ + +#define MUL_KARATSUBA_THRESHOLD 28 +#define MUL_TOOM3_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_KARATSUBA_THRESHOLD 52 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 196 + +#define MULLOW_BASECASE_THRESHOLD 10 +#define MULLOW_DC_THRESHOLD 96 +#define MULLOW_MUL_N_THRESHOLD 234 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 86 +#define POWM_THRESHOLD 134 +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD_THRESHOLD 163 +#define GCD_DC_THRESHOLD 665 +#define GCDEXT_DC_THRESHOLD 605 +#define JACOBI_BASE_METHOD 1 + +#define USE_PREINV_DIVREM_1 1 /* native */ +#define USE_PREINV_MOD_1 1 /* native */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 35 +#define SET_STR_DC_THRESHOLD 826 +#define SET_STR_PRECOMPUTE_THRESHOLD 1691 + +#define MUL_FFT_TABLE { 432, 864, 1664, 4608, 10240, 40960, 163840, 655360, 0 } +#define MUL_FFT_MODF_THRESHOLD 496 +#define MUL_FFT_THRESHOLD 4864 + +#define SQR_FFT_TABLE { 432, 864, 1664, 4608, 10240, 40960, 98304, 655360, 0 } +#define SQR_FFT_MODF_THRESHOLD 432 +#define SQR_FFT_THRESHOLD 3840 + +/* These tables need to be updated. */ + +#define MUL_FFT_TABLE2 {{1, 4}, {401, 5}, {801, 6}, {817, 5}, {865, 6}, {1025, 5}, {1057, 6}, {1601, 7}, {1633, 6}, {1729, 7}, {1921, 6}, {2113, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {2945, 6}, {3009, 7}, {3457, 8}, {3521, 7}, {4481, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6017, 8}, {7553, 9}, {7681, 8}, {9985, 9}, {11777, 8}, {13057, 9}, {13825, 8}, {14081, 9}, {15873, 8}, {16641, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24577, 9}, {25089, 8}, {25345, 9}, {27393, 10}, {27649, 9}, {28161, 10}, {31745, 9}, {38913, 10}, {39425, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {98305, 11}, {99329, 10}, {100353, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {110593, 11}, {112641, 10}, {113665, 11}, {129025, 10}, {162817, 11}, {194561, 10}, {195585, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4976641, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {MP_SIZE_T_MAX,0}} + +#define SQR_FFT_TABLE2 {{1, 4}, {401, 5}, {417, 4}, {433, 5}, {881, 6}, {961, 5}, {993, 6}, {1857, 7}, {1921, 6}, {2049, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {3457, 8}, {3841, 7}, {4481, 8}, {4609, 7}, {4737, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6273, 8}, {7041, 9}, {7681, 8}, {9985, 9}, {10241, 8}, {10497, 9}, {11777, 8}, {13057, 9}, {15873, 8}, {16385, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24321, 9}, {24577, 10}, {24833, 9}, {25601, 10}, {27137, 9}, {27649, 10}, {31745, 9}, {38401, 10}, {38913, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {99329, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {107521, 11}, {110593, 10}, {113665, 11}, {129025, 10}, {154625, 11}, {155649, 10}, {162817, 11}, {194561, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {727041, 11}, {729089, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4714497, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {7073793, 13}, {7331841, 12}, {MP_SIZE_T_MAX,0}} diff --git a/gmp/mpn/x86/k7/invert_limb.asm b/gmp/mpn/x86/k7/invert_limb.asm deleted file mode 100644 index 6cce455a9d..0000000000 --- a/gmp/mpn/x86/k7/invert_limb.asm +++ /dev/null @@ -1,193 +0,0 @@ -dnl x86 mpn_invert_limb - -dnl Contributed to the GNU project by Niels Möller - -dnl Copyright 2009, 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles (approx) div -C P5 ? -C P6 model 0-8,10-12 ? -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) ? -C P4 model 0 (Willamette) ? -C P4 model 1 (?) ? -C P4 model 2 (Northwood) ? -C P4 model 3 (Prescott) ? -C P4 model 4 (Nocona) ? -C AMD K6 ? -C AMD K7 41 53 -C AMD K8 ? - -C TODO -C * These c/l numbers are for a non-PIC build. Consider falling back to using -C the 'div' instruction for PIC builds. -C * Perhaps use this file--or at least the algorithm--for more machines than k7. - -C Register usage: -C Input D in %edi -C Current approximation is in %eax and/or %ecx -C %ebx and %edx are temporaries -C %esi and %ebp are unused - -defframe(PARAM_DIVISOR,4) - -ASM_START() - -C Make approx_tab global to work around Apple relocation bug. -ifdef(`DARWIN',` - deflit(`approx_tab', MPN(invert_limb_tab)) - GLOBL approx_tab') - - TEXT - ALIGN(16) -PROLOGUE(mpn_invert_limb) -deflit(`FRAME', 0) - mov PARAM_DIVISOR, %eax - C Avoid push/pop on k7. - sub $8, %esp FRAME_subl_esp(8) - mov %ebx, (%esp) - mov %edi, 4(%esp) - - mov %eax, %edi - shr $22, %eax -ifdef(`PIC',` - LEA( approx_tab, %ebx) - movzwl -1024(%ebx, %eax, 2), %eax -',` - movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0 -') - - C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1 - mov %eax, %ecx - imul %eax, %eax - mov %edi, %ebx - shr $11, %ebx - inc %ebx - mul %ebx - mov %edi, %ebx C Prepare - shr %ebx - sbb %eax, %eax - sub %eax, %ebx C %ebx = d_31, %eax = mask - shl $4, %ecx - dec %ecx - sub %edx, %ecx C %ecx = v1 - - C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33) - imul %ecx, %ebx - and %ecx, %eax - shr %eax - sub %ebx, %eax - mul %ecx - mov %edi, %eax C Prepare for next mul - shl $15, %ecx - shr %edx - add %edx, %ecx C %ecx = v2 - - mul %ecx - add %edi, %eax - mov %ecx, %eax - adc %edi, %edx - sub %edx, %eax C %eax = v3 - - mov (%esp), %ebx - mov 4(%esp), %edi - add $8, %esp - - ret - -EPILOGUE() - -DEF_OBJECT(approx_tab,2) - .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27 - .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d - .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61 - .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894 - .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3 - .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520 - .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379 - .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de - .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e - .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8 - .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e - .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd - .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76 - .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918 - .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3 - .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676 - .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532 - .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5 - .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1 - .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193 - .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d - .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d - .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35 - .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22 - .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16 - .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10 - .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f - .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914 - .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f - .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e - .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643 - .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d - .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b - .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e - .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6 - .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1 - .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121 - .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056 - .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e - .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca - .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09 - .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d - .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93 - .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde - .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b - .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c - .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0 - .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927 - .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881 - .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de - .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e - .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1 - .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606 - .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e - .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8 - .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445 - .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5 - .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327 - .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b - .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211 - .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a - .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104 - .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081 - .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000 -END_OBJECT(approx_tab) diff --git a/gmp/mpn/x86/k7/mmx/com.asm b/gmp/mpn/x86/k7/mmx/com_n.asm index a258c224f1..068c01f076 100644 --- a/gmp/mpn/x86/k7/mmx/com.asm +++ b/gmp/mpn/x86/k7/mmx/com_n.asm @@ -1,32 +1,21 @@ -dnl AMD Athlon mpn_com -- mpn bitwise one's complement. +dnl AMD Athlon mpn_com_n -- mpn bitwise one's complement. dnl Copyright 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -34,7 +23,7 @@ include(`../config.m4') C K7: 1.0 cycles/limb -C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size); C C The loop form below is necessary for the claimed speed. It needs to be C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it @@ -62,7 +51,7 @@ defframe(PARAM_DST, 4) TEXT ALIGN(16) -PROLOGUE(mpn_com) +PROLOGUE(mpn_com_n) deflit(`FRAME',0) movl PARAM_DST, %edx diff --git a/gmp/mpn/x86/k7/mmx/copyd.asm b/gmp/mpn/x86/k7/mmx/copyd.asm index 59ece40920..4601fcd75a 100644 --- a/gmp/mpn/x86/k7/mmx/copyd.asm +++ b/gmp/mpn/x86/k7/mmx/copyd.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/mmx/copyi.asm b/gmp/mpn/x86/k7/mmx/copyi.asm index 9a28f927ec..a17d575ff4 100644 --- a/gmp/mpn/x86/k7/mmx/copyi.asm +++ b/gmp/mpn/x86/k7/mmx/copyi.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/mmx/divrem_1.asm b/gmp/mpn/x86/k7/mmx/divrem_1.asm index cf343280bb..fa5824c7b9 100644 --- a/gmp/mpn/x86/k7/mmx/divrem_1.asm +++ b/gmp/mpn/x86/k7/mmx/divrem_1.asm @@ -1,33 +1,22 @@ dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb dnl division. -dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -456,7 +445,7 @@ C chain, and nothing better than 18 cycles has been found when using it. C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will C be an extremely rare event. C -C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but +C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but C if some special data is coming out with this always, the q1_ff special C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to C induce the q1_ff case, for speed measurements or testing. Note that @@ -735,12 +724,12 @@ C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. C rnd() means rounding down to a multiple of d. C C m*n2 + b*n2 <= m*(d-1) + b*(d-1) -C = m*d + b*d - m - b -C = floor((b(b-d)-1)/d)*d + b*d - m - b -C = rnd(b(b-d)-1) + b*d - m - b -C = rnd(b(b-d)-1 + b*d) - m - b -C = rnd(b*b-1) - m - b -C <= (b-2)*b +C = m*d + b*d - m - b +C = floor((b(b-d)-1)/d)*d + b*d - m - b +C = rnd(b(b-d)-1) + b*d - m - b +C = rnd(b(b-d)-1 + b*d) - m - b +C = rnd(b*b-1) - m - b +C <= (b-2)*b C C Unchanged from the general case is that the final quotient limb q can be C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from diff --git a/gmp/mpn/x86/k7/mmx/lshift.asm b/gmp/mpn/x86/k7/mmx/lshift.asm index b3383cf2c3..b3bff8ffd1 100644 --- a/gmp/mpn/x86/k7/mmx/lshift.asm +++ b/gmp/mpn/x86/k7/mmx/lshift.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_lshift -- mpn left shift. -dnl Copyright 1999-2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/mmx/mod_1.asm b/gmp/mpn/x86/k7/mmx/mod_1.asm new file mode 100644 index 0000000000..2b42e55caf --- /dev/null +++ b/gmp/mpn/x86/k7/mmx/mod_1.asm @@ -0,0 +1,509 @@ +dnl AMD K7 mpn_mod_1 -- mpn by limb remainder. + +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 17.0 cycles/limb. + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse); +C +C The code here is the same as mpn_divrem_1, but with the quotient +C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 41 c/l. +dnl +dnl Using mul or div is about the same speed at 3 limbs, so the threshold +dnl is set to 4 to get the smaller div code used at 3. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1 +defframe(PARAM_CARRY, 16) dnl mpn_mod_1c +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC_STOP,-28) + +deflit(STACK_SPACE, 28) + + TEXT + + ALIGN(32) +PROLOGUE(mpn_preinv_mod_1) +deflit(`FRAME',0) + movl PARAM_SRC, %ecx + movl PARAM_SIZE, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_INVERSE, %edx + + movl %esi, SAVE_ESI + movl -4(%ecx,%eax,4), %edi C src high limb + leal -16(%ecx,%eax,4), %ecx C &src[size-4] + + movl %ebx, SAVE_EBX + movl PARAM_INVERSE, %edx + + movl $0, VAR_NORM C l==0 + + movl %edi, %esi + subl %ebp, %edi C high-divisor + + cmovc( %esi, %edi) C restore if underflow + decl %eax + jz L(done_edi) C size==1, high-divisor only + + movl 8(%ecx), %esi C src second high limb + movl %edx, VAR_INVERSE + + movl $32, %ebx C 32-l + decl %eax + jz L(inverse_one_left) C size==2, one divide + + movd %ebx, %mm7 C 32-l + decl %eax + jz L(inverse_two_left) C size==3, two divides + + jmp L(inverse_top) C size>=4 + + +L(done_edi): + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + movl %edi, %eax + + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() + + + ALIGN(32) +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(32) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + orl %ecx, %ecx + jz L(divide_done) + + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C carry flag if high<divisor + + cmovc( %eax, %edx) C src high limb as initial carry + sbbl $0, %ecx C size-1 to skip one div + jz L(divide_done) + + + ALIGN(16) +L(start_1c): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + cmpl $MUL_THRESHOLD, %ecx + jae L(mul_by_inverse) + + + +C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations, +C but it's already fast and compact, and there's nothing to gain by +C expanding it out. +C +C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp. + + orl %ecx, %ecx + jz L(divide_done) + + +L(divide_top): + C eax scratch (quotient) + C ebx + C ecx counter, limbs, decrementing + C edx scratch (remainder) + C esi src + C edi + C ebp + + movl -4(%esi,%ecx,4), %eax + + divl PARAM_DIVISOR + + decl %ecx + jnz L(divide_top) + + +L(divide_done): + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + movl %edx, %eax + + ret + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + bsrl %ebp, %eax C 31-l + + movl %ebx, SAVE_EBX + movl %ecx, %ebx C size + + movl %edi, SAVE_EDI + movl $31, %ecx + + movl %edx, %edi C carry + movl $-1, %edx + + C + + xorl %eax, %ecx C l + incl %eax C 32-l + + shll %cl, %ebp C d normalized + movl %ecx, VAR_NORM + + movd %eax, %mm7 C 32-l + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + C + + movl %eax, VAR_INVERSE + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl 8(%eax), %esi C src high limb + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + movl %eax, %ecx C &src[size-3] + + +ifelse(MUL_THRESHOLD,2,` + cmpl $2, %ebx + je L(inverse_two_left) +') + + +C The dependent chain here is the same as in mpn_divrem_1, but a few +C instructions are saved by not needing to store the quotient limbs. +C Unfortunately this doesn't get the code down to the theoretical 16 c/l. +C +C There's four dummy instructions in the loop, all of which are necessary +C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed, +C or changed from load to store or vice versa. They're not completely +C random, since they correspond to what mpn_divrem_1 has, but there's no +C obvious reason why they're necessary. Presumably they induce something +C good in the out of order execution, perhaps through some load/store +C ordering and/or decoding effects. +C +C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On +C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at +C about 13.5 c/l. + + ALIGN(32) +L(inverse_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx src pointer, decrementing + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl PARAM_SIZE, %ebx C dummy + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next src limb and the one below it + subl $4, %ecx + + movl %ecx, PARAM_SIZE C dummy + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + nop C dummy + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + leal (%ecx), %ecx C dummy + + C + + C + + subl %eax, %esi C low n - (q1+1)*d + movl PARAM_SRC, %eax + + C + + sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1 + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + cmpl %eax, %ecx + jae L(inverse_top) + + +L(inverse_loop_done): + + +C ----------------------------------------------------------------------------- + +L(inverse_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx &src[-1] + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src dword) + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd 4(%ecx), %mm0 C src low limb + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + +L(inverse_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movl VAR_NORM, %ecx C for final denorm + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + movl SAVE_EBX, %ebx + + C + + C + + subl %eax, %esi + + movl %esi, %eax C remainder + movl SAVE_ESI, %esi + + sbbl %edx, %edi C n - (q1+1)*d + leal (%ebp,%eax), %edx + movl SAVE_EBP, %ebp + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + movl SAVE_EDI, %edi + + shrl %cl, %eax C denorm remainder + addl $STACK_SPACE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx src pointer + C edx + C esi n10 + C edi (n2) + C ebp divisor + + movl PARAM_SRC, %edx + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + psrlq %mm7, %mm0 + + movd %mm0, %esi C next n10 + + cmpl %edx, %ecx + jae L(inverse_top) + jmp L(inverse_loop_done) + +EPILOGUE() diff --git a/gmp/mpn/x86/k7/mmx/popham.asm b/gmp/mpn/x86/k7/mmx/popham.asm index 95965b74d4..5dc0a78c42 100644 --- a/gmp/mpn/x86/k7/mmx/popham.asm +++ b/gmp/mpn/x86/k7/mmx/popham.asm @@ -1,40 +1,29 @@ dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming dnl distance. -dnl Copyright 2000-2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C popcount hamdist C P3 generic 6.5 7 -C P3 model 9 (Banias) 5.7 6.1 +C P3 model 9 (Banias) ? ? C P3 model 13 (Dothan) 5.75 6 C K7 5 6 diff --git a/gmp/mpn/x86/k7/mmx/rshift.asm b/gmp/mpn/x86/k7/mmx/rshift.asm index 345d23a25e..3566ce85d7 100644 --- a/gmp/mpn/x86/k7/mmx/rshift.asm +++ b/gmp/mpn/x86/k7/mmx/rshift.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_rshift -- mpn right shift. -dnl Copyright 1999-2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/mod_1_1.asm b/gmp/mpn/x86/k7/mod_1_1.asm deleted file mode 100644 index 1bbe6f92d7..0000000000 --- a/gmp/mpn/x86/k7/mod_1_1.asm +++ /dev/null @@ -1,221 +0,0 @@ -dnl x86-32 mpn_mod_1_1p, requiring cmov. - -dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund. - -dnl Copyright 2010, 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C P5 ? -C P6 model 0-8,10-12 ? -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) ? -C P4 model 0 (Willamette) ? -C P4 model 1 (?) ? -C P4 model 2 (Northwood) ? -C P4 model 3 (Prescott) ? -C P4 model 4 (Nocona) ? -C AMD K6 ? -C AMD K7 7 -C AMD K8 ? - -define(`B2mb', `%ebx') -define(`r0', `%esi') -define(`r2', `%ebp') -define(`t0', `%edi') -define(`ap', `%ecx') C Also shift count - -C Stack frame -C pre 36(%esp) -C b 32(%esp) -C n 28(%esp) -C ap 24(%esp) -C return 20(%esp) -C %ebp 16(%esp) -C %edi 12(%esp) -C %esi 8(%esp) -C %ebx 4(%esp) -C B2mod (%esp) - -define(`B2modb', `(%esp)') -define(`n', `28(%esp)') -define(`b', `32(%esp)') -define(`pre', `36(%esp)') - -C mp_limb_t -C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) -C -C The pre array contains bi, cnt, B1modb, B2modb -C Note: This implementation needs B1modb only when cnt > 0 - -ASM_START() - TEXT - ALIGN(8) -PROLOGUE(mpn_mod_1_1p) - push %ebp - push %edi - push %esi - push %ebx - mov 32(%esp), %ebp C pre[] - - mov 12(%ebp), %eax C B2modb - push %eax C Put it on stack - - mov n, %edx - mov 24(%esp), ap - - lea (ap, %edx, 4), ap - mov -4(ap), %eax - cmp $3, %edx - jnc L(first) - mov -8(ap), r0 - jmp L(reduce_two) - -L(first): - C First iteration, no r2 - mull B2modb - mov -12(ap), r0 - add %eax, r0 - mov -8(ap), %eax - adc %edx, %eax - sbb r2, r2 - subl $3, n - lea -16(ap), ap - jz L(reduce_three) - - mov B2modb, B2mb - sub b, B2mb - lea (B2mb, r0), t0 - jmp L(mid) - - ALIGN(16) -L(top): C Loopmixed to 7 c/l on k7 - add %eax, r0 - lea (B2mb, r0), t0 - mov r2, %eax - adc %edx, %eax - sbb r2, r2 -L(mid): mull B2modb - and B2modb, r2 - add r0, r2 - decl n - mov (ap), r0 - cmovc( t0, r2) - lea -4(ap), ap - jnz L(top) - - add %eax, r0 - mov r2, %eax - adc %edx, %eax - sbb r2, r2 - -L(reduce_three): - C Eliminate r2 - and b, r2 - sub r2, %eax - -L(reduce_two): - mov pre, %ebp - movb 4(%ebp), %cl - test %cl, %cl - jz L(normalized) - - C Unnormalized, use B1modb to reduce to size < B b - mull 8(%ebp) - xor t0, t0 - add %eax, r0 - adc %edx, t0 - mov t0, %eax - - C Left-shift to normalize - shld %cl, r0, %eax C Always use shld? - - shl %cl, r0 - jmp L(udiv) - -L(normalized): - mov %eax, t0 - sub b, t0 - cmovnc( t0, %eax) - -L(udiv): - lea 1(%eax), t0 - mull (%ebp) - mov b, %ebx C Needed in register for lea - add r0, %eax - adc t0, %edx - imul %ebx, %edx - sub %edx, r0 - cmp r0, %eax - lea (%ebx, r0), %eax - cmovnc( r0, %eax) - cmp %ebx, %eax - jnc L(fix) -L(ok): shr %cl, %eax - - add $4, %esp - pop %ebx - pop %esi - pop %edi - pop %ebp - - ret -L(fix): sub %ebx, %eax - jmp L(ok) -EPILOGUE() - -PROLOGUE(mpn_mod_1_1p_cps) - push %ebp - mov 12(%esp), %ebp - push %esi - bsr %ebp, %ecx - push %ebx - xor $31, %ecx - mov 16(%esp), %esi - sal %cl, %ebp - mov %ebp, %edx - not %edx - mov $-1, %eax - div %ebp C On K7, invert_limb would be a few cycles faster. - mov %eax, (%esi) C store bi - mov %ecx, 4(%esi) C store cnt - neg %ebp - mov $1, %edx - shld %cl, %eax, %edx - imul %ebp, %edx - shr %cl, %edx - imul %ebp, %eax - mov %edx, 8(%esi) C store B1modb - mov %eax, 12(%esi) C store B2modb - pop %ebx - pop %esi - pop %ebp - ret -EPILOGUE() diff --git a/gmp/mpn/x86/k7/mod_1_4.asm b/gmp/mpn/x86/k7/mod_1_4.asm deleted file mode 100644 index bb7597edd2..0000000000 --- a/gmp/mpn/x86/k7/mod_1_4.asm +++ /dev/null @@ -1,260 +0,0 @@ -dnl x86-32 mpn_mod_1s_4p, requiring cmov. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2009, 2010 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C P5 ? -C P6 model 0-8,10-12 ? -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) 6 -C P4 model 0 (Willamette) ? -C P4 model 1 (?) ? -C P4 model 2 (Northwood) 15.5 -C P4 model 3 (Prescott) ? -C P4 model 4 (Nocona) ? -C AMD K6 ? -C AMD K7 4.75 -C AMD K8 ? - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_mod_1s_4p) - push %ebp - push %edi - push %esi - push %ebx - sub $28, %esp - mov 60(%esp), %edi C cps[] - mov 8(%edi), %eax - mov 12(%edi), %edx - mov 16(%edi), %ecx - mov 20(%edi), %esi - mov 24(%edi), %edi - mov %eax, 4(%esp) - mov %edx, 8(%esp) - mov %ecx, 12(%esp) - mov %esi, 16(%esp) - mov %edi, 20(%esp) - mov 52(%esp), %eax C n - xor %edi, %edi - mov 48(%esp), %esi C up - lea -12(%esi,%eax,4), %esi - and $3, %eax - je L(b0) - cmp $2, %eax - jc L(b1) - je L(b2) - -L(b3): mov 4(%esi), %eax - mull 4(%esp) - mov (%esi), %ebp - add %eax, %ebp - adc %edx, %edi - mov 8(%esi), %eax - mull 8(%esp) - lea -12(%esi), %esi - jmp L(m0) - -L(b0): mov (%esi), %eax - mull 4(%esp) - mov -4(%esi), %ebp - add %eax, %ebp - adc %edx, %edi - mov 4(%esi), %eax - mull 8(%esp) - add %eax, %ebp - adc %edx, %edi - mov 8(%esi), %eax - mull 12(%esp) - lea -16(%esi), %esi - jmp L(m0) - -L(b1): mov 8(%esi), %ebp - lea -4(%esi), %esi - jmp L(m1) - -L(b2): mov 8(%esi), %edi - mov 4(%esi), %ebp - lea -8(%esi), %esi - jmp L(m1) - - ALIGN(16) -L(top): mov (%esi), %eax - mull 4(%esp) - mov -4(%esi), %ebx - xor %ecx, %ecx - add %eax, %ebx - adc %edx, %ecx - mov 4(%esi), %eax - mull 8(%esp) - add %eax, %ebx - adc %edx, %ecx - mov 8(%esi), %eax - mull 12(%esp) - add %eax, %ebx - adc %edx, %ecx - lea -16(%esi), %esi - mov 16(%esp), %eax - mul %ebp - add %eax, %ebx - adc %edx, %ecx - mov 20(%esp), %eax - mul %edi - mov %ebx, %ebp - mov %ecx, %edi -L(m0): add %eax, %ebp - adc %edx, %edi -L(m1): subl $4, 52(%esp) - ja L(top) - -L(end): mov 4(%esp), %eax - mul %edi - mov 60(%esp), %edi - add %eax, %ebp - adc $0, %edx - mov 4(%edi), %ecx - mov %edx, %esi - mov %ebp, %eax - sal %cl, %esi - mov %ecx, %ebx - neg %ecx - shr %cl, %eax - or %esi, %eax - lea 1(%eax), %esi - mull (%edi) - mov %ebx, %ecx - mov %eax, %ebx - mov %ebp, %eax - mov 56(%esp), %ebp - sal %cl, %eax - add %eax, %ebx - adc %esi, %edx - imul %ebp, %edx - sub %edx, %eax - lea (%eax,%ebp), %edx - cmp %eax, %ebx - cmovc( %edx, %eax) - mov %eax, %edx - sub %ebp, %eax - cmovc( %edx, %eax) - add $28, %esp - pop %ebx - pop %esi - pop %edi - pop %ebp - shr %cl, %eax - ret -EPILOGUE() - - ALIGN(16) -PROLOGUE(mpn_mod_1s_4p_cps) -C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm - push %ebp - push %edi - push %esi - push %ebx - mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx - mov 24(%esp), %ebx - bsr %ebx, %ecx - xor $31, %ecx - sal %cl, %ebx C b << cnt - mov %ebx, %edx - not %edx - mov $-1, %eax - div %ebx - xor %edi, %edi - sub %ebx, %edi - mov $1, %esi - mov %eax, (%ebp) C store bi - mov %ecx, 4(%ebp) C store cnt - shld %cl, %eax, %esi - imul %edi, %esi - mov %eax, %edi - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 8(%ebp) C store B1modb - - not %edx - imul %ebx, %edx - lea (%edx,%ebx), %esi - cmp %edx, %eax - cmovnc( %edx, %esi) - mov %edi, %eax - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 12(%ebp) C store B2modb - - not %edx - imul %ebx, %edx - lea (%edx,%ebx), %esi - cmp %edx, %eax - cmovnc( %edx, %esi) - mov %edi, %eax - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 16(%ebp) C store B3modb - - not %edx - imul %ebx, %edx - lea (%edx,%ebx), %esi - cmp %edx, %eax - cmovnc( %edx, %esi) - mov %edi, %eax - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 20(%ebp) C store B4modb - - not %edx - imul %ebx, %edx - add %edx, %ebx - cmp %edx, %eax - cmovnc( %edx, %ebx) - - shr %cl, %ebx - mov %ebx, 24(%ebp) C store B5modb - - pop %ebx - pop %esi - pop %edi - pop %ebp - ret -EPILOGUE() diff --git a/gmp/mpn/x86/k7/mod_34lsub1.asm b/gmp/mpn/x86/k7/mod_34lsub1.asm index ee3ad04099..f00e84dc42 100644 --- a/gmp/mpn/x86/k7/mod_34lsub1.asm +++ b/gmp/mpn/x86/k7/mod_34lsub1.asm @@ -1,32 +1,22 @@ dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. -dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation, +dnl Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/mode1o.asm b/gmp/mpn/x86/k7/mode1o.asm index 6472ec5949..ef858049a6 100644 --- a/gmp/mpn/x86/k7/mode1o.asm +++ b/gmp/mpn/x86/k7/mode1o.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. -dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -122,7 +111,7 @@ ifdef(`PIC',` subl %eax, %edi C inv = 2*inv - inv*inv*d - ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB movl %esi, %eax imull %edi, %eax cmpl $1, %eax') diff --git a/gmp/mpn/x86/k7/mul_1.asm b/gmp/mpn/x86/k7/mul_1.asm index 755cd2ed50..016262d594 100644 --- a/gmp/mpn/x86/k7/mul_1.asm +++ b/gmp/mpn/x86/k7/mul_1.asm @@ -1,38 +1,28 @@ dnl AMD K7 mpn_mul_1. -dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation, +dnl Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C P5 +C cycles/limb +C P5: C P6 model 0-8,10-12) C P6 model 9 (Banias) C P6 model 13 (Dothan) @@ -41,9 +31,9 @@ C P4 model 1 (?) C P4 model 2 (Northwood) C P4 model 3 (Prescott) C P4 model 4 (Nocona) -C AMD K6 -C AMD K7 3.25 -C AMD K8 +C K6: +C K7: 3.25 +C K8: C TODO C * Improve feed-in and wind-down code. We beat the old code for all n != 1, diff --git a/gmp/mpn/x86/k7/mul_basecase.asm b/gmp/mpn/x86/k7/mul_basecase.asm index 4dfb500885..7f4c0002f7 100644 --- a/gmp/mpn/x86/k7/mul_basecase.asm +++ b/gmp/mpn/x86/k7/mul_basecase.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. -dnl Copyright 1999-2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/k7/sqr_basecase.asm b/gmp/mpn/x86/k7/sqr_basecase.asm index 7b6a97e0df..408a13dc9b 100644 --- a/gmp/mpn/x86/k7/sqr_basecase.asm +++ b/gmp/mpn/x86/k7/sqr_basecase.asm @@ -1,32 +1,21 @@ dnl AMD K7 mpn_sqr_basecase -- square an mpn number. -dnl Copyright 1999-2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -39,18 +28,18 @@ C roughly the Karatsuba recursing range). dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for dnl some comments. -deflit(SQR_TOOM2_THRESHOLD_MAX, 66) +deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66) -ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', -`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') +ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE', +`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)') -m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') -deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) +m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD') +deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3)) C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); C -C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes, +C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes, C which is quite a bit, but is considered good value since squares big C enough to use most of the code will be spending quite a few cycles in it. diff --git a/gmp/mpn/x86/k7/sublsh1_n.asm b/gmp/mpn/x86/k7/sublsh1_n.asm deleted file mode 100644 index 523b01218d..0000000000 --- a/gmp/mpn/x86/k7/sublsh1_n.asm +++ /dev/null @@ -1,173 +0,0 @@ -dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1) - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The -C innerloop is 2*3-way unrolled, which is best we can do with the available -C registers. It seems tricky to use the same structure for rsblsh1_n, since we -C cannot feed carry between operations there. - -C cycles/limb -C P5 -C P6 model 0-8,10-12 -C P6 model 9 (Banias) -C P6 model 13 (Dothan) -C P4 model 0 (Willamette) -C P4 model 1 (?) -C P4 model 2 (Northwood) -C P4 model 3 (Prescott) -C P4 model 4 (Nocona) -C Intel Atom 6.75 -C AMD K6 -C AMD K7 -C AMD K8 - -C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 -C processors. It uses 2*4-way unrolling, for good reasons. -C -C Breaking carry recurrency might be a good idea. We would then need separate -C registers for the shift carry and add/subtract carry, which in turn would -C force is to 2*2-way unrolling. - -defframe(PARAM_SIZE, 12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - -dnl re-use parameter space -define(VAR_COUNT,`PARAM_SIZE') -define(SAVE_EBX,`PARAM_SRC') -define(SAVE_EBP,`PARAM_DST') - -ASM_START() - TEXT - ALIGN(8) -PROLOGUE(mpn_sublsh1_n_ip1) -deflit(`FRAME',0) - -define(`rp', `%edi') -define(`up', `%esi') - - mov PARAM_SIZE, %eax C size - push up FRAME_pushl() - push rp FRAME_pushl() - xor %edx, %edx - mov PARAM_SRC, up - mov PARAM_DST, rp - mov %ebx, SAVE_EBX - mov %eax, %ebx - shr $3, %eax - - not %eax C count = -(size\8)-i - and $7, %ebx C size % 8 - jz L(exact) - -L(oop): -ifdef(`CPU_P6',` - shr %edx ') C restore 2nd saved carry bit - mov (up), %ecx - adc %ecx, %ecx - rcr %edx C restore 1st saved carry bit - lea 4(up), up - sbb %ecx, (rp) - lea 4(rp), rp - adc %edx, %edx C save a carry bit in edx -ifdef(`CPU_P6',` - adc %edx, %edx ') C save another carry bit in edx - dec %ebx - jnz L(oop) -L(exact): - inc %eax - jz L(end) - mov %eax, VAR_COUNT - mov %ebp, SAVE_EBP - - ALIGN(16) -L(top): -ifdef(`CPU_P6',` - shr %edx ') C restore 2nd saved carry bit - mov (up), %eax - adc %eax, %eax - mov 4(up), %ebx - adc %ebx, %ebx - mov 8(up), %ecx - adc %ecx, %ecx - mov 12(up), %ebp - adc %ebp, %ebp - - rcr %edx C restore 1st saved carry bit - - sbb %eax, (rp) - sbb %ebx, 4(rp) - sbb %ecx, 8(rp) - sbb %ebp, 12(rp) - - mov 16(up), %eax - adc %eax, %eax - mov 20(up), %ebx - adc %ebx, %ebx - mov 24(up), %ecx - adc %ecx, %ecx - mov 28(up), %ebp - adc %ebp, %ebp - - lea 32(up), up - adc %edx, %edx C save a carry bit in edx - - sbb %eax, 16(rp) - sbb %ebx, 20(rp) - sbb %ecx, 24(rp) - sbb %ebp, 28(rp) - -ifdef(`CPU_P6',` - adc %edx, %edx ') C save another carry bit in edx - incl VAR_COUNT - lea 32(rp), rp - jne L(top) - - mov SAVE_EBP, %ebp -L(end): - mov SAVE_EBX, %ebx - -ifdef(`CPU_P6',` - xor %eax, %eax - shr $1, %edx - adc %edx, %eax -',` - adc $0, %edx - mov %edx, %eax -') - pop rp FRAME_popl() - pop up FRAME_popl() - ret -EPILOGUE() -ASM_END() |