diff options
Diffstat (limited to 'gmp/mpn/x86_64/core2')
24 files changed, 330 insertions, 4679 deletions
diff --git a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp/mpn/x86_64/core2/aorrlsh1_n.asm deleted file mode 100644 index 7066bb4372..0000000000 --- a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm +++ /dev/null @@ -1,53 +0,0 @@ -dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) -dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -define(LSH, 1) -define(RSH, 63) - -ifdef(`OPERATION_addlsh1_n', ` - define(ADDSUB, add) - define(ADCSBB, adc) - define(func, mpn_addlsh1_n)') -ifdef(`OPERATION_rsblsh1_n', ` - define(ADDSUB, sub) - define(ADCSBB, sbb) - define(func, mpn_rsblsh1_n)') - -MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp/mpn/x86_64/core2/aorrlsh2_n.asm deleted file mode 100644 index 5065120857..0000000000 --- a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm +++ /dev/null @@ -1,53 +0,0 @@ -dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) -dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -define(LSH, 2) -define(RSH, 62) - -ifdef(`OPERATION_addlsh2_n', ` - define(ADDSUB, add) - define(ADCSBB, adc) - define(func, mpn_addlsh2_n)') -ifdef(`OPERATION_rsblsh2_n', ` - define(ADDSUB, sub) - define(ADCSBB, sbb) - define(func, mpn_rsblsh2_n)') - -MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp/mpn/x86_64/core2/aorrlsh_n.asm b/gmp/mpn/x86_64/core2/aorrlsh_n.asm deleted file mode 100644 index 57abf31579..0000000000 --- a/gmp/mpn/x86_64/core2/aorrlsh_n.asm +++ /dev/null @@ -1,38 +0,0 @@ -dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. - -dnl Copyright 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -include_mpn(`x86_64/coreinhm/aorrlsh_n.asm') diff --git a/gmp/mpn/x86_64/core2/aors_err1_n.asm b/gmp/mpn/x86_64/core2/aors_err1_n.asm deleted file mode 100644 index 3f875aefa4..0000000000 --- a/gmp/mpn/x86_64/core2/aors_err1_n.asm +++ /dev/null @@ -1,225 +0,0 @@ -dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n - -dnl Contributed by David Harvey. - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 ? -C AMD K10 ? -C Intel P4 ? -C Intel core2 4.14 -C Intel corei ? -C Intel atom ? -C VIA nano ? - - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`vp', `%rdx') -define(`ep', `%rcx') -define(`yp', `%r8') -define(`n', `%r9') -define(`cy_param', `8(%rsp)') - -define(`el', `%rbx') -define(`eh', `%rbp') -define(`t0', `%r10') -define(`t1', `%r11') -define(`t2', `%r12') -define(`t3', `%r13') -define(`w0', `%r14') -define(`w1', `%r15') - -ifdef(`OPERATION_add_err1_n', ` - define(ADCSBB, adc) - define(func, mpn_add_err1_n)') -ifdef(`OPERATION_sub_err1_n', ` - define(ADCSBB, sbb) - define(func, mpn_sub_err1_n)') - -MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) - - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(func) - mov cy_param, %rax - - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - - lea (up,n,8), up - lea (vp,n,8), vp - lea (rp,n,8), rp - - mov R32(n), R32(%r10) - and $3, R32(%r10) - jz L(0mod4) - cmp $2, R32(%r10) - jc L(1mod4) - jz L(2mod4) -L(3mod4): - xor R32(el), R32(el) - xor R32(eh), R32(eh) - xor R32(t0), R32(t0) - xor R32(t1), R32(t1) - lea -24(yp,n,8), yp - neg n - - shr $1, %al C restore carry - mov (up,n,8), w0 - mov 8(up,n,8), w1 - ADCSBB (vp,n,8), w0 - mov w0, (rp,n,8) - cmovc 16(yp), el - ADCSBB 8(vp,n,8), w1 - mov w1, 8(rp,n,8) - cmovc 8(yp), t0 - mov 16(up,n,8), w0 - ADCSBB 16(vp,n,8), w0 - mov w0, 16(rp,n,8) - cmovc (yp), t1 - setc %al C save carry - add t0, el - adc $0, eh - add t1, el - adc $0, eh - - add $3, n - jnz L(loop) - jmp L(end) - - ALIGN(16) -L(0mod4): - xor R32(el), R32(el) - xor R32(eh), R32(eh) - lea (yp,n,8), yp - neg n - jmp L(loop) - - ALIGN(16) -L(1mod4): - xor R32(el), R32(el) - xor R32(eh), R32(eh) - lea -8(yp,n,8), yp - neg n - - shr $1, %al C restore carry - mov (up,n,8), w0 - ADCSBB (vp,n,8), w0 - mov w0, (rp,n,8) - cmovc (yp), el - setc %al C save carry - - add $1, n - jnz L(loop) - jmp L(end) - - ALIGN(16) -L(2mod4): - xor R32(el), R32(el) - xor R32(eh), R32(eh) - xor R32(t0), R32(t0) - lea -16(yp,n,8), yp - neg n - - shr $1, %al C restore carry - mov (up,n,8), w0 - mov 8(up,n,8), w1 - ADCSBB (vp,n,8), w0 - mov w0, (rp,n,8) - cmovc 8(yp), el - ADCSBB 8(vp,n,8), w1 - mov w1, 8(rp,n,8) - cmovc (yp), t0 - setc %al C save carry - add t0, el - adc $0, eh - - add $2, n - jnz L(loop) - jmp L(end) - - ALIGN(32) -L(loop): - mov (up,n,8), w0 - shr $1, %al C restore carry - mov -8(yp), t0 - mov $0, R32(t3) - ADCSBB (vp,n,8), w0 - cmovnc t3, t0 - mov w0, (rp,n,8) - mov 8(up,n,8), w1 - mov 16(up,n,8), w0 - ADCSBB 8(vp,n,8), w1 - mov -16(yp), t1 - cmovnc t3, t1 - mov -24(yp), t2 - mov w1, 8(rp,n,8) - ADCSBB 16(vp,n,8), w0 - cmovnc t3, t2 - mov 24(up,n,8), w1 - ADCSBB 24(vp,n,8), w1 - cmovc -32(yp), t3 - setc %al C save carry - add t0, el - adc $0, eh - add t1, el - adc $0, eh - add t2, el - adc $0, eh - lea -32(yp), yp - mov w0, 16(rp,n,8) - add t3, el - adc $0, eh - add $4, n - mov w1, -8(rp,n,8) - jnz L(loop) - -L(end): - mov el, (ep) - mov eh, 8(ep) - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/aors_n.asm b/gmp/mpn/x86_64/core2/aors_n.asm index 74a1bce48a..d26af866f9 100644 --- a/gmp/mpn/x86_64/core2/aors_n.asm +++ b/gmp/mpn/x86_64/core2/aors_n.asm @@ -1,45 +1,30 @@ -dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem. +dnl Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract. -dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc. +dnl Copyright 2006, 2007 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. -dnl + dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C AMD K8,K9 2 -C AMD K10 2 -C Intel P4 10 -C Intel core2 2 -C Intel NHM 2 -C Intel SBR 2 -C Intel atom 9 -C VIA nano 3 +C K8,K9: 2.25 +C K10: 2 +C P4: 10 +C P6-15: 2.05 C INPUT PARAMETERS define(`rp', `%rdi') @@ -59,83 +44,80 @@ ifdef(`OPERATION_sub_n', ` MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - ASM_START() + TEXT ALIGN(16) + +PROLOGUE(func_nc) + jmp L(start) +EPILOGUE() + PROLOGUE(func) - FUNC_ENTRY(4) xor %r8, %r8 L(start): mov (up), %r10 mov (vp), %r11 - lea (up,n,8), up - lea (vp,n,8), vp - lea (rp,n,8), rp - mov R32(n), R32(%rax) + lea -8(up,n,8), up + lea -8(vp,n,8), vp + lea -16(rp,n,8), rp + mov %ecx, %eax neg n - and $3, R32(%rax) + and $3, %eax je L(b00) - add %rax, n C clear low rcx bits for jrcxz - cmp $2, R32(%rax) + add %rax, n C clear low rcx bits for jrcxz + cmp $2, %eax jl L(b01) je L(b10) -L(b11): neg %r8 C set cy +L(b11): shr %r8 C set cy jmp L(e11) -L(b00): neg %r8 C set cy +L(b00): shr %r8 C set cy mov %r10, %r8 mov %r11, %r9 lea 4(n), n jmp L(e00) - nop - nop - nop -L(b01): neg %r8 C set cy - jmp L(top) +L(b01): shr %r8 C set cy + jmp L(e01) -L(b10): neg %r8 C set cy +L(b10): shr %r8 C set cy mov %r10, %r8 mov %r11, %r9 jmp L(e10) L(end): ADCSBB %r11, %r10 - mov %r10, -8(rp) - mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0 - adc R32(%rax), R32(%rax) - FUNC_EXIT() + mov %r10, 8(rp) + mov %ecx, %eax C clear eax, ecx contains 0 + adc %eax, %eax ret ALIGN(16) -L(top): jrcxz L(end) - mov (up,n,8), %r8 - mov (vp,n,8), %r9 - lea 4(n), n - ADCSBB %r11, %r10 - mov %r10, -40(rp,n,8) -L(e00): mov -24(up,n,8), %r10 - mov -24(vp,n,8), %r11 - ADCSBB %r9, %r8 - mov %r8, -32(rp,n,8) -L(e11): mov -16(up,n,8), %r8 - mov -16(vp,n,8), %r9 +L(top): + mov -24(up,n,8), %r8 + mov -24(vp,n,8), %r9 ADCSBB %r11, %r10 mov %r10, -24(rp,n,8) -L(e10): mov -8(up,n,8), %r10 - mov -8(vp,n,8), %r11 +L(e00): + mov -16(up,n,8), %r10 + mov -16(vp,n,8), %r11 ADCSBB %r9, %r8 mov %r8, -16(rp,n,8) +L(e11): + mov -8(up,n,8), %r8 + mov -8(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -8(rp,n,8) +L(e10): + mov (up,n,8), %r10 + mov (vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, (rp,n,8) +L(e01): + jrcxz L(end) + lea 4(n), n jmp L(top) -EPILOGUE() -PROLOGUE(func_nc) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8 ') - jmp L(start) EPILOGUE() - diff --git a/gmp/mpn/x86_64/core2/sublshC_n.asm b/gmp/mpn/x86_64/core2/aorslsh1_n.asm index 5acc46b032..18db7c96f8 100644 --- a/gmp/mpn/x86_64/core2/sublshC_n.asm +++ b/gmp/mpn/x86_64/core2/aorslsh1_n.asm @@ -1,45 +1,29 @@ -dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << 1), optimised for Core 2 and -dnl Core iN. +dnl x86-64 mpn_addlsh1_n and mpn_sublsh1_n, optimized for "Core" 2. -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. +dnl Copyright 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. -dnl + dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') C cycles/limb -C AMD K8,K9 4.25 -C AMD K10 ? -C Intel P4 ? -C Intel core2 3 -C Intel NHM 3.1 -C Intel SBR 2.47 -C Intel atom ? -C VIA nano ? +C K8,K9: 4.25 +C K10: ? +C P4: ? +C P6-15: 3 C INPUT PARAMETERS define(`rp',`%rdi') @@ -47,11 +31,21 @@ define(`up',`%rsi') define(`vp',`%rdx') define(`n', `%rcx') +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_sublsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_sublsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + ASM_START() TEXT ALIGN(8) PROLOGUE(func) - FUNC_ENTRY(4) push %rbx push %r12 @@ -64,7 +58,7 @@ PROLOGUE(func) xor R32(%r11), R32(%r11) mov -24(vp,n,8), %r8 C do first limb early - shrd $RSH, %r8, %r11 + shrd $63, %r8, %r11 and $3, R32(%rax) je L(b0) @@ -73,9 +67,9 @@ PROLOGUE(func) je L(b2) L(b3): mov -16(vp,n,8), %r9 - shrd $RSH, %r9, %r8 + shrd $63, %r9, %r8 mov -8(vp,n,8), %r10 - shrd $RSH, %r10, %r9 + shrd $63, %r10, %r9 mov -24(up,n,8), %r12 ADDSUB %r11, %r12 mov %r12, -24(rp,n,8) @@ -101,7 +95,7 @@ L(b1): mov -24(up,n,8), %r12 jmp L(end) L(b2): mov -16(vp,n,8), %r9 - shrd $RSH, %r9, %r8 + shrd $63, %r9, %r8 mov -24(up,n,8), %r12 ADDSUB %r11, %r12 mov %r12, -24(rp,n,8) @@ -116,13 +110,13 @@ L(b2): mov -16(vp,n,8), %r9 ALIGN(16) L(top): mov -24(vp,n,8), %r8 - shrd $RSH, %r8, %r11 + shrd $63, %r8, %r11 L(b0): mov -16(vp,n,8), %r9 - shrd $RSH, %r9, %r8 + shrd $63, %r9, %r8 mov -8(vp,n,8), %r10 - shrd $RSH, %r10, %r9 + shrd $63, %r10, %r9 mov (vp,n,8), %rbx - shrd $RSH, %rbx, %r10 + shrd $63, %rbx, %r10 add R32(%rax), R32(%rax) C restore cy @@ -148,11 +142,10 @@ L(b0): mov -16(vp,n,8), %r9 add $4, n js L(top) -L(end): shr $RSH, %r11 +L(end): add %r11, %r11 pop %r12 pop %rbx - sub R32(%r11), R32(%rax) + sbb $0, R32(%rax) neg R32(%rax) - FUNC_EXIT() ret EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/aorsmul_1.asm b/gmp/mpn/x86_64/core2/aorsmul_1.asm index 6b313dd836..1d05b30b59 100644 --- a/gmp/mpn/x86_64/core2/aorsmul_1.asm +++ b/gmp/mpn/x86_64/core2/aorsmul_1.asm @@ -1,46 +1,29 @@ dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". -dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc. +dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. -dnl + dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C AMD K8,K9 4 -C AMD K10 4 -C AMD bd1 5.1 -C AMD bobcat -C Intel P4 ? -C Intel core2 4.3-4.5 (fluctuating) -C Intel NHM 5.0 -C Intel SBR 4.1 -C Intel atom ? -C VIA nano 5.25 +C K8,K9: 4 +C K10: 4 +C P4: ? +C P6-15: 4.3-4.7 (fluctuating) C INPUT PARAMETERS define(`rp', `%rdi') @@ -50,129 +33,111 @@ define(`v0', `%rcx') ifdef(`OPERATION_addmul_1',` define(`ADDSUB', `add') - define(`func', `mpn_addmul_1') - define(`func_1c', `mpn_addmul_1c') + define(`func', `mpn_addmul_1') ') ifdef(`OPERATION_submul_1',` define(`ADDSUB', `sub') - define(`func', `mpn_submul_1') - define(`func_1c', `mpn_submul_1c') + define(`func', `mpn_submul_1') ') -MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - - C For DOS, on the stack we have four saved registers, return address, - C space for four register arguments, and finally the carry input. - -IFDOS(` define(`carry_in', `72(%rsp)')') dnl -IFSTD(` define(`carry_in', `%r8')') dnl +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) ASM_START() TEXT ALIGN(16) -PROLOGUE(func_1c) - FUNC_ENTRY(4) - push %rbx - push %rbp - lea (%rdx), %rbx - neg %rbx - - mov (up), %rax - mov (rp), %r10 - - lea -16(rp,%rdx,8), rp - lea (up,%rdx,8), up - mul %rcx - add carry_in, %rax - adc $0, %rdx - jmp L(start_nc) -EPILOGUE() - - ALIGN(16) PROLOGUE(func) - FUNC_ENTRY(4) - push %rbx - push %rbp - lea (%rdx), %rbx - neg %rbx + push %r15 + push %r12 + push %r13 + lea (%rdx), %r15 + neg %r15 mov (up), %rax - mov (rp), %r10 - lea -16(rp,%rdx,8), rp + bt $0, %r15 + jc L(odd) + + lea (rp,%rdx,8), rp lea (up,%rdx,8), up mul %rcx -L(start_nc): - bt $0, R32(%rbx) - jc L(odd) - lea (%rax), %r11 - mov 8(up,%rbx,8), %rax - lea (%rdx), %rbp - mul %rcx - add $2, %rbx + mov 8(up,%r15,8), %rax + mov (rp,%r15,8), %r13 + lea (%rdx), %r12 + + add $2, %r15 jns L(n2) + mul %rcx lea (%rax), %r8 - mov (up,%rbx,8), %rax + mov (up,%r15,8), %rax + mov -8(rp,%r15,8), %r10 lea (%rdx), %r9 - jmp L(mid) + jmp L(m) -L(odd): add $1, %rbx +L(odd): lea (rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + add $1, %r15 jns L(n1) - lea (%rax), %r8 - mov (up,%rbx,8), %rax +L(gt1): lea (%rax), %r8 + mov (up,%r15,8), %rax + mov -8(rp,%r15,8), %r10 lea (%rdx), %r9 mul %rcx lea (%rax), %r11 - mov 8(up,%rbx,8), %rax - lea (%rdx), %rbp - jmp L(e) + mov 8(up,%r15,8), %rax + mov (rp,%r15,8), %r13 + lea (%rdx), %r12 + add $2, %r15 + jns L(end) ALIGN(16) L(top): mul %rcx ADDSUB %r8, %r10 lea (%rax), %r8 - mov (up,%rbx,8), %rax + mov 0(up,%r15,8), %rax adc %r9, %r11 - mov %r10, -8(rp,%rbx,8) - mov (rp,%rbx,8), %r10 + mov %r10, -24(rp,%r15,8) + mov -8(rp,%r15,8), %r10 lea (%rdx), %r9 - adc $0, %rbp -L(mid): mul %rcx - ADDSUB %r11, %r10 + adc $0, %r12 +L(m): mul %rcx + ADDSUB %r11, %r13 lea (%rax), %r11 - mov 8(up,%rbx,8), %rax - adc %rbp, %r8 - mov %r10, (rp,%rbx,8) - mov 8(rp,%rbx,8), %r10 - lea (%rdx), %rbp + mov 8(up,%r15,8), %rax + adc %r12, %r8 + mov %r13, -16(rp,%r15,8) + mov 0(rp,%r15,8), %r13 + lea (%rdx), %r12 adc $0, %r9 -L(e): add $2, %rbx + + add $2, %r15 js L(top) - mul %rcx +L(end): mul %rcx ADDSUB %r8, %r10 adc %r9, %r11 - mov %r10, -8(rp) - adc $0, %rbp -L(n2): mov (rp), %r10 - ADDSUB %r11, %r10 - adc %rbp, %rax - mov %r10, (rp) + mov %r10, -24(rp,%r15,8) + mov -8(rp,%r15,8), %r10 + adc $0, %r12 +L(r): ADDSUB %r11, %r13 + adc %r12, %rax + mov %r13, -16(rp,%r15,8) adc $0, %rdx -L(n1): mov 8(rp), %r10 - ADDSUB %rax, %r10 - mov %r10, 8(rp) - mov R32(%rbx), R32(%rax) C zero rax +L(x): ADDSUB %rax, %r10 + mov %r10, -8(rp,%r15,8) + mov $0, %eax adc %rdx, %rax - pop %rbp - pop %rbx - FUNC_EXIT() +L(ret): pop %r13 + pop %r12 + pop %r15 ret +L(n2): mul %rcx + mov -8(rp,%r15,8), %r10 + jmp L(r) +L(n1): mov -8(rp,%r15,8), %r10 + jmp L(x) EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/copyd.asm b/gmp/mpn/x86_64/core2/copyd.asm deleted file mode 100644 index f0dc54a55e..0000000000 --- a/gmp/mpn/x86_64/core2/copyd.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_copyd) -include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp/mpn/x86_64/core2/copyi.asm b/gmp/mpn/x86_64/core2/copyi.asm deleted file mode 100644 index 9c26e00c52..0000000000 --- a/gmp/mpn/x86_64/core2/copyi.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_copyi) -include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp/mpn/x86_64/core2/divrem_1.asm b/gmp/mpn/x86_64/core2/divrem_1.asm deleted file mode 100644 index 623bea386c..0000000000 --- a/gmp/mpn/x86_64/core2/divrem_1.asm +++ /dev/null @@ -1,237 +0,0 @@ -dnl x86-64 mpn_divrem_1 -- mpn by limb division. - -dnl Copyright 2004, 2005, 2007-2010, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C norm unorm frac -C AMD K8,K9 15 15 12 -C AMD K10 15 15 12 -C Intel P4 44 44 43 -C Intel core2 24 24 19.5 -C Intel corei 19 19 18 -C Intel atom 51 51 36 -C VIA nano 46 44 22.5 - -C mp_limb_t -C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, -C mp_srcptr np, mp_size_t nn, mp_limb_t d) - -C mp_limb_t -C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, -C mp_srcptr np, mp_size_t nn, mp_limb_t d, -C mp_limb_t dinv, int cnt) - -C INPUT PARAMETERS -define(`qp', `%rdi') -define(`fn_param', `%rsi') -define(`up_param', `%rdx') -define(`un_param', `%rcx') -define(`d', `%r8') -define(`dinv', `%r9') C only for mpn_preinv_divrem_1 -C shift passed on stack C only for mpn_preinv_divrem_1 - -define(`cnt', `%rcx') -define(`up', `%rsi') -define(`fn', `%r12') -define(`un', `%rbx') - - -C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 -C cnt qp d dinv - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -IFSTD(`define(`CNTOFF', `40($1)')') -IFDOS(`define(`CNTOFF', `104($1)')') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_preinv_divrem_1) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8 ') -IFDOS(` mov 64(%rsp), %r9 ') - xor R32(%rax), R32(%rax) - push %r13 - push %r12 - push %rbp - push %rbx - - mov fn_param, fn - mov un_param, un - add fn_param, un_param - mov up_param, up - - lea -8(qp,un_param,8), qp - - mov CNTOFF(%rsp), R8(cnt) - shl R8(cnt), d - jmp L(ent) -EPILOGUE() - - ALIGN(16) -PROLOGUE(mpn_divrem_1) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8 ') - xor R32(%rax), R32(%rax) - push %r13 - push %r12 - push %rbp - push %rbx - - mov fn_param, fn - mov un_param, un - add fn_param, un_param - mov up_param, up - je L(ret) - - lea -8(qp,un_param,8), qp - xor R32(%rbp), R32(%rbp) - -L(unnormalized): - test un, un - je L(44) - mov -8(up,un,8), %rax - cmp d, %rax - jae L(44) - mov %rbp, (qp) - mov %rax, %rbp - lea -8(qp), qp - je L(ret) - dec un -L(44): - bsr d, %rcx - not R32(%rcx) - sal R8(%rcx), d - sal R8(%rcx), %rbp - - push %rcx -IFSTD(` push %rdi ') -IFSTD(` push %rsi ') - push %r8 -IFSTD(` mov d, %rdi ') -IFDOS(` mov d, %rcx ') - CALL( mpn_invert_limb) - pop %r8 -IFSTD(` pop %rsi ') -IFSTD(` pop %rdi ') - pop %rcx - - mov %rax, dinv - mov %rbp, %rax - test un, un - je L(frac) -L(ent): mov -8(up,un,8), %rbp - shr R8(%rcx), %rax - shld R8(%rcx), %rbp, %rax - sub $2, un - js L(end) - - ALIGN(16) -L(top): lea 1(%rax), %r11 - mul dinv - mov (up,un,8), %r10 - shld R8(%rcx), %r10, %rbp - mov %rbp, %r13 - add %rax, %r13 - adc %r11, %rdx - mov %rdx, %r11 - imul d, %rdx - sub %rdx, %rbp - lea (d,%rbp), %rax - sub $8, qp - cmp %r13, %rbp - cmovc %rbp, %rax - adc $-1, %r11 - cmp d, %rax - jae L(ufx) -L(uok): dec un - mov %r11, 8(qp) - mov %r10, %rbp - jns L(top) - -L(end): lea 1(%rax), %r11 - sal R8(%rcx), %rbp - mul dinv - add %rbp, %rax - adc %r11, %rdx - mov %rax, %r11 - mov %rdx, %r13 - imul d, %rdx - sub %rdx, %rbp - mov d, %rax - add %rbp, %rax - cmp %r11, %rbp - cmovc %rbp, %rax - adc $-1, %r13 - cmp d, %rax - jae L(efx) -L(eok): mov %r13, (qp) - sub $8, qp - jmp L(frac) - -L(ufx): sub d, %rax - inc %r11 - jmp L(uok) -L(efx): sub d, %rax - inc %r13 - jmp L(eok) - -L(frac):mov d, %rbp - neg %rbp - jmp L(fent) - - ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 -L(ftop):mul dinv C 0,12 0,17 0,17 - add %r11, %rdx C 5 8 10 - mov %rax, %r11 C 4 8 3 - mov %rdx, %r13 C 6 9 11 - imul %rbp, %rdx C 6 9 11 - mov d, %rax C - add %rdx, %rax C 10 14 14 - cmp %r11, %rdx C 10 14 14 - cmovc %rdx, %rax C 11 15 15 - adc $-1, %r13 C - mov %r13, (qp) C - sub $8, qp C -L(fent):lea 1(%rax), %r11 C - dec fn C - jns L(ftop) C - - shr R8(%rcx), %rax -L(ret): pop %rbx - pop %rbp - pop %r12 - pop %r13 - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/gcd_1.asm b/gmp/mpn/x86_64/core2/gcd_1.asm deleted file mode 100644 index e0cab9b4e4..0000000000 --- a/gmp/mpn/x86_64/core2/gcd_1.asm +++ /dev/null @@ -1,144 +0,0 @@ -dnl AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C AMD K8,K9 8.50 -C AMD K10 4.30 -C AMD bd1 5.00 -C AMD bobcat 10.0 -C Intel P4 18.6 -C Intel core2 3.83 -C Intel NHM 5.17 -C Intel SBR 4.69 -C Intel atom 17.0 -C VIA nano 5.44 -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C TODO -C * Optimise inner-loop for specific CPUs. -C * Use DIV for 1-by-1 reductions, at least for some CPUs. - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 6) - -C INPUT PARAMETERS -define(`up', `%rdi') -define(`n', `%rsi') -define(`v0', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -IFDOS(`define(`STACK_ALLOC', 40)') -IFSTD(`define(`STACK_ALLOC', 8)') - -C Undo some configure cleverness. -C The problem is that C only defines the '1c' variant, and that configure -C therefore considers modexact_1c to be the base function. It then adds a -C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep -C gcd_1 exists without a corresponding cpudep mode1o. -ifdef(`WANT_FAT_BINARY', ` - define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')') - - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - FUNC_ENTRY(3) - mov (up), %rax C U low limb - or v0, %rax - bsf %rax, %rax C min(ctz(u0),ctz(v0)) - - bsf v0, %rcx - shr R8(%rcx), v0 - - push %rax C preserve common twos over call - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment - - cmp $1, n - jnz L(reduce_nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - mov (up), %r8 - mov %r8, %rax - shr $BMOD_THRES_LOG2, %r8 - cmp %r8, v0 - ja L(reduced) - jmp L(bmod) - -L(reduce_nby1): - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - CALL( mpn_mod_1) - jmp L(reduced) -L(bmod): -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - CALL( mpn_modexact_1_odd) -L(reduced): - - add $STACK_ALLOC, %rsp - pop %rdx - - bsf %rax, %rcx -C test %rax, %rax C FIXME: does this lower latency? - jnz L(mid) - jmp L(end) - - ALIGN(16) C K10 BD C2 NHM SBR -L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5 - cmovc %r9, %rdx C use x,y-x 0,3 0,3 2,8 1,7 1,7 -L(mid): shr R8(%rcx), %rax C 1,7 1,6 2,8 2,8 2,8 - mov %rdx, %r10 C 1 1 4 3 3 - sub %rax, %r10 C 2 2 5 4 4 - bsf %r10, %rcx C 3 3 6 5 5 - mov %rax, %r9 C 2 2 3 3 4 - sub %rdx, %rax C 2 2 4 3 4 - jnz L(top) C - -L(end): pop %rcx - mov %rdx, %rax - shl R8(%rcx), %rax - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/gmp-mparam.h b/gmp/mpn/x86_64/core2/gmp-mparam.h index 0f4f88f780..8207da4895 100644 --- a/gmp/mpn/x86_64/core2/gmp-mparam.h +++ b/gmp/mpn/x86_64/core2/gmp-mparam.h @@ -1,217 +1,78 @@ -/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file. +/* "Core 2" gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation, -Inc. +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +2008, 2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. -or +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 -or both in parallel, as here. +/* 2133 MHz "Core 2" / 65nm / 4096 Kibyte cache / socket 775 */ -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. */ - -#define GMP_LIMB_BITS 64 -#define GMP_LIMB_BYTES 8 - -/* 2133 MHz Core 2 (65nm) */ -/* FFT tuning limit = 60000000 */ -/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */ - -#define MOD_1_NORM_THRESHOLD 0 /* always */ -#define MOD_1_UNNORM_THRESHOLD 0 /* always */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 -#define USE_PREINV_DIVREM_1 1 /* native */ -#define DIV_QR_1_NORM_THRESHOLD 1 -#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 24 - -#define MUL_TOOM22_THRESHOLD 23 -#define MUL_TOOM33_THRESHOLD 65 -#define MUL_TOOM44_THRESHOLD 179 -#define MUL_TOOM6H_THRESHOLD 268 -#define MUL_TOOM8H_THRESHOLD 357 - -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78 -#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 - -#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 28 -#define SQR_TOOM3_THRESHOLD 102 -#define SQR_TOOM4_THRESHOLD 160 -#define SQR_TOOM6_THRESHOLD 222 -#define SQR_TOOM8_THRESHOLD 296 - -#define MULMID_TOOM42_THRESHOLD 28 - -#define MULMOD_BNM1_THRESHOLD 12 -#define SQRMOD_BNM1_THRESHOLD 13 - -#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ -#define MUL_FFT_TABLE3 \ - { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ - { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ - { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ - { 13, 7}, { 27, 8}, { 15, 7}, { 32, 8}, \ - { 17, 7}, { 36, 8}, { 19, 7}, { 40, 8}, \ - { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ - { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ - { 47, 9}, { 27,10}, { 15, 9}, { 43,10}, \ - { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ - { 63,10}, { 39, 9}, { 83,10}, { 47, 9}, \ - { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ - { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ - { 127, 9}, { 255,10}, { 135, 9}, { 271,11}, \ - { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \ - { 95,10}, { 191, 9}, { 383,10}, { 207,11}, \ - { 111,12}, { 63,11}, { 127,10}, { 271,11}, \ - { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ - { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ - { 383,11}, { 207,10}, { 415,11}, { 223,13}, \ - { 63,12}, { 127,11}, { 271,10}, { 543,11}, \ - { 287,10}, { 575,11}, { 303,10}, { 607,12}, \ - { 159,11}, { 319,10}, { 639,11}, { 351,12}, \ - { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ - { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ - { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ - { 703,13}, { 191,12}, { 415,11}, { 831,12}, \ - { 479,14}, { 127,13}, { 255,12}, { 607,13}, \ - { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ - { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ - { 1023,13}, { 575,12}, { 1215,13}, { 639,12}, \ - { 1279,13}, { 703,14}, { 383,13}, { 831,12}, \ - { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \ - { 1151,14}, { 639,13}, { 1343,14}, { 767,13}, \ - { 1599,14}, { 895,15}, { 511,14}, { 1279,13}, \ - { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ - { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ - { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \ - { 2175,12}, { 8959,14}, { 2303,13}, { 4607,12}, \ - { 9471,14}, { 2431,13}, { 4863,12}, { 9983,15}, \ - { 1279,14}, { 2559,12}, { 10239,14}, { 2687,12}, \ - { 11775,15}, { 1535,14}, { 3327,13}, { 6655,14}, \ - { 3455,13}, { 6911,14}, { 3583,12}, { 14335,11}, \ - { 28671,10}, { 57343,11}, { 2048,12}, { 4096,13}, \ - { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ - { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ - {2097152,22}, {4194304,23}, {8388608,24} } -#define MUL_FFT_TABLE3_SIZE 183 -#define MUL_FFT_THRESHOLD 4736 - -#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ -#define SQR_FFT_TABLE3 \ - { { 340, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ - { 9, 5}, { 19, 6}, { 23, 7}, { 12, 6}, \ - { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ - { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ - { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ - { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ - { 47, 9}, { 27,10}, { 15, 9}, { 43,10}, \ - { 23, 9}, { 55,10}, { 31, 9}, { 67,10}, \ - { 39, 9}, { 79,10}, { 47,11}, { 31,10}, \ - { 79,11}, { 47,10}, { 95,12}, { 31,11}, \ - { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \ - { 271, 8}, { 543,11}, { 79, 9}, { 319, 8}, \ - { 639,11}, { 95,10}, { 191, 9}, { 383,10}, \ - { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \ - { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ - { 303, 9}, { 607,10}, { 319, 9}, { 639,11}, \ - { 175,12}, { 95,11}, { 191,10}, { 383,11}, \ - { 207,10}, { 415,13}, { 63,12}, { 127,11}, \ - { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ - { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ - { 351,12}, { 191,11}, { 415,10}, { 831,12}, \ - { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ - { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ - { 607,12}, { 319,11}, { 639,12}, { 351,13}, \ - { 191,12}, { 415,11}, { 831,12}, { 479,14}, \ - { 127,13}, { 255,12}, { 607,13}, { 319,12}, \ - { 703,13}, { 383,12}, { 831,13}, { 447,12}, \ - { 959,14}, { 255,13}, { 511,12}, { 1023,13}, \ - { 575,12}, { 1215,13}, { 639,12}, { 1279,13}, \ - { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \ - { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ - { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \ - { 2687,13}, { 1407,12}, { 2815,14}, { 767,13}, \ - { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ - { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \ - { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ - { 1407,15}, { 767,14}, { 1535,13}, { 3071,14}, \ - { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ - { 3583,16}, { 511,15}, { 1023,14}, { 2175,13}, \ - { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \ - { 4863,15}, { 1279,14}, { 2815,13}, { 5631,14}, \ - { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ - { 6911,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ - { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ - {2097152,22}, {4194304,23}, {8388608,24} } -#define SQR_FFT_TABLE3_SIZE 179 -#define SQR_FFT_THRESHOLD 3008 - -#define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 62 -#define MULLO_MUL_N_THRESHOLD 9174 - -#define DC_DIV_QR_THRESHOLD 46 -#define DC_DIVAPPR_Q_THRESHOLD 155 -#define DC_BDIV_QR_THRESHOLD 50 -#define DC_BDIV_Q_THRESHOLD 94 - -#define INV_MULMOD_BNM1_THRESHOLD 48 -#define INV_NEWTON_THRESHOLD 156 -#define INV_APPR_THRESHOLD 155 - -#define BINV_NEWTON_THRESHOLD 234 -#define REDC_1_TO_REDC_2_THRESHOLD 22 -#define REDC_2_TO_REDC_N_THRESHOLD 48 - -#define MU_DIV_QR_THRESHOLD 1187 -#define MU_DIVAPPR_Q_THRESHOLD 1142 -#define MUPI_DIV_QR_THRESHOLD 74 -#define MU_BDIV_QR_THRESHOLD 1017 -#define MU_BDIV_Q_THRESHOLD 1187 - -#define POWM_SEC_TABLE 1,64,131,269,466 - -#define MATRIX22_STRASSEN_THRESHOLD 19 -#define HGCD_THRESHOLD 117 -#define HGCD_APPR_THRESHOLD 151 -#define HGCD_REDUCE_THRESHOLD 2121 -#define GCD_DC_THRESHOLD 427 -#define GCDEXT_DC_THRESHOLD 342 -#define JACOBI_BASE_METHOD 4 - -#define GET_STR_DC_THRESHOLD 11 -#define GET_STR_PRECOMPUTE_THRESHOLD 18 -#define SET_STR_DC_THRESHOLD 552 -#define SET_STR_PRECOMPUTE_THRESHOLD 1561 - -#define FAC_DSC_THRESHOLD 656 -#define FAC_ODD_THRESHOLD 23 +/* Generated by tuneup.c, 2009-01-14, gcc 4.2 */ + +#define MUL_KARATSUBA_THRESHOLD 18 +#define MUL_TOOM3_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 166 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_KARATSUBA_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 163 + +#define MULLOW_BASECASE_THRESHOLD 0 /* always */ +#define MULLOW_DC_THRESHOLD 20 +#define MULLOW_MUL_N_THRESHOLD 232 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 60 +#define POWM_THRESHOLD 77 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD_THRESHOLD 140 +#define GCD_DC_THRESHOLD 691 +#define GCDEXT_DC_THRESHOLD 760 +#define JACOBI_BASE_METHOD 1 + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1_THRESHOLD 3 +#define MOD_1_2_THRESHOLD 5 +#define MOD_1_4_THRESHOLD 20 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define USE_PREINV_MOD_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 668 +#define SET_STR_PRECOMPUTE_THRESHOLD 2052 + +#define MUL_FFT_TABLE { 336, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 } +#define MUL_FFT_MODF_THRESHOLD 352 +#define MUL_FFT_THRESHOLD 3456 + +#define SQR_FFT_TABLE { 336, 736, 1728, 3328, 7168, 20480, 81920, 327680, 0 } +#define SQR_FFT_MODF_THRESHOLD 352 +#define SQR_FFT_THRESHOLD 2432 + +/* Generated 2009-01-12, gcc 4.2 */ + +#define MUL_FFT_TABLE2 {{1,4}, {273,5}, {545,6}, {1217,7}, {3201,8}, {6913,9}, {7681,8}, {8449,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {39937,9}, {42497,10}, {56321,11}, {63489,10}, {81409,11}, {92161,10}, {93185,11}, {96257,12}, {126977,11}, {131073,10}, {138241,11}, {167937,10}, {169473,11}, {169985,10}, {172033,11}, {195585,9}, {196097,11}, {198657,10}, {208897,11}, {217089,12}, {258049,11}, {261121,9}, {262657,10}, {275457,11}, {302081,10}, {307201,11}, {331777,12}, {389121,11}, {425985,13}, {516097,12}, {520193,11}, {598017,12}, {610305,11}, {614401,12}, {651265,11}, {653313,10}, {654337,11}, {673793,10}, {674817,11}, {677889,10}, {679937,11}, {718849,10}, {719873,12}, {782337,11}, {850945,12}, {913409,11}, {925697,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1244161,12}, {1306625,11}, {1310721,12}, {1327105,11}, {1347585,12}, {1355777,11}, {1366017,12}, {1439745,13}, {1564673,12}, {1835009,14}, {1900545,12}, {1904641,14}, {2080769,13}, {2088961,12}, {2488321,13}, {2613249,12}, {2879489,13}, {2932737,12}, {2940929,13}, {3137537,12}, {3403777,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4978689,13}, {5234689,12}, {5500929,13}, {5758977,14}, {6275073,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9957377,14}, {MP_SIZE_T_MAX, 0}} + +#define SQR_FFT_TABLE2 {{1,4}, {241,5}, {545,6}, {1345,7}, {3201,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {55297,11}, {63489,10}, {80897,11}, {94209,10}, {97281,12}, {126977,11}, {129025,9}, {130049,10}, {138753,11}, {162817,9}, {164353,11}, {170497,10}, {178177,11}, {183297,10}, {184321,11}, {194561,10}, {208897,12}, {219137,11}, {221185,12}, {258049,11}, {261121,9}, {261633,10}, {267777,9}, {268289,11}, {270337,10}, {274945,9}, {276481,10}, {278529,11}, {292865,9}, {293377,10}, {295937,9}, {296449,10}, {306177,9}, {309249,10}, {310273,11}, {328705,12}, {331777,11}, {335873,12}, {344065,11}, {346113,12}, {352257,11}, {356353,12}, {389121,11}, {395265,10}, {398337,11}, {419841,10}, {421889,11}, {423937,13}, {516097,12}, {520193,11}, {546817,10}, {550913,11}, {561153,10}, {563201,11}, {579585,10}, {585729,11}, {621569,12}, {636929,11}, {638977,12}, {651265,11}, {714753,10}, {716801,11}, {718849,12}, {782337,11}, {849921,12}, {913409,11}, {954369,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,12}, {1437697,13}, {1564673,12}, {1961985,14}, {2080769,13}, {2088961,12}, {2486273,13}, {2613249,12}, {2879489,13}, {3137537,12}, {3272705,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4976641,13}, {5234689,12}, {5320705,13}, {5324801,12}, {5447681,13}, {5455873,12}, {5500929,13}, {5758977,14}, {6275073,13}, {6283265,12}, {6549505,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9953281,14}, {MP_SIZE_T_MAX, 0}} diff --git a/gmp/mpn/x86_64/core2/lshift.asm b/gmp/mpn/x86_64/core2/lshift.asm index 8ccafeca6c..60518901eb 100644 --- a/gmp/mpn/x86_64/core2/lshift.asm +++ b/gmp/mpn/x86_64/core2/lshift.asm @@ -1,83 +1,64 @@ dnl x86-64 mpn_lshift optimized for "Core 2". -dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. +dnl Copyright 2007 Free Software Foundation, Inc. dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C AMD K8,K9 4.25 -C AMD K10 4.25 -C Intel P4 14.7 -C Intel core2 1.27 -C Intel NHM 1.375 (up to about n = 260, then 1.5) -C Intel SBR 1.87 -C Intel atom ? -C VIA nano ? +C K8,K9: 4.25 +C K10: 4.25 +C P4: 14.7 +C P6-15: 1.27 C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n', `%rdx') -define(`cnt', `%rcx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) +define(`cnt', `%cl') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_lshift) - FUNC_ENTRY(4) lea -8(rp,n,8), rp lea -8(up,n,8), up - mov R32(%rdx), R32(%rax) - and $3, R32(%rax) + mov %edx, %eax + and $3, %eax jne L(nb00) L(b00): C n = 4, 8, 12, ... mov (up), %r10 mov -8(up), %r11 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r10, %rax + xor %eax, %eax + shld %cl, %r10, %rax mov -16(up), %r8 lea 24(rp), rp sub $4, n jmp L(00) L(nb00):C n = 1, 5, 9, ... - cmp $2, R32(%rax) + cmp $2, %eax jae L(nb01) L(b01): mov (up), %r9 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r9, %rax + xor %eax, %eax + shld %cl, %r9, %rax sub $2, n jb L(le1) mov -8(up), %r10 @@ -85,65 +66,62 @@ L(b01): mov (up), %r9 lea -8(up), up lea 16(rp), rp jmp L(01) -L(le1): shl R8(cnt), %r9 +L(le1): shl %cl, %r9 mov %r9, (rp) - FUNC_EXIT() ret L(nb01):C n = 2, 6, 10, ... jne L(b11) L(b10): mov (up), %r8 mov -8(up), %r9 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r8, %rax + xor %eax, %eax + shld %cl, %r8, %rax sub $3, n jb L(le2) mov -16(up), %r10 lea -16(up), up lea 8(rp), rp jmp L(10) -L(le2): shld R8(cnt), %r9, %r8 +L(le2): shld %cl, %r9, %r8 mov %r8, (rp) - shl R8(cnt), %r9 + shl %cl, %r9 mov %r9, -8(rp) - FUNC_EXIT() ret ALIGN(16) C performance critical! L(b11): C n = 3, 7, 11, ... mov (up), %r11 mov -8(up), %r8 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r11, %rax + xor %eax, %eax + shld %cl, %r11, %rax mov -16(up), %r9 lea -24(up), up sub $4, n jb L(end) ALIGN(16) -L(top): shld R8(cnt), %r8, %r11 +L(top): shld %cl, %r8, %r11 mov (up), %r10 mov %r11, (rp) -L(10): shld R8(cnt), %r9, %r8 +L(10): shld %cl, %r9, %r8 mov -8(up), %r11 mov %r8, -8(rp) -L(01): shld R8(cnt), %r10, %r9 +L(01): shld %cl, %r10, %r9 mov -16(up), %r8 mov %r9, -16(rp) -L(00): shld R8(cnt), %r11, %r10 +L(00): shld %cl, %r11, %r10 mov -24(up), %r9 + lea -32(up), up mov %r10, -24(rp) - add $-32, up lea -32(rp), rp sub $4, n jnc L(top) -L(end): shld R8(cnt), %r8, %r11 +L(end): shld %cl, %r8, %r11 mov %r11, (rp) - shld R8(cnt), %r9, %r8 + shld %cl, %r9, %r8 mov %r8, -8(rp) - shl R8(cnt), %r9 + shl %cl, %r9 mov %r9, -16(rp) - FUNC_EXIT() ret EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/lshiftc.asm b/gmp/mpn/x86_64/core2/lshiftc.asm deleted file mode 100644 index 65c7b2f1b8..0000000000 --- a/gmp/mpn/x86_64/core2/lshiftc.asm +++ /dev/null @@ -1,159 +0,0 @@ -dnl x86-64 mpn_lshiftc optimized for "Core 2". - -dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb -C AMD K8,K9 ? -C AMD K10 ? -C Intel P4 ? -C Intel core2 1.5 -C Intel NHM 2.25 (up to about n = 260, then 1.875) -C Intel SBR 2.25 -C Intel atom ? -C VIA nano ? - - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') -define(`cnt', `%rcx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_lshiftc) - FUNC_ENTRY(4) - lea -8(rp,n,8), rp - lea -8(up,n,8), up - - mov R32(%rdx), R32(%rax) - and $3, R32(%rax) - jne L(nb00) -L(b00): C n = 4, 8, 12, ... - mov (up), %r10 - mov -8(up), %r11 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r10, %rax - mov -16(up), %r8 - lea 24(rp), rp - sub $4, n - jmp L(00) - -L(nb00):C n = 1, 5, 9, ... - cmp $2, R32(%rax) - jae L(nb01) -L(b01): mov (up), %r9 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r9, %rax - sub $2, n - jb L(le1) - mov -8(up), %r10 - mov -16(up), %r11 - lea -8(up), up - lea 16(rp), rp - jmp L(01) -L(le1): shl R8(cnt), %r9 - not %r9 - mov %r9, (rp) - FUNC_EXIT() - ret - -L(nb01):C n = 2, 6, 10, ... - jne L(b11) -L(b10): mov (up), %r8 - mov -8(up), %r9 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r8, %rax - sub $3, n - jb L(le2) - mov -16(up), %r10 - lea -16(up), up - lea 8(rp), rp - jmp L(10) -L(le2): shld R8(cnt), %r9, %r8 - not %r8 - mov %r8, (rp) - shl R8(cnt), %r9 - not %r9 - mov %r9, -8(rp) - FUNC_EXIT() - ret - - ALIGN(16) C performance critical! -L(b11): C n = 3, 7, 11, ... - mov (up), %r11 - mov -8(up), %r8 - xor R32(%rax), R32(%rax) - shld R8(cnt), %r11, %rax - mov -16(up), %r9 - lea -24(up), up - sub $4, n - jb L(end) - - ALIGN(16) -L(top): shld R8(cnt), %r8, %r11 - mov (up), %r10 - not %r11 - mov %r11, (rp) -L(10): shld R8(cnt), %r9, %r8 - mov -8(up), %r11 - not %r8 - mov %r8, -8(rp) -L(01): shld R8(cnt), %r10, %r9 - mov -16(up), %r8 - not %r9 - mov %r9, -16(rp) -L(00): shld R8(cnt), %r11, %r10 - mov -24(up), %r9 - not %r10 - mov %r10, -24(rp) - add $-32, up - lea -32(rp), rp - sub $4, n - jnc L(top) - -L(end): shld R8(cnt), %r8, %r11 - not %r11 - mov %r11, (rp) - shld R8(cnt), %r9, %r8 - not %r8 - mov %r8, -8(rp) - shl R8(cnt), %r9 - not %r9 - mov %r9, -16(rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/mul_basecase.asm b/gmp/mpn/x86_64/core2/mul_basecase.asm deleted file mode 100644 index d16be852f7..0000000000 --- a/gmp/mpn/x86_64/core2/mul_basecase.asm +++ /dev/null @@ -1,975 +0,0 @@ -dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere. -dnl It also seems good for Conroe/Wolfdale. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb mul_1 mul_2 mul_3 addmul_2 -C AMD K8,K9 -C AMD K10 -C AMD bull -C AMD pile -C AMD steam -C AMD bobcat -C AMD jaguar -C Intel P4 -C Intel core 4.0 4.0 - 4.18-4.25 -C Intel NHM 3.75 3.8 - 4.06-4.2 -C Intel SBR -C Intel IBR -C Intel HWL -C Intel BWL -C Intel atom -C VIA nano - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C Code structure: -C -C -C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4) -C | | | | -C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) | -C | / | / | / | / -C | / | / | / | / -C | / | / | / | / -C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_ -C _____ _____ _____ _____ -C / \ / \ / \ / \ -C \|/ | \|/ | \|/ | \|/ | -C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) | -C \ /|\ \ /|\ \ /|\ \ /|\ -C \_____/ \_____/ \_____/ \_____/ - -C TODO -C * Tune. None done so far. -C * Currently 2687 bytes, making it smaller would be nice. -C * Implement some basecases, say for un < 4. -C * Try zeroing with xor in m2 loops. -C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication -C between loop header and wind-down code. -C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. - -C When playing with pointers, set this to $2 to fall back to conservative -C indexing in wind-down code. -define(`I',`$1') - -C Define this to $1 to use late loop index variable as zero, $2 to use an -C explicit $0. -define(`Z',`$1') - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`un_param', `%rdx') -define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance! -define(`vn_param', `%r8') - -define(`un', `%r9') -define(`vn', `(%rsp)') - -define(`v0', `%r10') -define(`v1', `%r11') -define(`w0', `%rbx') -define(`w1', `%rcx') -define(`w2', `%rbp') -define(`w3', `%r12') -define(`i', `%r13') -define(`vp', `%r14') - -define(`X0', `%r8') -define(`X1', `%r15') - -C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -define(`ALIGNx', `ALIGN(16)') - -define(`N', 85) -ifdef(`N',,`define(`N',0)') -define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_mul_basecase) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8d ') - mov (up), %rax C shared for mul_1 and mul_2 - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - - mov (vp_param), v0 C shared for mul_1 and mul_2 - - xor un, un - sub un_param, un C un = -un_param - - lea (up,un_param,8), up - lea (rp,un_param,8), rp - - mul v0 C shared for mul_1 and mul_2 - - test $1, R8(vn_param) - jz L(m2) - - lea 8(vp_param), vp C FIXME: delay until known needed - - test $1, R8(un) - jnz L(m1x1) - -L(m1x0):test $2, R8(un) - jnz L(m1s2) - -L(m1s0): - lea (un), i - mov %rax, (rp,un,8) - mov 8(up,un,8), %rax - mov %rdx, w0 C FIXME: Use lea? - lea L(do_am0)(%rip), %rbp - jmp L(m1e0) - -L(m1s2): - lea 2(un), i - mov %rax, (rp,un,8) - mov 8(up,un,8), %rax - mov %rdx, w0 C FIXME: Use lea? - mul v0 - lea L(do_am2)(%rip), %rbp - test i, i - jnz L(m1e2) - add %rax, w0 - adc $0, %rdx - mov w0, I(-8(rp),8(rp,un,8)) - mov %rdx, I((rp),16(rp,un,8)) - jmp L(ret2) - -L(m1x1):test $2, R8(un) - jz L(m1s3) - -L(m1s1): - lea 1(un), i - mov %rax, (rp,un,8) - test i, i - jz L(1) - mov 8(up,un,8), %rax - mov %rdx, w1 C FIXME: Use lea? - lea L(do_am1)(%rip), %rbp - jmp L(m1e1) -L(1): mov %rdx, I((rp),8(rp,un,8)) - jmp L(ret2) - -L(m1s3): - lea -1(un), i - mov %rax, (rp,un,8) - mov 8(up,un,8), %rax - mov %rdx, w1 C FIXME: Use lea? - lea L(do_am3)(%rip), %rbp - jmp L(m1e3) - - ALIGNx -L(m1top): - mul v0 - mov w1, -16(rp,i,8) -L(m1e2):xor R32(w1), R32(w1) - add %rax, w0 - mov (up,i,8), %rax - adc %rdx, w1 - mov w0, -8(rp,i,8) -L(m1e1):xor R32(w0), R32(w0) - mul v0 - add %rax, w1 - mov 8(up,i,8), %rax - adc %rdx, w0 - mov w1, (rp,i,8) -L(m1e0):xor R32(w1), R32(w1) - mul v0 - add %rax, w0 - mov 16(up,i,8), %rax - adc %rdx, w1 - mov w0, 8(rp,i,8) -L(m1e3):xor R32(w0), R32(w0) - mul v0 - add %rax, w1 - mov 24(up,i,8), %rax - adc %rdx, w0 - add $4, i - js L(m1top) - - mul v0 - mov w1, I(-16(rp),-16(rp,i,8)) - add %rax, w0 - adc $0, %rdx - mov w0, I(-8(rp),-8(rp,i,8)) - mov %rdx, I((rp),(rp,i,8)) - - dec vn_param - jz L(ret2) - lea -8(rp), rp - jmp *%rbp - -L(m2): - mov 8(vp_param), v1 - lea 16(vp_param), vp C FIXME: delay until known needed - - test $1, R8(un) - jnz L(bx1) - -L(bx0): test $2, R8(un) - jnz L(b10) - -L(b00): lea (un), i - mov %rax, (rp,un,8) - mov %rdx, w1 C FIXME: Use lea? - mov (up,un,8), %rax - mov $0, R32(w2) - jmp L(m2e0) - -L(b10): lea -2(un), i - mov %rax, w2 C FIXME: Use lea? - mov (up,un,8), %rax - mov %rdx, w3 C FIXME: Use lea? - mov $0, R32(w0) - jmp L(m2e2) - -L(bx1): test $2, R8(un) - jz L(b11) - -L(b01): lea 1(un), i - mov %rax, (rp,un,8) - mov (up,un,8), %rax - mov %rdx, w0 C FIXME: Use lea? - mov $0, R32(w1) - jmp L(m2e1) - -L(b11): lea -1(un), i - mov %rax, w1 C FIXME: Use lea? - mov (up,un,8), %rax - mov %rdx, w2 C FIXME: Use lea? - mov $0, R32(w3) - jmp L(m2e3) - - ALIGNx -L(m2top0): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) -L(m2e0):mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top0) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov w0, I((rp),(rp,i,8)) - mov w1, I(8(rp),8(rp,i,8)) - - add $-2, vn_param - jz L(ret2) - -L(do_am0): - push %r15 - push vn_param - -L(olo0): - mov (vp), v0 - mov 8(vp), v1 - lea 16(vp), vp - lea 16(rp), rp - mov (up,un,8), %rax -C lea 0(un), i - mov un, i - mul v0 - mov %rax, X0 - mov (up,un,8), %rax - MOV( %rdx, X1, 2) - mul v1 - MOV( %rdx, w0, 4) - mov (rp,un,8), w2 - mov %rax, w3 - jmp L(lo0) - - ALIGNx -L(am2top0): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 - mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 -L(lo0): mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 - mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top0) - - mul v1 - add w0, w1 - adc %rax, w2 - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add w2, X0 - mov X0, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - addl $-2, vn - jnz L(olo0) - -L(ret): pop %rax - pop %r15 -L(ret2):pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - - - ALIGNx -L(m2top1): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) -L(m2e1):mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top1) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov w0, I((rp),(rp,i,8)) - mov w1, I(8(rp),8(rp,i,8)) - - add $-2, vn_param - jz L(ret2) - -L(do_am1): - push %r15 - push vn_param - -L(olo1): - mov (vp), v0 - mov 8(vp), v1 - lea 16(vp), vp - lea 16(rp), rp - mov (up,un,8), %rax - lea 1(un), i - mul v0 - mov %rax, X1 - MOV( %rdx, X0, 128) - mov (up,un,8), %rax - mov (rp,un,8), w1 - mul v1 - mov %rax, w2 - mov 8(up,un,8), %rax - MOV( %rdx, w3, 1) - jmp L(lo1) - - ALIGNx -L(am2top1): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 -L(lo1): mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 - mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 - mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top1) - - mul v1 - add w0, w1 - adc %rax, w2 - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add w2, X0 - mov X0, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - addl $-2, vn - jnz L(olo1) - - pop %rax - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - - - ALIGNx -L(m2top2): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) -L(m2e2):mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top2) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov w0, I((rp),(rp,i,8)) - mov w1, I(8(rp),8(rp,i,8)) - - add $-2, vn_param - jz L(ret2) - -L(do_am2): - push %r15 - push vn_param - -L(olo2): - mov (vp), v0 - mov 8(vp), v1 - lea 16(vp), vp - lea 16(rp), rp - mov (up,un,8), %rax - lea -2(un), i - mul v0 - mov %rax, X0 - MOV( %rdx, X1, 32) - mov (up,un,8), %rax - mov (rp,un,8), w0 - mul v1 - mov %rax, w1 - lea (%rdx), w2 - mov 8(up,un,8), %rax - jmp L(lo2) - - ALIGNx -L(am2top2): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 - mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 - mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 -L(lo2): mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top2) - - mul v1 - add w0, w1 - adc %rax, w2 - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add w2, X0 - mov X0, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - addl $-2, vn - jnz L(olo2) - - pop %rax - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - - - ALIGNx -L(m2top3): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax -L(m2e3):mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top3) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov w0, I((rp),(rp,i,8)) - mov w1, I(8(rp),8(rp,i,8)) - - add $-2, vn_param - jz L(ret2) - -L(do_am3): - push %r15 - push vn_param - -L(olo3): - mov (vp), v0 - mov 8(vp), v1 - lea 16(vp), vp - lea 16(rp), rp - mov (up,un,8), %rax - lea -1(un), i - mul v0 - mov %rax, X1 - MOV( %rdx, X0, 8) - mov (up,un,8), %rax - mov (rp,un,8), w3 - mul v1 - mov %rax, w0 - MOV( %rdx, w1, 16) - mov 8(up,un,8), %rax - jmp L(lo3) - - ALIGNx -L(am2top3): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 - mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 - mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax -L(lo3): mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 - mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top3) - - mul v1 - add w0, w1 - adc %rax, w2 - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add w2, X0 - mov X0, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - addl $-2, vn - jnz L(olo3) - - pop %rax - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/mullo_basecase.asm b/gmp/mpn/x86_64/core2/mullo_basecase.asm deleted file mode 100644 index 0f03d867f6..0000000000 --- a/gmp/mpn/x86_64/core2/mullo_basecase.asm +++ /dev/null @@ -1,427 +0,0 @@ -dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb mul_2 addmul_2 -C AMD K8,K9 -C AMD K10 -C AMD bull -C AMD pile -C AMD steam -C AMD bobcat -C AMD jaguar -C Intel P4 -C Intel core 4.0 4.18-4.25 -C Intel NHM 3.75 4.06-4.2 -C Intel SBR -C Intel IBR -C Intel HWL -C Intel BWL -C Intel atom -C VIA nano - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Implement proper cor2, replacing current cor0. -C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) -C * Micro-optimise. - -C When playing with pointers, set this to $2 to fall back to conservative -C indexing in wind-down code. -define(`I',`$1') - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`vp_param', `%rdx') -define(`n_param', `%rcx') - -define(`v0', `%r10') -define(`v1', `%r11') -define(`w0', `%rbx') -define(`w1', `%rcx') -define(`w2', `%rbp') -define(`w3', `%r12') -define(`n', `%r9') -define(`i', `%r13') -define(`vp', `%r8') - -define(`X0', `%r14') -define(`X1', `%r15') - -C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -define(`ALIGNx', `ALIGN(16)') - -define(`N', 85) -ifdef(`N',,`define(`N',0)') -define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_mullo_basecase) - FUNC_ENTRY(4) - - mov (up), %rax - mov vp_param, vp - - cmp $4, n_param - jb L(small) - - mov (vp_param), v0 - push %rbx - lea (rp,n_param,8), rp C point rp at R[un] - push %rbp - lea (up,n_param,8), up C point up right after U's end - push %r12 - mov $0, R32(n) C FIXME - sub n_param, n - push %r13 - mul v0 - mov 8(vp), v1 - - test $1, R8(n_param) - jnz L(m2x1) - -L(m2x0):test $2, R8(n_param) - jnz L(m2b2) - -L(m2b0):lea (n), i - mov %rax, (rp,n,8) - mov %rdx, w1 - mov (up,n,8), %rax - xor R32(w2), R32(w2) - jmp L(m2e0) - -L(m2b2):lea -2(n), i - mov %rax, w2 - mov (up,n,8), %rax - mov %rdx, w3 - xor R32(w0), R32(w0) - jmp L(m2e2) - -L(m2x1):test $2, R8(n_param) - jnz L(m2b3) - -L(m2b1):lea 1(n), i - mov %rax, (rp,n,8) - mov (up,n,8), %rax - mov %rdx, w0 - xor R32(w1), R32(w1) - jmp L(m2e1) - -L(m2b3):lea -1(n), i - xor R32(w3), R32(w3) - mov %rax, w1 - mov %rdx, w2 - mov (up,n,8), %rax - jmp L(m2e3) - - ALIGNx -L(m2tp):mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) -L(m2e1):mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) -L(m2e0):mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax -L(m2e3):mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) -L(m2e2):mul v1 - mov $0, R32(w1) C FIXME: dead in last iteration - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 C FIXME: dead in last iteration - add $4, i - js L(m2tp) - -L(m2ed):imul v0, %rax - add w3, %rax - mov %rax, I(-8(rp),-8(rp,i,8)) - - add $2, n - lea 16(vp), vp - lea -16(up), up - cmp $-2, n - jge L(cor1) - - push %r14 - push %r15 - -L(outer): - mov (vp), v0 - mov 8(vp), v1 - mov (up,n,8), %rax - mul v0 - test $1, R8(n) - jnz L(a1x1) - -L(a1x0):mov %rax, X1 - MOV( %rdx, X0, 8) - mov (up,n,8), %rax - mul v1 - test $2, R8(n) - jnz L(a110) - -L(a100):lea (n), i - mov (rp,n,8), w3 - mov %rax, w0 - MOV( %rdx, w1, 16) - jmp L(lo0) - -L(a110):lea 2(n), i - mov (rp,n,8), w1 - mov %rax, w2 - mov 8(up,n,8), %rax - MOV( %rdx, w3, 1) - jmp L(lo2) - -L(a1x1):mov %rax, X0 - MOV( %rdx, X1, 2) - mov (up,n,8), %rax - mul v1 - test $2, R8(n) - jz L(a111) - -L(a101):lea 1(n), i - MOV( %rdx, w0, 4) - mov (rp,n,8), w2 - mov %rax, w3 - jmp L(lo1) - -L(a111):lea -1(n), i - MOV( %rdx, w2, 64) - mov %rax, w1 - mov (rp,n,8), w0 - mov 8(up,n,8), %rax - jmp L(lo3) - - ALIGNx -L(top): mul v1 - add w0, w1 - adc %rax, w2 - mov -8(up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 -L(lo2): mul v0 - add w1, X1 - mov X1, -16(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov -8(up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov -8(rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 -L(lo1): mov (up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, -8(rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov (up,i,8), %rax - mov (rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 -L(lo0): mov 8(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, (rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 8(rp,i,8), w3 - adc $0, X1 - mov 8(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 16(up,i,8), %rax - adc $0, w2 -L(lo3): mul v0 - add w0, X0 - mov X0, 8(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 16(up,i,8), %rax - mov 16(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(top) - -L(end): imul v1, %rax - add w0, w1 - adc %rax, w2 - mov I(-8(up),-8(up,i,8)), %rax - imul v0, %rax - add w1, X1 - mov X1, I(-16(rp),-16(rp,i,8)) - adc X0, %rax - mov I(-8(rp),-8(rp,i,8)), w1 - add w1, w2 - add w2, %rax - mov %rax, I(-8(rp),-8(rp,i,8)) - - add $2, n - lea 16(vp), vp - lea -16(up), up - cmp $-2, n - jl L(outer) - - pop %r15 - pop %r14 - - jnz L(cor0) - -L(cor1):mov (vp), v0 - mov 8(vp), v1 - mov -16(up), %rax - mul v0 C u0 x v2 - add -16(rp), %rax C FIXME: rp[0] still available in reg? - adc -8(rp), %rdx C FIXME: rp[1] still available in reg? - mov -8(up), %rbx - imul v0, %rbx - mov -16(up), %rcx - imul v1, %rcx - mov %rax, -16(rp) - add %rbx, %rcx - add %rdx, %rcx - mov %rcx, -8(rp) - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - -L(cor0):mov (vp), %r11 - imul -8(up), %r11 - add %rax, %r11 - mov %r11, -8(rp) - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - - ALIGN(16) -L(small): - cmp $2, n_param - jae L(gt1) -L(n1): imul (vp_param), %rax - mov %rax, (rp) - FUNC_EXIT() - ret -L(gt1): ja L(gt2) -L(n2): mov (vp_param), %r9 - mul %r9 - mov %rax, (rp) - mov 8(up), %rax - imul %r9, %rax - add %rax, %rdx - mov 8(vp), %r9 - mov (up), %rcx - imul %r9, %rcx - add %rcx, %rdx - mov %rdx, 8(rp) - FUNC_EXIT() - ret -L(gt2): -L(n3): mov (vp_param), %r9 - mul %r9 C u0 x v0 - mov %rax, (rp) - mov %rdx, %r10 - mov 8(up), %rax - mul %r9 C u1 x v0 - imul 16(up), %r9 C u2 x v0 - add %rax, %r10 - adc %rdx, %r9 - mov 8(vp), %r11 - mov (up), %rax - mul %r11 C u0 x v1 - add %rax, %r10 - adc %rdx, %r9 - imul 8(up), %r11 C u1 x v1 - add %r11, %r9 - mov %r10, 8(rp) - mov 16(vp), %r10 - mov (up), %rax - imul %rax, %r10 C u0 x v2 - add %r10, %r9 - mov %r9, 16(rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/popcount.asm b/gmp/mpn/x86_64/core2/popcount.asm index e935cf1892..6c22999ff4 100644 --- a/gmp/mpn/x86_64/core2/popcount.asm +++ b/gmp/mpn/x86_64/core2/popcount.asm @@ -3,33 +3,21 @@ dnl x86-64 mpn_popcount optimized for "Core 2". dnl Copyright 2007 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. -dnl + dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -MULFUNC_PROLOGUE(mpn_popcount) include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp/mpn/x86_64/core2/redc_1.asm b/gmp/mpn/x86_64/core2/redc_1.asm deleted file mode 100644 index d0e96ef1cb..0000000000 --- a/gmp/mpn/x86_64/core2/redc_1.asm +++ /dev/null @@ -1,425 +0,0 @@ -dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 ? -C AMD K10 ? -C AMD bull ? -C AMD pile ? -C AMD steam ? -C AMD bobcat ? -C AMD jaguar ? -C Intel P4 ? -C Intel core 4.5 (fluctuating) -C Intel NHM ? -C Intel SBR ? -C Intel IBR ? -C Intel HWL ? -C Intel BWL ? -C Intel atom ? -C VIA nano ? - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C TODO -C * Micro-optimise, none performed thus far. -C * Consider inlining mpn_add_n. -C * Single basecases out before the pushes. -C * Keep up[i] in registers for basecases (might require pushes). - -C When playing with pointers, set this to $2 to fall back to conservative -C indexing in wind-down code. -define(`I',`$1') - -define(`rp', `%rdi') C rcx -define(`up', `%rsi') C rdx -define(`mp_param', `%rdx') C r8 -define(`n', `%rcx') C r9 -define(`u0inv', `%r8') C stack - -define(`i', `%r14') -define(`j', `%r15') -define(`mp', `%r12') -define(`q0', `%r13') - -C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 -C X q0' n X rp up u0i mp q0 i j - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -define(`ALIGNx', `ALIGN(16)') - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_redc_1) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8 ') - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - - mov (up), q0 - mov n, j C outer loop induction var - lea (mp_param,n,8), mp - lea -16(up,n,8), up - neg n - imul u0inv, q0 C first iteration q0 - - test $1, R8(n) - jz L(b0) - -L(b1): cmp $-1, R32(n) - jz L(n1) - cmp $-3, R32(n) - jz L(n3) - - push rp - -L(otp1):lea 3(n), i - mov (mp,n,8), %rax - mul q0 - lea (%rax), %rbp - mov 8(mp,n,8), %rax - lea (%rdx), %r9 - mul q0 - lea (%rax), %r11 - mov 16(mp,n,8), %rax - mov 16(up,n,8), %r10 - lea (%rdx), %rdi - mul q0 - add %rbp, %r10 - lea (%rax), %rbp - mov 24(mp,n,8), %rax - adc %r9, %r11 - mov 24(up,n,8), %rbx - lea (%rdx), %r9 - adc $0, %rdi - mul q0 - add %r11, %rbx - lea (%rax), %r11 - mov 32(mp,n,8), %rax - adc %rdi, %rbp - mov %rbx, 24(up,n,8) - mov 32(up,n,8), %r10 - lea (%rdx), %rdi - adc $0, %r9 - imul u0inv, %rbx C next q limb - add $2, i - jns L(ed1) - - ALIGNx -L(tp1): mul q0 - add %rbp, %r10 - lea (%rax), %rbp - mov (mp,i,8), %rax - adc %r9, %r11 - mov %r10, -8(up,i,8) - mov (up,i,8), %r10 - lea (%rdx), %r9 - adc $0, %rdi - mul q0 - add %r11, %r10 - lea (%rax), %r11 - mov 8(mp,i,8), %rax - adc %rdi, %rbp - mov %r10, (up,i,8) - mov 8(up,i,8), %r10 - lea (%rdx), %rdi - adc $0, %r9 - add $2, i - js L(tp1) - -L(ed1): mul q0 - add %rbp, %r10 - adc %r9, %r11 - mov %r10, I(-8(up),-8(up,i,8)) - mov I((up),(up,i,8)), %r10 - adc $0, %rdi - add %r11, %r10 - adc %rdi, %rax - mov %r10, I((up),(up,i,8)) - mov I(8(up),8(up,i,8)), %r10 - adc $0, %rdx - add %rax, %r10 - mov %r10, I(8(up),8(up,i,8)) - adc $0, %rdx - mov %rdx, 16(up,n,8) C up[0] - mov %rbx, q0 C previously computed q limb -> q0 - lea 8(up), up C up++ - dec j - jnz L(otp1) - jmp L(cj) - -L(b0): cmp $-2, R32(n) - jz L(n2) - cmp $-4, R32(n) - jz L(n4) - - push rp - -L(otp0):lea 4(n), i - mov (mp,n,8), %rax - mul q0 - lea (%rax), %r11 - mov 8(mp,n,8), %rax - lea (%rdx), %rdi - mul q0 - lea (%rax), %rbp - mov 16(mp,n,8), %rax - mov 16(up,n,8), %r10 - lea (%rdx), %r9 - mul q0 - add %r11, %r10 - lea (%rax), %r11 - mov 24(mp,n,8), %rax - adc %rdi, %rbp - mov 24(up,n,8), %rbx - lea (%rdx), %rdi - adc $0, %r9 - mul q0 - add %rbp, %rbx - lea (%rax), %rbp - mov 32(mp,n,8), %rax - adc %r9, %r11 - mov %rbx, 24(up,n,8) - mov 32(up,n,8), %r10 - lea (%rdx), %r9 - adc $0, %rdi - imul u0inv, %rbx C next q limb - jmp L(e0) - - ALIGNx -L(tp0): mul q0 - add %rbp, %r10 - lea (%rax), %rbp - mov (mp,i,8), %rax - adc %r9, %r11 - mov %r10, -8(up,i,8) - mov (up,i,8), %r10 - lea (%rdx), %r9 - adc $0, %rdi -L(e0): mul q0 - add %r11, %r10 - lea (%rax), %r11 - mov 8(mp,i,8), %rax - adc %rdi, %rbp - mov %r10, (up,i,8) - mov 8(up,i,8), %r10 - lea (%rdx), %rdi - adc $0, %r9 - add $2, i - js L(tp0) - -L(ed0): mul q0 - add %rbp, %r10 - adc %r9, %r11 - mov %r10, I(-8(up),-8(up,i,8)) - mov I((up),(up,i,8)), %r10 - adc $0, %rdi - add %r11, %r10 - adc %rdi, %rax - mov %r10, I((up),(up,i,8)) - mov I(8(up),8(up,i,8)), %r10 - adc $0, %rdx - add %rax, %r10 - mov %r10, I(8(up),8(up,i,8)) - adc $0, %rdx - mov %rdx, 16(up,n,8) C up[0] - mov %rbx, q0 C previously computed q limb -> q0 - lea 8(up), up C up++ - dec j - jnz L(otp0) - -L(cj): lea 16(up), up C FIXME - pop rp -L(add_n): -IFSTD(` lea (up,n,8), up C param 2: up - lea (up,n,8), %rdx C param 3: up - n - neg R32(n) ') C param 4: n - -IFDOS(` lea (up,n,8), %rdx C param 2: up - lea (%rdx,n,8), %r8 C param 3: up - n - neg R32(n) - mov n, %r9 C param 4: n - mov rp, %rcx ') C param 1: rp - - CALL( mpn_add_n) - -L(ret): pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - -L(n1): mov (mp_param), %rax - mul q0 - add 8(up), %rax - adc 16(up), %rdx - mov %rdx, (rp) - mov $0, R32(%rax) - adc R32(%rax), R32(%rax) - jmp L(ret) - -L(n2): mov (mp_param), %rax - mov (up), %rbp - mul q0 - add %rax, %rbp - mov %rdx, %r9 - adc $0, %r9 - mov -8(mp), %rax - mov 8(up), %r10 - mul q0 - add %rax, %r10 - mov %rdx, %r11 - adc $0, %r11 - add %r9, %r10 - adc $0, %r11 - mov %r10, q0 - imul u0inv, q0 C next q0 - mov -16(mp), %rax - mul q0 - add %rax, %r10 - mov %rdx, %r9 - adc $0, %r9 - mov -8(mp), %rax - mov 16(up), %r14 - mul q0 - add %rax, %r14 - adc $0, %rdx - add %r9, %r14 - adc $0, %rdx - xor R32(%rax), R32(%rax) - add %r11, %r14 - adc 24(up), %rdx - mov %r14, (rp) - mov %rdx, 8(rp) - adc R32(%rax), R32(%rax) - jmp L(ret) - - ALIGNx -L(n3): mov -24(mp), %rax - mov -8(up), %r10 - mul q0 - add %rax, %r10 - mov -16(mp), %rax - mov %rdx, %r11 - adc $0, %r11 - mov (up), %rbp - mul q0 - add %rax, %rbp - mov %rdx, %r9 - adc $0, %r9 - mov -8(mp), %rax - add %r11, %rbp - mov 8(up), %r10 - adc $0, %r9 - mul q0 - mov %rbp, q0 - imul u0inv, q0 C next q0 - add %rax, %r10 - mov %rdx, %r11 - adc $0, %r11 - mov %rbp, (up) - add %r9, %r10 - adc $0, %r11 - mov %r10, 8(up) - mov %r11, -8(up) C up[0] - lea 8(up), up C up++ - dec j - jnz L(n3) - - mov -32(up), %rdx - mov -24(up), %rbx - xor R32(%rax), R32(%rax) - add %rbp, %rdx - adc %r10, %rbx - adc 8(up), %r11 - mov %rdx, (rp) - mov %rbx, 8(rp) - mov %r11, 16(rp) - adc R32(%rax), R32(%rax) - jmp L(ret) - - ALIGNx -L(n4): mov -32(mp), %rax - mul q0 - lea (%rax), %r11 - mov -24(mp), %rax - lea (%rdx), %r14 - mul q0 - lea (%rax), %rbp - mov -16(mp), %rax - mov -16(up), %r10 - lea (%rdx), %r9 - mul q0 - add %r11, %r10 - lea (%rax), %r11 - mov -8(mp), %rax - adc %r14, %rbp - mov -8(up), %rbx - lea (%rdx), %r14 - adc $0, %r9 - mul q0 - add %rbp, %rbx - adc %r9, %r11 - mov %rbx, -8(up) - mov (up), %r10 - adc $0, %r14 - imul u0inv, %rbx C next q limb - add %r11, %r10 - adc %r14, %rax - mov %r10, (up) - mov 8(up), %r10 - adc $0, %rdx - add %rax, %r10 - mov %r10, 8(up) - adc $0, %rdx - mov %rdx, -16(up) C up[0] - mov %rbx, q0 C previously computed q limb -> q0 - lea 8(up), up C up++ - dec j - jnz L(n4) - lea 16(up), up - jmp L(add_n) -EPILOGUE() -ASM_END() diff --git a/gmp/mpn/x86_64/core2/rsh1aors_n.asm b/gmp/mpn/x86_64/core2/rsh1aors_n.asm deleted file mode 100644 index 27eed3712d..0000000000 --- a/gmp/mpn/x86_64/core2/rsh1aors_n.asm +++ /dev/null @@ -1,169 +0,0 @@ -dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn. - -dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C AMD K8,K9 ? -C AMD K10 ? -C Intel P4 ? -C Intel core2 3.05 -C Intel NHM 3.3 -C Intel SBR 2.5 -C Intel atom ? -C VIA nano ? - -C TODO -C * Loopmix to approach 2.5 c/l on NHM. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`vp', `%rdx') -define(`n', `%rcx') - -ifdef(`OPERATION_rsh1add_n', ` - define(ADDSUB, add) - define(ADCSBB, adc) - define(func_n, mpn_rsh1add_n) - define(func_nc, mpn_rsh1add_nc)') -ifdef(`OPERATION_rsh1sub_n', ` - define(ADDSUB, sub) - define(ADCSBB, sbb) - define(func_n, mpn_rsh1sub_n) - define(func_nc, mpn_rsh1sub_nc)') - -MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(func_nc) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8 ') - push %rbx - push %rbp - - neg %r8 C set C flag from parameter - mov (up), %r8 - ADCSBB (vp), %r8 - jmp L(ent) -EPILOGUE() - - ALIGN(16) -PROLOGUE(func_n) - FUNC_ENTRY(4) - push %rbx - push %rbp - - mov (up), %r8 - ADDSUB (vp), %r8 -L(ent): sbb R32(%rbx), R32(%rbx) C save cy - mov %r8, %rax - and $1, R32(%rax) C return value - - lea (up,n,8), up - lea (vp,n,8), vp - lea (rp,n,8), rp - mov R32(n), R32(%rbp) - neg n - and $3, R32(%rbp) - jz L(b0) - cmp $2, R32(%rbp) - jae L(n1) - -L(b1): mov %r8, %rbp - inc n - js L(top) - jmp L(end) - -L(n1): jnz L(b3) - add R32(%rbx), R32(%rbx) C restore cy - mov 8(up,n,8), %r11 - ADCSBB 8(vp,n,8), %r11 - sbb R32(%rbx), R32(%rbx) C save cy - mov %r8, %r10 - add $-2, n - jmp L(2) - -L(b3): add R32(%rbx), R32(%rbx) C restore cy - mov 8(up,n,8), %r10 - mov 16(up,n,8), %r11 - ADCSBB 8(vp,n,8), %r10 - ADCSBB 16(vp,n,8), %r11 - sbb R32(%rbx), R32(%rbx) C save cy - mov %r8, %r9 - dec n - jmp L(3) - -L(b0): add R32(%rbx), R32(%rbx) C restore cy - mov 8(up,n,8), %r9 - mov 16(up,n,8), %r10 - mov 24(up,n,8), %r11 - ADCSBB 8(vp,n,8), %r9 - ADCSBB 16(vp,n,8), %r10 - ADCSBB 24(vp,n,8), %r11 - sbb R32(%rbx), R32(%rbx) C save cy - jmp L(4) - - ALIGN(16) - -L(top): add R32(%rbx), R32(%rbx) C restore cy - mov (up,n,8), %r8 - mov 8(up,n,8), %r9 - mov 16(up,n,8), %r10 - mov 24(up,n,8), %r11 - ADCSBB (vp,n,8), %r8 - ADCSBB 8(vp,n,8), %r9 - ADCSBB 16(vp,n,8), %r10 - ADCSBB 24(vp,n,8), %r11 - sbb R32(%rbx), R32(%rbx) C save cy - shrd $1, %r8, %rbp - mov %rbp, -8(rp,n,8) -L(4): shrd $1, %r9, %r8 - mov %r8, (rp,n,8) -L(3): shrd $1, %r10, %r9 - mov %r9, 8(rp,n,8) -L(2): shrd $1, %r11, %r10 - mov %r10, 16(rp,n,8) -L(1): add $4, n - mov %r11, %rbp - js L(top) - -L(end): shrd $1, %rbx, %rbp - mov %rbp, -8(rp) - pop %rbp - pop %rbx - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/rshift.asm b/gmp/mpn/x86_64/core2/rshift.asm index ab32ec85df..9a3fc46f9a 100644 --- a/gmp/mpn/x86_64/core2/rshift.asm +++ b/gmp/mpn/x86_64/core2/rshift.asm @@ -1,69 +1,50 @@ dnl x86-64 mpn_rshift optimized for "Core 2". -dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. +dnl Copyright 2007 Free Software Foundation, Inc. dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C AMD K8,K9 4.25 -C AMD K10 4.25 -C Intel P4 14.7 -C Intel core2 1.27 -C Intel NHM 1.375 (up to about n = 260, then 1.5) -C Intel SBR 1.77 -C Intel atom ? -C VIA nano ? +C K8,K9: 4.25 +C K10: 4.25 +C P4: 14.7 +C P6-15: 1.27 C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n', `%rdx') -define(`cnt', `%rcx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) +define(`cnt', `%cl') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_rshift) - FUNC_ENTRY(4) - mov R32(%rdx), R32(%rax) - and $3, R32(%rax) + mov %edx, %eax + and $3, %eax jne L(nb00) L(b00): C n = 4, 8, 12, ... mov (up), %r10 mov 8(up), %r11 - xor R32(%rax), R32(%rax) - shrd R8(cnt), %r10, %rax + xor %eax, %eax + shrd %cl, %r10, %rax mov 16(up), %r8 lea 8(up), up lea -24(rp), rp @@ -71,11 +52,11 @@ L(b00): C n = 4, 8, 12, ... jmp L(00) L(nb00):C n = 1, 5, 9, ... - cmp $2, R32(%rax) + cmp $2, %eax jae L(nb01) L(b01): mov (up), %r9 - xor R32(%rax), R32(%rax) - shrd R8(cnt), %r9, %rax + xor %eax, %eax + shrd %cl, %r9, %rax sub $2, n jb L(le1) mov 8(up), %r10 @@ -83,65 +64,62 @@ L(b01): mov (up), %r9 lea 16(up), up lea -16(rp), rp jmp L(01) -L(le1): shr R8(cnt), %r9 +L(le1): shr %cl, %r9 mov %r9, (rp) - FUNC_EXIT() ret L(nb01):C n = 2, 6, 10, ... jne L(b11) L(b10): mov (up), %r8 mov 8(up), %r9 - xor R32(%rax), R32(%rax) - shrd R8(cnt), %r8, %rax + xor %eax, %eax + shrd %cl, %r8, %rax sub $3, n jb L(le2) mov 16(up), %r10 lea 24(up), up lea -8(rp), rp jmp L(10) -L(le2): shrd R8(cnt), %r9, %r8 +L(le2): shrd %cl, %r9, %r8 mov %r8, (rp) - shr R8(cnt), %r9 + shr %cl, %r9 mov %r9, 8(rp) - FUNC_EXIT() ret ALIGN(16) L(b11): C n = 3, 7, 11, ... mov (up), %r11 mov 8(up), %r8 - xor R32(%rax), R32(%rax) - shrd R8(cnt), %r11, %rax + xor %eax, %eax + shrd %cl, %r11, %rax mov 16(up), %r9 lea 32(up), up sub $4, n jb L(end) ALIGN(16) -L(top): shrd R8(cnt), %r8, %r11 +L(top): shrd %cl, %r8, %r11 mov -8(up), %r10 mov %r11, (rp) -L(10): shrd R8(cnt), %r9, %r8 +L(10): shrd %cl, %r9, %r8 mov (up), %r11 mov %r8, 8(rp) -L(01): shrd R8(cnt), %r10, %r9 +L(01): shrd %cl, %r10, %r9 mov 8(up), %r8 mov %r9, 16(rp) -L(00): shrd R8(cnt), %r11, %r10 +L(00): shrd %cl, %r11, %r10 mov 16(up), %r9 + lea 32(up), up mov %r10, 24(rp) - add $32, up lea 32(rp), rp sub $4, n jnc L(top) -L(end): shrd R8(cnt), %r8, %r11 +L(end): shrd %cl, %r8, %r11 mov %r11, (rp) - shrd R8(cnt), %r9, %r8 + shrd %cl, %r9, %r8 mov %r8, 8(rp) - shr R8(cnt), %r9 + shr %cl, %r9 mov %r9, 16(rp) - FUNC_EXIT() ret EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/sec_tabselect.asm b/gmp/mpn/x86_64/core2/sec_tabselect.asm deleted file mode 100644 index e4360341d9..0000000000 --- a/gmp/mpn/x86_64/core2/sec_tabselect.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl X86-64 mpn_sec_tabselect. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_sec_tabselect) -include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp/mpn/x86_64/core2/sqr_basecase.asm b/gmp/mpn/x86_64/core2/sqr_basecase.asm deleted file mode 100644 index a112c1b52e..0000000000 --- a/gmp/mpn/x86_64/core2/sqr_basecase.asm +++ /dev/null @@ -1,984 +0,0 @@ -dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere. -dnl It also seems good for Conroe/Wolfdale. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 -C AMD K8,K9 -C AMD K10 -C AMD bull -C AMD pile -C AMD steam -C AMD bobcat -C AMD jaguar -C Intel P4 -C Intel core 4.9 4.18-4.25 3.87 -C Intel NHM 3.8 4.06-4.2 3.5 -C Intel SBR -C Intel IBR -C Intel HWL -C Intel BWL -C Intel atom -C VIA nano - -C The inner loops of this code are the result of running a code generation and -C optimisation tool suite written by David Harvey and Torbjörn Granlund. - -C Code structure: -C -C -C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) -C | | | | -C | | | | -C | | | | -C \|/ \|/ \|/ \|/ -C ____________ ____________ -C / \ / \ -C \|/ \ \|/ \ -C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) -C \ /|\ \ /|\ -C \____________/ \____________/ -C \ / -C \ / -C \ / -C tail(0m2) tail(1m2) -C \ / -C \ / -C sqr_diag_addlsh1 - -C TODO -C * Tune. None done so far. -C * Currently 2761 bytes, making it smaller would be nice. -C * Consider using a jumptab-based entry sequence. One might even use a mask- -C less sequence, if the table is large enough to support tuneup's needs. -C The code would be, using non-PIC code, -C lea tab(%rip),%rax; jmp *(n,%rax) -C or, -C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx -C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,.. -C with the last four entries repeated a safe number of times. -C * Consider expanding feed-in code in order to avoid zeroing registers. -C * Zero consistently with xor. -C * Check if using "lea (reg),reg" should be done in more places; we have some -C explicit "mov %rax,reg" now. -C * Try zeroing with xor in m2 loops. -C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication -C between loop header and wind-down code. -C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. - -C When playing with pointers, set this to $2 to fall back to conservative -C indexing in wind-down code. -define(`I',`$1') - -C Define this to $1 to use late loop index variable as zero, $2 to use an -C explicit $0. -define(`Z',`$1') - -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n_param', `%rdx') - -define(`n', `%r8') - -define(`v0', `%r10') -define(`v1', `%r11') -define(`w0', `%rbx') -define(`w1', `%rcx') -define(`w2', `%rbp') -define(`w3', `%r9') -define(`i', `%r13') - -define(`X0', `%r12') -define(`X1', `%r14') - -C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -define(`ALIGNx', `ALIGN(16)') - -define(`N', 85) -ifdef(`N',,`define(`N',0)') -define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') - -ASM_START() - TEXT - ALIGN(32) -PROLOGUE(mpn_sqr_basecase) - FUNC_ENTRY(3) - - cmp $4, n_param - jl L(small) - - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - - mov (up), v0 - mov 8(up), %rax - mov %rax, v1 - - mov $1, R32(n) - sub n_param, n C n = -n_param+1 - push n - - lea (up,n_param,8), up - lea (rp,n_param,8), rp - - mul v0 - - test $1, R8(n) - jnz L(bx1) - -L(bx0): test $2, R8(n) - mov %rax, (rp,n,8) - jnz L(b10) - -L(b00): lea (n), i C n = 5, 9, ... - mov %rdx, w1 C FIXME: Use lea? - xor R32(w2), R32(w2) - jmp L(m2e0) - -L(b10): lea 2(n), i C n = 7, 11, ... - mov 8(up,n,8), %rax - mov %rdx, w3 C FIXME: Use lea? - xor R32(w0), R32(w0) - xor R32(w1), R32(w1) - jmp L(m2e2) - -L(bx1): test $2, R8(n) - mov %rax, (rp,n,8) - jz L(b11) - -L(b01): lea 1(n), i C n = 6, 10, ... - mov %rdx, w0 C FIXME: Use lea? - xor R32(w1), R32(w1) - jmp L(m2e1) - -L(b11): lea -1(n), i C n = 4, 8, 12, ... - mov %rdx, w2 C FIXME: Use lea? - xor R32(w3), R32(w3) - jmp L(m2e3) - - - ALIGNx -L(m2top1): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 -L(m2e1):mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top1) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add w0, %rax - adc w1, %rdx - mov %rax, I((rp),(rp,i,8)) - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n C decrease |n| - jmp L(am2o3) - - ALIGNx -L(m2top3): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 -L(m2e3):mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top3) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add w0, %rax - adc w1, %rdx - mov %rax, I((rp),(rp,i,8)) - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n C decrease |n| - cmp $-1, n - jz L(cor1) C jumps iff entry n = 4 - -L(am2o1): - mov -8(up,n,8), v0 - mov (up,n,8), %rax - mov %rax, v1 - lea 1(n), i - mul v0 - mov %rax, X1 - MOV( %rdx, X0, 128) - mov (rp,n,8), w1 - xor R32(w2), R32(w2) - mov 8(up,n,8), %rax - xor R32(w3), R32(w3) - jmp L(lo1) - - ALIGNx -L(am2top1): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 -L(lo1): mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 - mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 - mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top1) - - mul v1 - add w0, w1 - adc w2, %rax - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add X0, %rax - mov %rax, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n - -L(am2o3): - mov -8(up,n,8), v0 - mov (up,n,8), %rax - mov %rax, v1 - lea -1(n), i - mul v0 - mov %rax, X1 - MOV( %rdx, X0, 8) - mov (rp,n,8), w3 - xor R32(w0), R32(w0) - xor R32(w1), R32(w1) - mov 8(up,n,8), %rax - jmp L(lo3) - - ALIGNx -L(am2top3): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 - mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 - mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax -L(lo3): mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 - mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top3) - - mul v1 - add w0, w1 - adc w2, %rax - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add X0, %rax - mov %rax, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n - cmp $-1, n - jnz L(am2o1) - -L(cor1):pop n - mov %rdx, w3 - mov -16(up), v0 - mov -8(up), %rax - mul v0 - add w3, %rax - adc $0, %rdx - mov %rax, -8(rp) - mov %rdx, (rp) - jmp L(sqr_diag_addlsh1) - - ALIGNx -L(m2top2): -L(m2e2):mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 - mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top2) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add w0, %rax - adc w1, %rdx - mov %rax, I((rp),(rp,i,8)) - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n C decrease |n| - jmp L(am2o0) - - ALIGNx -L(m2top0): - mul v0 - add %rax, w3 - mov -8(up,i,8), %rax - mov w3, -8(rp,i,8) - adc %rdx, w0 - adc $0, R32(w1) - mul v1 - add %rax, w0 - adc %rdx, w1 - mov $0, R32(w2) - mov (up,i,8), %rax - mul v0 - add %rax, w0 - mov w0, (rp,i,8) - adc %rdx, w1 - mov (up,i,8), %rax - adc $0, R32(w2) - mul v1 - add %rax, w1 - adc %rdx, w2 -L(m2e0):mov 8(up,i,8), %rax - mul v0 - mov $0, R32(w3) - add %rax, w1 - adc %rdx, w2 - adc $0, R32(w3) - mov 8(up,i,8), %rax - mul v1 - add %rax, w2 - mov w1, 8(rp,i,8) - adc %rdx, w3 - mov $0, R32(w0) - mov 16(up,i,8), %rax - mul v0 - add %rax, w2 - mov 16(up,i,8), %rax - adc %rdx, w3 - adc $0, R32(w0) - mul v1 - mov $0, R32(w1) - add %rax, w3 - mov 24(up,i,8), %rax - mov w2, 16(rp,i,8) - adc %rdx, w0 - add $4, i - js L(m2top0) - - mul v0 - add %rax, w3 - mov I(-8(up),-8(up,i,8)), %rax - mov w3, I(-8(rp),-8(rp,i,8)) - adc %rdx, w0 - adc R32(w1), R32(w1) - mul v1 - add w0, %rax - adc w1, %rdx - mov %rax, I((rp),(rp,i,8)) - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n C decrease |n| - cmp $-2, n - jz L(cor2) C jumps iff entry n = 5 - -L(am2o2): - mov -8(up,n,8), v0 - mov (up,n,8), %rax - mov %rax, v1 - lea -2(n), i - mul v0 - mov %rax, X0 - MOV( %rdx, X1, 32) - mov (rp,n,8), w0 - xor R32(w1), R32(w1) - xor R32(w2), R32(w2) - mov 8(up,n,8), %rax - jmp L(lo2) - - ALIGNx -L(am2top2): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 - mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 - mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 -L(lo2): mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top2) - - mul v1 - add w0, w1 - adc w2, %rax - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add X0, %rax - mov %rax, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n - -L(am2o0): - mov -8(up,n,8), v0 - mov (up,n,8), %rax - mov %rax, v1 - lea 0(n), i - mul v0 - mov %rax, X0 - MOV( %rdx, X1, 2) - xor R32(w0), R32(w0) - mov (rp,n,8), w2 - xor R32(w3), R32(w3) - jmp L(lo0) - - ALIGNx -L(am2top0): - mul v1 - add w0, w1 - adc %rax, w2 - mov (up,i,8), %rax - MOV( %rdx, w3, 1) - adc $0, w3 - mul v0 - add w1, X1 - mov X1, -8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 2) - adc $0, X1 - mov (up,i,8), %rax - mul v1 - MOV( %rdx, w0, 4) - mov (rp,i,8), w1 - add w1, w2 - adc %rax, w3 - adc $0, w0 -L(lo0): mov 8(up,i,8), %rax - mul v0 - add w2, X0 - adc %rax, X1 - mov X0, (rp,i,8) - MOV( %rdx, X0, 8) - adc $0, X0 - mov 8(up,i,8), %rax - mov 8(rp,i,8), w2 - mul v1 - add w2, w3 - adc %rax, w0 - MOV( %rdx, w1, 16) - adc $0, w1 - mov 16(up,i,8), %rax - mul v0 - add w3, X1 - mov X1, 8(rp,i,8) - adc %rax, X0 - MOV( %rdx, X1, 32) - mov 16(rp,i,8), w3 - adc $0, X1 - mov 16(up,i,8), %rax - mul v1 - add w3, w0 - MOV( %rdx, w2, 64) - adc %rax, w1 - mov 24(up,i,8), %rax - adc $0, w2 - mul v0 - add w0, X0 - mov X0, 16(rp,i,8) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov 24(up,i,8), %rax - mov 24(rp,i,8), w0 - adc $0, X0 - add $4, i - jnc L(am2top0) - - mul v1 - add w0, w1 - adc w2, %rax - adc Z(i,$0), %rdx - add w1, X1 - adc Z(i,$0), X0 - mov X1, I(-8(rp),-8(rp,i,8)) - add X0, %rax - mov %rax, I((rp),(rp,i,8)) - adc Z(i,$0), %rdx - mov %rdx, I(8(rp),8(rp,i,8)) - - lea 16(rp), rp - add $2, n - cmp $-2, n - jnz L(am2o2) - -L(cor2):pop n - mov -24(up), v0 - mov %rax, w2 - mov %rdx, w0 - mov -16(up), %rax - mov %rax, v1 - mul v0 - mov %rax, X0 - MOV( %rdx, X1, 32) - mov -8(up), %rax - mul v0 - add w2, X0 - mov X0, -16(rp) - MOV( %rdx, X0, 128) - adc %rax, X1 - mov -8(up), %rax - adc $0, X0 - mul v1 - add w0, X1 - adc $0, X0 - mov X1, -8(rp) - add X0, %rax - mov %rax, (rp) - adc $0, %rdx - mov %rdx, 8(rp) - lea 8(rp), rp - -L(sqr_diag_addlsh1): - mov -8(up,n,8), %rax - shl n - xor R32(%rbx), R32(%rbx) - mul %rax - mov 8(rp,n,8), %r11 - lea (%rdx), %r10 - mov 16(rp,n,8), %r9 - add %r11, %r11 - jmp L(dm) - - ALIGNx -L(dtop):mul %rax - add %r11, %r10 - mov 8(rp,n,8), %r11 - mov %r10, -8(rp,n,8) - adc %r9, %rax - lea (%rdx,%rbx), %r10 - mov 16(rp,n,8), %r9 - adc %r11, %r11 -L(dm): mov %rax, (rp,n,8) - mov (up,n,4), %rax - adc %r9, %r9 - setc R8(%rbx) - add $2, n - js L(dtop) - - mul %rax - add %r11, %r10 - mov %r10, -8(rp) - adc %r9, %rax - lea (%rdx,%rbx), %r10 - mov %rax, (rp) - adc $0, %r10 - mov %r10, 8(rp) - - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - FUNC_EXIT() - ret - - ALIGN(16) -L(small): - mov (up), %rax - cmp $2, n_param - jae L(gt1) -L(n1): - mul %rax - mov %rax, (rp) - mov %rdx, 8(rp) - FUNC_EXIT() - ret - -L(gt1): jne L(gt2) -L(n2): mov %rax, %r8 - mul %rax - mov 8(up), %r11 - mov %rax, (rp) - mov %r11, %rax - mov %rdx, %r9 - mul %rax - mov %rax, %r10 - mov %r11, %rax - mov %rdx, %r11 - mul %r8 - xor %r8, %r8 - add %rax, %r9 - adc %rdx, %r10 - adc %r8, %r11 - add %rax, %r9 - mov %r9, 8(rp) - adc %rdx, %r10 - mov %r10, 16(rp) - adc %r8, %r11 - mov %r11, 24(rp) - FUNC_EXIT() - ret - -L(gt2): -L(n3): mov %rax, %r10 - mul %rax - mov 8(up), %r11 - mov %rax, (rp) - mov %r11, %rax - mov %rdx, 8(rp) - mul %rax - mov 16(up), %rcx - mov %rax, 16(rp) - mov %rcx, %rax - mov %rdx, 24(rp) - mul %rax - mov %rax, 32(rp) - mov %rdx, 40(rp) - - mov %r11, %rax - mul %r10 - mov %rax, %r8 - mov %rcx, %rax - mov %rdx, %r9 - mul %r10 - xor %r10, %r10 - add %rax, %r9 - mov %r11, %rax - mov %r10, %r11 - adc %rdx, %r10 - - mul %rcx - add %rax, %r10 - adc %r11, %rdx - add %r8, %r8 - adc %r9, %r9 - adc %r10, %r10 - adc %rdx, %rdx - adc %r11, %r11 - add %r8, 8(rp) - adc %r9, 16(rp) - adc %r10, 24(rp) - adc %rdx, 32(rp) - adc %r11, 40(rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/core2/sublsh1_n.asm b/gmp/mpn/x86_64/core2/sublsh1_n.asm deleted file mode 100644 index 46488fcafe..0000000000 --- a/gmp/mpn/x86_64/core2/sublsh1_n.asm +++ /dev/null @@ -1,47 +0,0 @@ -dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -define(LSH, 1) -define(RSH, 63) - -define(ADDSUB, sub) -define(ADCSBB, sbb) -define(func, mpn_sublsh1_n) - -MULFUNC_PROLOGUE(mpn_sublsh1_n) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp/mpn/x86_64/core2/sublsh2_n.asm b/gmp/mpn/x86_64/core2/sublsh2_n.asm deleted file mode 100644 index f3b1e28464..0000000000 --- a/gmp/mpn/x86_64/core2/sublsh2_n.asm +++ /dev/null @@ -1,47 +0,0 @@ -dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -define(LSH, 2) -define(RSH, 62) - -define(ADDSUB, sub) -define(ADCSBB, sbb) -define(func, mpn_sublsh2_n) - -MULFUNC_PROLOGUE(mpn_sublsh2_n) - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -include_mpn(`x86_64/core2/sublshC_n.asm') |