summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86_64/core2
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86_64/core2')
-rw-r--r--gmp/mpn/x86_64/core2/aorrlsh1_n.asm53
-rw-r--r--gmp/mpn/x86_64/core2/aorrlsh2_n.asm53
-rw-r--r--gmp/mpn/x86_64/core2/aorrlsh_n.asm38
-rw-r--r--gmp/mpn/x86_64/core2/aors_err1_n.asm225
-rw-r--r--gmp/mpn/x86_64/core2/aors_n.asm130
-rw-r--r--gmp/mpn/x86_64/core2/aorslsh1_n.asm (renamed from gmp/mpn/x86_64/core2/sublshC_n.asm)85
-rw-r--r--gmp/mpn/x86_64/core2/aorsmul_1.asm193
-rw-r--r--gmp/mpn/x86_64/core2/copyd.asm37
-rw-r--r--gmp/mpn/x86_64/core2/copyi.asm37
-rw-r--r--gmp/mpn/x86_64/core2/divrem_1.asm237
-rw-r--r--gmp/mpn/x86_64/core2/gcd_1.asm144
-rw-r--r--gmp/mpn/x86_64/core2/gmp-mparam.h275
-rw-r--r--gmp/mpn/x86_64/core2/lshift.asm100
-rw-r--r--gmp/mpn/x86_64/core2/lshiftc.asm159
-rw-r--r--gmp/mpn/x86_64/core2/mul_basecase.asm975
-rw-r--r--gmp/mpn/x86_64/core2/mullo_basecase.asm427
-rw-r--r--gmp/mpn/x86_64/core2/popcount.asm32
-rw-r--r--gmp/mpn/x86_64/core2/redc_1.asm425
-rw-r--r--gmp/mpn/x86_64/core2/rsh1aors_n.asm169
-rw-r--r--gmp/mpn/x86_64/core2/rshift.asm100
-rw-r--r--gmp/mpn/x86_64/core2/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/core2/sqr_basecase.asm984
-rw-r--r--gmp/mpn/x86_64/core2/sublsh1_n.asm47
-rw-r--r--gmp/mpn/x86_64/core2/sublsh2_n.asm47
24 files changed, 330 insertions, 4679 deletions
diff --git a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
deleted file mode 100644
index 7066bb4372..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-ifdef(`OPERATION_addlsh1_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_addlsh1_n)')
-ifdef(`OPERATION_rsblsh1_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_rsblsh1_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
deleted file mode 100644
index 5065120857..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
-dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_addlsh2_n)')
-ifdef(`OPERATION_rsblsh2_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_rsblsh2_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aorrlsh_n.asm b/gmp/mpn/x86_64/core2/aorrlsh_n.asm
deleted file mode 100644
index 57abf31579..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh_n.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aors_err1_n.asm b/gmp/mpn/x86_64/core2/aors_err1_n.asm
deleted file mode 100644
index 3f875aefa4..0000000000
--- a/gmp/mpn/x86_64/core2/aors_err1_n.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n
-
-dnl Contributed by David Harvey.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 4.14
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`ep', `%rcx')
-define(`yp', `%r8')
-define(`n', `%r9')
-define(`cy_param', `8(%rsp)')
-
-define(`el', `%rbx')
-define(`eh', `%rbp')
-define(`t0', `%r10')
-define(`t1', `%r11')
-define(`t2', `%r12')
-define(`t3', `%r13')
-define(`w0', `%r14')
-define(`w1', `%r15')
-
-ifdef(`OPERATION_add_err1_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_err1_n)')
-ifdef(`OPERATION_sub_err1_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_err1_n)')
-
-MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- mov cy_param, %rax
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
-
- mov R32(n), R32(%r10)
- and $3, R32(%r10)
- jz L(0mod4)
- cmp $2, R32(%r10)
- jc L(1mod4)
- jz L(2mod4)
-L(3mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- xor R32(t0), R32(t0)
- xor R32(t1), R32(t1)
- lea -24(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc 16(yp), el
- ADCSBB 8(vp,n,8), w1
- mov w1, 8(rp,n,8)
- cmovc 8(yp), t0
- mov 16(up,n,8), w0
- ADCSBB 16(vp,n,8), w0
- mov w0, 16(rp,n,8)
- cmovc (yp), t1
- setc %al C save carry
- add t0, el
- adc $0, eh
- add t1, el
- adc $0, eh
-
- add $3, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(16)
-L(0mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- lea (yp,n,8), yp
- neg n
- jmp L(loop)
-
- ALIGN(16)
-L(1mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- lea -8(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc (yp), el
- setc %al C save carry
-
- add $1, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(16)
-L(2mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- xor R32(t0), R32(t0)
- lea -16(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc 8(yp), el
- ADCSBB 8(vp,n,8), w1
- mov w1, 8(rp,n,8)
- cmovc (yp), t0
- setc %al C save carry
- add t0, el
- adc $0, eh
-
- add $2, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(32)
-L(loop):
- mov (up,n,8), w0
- shr $1, %al C restore carry
- mov -8(yp), t0
- mov $0, R32(t3)
- ADCSBB (vp,n,8), w0
- cmovnc t3, t0
- mov w0, (rp,n,8)
- mov 8(up,n,8), w1
- mov 16(up,n,8), w0
- ADCSBB 8(vp,n,8), w1
- mov -16(yp), t1
- cmovnc t3, t1
- mov -24(yp), t2
- mov w1, 8(rp,n,8)
- ADCSBB 16(vp,n,8), w0
- cmovnc t3, t2
- mov 24(up,n,8), w1
- ADCSBB 24(vp,n,8), w1
- cmovc -32(yp), t3
- setc %al C save carry
- add t0, el
- adc $0, eh
- add t1, el
- adc $0, eh
- add t2, el
- adc $0, eh
- lea -32(yp), yp
- mov w0, 16(rp,n,8)
- add t3, el
- adc $0, eh
- add $4, n
- mov w1, -8(rp,n,8)
- jnz L(loop)
-
-L(end):
- mov el, (ep)
- mov eh, 8(ep)
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aors_n.asm b/gmp/mpn/x86_64/core2/aors_n.asm
index 74a1bce48a..d26af866f9 100644
--- a/gmp/mpn/x86_64/core2/aors_n.asm
+++ b/gmp/mpn/x86_64/core2/aors_n.asm
@@ -1,45 +1,30 @@
-dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
+dnl Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
+dnl Copyright 2006, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2
-C AMD K10 2
-C Intel P4 10
-C Intel core2 2
-C Intel NHM 2
-C Intel SBR 2
-C Intel atom 9
-C VIA nano 3
+C K8,K9: 2.25
+C K10: 2
+C P4: 10
+C P6-15: 2.05
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -59,83 +44,80 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
+
TEXT
ALIGN(16)
+
+PROLOGUE(func_nc)
+ jmp L(start)
+EPILOGUE()
+
PROLOGUE(func)
- FUNC_ENTRY(4)
xor %r8, %r8
L(start):
mov (up), %r10
mov (vp), %r11
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
- mov R32(n), R32(%rax)
+ lea -8(up,n,8), up
+ lea -8(vp,n,8), vp
+ lea -16(rp,n,8), rp
+ mov %ecx, %eax
neg n
- and $3, R32(%rax)
+ and $3, %eax
je L(b00)
- add %rax, n C clear low rcx bits for jrcxz
- cmp $2, R32(%rax)
+ add %rax, n C clear low rcx bits for jrcxz
+ cmp $2, %eax
jl L(b01)
je L(b10)
-L(b11): neg %r8 C set cy
+L(b11): shr %r8 C set cy
jmp L(e11)
-L(b00): neg %r8 C set cy
+L(b00): shr %r8 C set cy
mov %r10, %r8
mov %r11, %r9
lea 4(n), n
jmp L(e00)
- nop
- nop
- nop
-L(b01): neg %r8 C set cy
- jmp L(top)
+L(b01): shr %r8 C set cy
+ jmp L(e01)
-L(b10): neg %r8 C set cy
+L(b10): shr %r8 C set cy
mov %r10, %r8
mov %r11, %r9
jmp L(e10)
L(end): ADCSBB %r11, %r10
- mov %r10, -8(rp)
- mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
+ mov %r10, 8(rp)
+ mov %ecx, %eax C clear eax, ecx contains 0
+ adc %eax, %eax
ret
ALIGN(16)
-L(top): jrcxz L(end)
- mov (up,n,8), %r8
- mov (vp,n,8), %r9
- lea 4(n), n
- ADCSBB %r11, %r10
- mov %r10, -40(rp,n,8)
-L(e00): mov -24(up,n,8), %r10
- mov -24(vp,n,8), %r11
- ADCSBB %r9, %r8
- mov %r8, -32(rp,n,8)
-L(e11): mov -16(up,n,8), %r8
- mov -16(vp,n,8), %r9
+L(top):
+ mov -24(up,n,8), %r8
+ mov -24(vp,n,8), %r9
ADCSBB %r11, %r10
mov %r10, -24(rp,n,8)
-L(e10): mov -8(up,n,8), %r10
- mov -8(vp,n,8), %r11
+L(e00):
+ mov -16(up,n,8), %r10
+ mov -16(vp,n,8), %r11
ADCSBB %r9, %r8
mov %r8, -16(rp,n,8)
+L(e11):
+ mov -8(up,n,8), %r8
+ mov -8(vp,n,8), %r9
+ ADCSBB %r11, %r10
+ mov %r10, -8(rp,n,8)
+L(e10):
+ mov (up,n,8), %r10
+ mov (vp,n,8), %r11
+ ADCSBB %r9, %r8
+ mov %r8, (rp,n,8)
+L(e01):
+ jrcxz L(end)
+ lea 4(n), n
jmp L(top)
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- jmp L(start)
EPILOGUE()
-
diff --git a/gmp/mpn/x86_64/core2/sublshC_n.asm b/gmp/mpn/x86_64/core2/aorslsh1_n.asm
index 5acc46b032..18db7c96f8 100644
--- a/gmp/mpn/x86_64/core2/sublshC_n.asm
+++ b/gmp/mpn/x86_64/core2/aorslsh1_n.asm
@@ -1,45 +1,29 @@
-dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << 1), optimised for Core 2 and
-dnl Core iN.
+dnl x86-64 mpn_addlsh1_n and mpn_sublsh1_n, optimized for "Core" 2.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4.25
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 3
-C Intel NHM 3.1
-C Intel SBR 2.47
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 4.25
+C K10: ?
+C P4: ?
+C P6-15: 3
C INPUT PARAMETERS
define(`rp',`%rdi')
@@ -47,11 +31,21 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n', `%rcx')
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_sublsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_sublsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
ASM_START()
TEXT
ALIGN(8)
PROLOGUE(func)
- FUNC_ENTRY(4)
push %rbx
push %r12
@@ -64,7 +58,7 @@ PROLOGUE(func)
xor R32(%r11), R32(%r11)
mov -24(vp,n,8), %r8 C do first limb early
- shrd $RSH, %r8, %r11
+ shrd $63, %r8, %r11
and $3, R32(%rax)
je L(b0)
@@ -73,9 +67,9 @@ PROLOGUE(func)
je L(b2)
L(b3): mov -16(vp,n,8), %r9
- shrd $RSH, %r9, %r8
+ shrd $63, %r9, %r8
mov -8(vp,n,8), %r10
- shrd $RSH, %r10, %r9
+ shrd $63, %r10, %r9
mov -24(up,n,8), %r12
ADDSUB %r11, %r12
mov %r12, -24(rp,n,8)
@@ -101,7 +95,7 @@ L(b1): mov -24(up,n,8), %r12
jmp L(end)
L(b2): mov -16(vp,n,8), %r9
- shrd $RSH, %r9, %r8
+ shrd $63, %r9, %r8
mov -24(up,n,8), %r12
ADDSUB %r11, %r12
mov %r12, -24(rp,n,8)
@@ -116,13 +110,13 @@ L(b2): mov -16(vp,n,8), %r9
ALIGN(16)
L(top): mov -24(vp,n,8), %r8
- shrd $RSH, %r8, %r11
+ shrd $63, %r8, %r11
L(b0): mov -16(vp,n,8), %r9
- shrd $RSH, %r9, %r8
+ shrd $63, %r9, %r8
mov -8(vp,n,8), %r10
- shrd $RSH, %r10, %r9
+ shrd $63, %r10, %r9
mov (vp,n,8), %rbx
- shrd $RSH, %rbx, %r10
+ shrd $63, %rbx, %r10
add R32(%rax), R32(%rax) C restore cy
@@ -148,11 +142,10 @@ L(b0): mov -16(vp,n,8), %r9
add $4, n
js L(top)
-L(end): shr $RSH, %r11
+L(end): add %r11, %r11
pop %r12
pop %rbx
- sub R32(%r11), R32(%rax)
+ sbb $0, R32(%rax)
neg R32(%rax)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aorsmul_1.asm b/gmp/mpn/x86_64/core2/aorsmul_1.asm
index 6b313dd836..1d05b30b59 100644
--- a/gmp/mpn/x86_64/core2/aorsmul_1.asm
+++ b/gmp/mpn/x86_64/core2/aorsmul_1.asm
@@ -1,46 +1,29 @@
dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
-dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4
-C AMD K10 4
-C AMD bd1 5.1
-C AMD bobcat
-C Intel P4 ?
-C Intel core2 4.3-4.5 (fluctuating)
-C Intel NHM 5.0
-C Intel SBR 4.1
-C Intel atom ?
-C VIA nano 5.25
+C K8,K9: 4
+C K10: 4
+C P4: ?
+C P6-15: 4.3-4.7 (fluctuating)
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -50,129 +33,111 @@ define(`v0', `%rcx')
ifdef(`OPERATION_addmul_1',`
define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
- define(`func_1c', `mpn_addmul_1c')
+ define(`func', `mpn_addmul_1')
')
ifdef(`OPERATION_submul_1',`
define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
- define(`func_1c', `mpn_submul_1c')
+ define(`func', `mpn_submul_1')
')
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
- C For DOS, on the stack we have four saved registers, return address,
- C space for four register arguments, and finally the carry input.
-
-IFDOS(` define(`carry_in', `72(%rsp)')') dnl
-IFSTD(` define(`carry_in', `%r8')') dnl
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
ASM_START()
TEXT
ALIGN(16)
-PROLOGUE(func_1c)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- lea (%rdx), %rbx
- neg %rbx
-
- mov (up), %rax
- mov (rp), %r10
-
- lea -16(rp,%rdx,8), rp
- lea (up,%rdx,8), up
- mul %rcx
- add carry_in, %rax
- adc $0, %rdx
- jmp L(start_nc)
-EPILOGUE()
-
- ALIGN(16)
PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- lea (%rdx), %rbx
- neg %rbx
+ push %r15
+ push %r12
+ push %r13
+ lea (%rdx), %r15
+ neg %r15
mov (up), %rax
- mov (rp), %r10
- lea -16(rp,%rdx,8), rp
+ bt $0, %r15
+ jc L(odd)
+
+ lea (rp,%rdx,8), rp
lea (up,%rdx,8), up
mul %rcx
-L(start_nc):
- bt $0, R32(%rbx)
- jc L(odd)
-
lea (%rax), %r11
- mov 8(up,%rbx,8), %rax
- lea (%rdx), %rbp
- mul %rcx
- add $2, %rbx
+ mov 8(up,%r15,8), %rax
+ mov (rp,%r15,8), %r13
+ lea (%rdx), %r12
+
+ add $2, %r15
jns L(n2)
+ mul %rcx
lea (%rax), %r8
- mov (up,%rbx,8), %rax
+ mov (up,%r15,8), %rax
+ mov -8(rp,%r15,8), %r10
lea (%rdx), %r9
- jmp L(mid)
+ jmp L(m)
-L(odd): add $1, %rbx
+L(odd): lea (rp,%rdx,8), rp
+ lea (up,%rdx,8), up
+ mul %rcx
+ add $1, %r15
jns L(n1)
- lea (%rax), %r8
- mov (up,%rbx,8), %rax
+L(gt1): lea (%rax), %r8
+ mov (up,%r15,8), %rax
+ mov -8(rp,%r15,8), %r10
lea (%rdx), %r9
mul %rcx
lea (%rax), %r11
- mov 8(up,%rbx,8), %rax
- lea (%rdx), %rbp
- jmp L(e)
+ mov 8(up,%r15,8), %rax
+ mov (rp,%r15,8), %r13
+ lea (%rdx), %r12
+ add $2, %r15
+ jns L(end)
ALIGN(16)
L(top): mul %rcx
ADDSUB %r8, %r10
lea (%rax), %r8
- mov (up,%rbx,8), %rax
+ mov 0(up,%r15,8), %rax
adc %r9, %r11
- mov %r10, -8(rp,%rbx,8)
- mov (rp,%rbx,8), %r10
+ mov %r10, -24(rp,%r15,8)
+ mov -8(rp,%r15,8), %r10
lea (%rdx), %r9
- adc $0, %rbp
-L(mid): mul %rcx
- ADDSUB %r11, %r10
+ adc $0, %r12
+L(m): mul %rcx
+ ADDSUB %r11, %r13
lea (%rax), %r11
- mov 8(up,%rbx,8), %rax
- adc %rbp, %r8
- mov %r10, (rp,%rbx,8)
- mov 8(rp,%rbx,8), %r10
- lea (%rdx), %rbp
+ mov 8(up,%r15,8), %rax
+ adc %r12, %r8
+ mov %r13, -16(rp,%r15,8)
+ mov 0(rp,%r15,8), %r13
+ lea (%rdx), %r12
adc $0, %r9
-L(e): add $2, %rbx
+
+ add $2, %r15
js L(top)
- mul %rcx
+L(end): mul %rcx
ADDSUB %r8, %r10
adc %r9, %r11
- mov %r10, -8(rp)
- adc $0, %rbp
-L(n2): mov (rp), %r10
- ADDSUB %r11, %r10
- adc %rbp, %rax
- mov %r10, (rp)
+ mov %r10, -24(rp,%r15,8)
+ mov -8(rp,%r15,8), %r10
+ adc $0, %r12
+L(r): ADDSUB %r11, %r13
+ adc %r12, %rax
+ mov %r13, -16(rp,%r15,8)
adc $0, %rdx
-L(n1): mov 8(rp), %r10
- ADDSUB %rax, %r10
- mov %r10, 8(rp)
- mov R32(%rbx), R32(%rax) C zero rax
+L(x): ADDSUB %rax, %r10
+ mov %r10, -8(rp,%r15,8)
+ mov $0, %eax
adc %rdx, %rax
- pop %rbp
- pop %rbx
- FUNC_EXIT()
+L(ret): pop %r13
+ pop %r12
+ pop %r15
ret
+L(n2): mul %rcx
+ mov -8(rp,%r15,8), %r10
+ jmp L(r)
+L(n1): mov -8(rp,%r15,8), %r10
+ jmp L(x)
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/copyd.asm b/gmp/mpn/x86_64/core2/copyd.asm
deleted file mode 100644
index f0dc54a55e..0000000000
--- a/gmp/mpn/x86_64/core2/copyd.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/core2/copyi.asm b/gmp/mpn/x86_64/core2/copyi.asm
deleted file mode 100644
index 9c26e00c52..0000000000
--- a/gmp/mpn/x86_64/core2/copyi.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/core2/divrem_1.asm b/gmp/mpn/x86_64/core2/divrem_1.asm
deleted file mode 100644
index 623bea386c..0000000000
--- a/gmp/mpn/x86_64/core2/divrem_1.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-dnl x86-64 mpn_divrem_1 -- mpn by limb division.
-
-dnl Copyright 2004, 2005, 2007-2010, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C norm unorm frac
-C AMD K8,K9 15 15 12
-C AMD K10 15 15 12
-C Intel P4 44 44 43
-C Intel core2 24 24 19.5
-C Intel corei 19 19 18
-C Intel atom 51 51 36
-C VIA nano 46 44 22.5
-
-C mp_limb_t
-C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
-C mp_srcptr np, mp_size_t nn, mp_limb_t d)
-
-C mp_limb_t
-C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
-C mp_srcptr np, mp_size_t nn, mp_limb_t d,
-C mp_limb_t dinv, int cnt)
-
-C INPUT PARAMETERS
-define(`qp', `%rdi')
-define(`fn_param', `%rsi')
-define(`up_param', `%rdx')
-define(`un_param', `%rcx')
-define(`d', `%r8')
-define(`dinv', `%r9') C only for mpn_preinv_divrem_1
-C shift passed on stack C only for mpn_preinv_divrem_1
-
-define(`cnt', `%rcx')
-define(`up', `%rsi')
-define(`fn', `%r12')
-define(`un', `%rbx')
-
-
-C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C cnt qp d dinv
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFSTD(`define(`CNTOFF', `40($1)')')
-IFDOS(`define(`CNTOFF', `104($1)')')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_preinv_divrem_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- xor R32(%rax), R32(%rax)
- push %r13
- push %r12
- push %rbp
- push %rbx
-
- mov fn_param, fn
- mov un_param, un
- add fn_param, un_param
- mov up_param, up
-
- lea -8(qp,un_param,8), qp
-
- mov CNTOFF(%rsp), R8(cnt)
- shl R8(cnt), d
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_divrem_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- xor R32(%rax), R32(%rax)
- push %r13
- push %r12
- push %rbp
- push %rbx
-
- mov fn_param, fn
- mov un_param, un
- add fn_param, un_param
- mov up_param, up
- je L(ret)
-
- lea -8(qp,un_param,8), qp
- xor R32(%rbp), R32(%rbp)
-
-L(unnormalized):
- test un, un
- je L(44)
- mov -8(up,un,8), %rax
- cmp d, %rax
- jae L(44)
- mov %rbp, (qp)
- mov %rax, %rbp
- lea -8(qp), qp
- je L(ret)
- dec un
-L(44):
- bsr d, %rcx
- not R32(%rcx)
- sal R8(%rcx), d
- sal R8(%rcx), %rbp
-
- push %rcx
-IFSTD(` push %rdi ')
-IFSTD(` push %rsi ')
- push %r8
-IFSTD(` mov d, %rdi ')
-IFDOS(` mov d, %rcx ')
- CALL( mpn_invert_limb)
- pop %r8
-IFSTD(` pop %rsi ')
-IFSTD(` pop %rdi ')
- pop %rcx
-
- mov %rax, dinv
- mov %rbp, %rax
- test un, un
- je L(frac)
-L(ent): mov -8(up,un,8), %rbp
- shr R8(%rcx), %rax
- shld R8(%rcx), %rbp, %rax
- sub $2, un
- js L(end)
-
- ALIGN(16)
-L(top): lea 1(%rax), %r11
- mul dinv
- mov (up,un,8), %r10
- shld R8(%rcx), %r10, %rbp
- mov %rbp, %r13
- add %rax, %r13
- adc %r11, %rdx
- mov %rdx, %r11
- imul d, %rdx
- sub %rdx, %rbp
- lea (d,%rbp), %rax
- sub $8, qp
- cmp %r13, %rbp
- cmovc %rbp, %rax
- adc $-1, %r11
- cmp d, %rax
- jae L(ufx)
-L(uok): dec un
- mov %r11, 8(qp)
- mov %r10, %rbp
- jns L(top)
-
-L(end): lea 1(%rax), %r11
- sal R8(%rcx), %rbp
- mul dinv
- add %rbp, %rax
- adc %r11, %rdx
- mov %rax, %r11
- mov %rdx, %r13
- imul d, %rdx
- sub %rdx, %rbp
- mov d, %rax
- add %rbp, %rax
- cmp %r11, %rbp
- cmovc %rbp, %rax
- adc $-1, %r13
- cmp d, %rax
- jae L(efx)
-L(eok): mov %r13, (qp)
- sub $8, qp
- jmp L(frac)
-
-L(ufx): sub d, %rax
- inc %r11
- jmp L(uok)
-L(efx): sub d, %rax
- inc %r13
- jmp L(eok)
-
-L(frac):mov d, %rbp
- neg %rbp
- jmp L(fent)
-
- ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
-L(ftop):mul dinv C 0,12 0,17 0,17
- add %r11, %rdx C 5 8 10
- mov %rax, %r11 C 4 8 3
- mov %rdx, %r13 C 6 9 11
- imul %rbp, %rdx C 6 9 11
- mov d, %rax C
- add %rdx, %rax C 10 14 14
- cmp %r11, %rdx C 10 14 14
- cmovc %rdx, %rax C 11 15 15
- adc $-1, %r13 C
- mov %r13, (qp) C
- sub $8, qp C
-L(fent):lea 1(%rax), %r11 C
- dec fn C
- jns L(ftop) C
-
- shr R8(%rcx), %rax
-L(ret): pop %rbx
- pop %rbp
- pop %r12
- pop %r13
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/gcd_1.asm b/gmp/mpn/x86_64/core2/gcd_1.asm
deleted file mode 100644
index e0cab9b4e4..0000000000
--- a/gmp/mpn/x86_64/core2/gcd_1.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K8,K9 8.50
-C AMD K10 4.30
-C AMD bd1 5.00
-C AMD bobcat 10.0
-C Intel P4 18.6
-C Intel core2 3.83
-C Intel NHM 5.17
-C Intel SBR 4.69
-C Intel atom 17.0
-C VIA nano 5.44
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop for specific CPUs.
-C * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up', `%rdi')
-define(`n', `%rsi')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function. It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
- define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- FUNC_ENTRY(3)
- mov (up), %rax C U low limb
- or v0, %rax
- bsf %rax, %rax C min(ctz(u0),ctz(v0))
-
- bsf v0, %rcx
- shr R8(%rcx), v0
-
- push %rax C preserve common twos over call
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %r8
- mov %r8, %rax
- shr $BMOD_THRES_LOG2, %r8
- cmp %r8, v0
- ja L(reduced)
- jmp L(bmod)
-
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- CALL( mpn_mod_1)
- jmp L(reduced)
-L(bmod):
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- CALL( mpn_modexact_1_odd)
-L(reduced):
-
- add $STACK_ALLOC, %rsp
- pop %rdx
-
- bsf %rax, %rcx
-C test %rax, %rax C FIXME: does this lower latency?
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K10 BD C2 NHM SBR
-L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5
- cmovc %r9, %rdx C use x,y-x 0,3 0,3 2,8 1,7 1,7
-L(mid): shr R8(%rcx), %rax C 1,7 1,6 2,8 2,8 2,8
- mov %rdx, %r10 C 1 1 4 3 3
- sub %rax, %r10 C 2 2 5 4 4
- bsf %r10, %rcx C 3 3 6 5 5
- mov %rax, %r9 C 2 2 3 3 4
- sub %rdx, %rax C 2 2 4 3 4
- jnz L(top) C
-
-L(end): pop %rcx
- mov %rdx, %rax
- shl R8(%rcx), %rax
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/gmp-mparam.h b/gmp/mpn/x86_64/core2/gmp-mparam.h
index 0f4f88f780..8207da4895 100644
--- a/gmp/mpn/x86_64/core2/gmp-mparam.h
+++ b/gmp/mpn/x86_64/core2/gmp-mparam.h
@@ -1,217 +1,78 @@
-/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file.
+/* "Core 2" gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
-or both in parallel, as here.
+/* 2133 MHz "Core 2" / 65nm / 4096 Kibyte cache / socket 775 */
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 2133 MHz Core 2 (65nm) */
-/* FFT tuning limit = 60000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 24
-
-#define MUL_TOOM22_THRESHOLD 23
-#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 179
-#define MUL_TOOM6H_THRESHOLD 268
-#define MUL_TOOM8H_THRESHOLD 357
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 102
-#define SQR_TOOM4_THRESHOLD 160
-#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 296
-
-#define MULMID_TOOM42_THRESHOLD 28
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 13
-
-#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
- { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 32, 8}, \
- { 17, 7}, { 36, 8}, { 19, 7}, { 40, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 43,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 63,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 135, 9}, { 271,11}, \
- { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 271,11}, \
- { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
- { 383,11}, { 207,10}, { 415,11}, { 223,13}, \
- { 63,12}, { 127,11}, { 271,10}, { 543,11}, \
- { 287,10}, { 575,11}, { 303,10}, { 607,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
- { 191,11}, { 415,12}, { 223,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
- { 703,13}, { 191,12}, { 415,11}, { 831,12}, \
- { 479,14}, { 127,13}, { 255,12}, { 607,13}, \
- { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
- { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1023,13}, { 575,12}, { 1215,13}, { 639,12}, \
- { 1279,13}, { 703,14}, { 383,13}, { 831,12}, \
- { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \
- { 1151,14}, { 639,13}, { 1343,14}, { 767,13}, \
- { 1599,14}, { 895,15}, { 511,14}, { 1279,13}, \
- { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
- { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \
- { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \
- { 2175,12}, { 8959,14}, { 2303,13}, { 4607,12}, \
- { 9471,14}, { 2431,13}, { 4863,12}, { 9983,15}, \
- { 1279,14}, { 2559,12}, { 10239,14}, { 2687,12}, \
- { 11775,15}, { 1535,14}, { 3327,13}, { 6655,14}, \
- { 3455,13}, { 6911,14}, { 3583,12}, { 14335,11}, \
- { 28671,10}, { 57343,11}, { 2048,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 183
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 23, 7}, { 12, 6}, \
- { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 43,10}, \
- { 23, 9}, { 55,10}, { 31, 9}, { 67,10}, \
- { 39, 9}, { 79,10}, { 47,11}, { 31,10}, \
- { 79,11}, { 47,10}, { 95,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \
- { 271, 8}, { 543,11}, { 79, 9}, { 319, 8}, \
- { 639,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
- { 303, 9}, { 607,10}, { 319, 9}, { 639,11}, \
- { 175,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,10}, { 415,13}, { 63,12}, { 127,11}, \
- { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
- { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
- { 351,12}, { 191,11}, { 415,10}, { 831,12}, \
- { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 607,12}, { 319,11}, { 639,12}, { 351,13}, \
- { 191,12}, { 415,11}, { 831,12}, { 479,14}, \
- { 127,13}, { 255,12}, { 607,13}, { 319,12}, \
- { 703,13}, { 383,12}, { 831,13}, { 447,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1023,13}, \
- { 575,12}, { 1215,13}, { 639,12}, { 1279,13}, \
- { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \
- { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \
- { 2687,13}, { 1407,12}, { 2815,14}, { 767,13}, \
- { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,15}, { 767,14}, { 1535,13}, { 3071,14}, \
- { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
- { 3583,16}, { 511,15}, { 1023,14}, { 2175,13}, \
- { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \
- { 4863,15}, { 1279,14}, { 2815,13}, { 5631,14}, \
- { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \
- { 6911,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 179
-#define SQR_FFT_THRESHOLD 3008
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 62
-#define MULLO_MUL_N_THRESHOLD 9174
-
-#define DC_DIV_QR_THRESHOLD 46
-#define DC_DIVAPPR_Q_THRESHOLD 155
-#define DC_BDIV_QR_THRESHOLD 50
-#define DC_BDIV_Q_THRESHOLD 94
-
-#define INV_MULMOD_BNM1_THRESHOLD 48
-#define INV_NEWTON_THRESHOLD 156
-#define INV_APPR_THRESHOLD 155
-
-#define BINV_NEWTON_THRESHOLD 234
-#define REDC_1_TO_REDC_2_THRESHOLD 22
-#define REDC_2_TO_REDC_N_THRESHOLD 48
-
-#define MU_DIV_QR_THRESHOLD 1187
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 74
-#define MU_BDIV_QR_THRESHOLD 1017
-#define MU_BDIV_Q_THRESHOLD 1187
-
-#define POWM_SEC_TABLE 1,64,131,269,466
-
-#define MATRIX22_STRASSEN_THRESHOLD 19
-#define HGCD_THRESHOLD 117
-#define HGCD_APPR_THRESHOLD 151
-#define HGCD_REDUCE_THRESHOLD 2121
-#define GCD_DC_THRESHOLD 427
-#define GCDEXT_DC_THRESHOLD 342
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 18
-#define SET_STR_DC_THRESHOLD 552
-#define SET_STR_PRECOMPUTE_THRESHOLD 1561
-
-#define FAC_DSC_THRESHOLD 656
-#define FAC_ODD_THRESHOLD 23
+/* Generated by tuneup.c, 2009-01-14, gcc 4.2 */
+
+#define MUL_KARATSUBA_THRESHOLD 18
+#define MUL_TOOM3_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 166
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 163
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 20
+#define MULLOW_MUL_N_THRESHOLD 232
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 60
+#define POWM_THRESHOLD 77
+
+#define MATRIX22_STRASSEN_THRESHOLD 25
+#define HGCD_THRESHOLD 140
+#define GCD_DC_THRESHOLD 691
+#define GCDEXT_DC_THRESHOLD 760
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 3
+#define MOD_1_2_THRESHOLD 5
+#define MOD_1_4_THRESHOLD 20
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 668
+#define SET_STR_PRECOMPUTE_THRESHOLD 2052
+
+#define MUL_FFT_TABLE { 336, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
+#define MUL_FFT_MODF_THRESHOLD 352
+#define MUL_FFT_THRESHOLD 3456
+
+#define SQR_FFT_TABLE { 336, 736, 1728, 3328, 7168, 20480, 81920, 327680, 0 }
+#define SQR_FFT_MODF_THRESHOLD 352
+#define SQR_FFT_THRESHOLD 2432
+
+/* Generated 2009-01-12, gcc 4.2 */
+
+#define MUL_FFT_TABLE2 {{1,4}, {273,5}, {545,6}, {1217,7}, {3201,8}, {6913,9}, {7681,8}, {8449,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {39937,9}, {42497,10}, {56321,11}, {63489,10}, {81409,11}, {92161,10}, {93185,11}, {96257,12}, {126977,11}, {131073,10}, {138241,11}, {167937,10}, {169473,11}, {169985,10}, {172033,11}, {195585,9}, {196097,11}, {198657,10}, {208897,11}, {217089,12}, {258049,11}, {261121,9}, {262657,10}, {275457,11}, {302081,10}, {307201,11}, {331777,12}, {389121,11}, {425985,13}, {516097,12}, {520193,11}, {598017,12}, {610305,11}, {614401,12}, {651265,11}, {653313,10}, {654337,11}, {673793,10}, {674817,11}, {677889,10}, {679937,11}, {718849,10}, {719873,12}, {782337,11}, {850945,12}, {913409,11}, {925697,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1244161,12}, {1306625,11}, {1310721,12}, {1327105,11}, {1347585,12}, {1355777,11}, {1366017,12}, {1439745,13}, {1564673,12}, {1835009,14}, {1900545,12}, {1904641,14}, {2080769,13}, {2088961,12}, {2488321,13}, {2613249,12}, {2879489,13}, {2932737,12}, {2940929,13}, {3137537,12}, {3403777,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4978689,13}, {5234689,12}, {5500929,13}, {5758977,14}, {6275073,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9957377,14}, {MP_SIZE_T_MAX, 0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {241,5}, {545,6}, {1345,7}, {3201,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {55297,11}, {63489,10}, {80897,11}, {94209,10}, {97281,12}, {126977,11}, {129025,9}, {130049,10}, {138753,11}, {162817,9}, {164353,11}, {170497,10}, {178177,11}, {183297,10}, {184321,11}, {194561,10}, {208897,12}, {219137,11}, {221185,12}, {258049,11}, {261121,9}, {261633,10}, {267777,9}, {268289,11}, {270337,10}, {274945,9}, {276481,10}, {278529,11}, {292865,9}, {293377,10}, {295937,9}, {296449,10}, {306177,9}, {309249,10}, {310273,11}, {328705,12}, {331777,11}, {335873,12}, {344065,11}, {346113,12}, {352257,11}, {356353,12}, {389121,11}, {395265,10}, {398337,11}, {419841,10}, {421889,11}, {423937,13}, {516097,12}, {520193,11}, {546817,10}, {550913,11}, {561153,10}, {563201,11}, {579585,10}, {585729,11}, {621569,12}, {636929,11}, {638977,12}, {651265,11}, {714753,10}, {716801,11}, {718849,12}, {782337,11}, {849921,12}, {913409,11}, {954369,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,12}, {1437697,13}, {1564673,12}, {1961985,14}, {2080769,13}, {2088961,12}, {2486273,13}, {2613249,12}, {2879489,13}, {3137537,12}, {3272705,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4976641,13}, {5234689,12}, {5320705,13}, {5324801,12}, {5447681,13}, {5455873,12}, {5500929,13}, {5758977,14}, {6275073,13}, {6283265,12}, {6549505,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9953281,14}, {MP_SIZE_T_MAX, 0}}
diff --git a/gmp/mpn/x86_64/core2/lshift.asm b/gmp/mpn/x86_64/core2/lshift.asm
index 8ccafeca6c..60518901eb 100644
--- a/gmp/mpn/x86_64/core2/lshift.asm
+++ b/gmp/mpn/x86_64/core2/lshift.asm
@@ -1,83 +1,64 @@
dnl x86-64 mpn_lshift optimized for "Core 2".
-dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2007 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4.25
-C AMD K10 4.25
-C Intel P4 14.7
-C Intel core2 1.27
-C Intel NHM 1.375 (up to about n = 260, then 1.5)
-C Intel SBR 1.87
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 4.25
+C K10: 4.25
+C P4: 14.7
+C P6-15: 1.27
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt', `%cl')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
lea -8(rp,n,8), rp
lea -8(up,n,8), up
- mov R32(%rdx), R32(%rax)
- and $3, R32(%rax)
+ mov %edx, %eax
+ and $3, %eax
jne L(nb00)
L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov -8(up), %r11
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r10, %rax
+ xor %eax, %eax
+ shld %cl, %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
jmp L(00)
L(nb00):C n = 1, 5, 9, ...
- cmp $2, R32(%rax)
+ cmp $2, %eax
jae L(nb01)
L(b01): mov (up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r9, %rax
+ xor %eax, %eax
+ shld %cl, %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
@@ -85,65 +66,62 @@ L(b01): mov (up), %r9
lea -8(up), up
lea 16(rp), rp
jmp L(01)
-L(le1): shl R8(cnt), %r9
+L(le1): shl %cl, %r9
mov %r9, (rp)
- FUNC_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
jne L(b11)
L(b10): mov (up), %r8
mov -8(up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r8, %rax
+ xor %eax, %eax
+ shld %cl, %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
-L(le2): shld R8(cnt), %r9, %r8
+L(le2): shld %cl, %r9, %r8
mov %r8, (rp)
- shl R8(cnt), %r9
+ shl %cl, %r9
mov %r9, -8(rp)
- FUNC_EXIT()
ret
ALIGN(16) C performance critical!
L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov -8(up), %r8
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r11, %rax
+ xor %eax, %eax
+ shld %cl, %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shld R8(cnt), %r8, %r11
+L(top): shld %cl, %r8, %r11
mov (up), %r10
mov %r11, (rp)
-L(10): shld R8(cnt), %r9, %r8
+L(10): shld %cl, %r9, %r8
mov -8(up), %r11
mov %r8, -8(rp)
-L(01): shld R8(cnt), %r10, %r9
+L(01): shld %cl, %r10, %r9
mov -16(up), %r8
mov %r9, -16(rp)
-L(00): shld R8(cnt), %r11, %r10
+L(00): shld %cl, %r11, %r10
mov -24(up), %r9
+ lea -32(up), up
mov %r10, -24(rp)
- add $-32, up
lea -32(rp), rp
sub $4, n
jnc L(top)
-L(end): shld R8(cnt), %r8, %r11
+L(end): shld %cl, %r8, %r11
mov %r11, (rp)
- shld R8(cnt), %r9, %r8
+ shld %cl, %r9, %r8
mov %r8, -8(rp)
- shl R8(cnt), %r9
+ shl %cl, %r9
mov %r9, -16(rp)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/lshiftc.asm b/gmp/mpn/x86_64/core2/lshiftc.asm
deleted file mode 100644
index 65c7b2f1b8..0000000000
--- a/gmp/mpn/x86_64/core2/lshiftc.asm
+++ /dev/null
@@ -1,159 +0,0 @@
-dnl x86-64 mpn_lshiftc optimized for "Core 2".
-
-dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 1.5
-C Intel NHM 2.25 (up to about n = 260, then 1.875)
-C Intel SBR 2.25
-C Intel atom ?
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- lea -8(rp,n,8), rp
- lea -8(up,n,8), up
-
- mov R32(%rdx), R32(%rax)
- and $3, R32(%rax)
- jne L(nb00)
-L(b00): C n = 4, 8, 12, ...
- mov (up), %r10
- mov -8(up), %r11
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r10, %rax
- mov -16(up), %r8
- lea 24(rp), rp
- sub $4, n
- jmp L(00)
-
-L(nb00):C n = 1, 5, 9, ...
- cmp $2, R32(%rax)
- jae L(nb01)
-L(b01): mov (up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r9, %rax
- sub $2, n
- jb L(le1)
- mov -8(up), %r10
- mov -16(up), %r11
- lea -8(up), up
- lea 16(rp), rp
- jmp L(01)
-L(le1): shl R8(cnt), %r9
- not %r9
- mov %r9, (rp)
- FUNC_EXIT()
- ret
-
-L(nb01):C n = 2, 6, 10, ...
- jne L(b11)
-L(b10): mov (up), %r8
- mov -8(up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r8, %rax
- sub $3, n
- jb L(le2)
- mov -16(up), %r10
- lea -16(up), up
- lea 8(rp), rp
- jmp L(10)
-L(le2): shld R8(cnt), %r9, %r8
- not %r8
- mov %r8, (rp)
- shl R8(cnt), %r9
- not %r9
- mov %r9, -8(rp)
- FUNC_EXIT()
- ret
-
- ALIGN(16) C performance critical!
-L(b11): C n = 3, 7, 11, ...
- mov (up), %r11
- mov -8(up), %r8
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r11, %rax
- mov -16(up), %r9
- lea -24(up), up
- sub $4, n
- jb L(end)
-
- ALIGN(16)
-L(top): shld R8(cnt), %r8, %r11
- mov (up), %r10
- not %r11
- mov %r11, (rp)
-L(10): shld R8(cnt), %r9, %r8
- mov -8(up), %r11
- not %r8
- mov %r8, -8(rp)
-L(01): shld R8(cnt), %r10, %r9
- mov -16(up), %r8
- not %r9
- mov %r9, -16(rp)
-L(00): shld R8(cnt), %r11, %r10
- mov -24(up), %r9
- not %r10
- mov %r10, -24(rp)
- add $-32, up
- lea -32(rp), rp
- sub $4, n
- jnc L(top)
-
-L(end): shld R8(cnt), %r8, %r11
- not %r11
- mov %r11, (rp)
- shld R8(cnt), %r9, %r8
- not %r8
- mov %r8, -8(rp)
- shl R8(cnt), %r9
- not %r9
- mov %r9, -16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/mul_basecase.asm b/gmp/mpn/x86_64/core2/mul_basecase.asm
deleted file mode 100644
index d16be852f7..0000000000
--- a/gmp/mpn/x86_64/core2/mul_basecase.asm
+++ /dev/null
@@ -1,975 +0,0 @@
-dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
-dnl It also seems good for Conroe/Wolfdale.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core 4.0 4.0 - 4.18-4.25
-C Intel NHM 3.75 3.8 - 4.06-4.2
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C Code structure:
-C
-C
-C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
-C | | | |
-C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
-C | / | / | / | /
-C | / | / | / | /
-C | / | / | / | /
-C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
-C _____ _____ _____ _____
-C / \ / \ / \ / \
-C \|/ | \|/ | \|/ | \|/ |
-C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
-C \ /|\ \ /|\ \ /|\ \ /|\
-C \_____/ \_____/ \_____/ \_____/
-
-C TODO
-C * Tune. None done so far.
-C * Currently 2687 bytes, making it smaller would be nice.
-C * Implement some basecases, say for un < 4.
-C * Try zeroing with xor in m2 loops.
-C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
-C between loop header and wind-down code.
-C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-C Define this to $1 to use late loop index variable as zero, $2 to use an
-C explicit $0.
-define(`Z',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param', `%rdx')
-define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance!
-define(`vn_param', `%r8')
-
-define(`un', `%r9')
-define(`vn', `(%rsp)')
-
-define(`v0', `%r10')
-define(`v1', `%r11')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r12')
-define(`i', `%r13')
-define(`vp', `%r14')
-
-define(`X0', `%r8')
-define(`X1', `%r15')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- mov (up), %rax C shared for mul_1 and mul_2
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
-
- mov (vp_param), v0 C shared for mul_1 and mul_2
-
- xor un, un
- sub un_param, un C un = -un_param
-
- lea (up,un_param,8), up
- lea (rp,un_param,8), rp
-
- mul v0 C shared for mul_1 and mul_2
-
- test $1, R8(vn_param)
- jz L(m2)
-
- lea 8(vp_param), vp C FIXME: delay until known needed
-
- test $1, R8(un)
- jnz L(m1x1)
-
-L(m1x0):test $2, R8(un)
- jnz L(m1s2)
-
-L(m1s0):
- lea (un), i
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- mov %rdx, w0 C FIXME: Use lea?
- lea L(do_am0)(%rip), %rbp
- jmp L(m1e0)
-
-L(m1s2):
- lea 2(un), i
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- mov %rdx, w0 C FIXME: Use lea?
- mul v0
- lea L(do_am2)(%rip), %rbp
- test i, i
- jnz L(m1e2)
- add %rax, w0
- adc $0, %rdx
- mov w0, I(-8(rp),8(rp,un,8))
- mov %rdx, I((rp),16(rp,un,8))
- jmp L(ret2)
-
-L(m1x1):test $2, R8(un)
- jz L(m1s3)
-
-L(m1s1):
- lea 1(un), i
- mov %rax, (rp,un,8)
- test i, i
- jz L(1)
- mov 8(up,un,8), %rax
- mov %rdx, w1 C FIXME: Use lea?
- lea L(do_am1)(%rip), %rbp
- jmp L(m1e1)
-L(1): mov %rdx, I((rp),8(rp,un,8))
- jmp L(ret2)
-
-L(m1s3):
- lea -1(un), i
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- mov %rdx, w1 C FIXME: Use lea?
- lea L(do_am3)(%rip), %rbp
- jmp L(m1e3)
-
- ALIGNx
-L(m1top):
- mul v0
- mov w1, -16(rp,i,8)
-L(m1e2):xor R32(w1), R32(w1)
- add %rax, w0
- mov (up,i,8), %rax
- adc %rdx, w1
- mov w0, -8(rp,i,8)
-L(m1e1):xor R32(w0), R32(w0)
- mul v0
- add %rax, w1
- mov 8(up,i,8), %rax
- adc %rdx, w0
- mov w1, (rp,i,8)
-L(m1e0):xor R32(w1), R32(w1)
- mul v0
- add %rax, w0
- mov 16(up,i,8), %rax
- adc %rdx, w1
- mov w0, 8(rp,i,8)
-L(m1e3):xor R32(w0), R32(w0)
- mul v0
- add %rax, w1
- mov 24(up,i,8), %rax
- adc %rdx, w0
- add $4, i
- js L(m1top)
-
- mul v0
- mov w1, I(-16(rp),-16(rp,i,8))
- add %rax, w0
- adc $0, %rdx
- mov w0, I(-8(rp),-8(rp,i,8))
- mov %rdx, I((rp),(rp,i,8))
-
- dec vn_param
- jz L(ret2)
- lea -8(rp), rp
- jmp *%rbp
-
-L(m2):
- mov 8(vp_param), v1
- lea 16(vp_param), vp C FIXME: delay until known needed
-
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): test $2, R8(un)
- jnz L(b10)
-
-L(b00): lea (un), i
- mov %rax, (rp,un,8)
- mov %rdx, w1 C FIXME: Use lea?
- mov (up,un,8), %rax
- mov $0, R32(w2)
- jmp L(m2e0)
-
-L(b10): lea -2(un), i
- mov %rax, w2 C FIXME: Use lea?
- mov (up,un,8), %rax
- mov %rdx, w3 C FIXME: Use lea?
- mov $0, R32(w0)
- jmp L(m2e2)
-
-L(bx1): test $2, R8(un)
- jz L(b11)
-
-L(b01): lea 1(un), i
- mov %rax, (rp,un,8)
- mov (up,un,8), %rax
- mov %rdx, w0 C FIXME: Use lea?
- mov $0, R32(w1)
- jmp L(m2e1)
-
-L(b11): lea -1(un), i
- mov %rax, w1 C FIXME: Use lea?
- mov (up,un,8), %rax
- mov %rdx, w2 C FIXME: Use lea?
- mov $0, R32(w3)
- jmp L(m2e3)
-
- ALIGNx
-L(m2top0):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
-L(m2e0):mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top0)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am0):
- push %r15
- push vn_param
-
-L(olo0):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
-C lea 0(un), i
- mov un, i
- mul v0
- mov %rax, X0
- mov (up,un,8), %rax
- MOV( %rdx, X1, 2)
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,un,8), w2
- mov %rax, w3
- jmp L(lo0)
-
- ALIGNx
-L(am2top0):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo0): mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top0)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo0)
-
-L(ret): pop %rax
- pop %r15
-L(ret2):pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-
- ALIGNx
-L(m2top1):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
-L(m2e1):mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top1)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am1):
- push %r15
- push vn_param
-
-L(olo1):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
- lea 1(un), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 128)
- mov (up,un,8), %rax
- mov (rp,un,8), w1
- mul v1
- mov %rax, w2
- mov 8(up,un,8), %rax
- MOV( %rdx, w3, 1)
- jmp L(lo1)
-
- ALIGNx
-L(am2top1):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
-L(lo1): mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top1)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo1)
-
- pop %rax
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-
- ALIGNx
-L(m2top2):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
-L(m2e2):mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top2)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am2):
- push %r15
- push vn_param
-
-L(olo2):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
- lea -2(un), i
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 32)
- mov (up,un,8), %rax
- mov (rp,un,8), w0
- mul v1
- mov %rax, w1
- lea (%rdx), w2
- mov 8(up,un,8), %rax
- jmp L(lo2)
-
- ALIGNx
-L(am2top2):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
-L(lo2): mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top2)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo2)
-
- pop %rax
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-
- ALIGNx
-L(m2top3):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
-L(m2e3):mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top3)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am3):
- push %r15
- push vn_param
-
-L(olo3):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
- lea -1(un), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 8)
- mov (up,un,8), %rax
- mov (rp,un,8), w3
- mul v1
- mov %rax, w0
- MOV( %rdx, w1, 16)
- mov 8(up,un,8), %rax
- jmp L(lo3)
-
- ALIGNx
-L(am2top3):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
-L(lo3): mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top3)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo3)
-
- pop %rax
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/mullo_basecase.asm b/gmp/mpn/x86_64/core2/mullo_basecase.asm
deleted file mode 100644
index 0f03d867f6..0000000000
--- a/gmp/mpn/x86_64/core2/mullo_basecase.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core 4.0 4.18-4.25
-C Intel NHM 3.75 4.06-4.2
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Implement proper cor2, replacing current cor0.
-C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
-C * Micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n_param', `%rcx')
-
-define(`v0', `%r10')
-define(`v1', `%r11')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r12')
-define(`n', `%r9')
-define(`i', `%r13')
-define(`vp', `%r8')
-
-define(`X0', `%r14')
-define(`X1', `%r15')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mullo_basecase)
- FUNC_ENTRY(4)
-
- mov (up), %rax
- mov vp_param, vp
-
- cmp $4, n_param
- jb L(small)
-
- mov (vp_param), v0
- push %rbx
- lea (rp,n_param,8), rp C point rp at R[un]
- push %rbp
- lea (up,n_param,8), up C point up right after U's end
- push %r12
- mov $0, R32(n) C FIXME
- sub n_param, n
- push %r13
- mul v0
- mov 8(vp), v1
-
- test $1, R8(n_param)
- jnz L(m2x1)
-
-L(m2x0):test $2, R8(n_param)
- jnz L(m2b2)
-
-L(m2b0):lea (n), i
- mov %rax, (rp,n,8)
- mov %rdx, w1
- mov (up,n,8), %rax
- xor R32(w2), R32(w2)
- jmp L(m2e0)
-
-L(m2b2):lea -2(n), i
- mov %rax, w2
- mov (up,n,8), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- jmp L(m2e2)
-
-L(m2x1):test $2, R8(n_param)
- jnz L(m2b3)
-
-L(m2b1):lea 1(n), i
- mov %rax, (rp,n,8)
- mov (up,n,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- jmp L(m2e1)
-
-L(m2b3):lea -1(n), i
- xor R32(w3), R32(w3)
- mov %rax, w1
- mov %rdx, w2
- mov (up,n,8), %rax
- jmp L(m2e3)
-
- ALIGNx
-L(m2tp):mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
-L(m2e1):mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
-L(m2e0):mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
-L(m2e3):mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
-L(m2e2):mul v1
- mov $0, R32(w1) C FIXME: dead in last iteration
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0 C FIXME: dead in last iteration
- add $4, i
- js L(m2tp)
-
-L(m2ed):imul v0, %rax
- add w3, %rax
- mov %rax, I(-8(rp),-8(rp,i,8))
-
- add $2, n
- lea 16(vp), vp
- lea -16(up), up
- cmp $-2, n
- jge L(cor1)
-
- push %r14
- push %r15
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
- mov (up,n,8), %rax
- mul v0
- test $1, R8(n)
- jnz L(a1x1)
-
-L(a1x0):mov %rax, X1
- MOV( %rdx, X0, 8)
- mov (up,n,8), %rax
- mul v1
- test $2, R8(n)
- jnz L(a110)
-
-L(a100):lea (n), i
- mov (rp,n,8), w3
- mov %rax, w0
- MOV( %rdx, w1, 16)
- jmp L(lo0)
-
-L(a110):lea 2(n), i
- mov (rp,n,8), w1
- mov %rax, w2
- mov 8(up,n,8), %rax
- MOV( %rdx, w3, 1)
- jmp L(lo2)
-
-L(a1x1):mov %rax, X0
- MOV( %rdx, X1, 2)
- mov (up,n,8), %rax
- mul v1
- test $2, R8(n)
- jz L(a111)
-
-L(a101):lea 1(n), i
- MOV( %rdx, w0, 4)
- mov (rp,n,8), w2
- mov %rax, w3
- jmp L(lo1)
-
-L(a111):lea -1(n), i
- MOV( %rdx, w2, 64)
- mov %rax, w1
- mov (rp,n,8), w0
- mov 8(up,n,8), %rax
- jmp L(lo3)
-
- ALIGNx
-L(top): mul v1
- add w0, w1
- adc %rax, w2
- mov -8(up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
-L(lo2): mul v0
- add w1, X1
- mov X1, -16(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov -8(up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov -8(rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo1): mov (up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, -8(rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov (up,i,8), %rax
- mov (rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
-L(lo0): mov 8(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, (rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 8(rp,i,8), w3
- adc $0, X1
- mov 8(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 16(up,i,8), %rax
- adc $0, w2
-L(lo3): mul v0
- add w0, X0
- mov X0, 8(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 16(up,i,8), %rax
- mov 16(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(top)
-
-L(end): imul v1, %rax
- add w0, w1
- adc %rax, w2
- mov I(-8(up),-8(up,i,8)), %rax
- imul v0, %rax
- add w1, X1
- mov X1, I(-16(rp),-16(rp,i,8))
- adc X0, %rax
- mov I(-8(rp),-8(rp,i,8)), w1
- add w1, w2
- add w2, %rax
- mov %rax, I(-8(rp),-8(rp,i,8))
-
- add $2, n
- lea 16(vp), vp
- lea -16(up), up
- cmp $-2, n
- jl L(outer)
-
- pop %r15
- pop %r14
-
- jnz L(cor0)
-
-L(cor1):mov (vp), v0
- mov 8(vp), v1
- mov -16(up), %rax
- mul v0 C u0 x v2
- add -16(rp), %rax C FIXME: rp[0] still available in reg?
- adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
- mov -8(up), %rbx
- imul v0, %rbx
- mov -16(up), %rcx
- imul v1, %rcx
- mov %rax, -16(rp)
- add %rbx, %rcx
- add %rdx, %rcx
- mov %rcx, -8(rp)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(cor0):mov (vp), %r11
- imul -8(up), %r11
- add %rax, %r11
- mov %r11, -8(rp)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- cmp $2, n_param
- jae L(gt1)
-L(n1): imul (vp_param), %rax
- mov %rax, (rp)
- FUNC_EXIT()
- ret
-L(gt1): ja L(gt2)
-L(n2): mov (vp_param), %r9
- mul %r9
- mov %rax, (rp)
- mov 8(up), %rax
- imul %r9, %rax
- add %rax, %rdx
- mov 8(vp), %r9
- mov (up), %rcx
- imul %r9, %rcx
- add %rcx, %rdx
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-L(gt2):
-L(n3): mov (vp_param), %r9
- mul %r9 C u0 x v0
- mov %rax, (rp)
- mov %rdx, %r10
- mov 8(up), %rax
- mul %r9 C u1 x v0
- imul 16(up), %r9 C u2 x v0
- add %rax, %r10
- adc %rdx, %r9
- mov 8(vp), %r11
- mov (up), %rax
- mul %r11 C u0 x v1
- add %rax, %r10
- adc %rdx, %r9
- imul 8(up), %r11 C u1 x v1
- add %r11, %r9
- mov %r10, 8(rp)
- mov 16(vp), %r10
- mov (up), %rax
- imul %rax, %r10 C u0 x v2
- add %r10, %r9
- mov %r9, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/popcount.asm b/gmp/mpn/x86_64/core2/popcount.asm
index e935cf1892..6c22999ff4 100644
--- a/gmp/mpn/x86_64/core2/popcount.asm
+++ b/gmp/mpn/x86_64/core2/popcount.asm
@@ -3,33 +3,21 @@ dnl x86-64 mpn_popcount optimized for "Core 2".
dnl Copyright 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-MULFUNC_PROLOGUE(mpn_popcount)
include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86_64/core2/redc_1.asm b/gmp/mpn/x86_64/core2/redc_1.asm
deleted file mode 100644
index d0e96ef1cb..0000000000
--- a/gmp/mpn/x86_64/core2/redc_1.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat ?
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core 4.5 (fluctuating)
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * Consider inlining mpn_add_n.
-C * Single basecases out before the pushes.
-C * Keep up[i] in registers for basecases (might require pushes).
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%r12')
-define(`q0', `%r13')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C X q0' n X rp up u0i mp q0 i j
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), q0
- mov n, j C outer loop induction var
- lea (mp_param,n,8), mp
- lea -16(up,n,8), up
- neg n
- imul u0inv, q0 C first iteration q0
-
- test $1, R8(n)
- jz L(b0)
-
-L(b1): cmp $-1, R32(n)
- jz L(n1)
- cmp $-3, R32(n)
- jz L(n3)
-
- push rp
-
-L(otp1):lea 3(n), i
- mov (mp,n,8), %rax
- mul q0
- lea (%rax), %rbp
- mov 8(mp,n,8), %rax
- lea (%rdx), %r9
- mul q0
- lea (%rax), %r11
- mov 16(mp,n,8), %rax
- mov 16(up,n,8), %r10
- lea (%rdx), %rdi
- mul q0
- add %rbp, %r10
- lea (%rax), %rbp
- mov 24(mp,n,8), %rax
- adc %r9, %r11
- mov 24(up,n,8), %rbx
- lea (%rdx), %r9
- adc $0, %rdi
- mul q0
- add %r11, %rbx
- lea (%rax), %r11
- mov 32(mp,n,8), %rax
- adc %rdi, %rbp
- mov %rbx, 24(up,n,8)
- mov 32(up,n,8), %r10
- lea (%rdx), %rdi
- adc $0, %r9
- imul u0inv, %rbx C next q limb
- add $2, i
- jns L(ed1)
-
- ALIGNx
-L(tp1): mul q0
- add %rbp, %r10
- lea (%rax), %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %r10, -8(up,i,8)
- mov (up,i,8), %r10
- lea (%rdx), %r9
- adc $0, %rdi
- mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov 8(mp,i,8), %rax
- adc %rdi, %rbp
- mov %r10, (up,i,8)
- mov 8(up,i,8), %r10
- lea (%rdx), %rdi
- adc $0, %r9
- add $2, i
- js L(tp1)
-
-L(ed1): mul q0
- add %rbp, %r10
- adc %r9, %r11
- mov %r10, I(-8(up),-8(up,i,8))
- mov I((up),(up,i,8)), %r10
- adc $0, %rdi
- add %r11, %r10
- adc %rdi, %rax
- mov %r10, I((up),(up,i,8))
- mov I(8(up),8(up,i,8)), %r10
- adc $0, %rdx
- add %rax, %r10
- mov %r10, I(8(up),8(up,i,8))
- adc $0, %rdx
- mov %rdx, 16(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp1)
- jmp L(cj)
-
-L(b0): cmp $-2, R32(n)
- jz L(n2)
- cmp $-4, R32(n)
- jz L(n4)
-
- push rp
-
-L(otp0):lea 4(n), i
- mov (mp,n,8), %rax
- mul q0
- lea (%rax), %r11
- mov 8(mp,n,8), %rax
- lea (%rdx), %rdi
- mul q0
- lea (%rax), %rbp
- mov 16(mp,n,8), %rax
- mov 16(up,n,8), %r10
- lea (%rdx), %r9
- mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov 24(mp,n,8), %rax
- adc %rdi, %rbp
- mov 24(up,n,8), %rbx
- lea (%rdx), %rdi
- adc $0, %r9
- mul q0
- add %rbp, %rbx
- lea (%rax), %rbp
- mov 32(mp,n,8), %rax
- adc %r9, %r11
- mov %rbx, 24(up,n,8)
- mov 32(up,n,8), %r10
- lea (%rdx), %r9
- adc $0, %rdi
- imul u0inv, %rbx C next q limb
- jmp L(e0)
-
- ALIGNx
-L(tp0): mul q0
- add %rbp, %r10
- lea (%rax), %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %r10, -8(up,i,8)
- mov (up,i,8), %r10
- lea (%rdx), %r9
- adc $0, %rdi
-L(e0): mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov 8(mp,i,8), %rax
- adc %rdi, %rbp
- mov %r10, (up,i,8)
- mov 8(up,i,8), %r10
- lea (%rdx), %rdi
- adc $0, %r9
- add $2, i
- js L(tp0)
-
-L(ed0): mul q0
- add %rbp, %r10
- adc %r9, %r11
- mov %r10, I(-8(up),-8(up,i,8))
- mov I((up),(up,i,8)), %r10
- adc $0, %rdi
- add %r11, %r10
- adc %rdi, %rax
- mov %r10, I((up),(up,i,8))
- mov I(8(up),8(up,i,8)), %r10
- adc $0, %rdx
- add %rax, %r10
- mov %r10, I(8(up),8(up,i,8))
- adc $0, %rdx
- mov %rdx, 16(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp0)
-
-L(cj): lea 16(up), up C FIXME
- pop rp
-L(add_n):
-IFSTD(` lea (up,n,8), up C param 2: up
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` lea (up,n,8), %rdx C param 2: up
- lea (%rdx,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov rp, %rcx ') C param 1: rp
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(n1): mov (mp_param), %rax
- mul q0
- add 8(up), %rax
- adc 16(up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n2): mov (mp_param), %rax
- mov (up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov 8(up), %r10
- mul q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- add %r9, %r10
- adc $0, %r11
- mov %r10, q0
- imul u0inv, q0 C next q0
- mov -16(mp), %rax
- mul q0
- add %rax, %r10
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov 16(up), %r14
- mul q0
- add %rax, %r14
- adc $0, %rdx
- add %r9, %r14
- adc $0, %rdx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc 24(up), %rdx
- mov %r14, (rp)
- mov %rdx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n3): mov -24(mp), %rax
- mov -8(up), %r10
- mul q0
- add %rax, %r10
- mov -16(mp), %rax
- mov %rdx, %r11
- adc $0, %r11
- mov (up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- add %r11, %rbp
- mov 8(up), %r10
- adc $0, %r9
- mul q0
- mov %rbp, q0
- imul u0inv, q0 C next q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- mov %rbp, (up)
- add %r9, %r10
- adc $0, %r11
- mov %r10, 8(up)
- mov %r11, -8(up) C up[0]
- lea 8(up), up C up++
- dec j
- jnz L(n3)
-
- mov -32(up), %rdx
- mov -24(up), %rbx
- xor R32(%rax), R32(%rax)
- add %rbp, %rdx
- adc %r10, %rbx
- adc 8(up), %r11
- mov %rdx, (rp)
- mov %rbx, 8(rp)
- mov %r11, 16(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n4): mov -32(mp), %rax
- mul q0
- lea (%rax), %r11
- mov -24(mp), %rax
- lea (%rdx), %r14
- mul q0
- lea (%rax), %rbp
- mov -16(mp), %rax
- mov -16(up), %r10
- lea (%rdx), %r9
- mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov -8(mp), %rax
- adc %r14, %rbp
- mov -8(up), %rbx
- lea (%rdx), %r14
- adc $0, %r9
- mul q0
- add %rbp, %rbx
- adc %r9, %r11
- mov %rbx, -8(up)
- mov (up), %r10
- adc $0, %r14
- imul u0inv, %rbx C next q limb
- add %r11, %r10
- adc %r14, %rax
- mov %r10, (up)
- mov 8(up), %r10
- adc $0, %rdx
- add %rax, %r10
- mov %r10, 8(up)
- adc $0, %rdx
- mov %rdx, -16(up) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(n4)
- lea 16(up), up
- jmp L(add_n)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/core2/rsh1aors_n.asm b/gmp/mpn/x86_64/core2/rsh1aors_n.asm
deleted file mode 100644
index 27eed3712d..0000000000
--- a/gmp/mpn/x86_64/core2/rsh1aors_n.asm
+++ /dev/null
@@ -1,169 +0,0 @@
-dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn.
-
-dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 3.05
-C Intel NHM 3.3
-C Intel SBR 2.5
-C Intel atom ?
-C VIA nano ?
-
-C TODO
-C * Loopmix to approach 2.5 c/l on NHM.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_rsh1add_n)
- define(func_nc, mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsh1sub_n)
- define(func_nc, mpn_rsh1sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
-
- neg %r8 C set C flag from parameter
- mov (up), %r8
- ADCSBB (vp), %r8
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (up), %r8
- ADDSUB (vp), %r8
-L(ent): sbb R32(%rbx), R32(%rbx) C save cy
- mov %r8, %rax
- and $1, R32(%rax) C return value
-
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
- mov R32(n), R32(%rbp)
- neg n
- and $3, R32(%rbp)
- jz L(b0)
- cmp $2, R32(%rbp)
- jae L(n1)
-
-L(b1): mov %r8, %rbp
- inc n
- js L(top)
- jmp L(end)
-
-L(n1): jnz L(b3)
- add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up,n,8), %r11
- ADCSBB 8(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- mov %r8, %r10
- add $-2, n
- jmp L(2)
-
-L(b3): add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up,n,8), %r10
- mov 16(up,n,8), %r11
- ADCSBB 8(vp,n,8), %r10
- ADCSBB 16(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- mov %r8, %r9
- dec n
- jmp L(3)
-
-L(b0): add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up,n,8), %r9
- mov 16(up,n,8), %r10
- mov 24(up,n,8), %r11
- ADCSBB 8(vp,n,8), %r9
- ADCSBB 16(vp,n,8), %r10
- ADCSBB 24(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- jmp L(4)
-
- ALIGN(16)
-
-L(top): add R32(%rbx), R32(%rbx) C restore cy
- mov (up,n,8), %r8
- mov 8(up,n,8), %r9
- mov 16(up,n,8), %r10
- mov 24(up,n,8), %r11
- ADCSBB (vp,n,8), %r8
- ADCSBB 8(vp,n,8), %r9
- ADCSBB 16(vp,n,8), %r10
- ADCSBB 24(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- shrd $1, %r8, %rbp
- mov %rbp, -8(rp,n,8)
-L(4): shrd $1, %r9, %r8
- mov %r8, (rp,n,8)
-L(3): shrd $1, %r10, %r9
- mov %r9, 8(rp,n,8)
-L(2): shrd $1, %r11, %r10
- mov %r10, 16(rp,n,8)
-L(1): add $4, n
- mov %r11, %rbp
- js L(top)
-
-L(end): shrd $1, %rbx, %rbp
- mov %rbp, -8(rp)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/rshift.asm b/gmp/mpn/x86_64/core2/rshift.asm
index ab32ec85df..9a3fc46f9a 100644
--- a/gmp/mpn/x86_64/core2/rshift.asm
+++ b/gmp/mpn/x86_64/core2/rshift.asm
@@ -1,69 +1,50 @@
dnl x86-64 mpn_rshift optimized for "Core 2".
-dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2007 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4.25
-C AMD K10 4.25
-C Intel P4 14.7
-C Intel core2 1.27
-C Intel NHM 1.375 (up to about n = 260, then 1.5)
-C Intel SBR 1.77
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 4.25
+C K10: 4.25
+C P4: 14.7
+C P6-15: 1.27
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt', `%cl')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
- mov R32(%rdx), R32(%rax)
- and $3, R32(%rax)
+ mov %edx, %eax
+ and $3, %eax
jne L(nb00)
L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov 8(up), %r11
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r10, %rax
+ xor %eax, %eax
+ shrd %cl, %r10, %rax
mov 16(up), %r8
lea 8(up), up
lea -24(rp), rp
@@ -71,11 +52,11 @@ L(b00): C n = 4, 8, 12, ...
jmp L(00)
L(nb00):C n = 1, 5, 9, ...
- cmp $2, R32(%rax)
+ cmp $2, %eax
jae L(nb01)
L(b01): mov (up), %r9
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r9, %rax
+ xor %eax, %eax
+ shrd %cl, %r9, %rax
sub $2, n
jb L(le1)
mov 8(up), %r10
@@ -83,65 +64,62 @@ L(b01): mov (up), %r9
lea 16(up), up
lea -16(rp), rp
jmp L(01)
-L(le1): shr R8(cnt), %r9
+L(le1): shr %cl, %r9
mov %r9, (rp)
- FUNC_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
jne L(b11)
L(b10): mov (up), %r8
mov 8(up), %r9
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r8, %rax
+ xor %eax, %eax
+ shrd %cl, %r8, %rax
sub $3, n
jb L(le2)
mov 16(up), %r10
lea 24(up), up
lea -8(rp), rp
jmp L(10)
-L(le2): shrd R8(cnt), %r9, %r8
+L(le2): shrd %cl, %r9, %r8
mov %r8, (rp)
- shr R8(cnt), %r9
+ shr %cl, %r9
mov %r9, 8(rp)
- FUNC_EXIT()
ret
ALIGN(16)
L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov 8(up), %r8
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r11, %rax
+ xor %eax, %eax
+ shrd %cl, %r11, %rax
mov 16(up), %r9
lea 32(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shrd R8(cnt), %r8, %r11
+L(top): shrd %cl, %r8, %r11
mov -8(up), %r10
mov %r11, (rp)
-L(10): shrd R8(cnt), %r9, %r8
+L(10): shrd %cl, %r9, %r8
mov (up), %r11
mov %r8, 8(rp)
-L(01): shrd R8(cnt), %r10, %r9
+L(01): shrd %cl, %r10, %r9
mov 8(up), %r8
mov %r9, 16(rp)
-L(00): shrd R8(cnt), %r11, %r10
+L(00): shrd %cl, %r11, %r10
mov 16(up), %r9
+ lea 32(up), up
mov %r10, 24(rp)
- add $32, up
lea 32(rp), rp
sub $4, n
jnc L(top)
-L(end): shrd R8(cnt), %r8, %r11
+L(end): shrd %cl, %r8, %r11
mov %r11, (rp)
- shrd R8(cnt), %r9, %r8
+ shrd %cl, %r9, %r8
mov %r8, 8(rp)
- shr R8(cnt), %r9
+ shr %cl, %r9
mov %r9, 16(rp)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/sec_tabselect.asm b/gmp/mpn/x86_64/core2/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/core2/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/core2/sqr_basecase.asm b/gmp/mpn/x86_64/core2/sqr_basecase.asm
deleted file mode 100644
index a112c1b52e..0000000000
--- a/gmp/mpn/x86_64/core2/sqr_basecase.asm
+++ /dev/null
@@ -1,984 +0,0 @@
-dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
-dnl It also seems good for Conroe/Wolfdale.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core 4.9 4.18-4.25 3.87
-C Intel NHM 3.8 4.06-4.2 3.5
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C Code structure:
-C
-C
-C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
-C | | | |
-C | | | |
-C | | | |
-C \|/ \|/ \|/ \|/
-C ____________ ____________
-C / \ / \
-C \|/ \ \|/ \
-C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
-C \ /|\ \ /|\
-C \____________/ \____________/
-C \ /
-C \ /
-C \ /
-C tail(0m2) tail(1m2)
-C \ /
-C \ /
-C sqr_diag_addlsh1
-
-C TODO
-C * Tune. None done so far.
-C * Currently 2761 bytes, making it smaller would be nice.
-C * Consider using a jumptab-based entry sequence. One might even use a mask-
-C less sequence, if the table is large enough to support tuneup's needs.
-C The code would be, using non-PIC code,
-C lea tab(%rip),%rax; jmp *(n,%rax)
-C or,
-C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
-C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
-C with the last four entries repeated a safe number of times.
-C * Consider expanding feed-in code in order to avoid zeroing registers.
-C * Zero consistently with xor.
-C * Check if using "lea (reg),reg" should be done in more places; we have some
-C explicit "mov %rax,reg" now.
-C * Try zeroing with xor in m2 loops.
-C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
-C between loop header and wind-down code.
-C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-C Define this to $1 to use late loop index variable as zero, $2 to use an
-C explicit $0.
-define(`Z',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param', `%rdx')
-
-define(`n', `%r8')
-
-define(`v0', `%r10')
-define(`v1', `%r11')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r9')
-define(`i', `%r13')
-
-define(`X0', `%r12')
-define(`X1', `%r14')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
-
- cmp $4, n_param
- jl L(small)
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
-
- mov (up), v0
- mov 8(up), %rax
- mov %rax, v1
-
- mov $1, R32(n)
- sub n_param, n C n = -n_param+1
- push n
-
- lea (up,n_param,8), up
- lea (rp,n_param,8), rp
-
- mul v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- mov %rax, (rp,n,8)
- jnz L(b10)
-
-L(b00): lea (n), i C n = 5, 9, ...
- mov %rdx, w1 C FIXME: Use lea?
- xor R32(w2), R32(w2)
- jmp L(m2e0)
-
-L(b10): lea 2(n), i C n = 7, 11, ...
- mov 8(up,n,8), %rax
- mov %rdx, w3 C FIXME: Use lea?
- xor R32(w0), R32(w0)
- xor R32(w1), R32(w1)
- jmp L(m2e2)
-
-L(bx1): test $2, R8(n)
- mov %rax, (rp,n,8)
- jz L(b11)
-
-L(b01): lea 1(n), i C n = 6, 10, ...
- mov %rdx, w0 C FIXME: Use lea?
- xor R32(w1), R32(w1)
- jmp L(m2e1)
-
-L(b11): lea -1(n), i C n = 4, 8, 12, ...
- mov %rdx, w2 C FIXME: Use lea?
- xor R32(w3), R32(w3)
- jmp L(m2e3)
-
-
- ALIGNx
-L(m2top1):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
-L(m2e1):mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top1)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- jmp L(am2o3)
-
- ALIGNx
-L(m2top3):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
-L(m2e3):mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top3)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- cmp $-1, n
- jz L(cor1) C jumps iff entry n = 4
-
-L(am2o1):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea 1(n), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 128)
- mov (rp,n,8), w1
- xor R32(w2), R32(w2)
- mov 8(up,n,8), %rax
- xor R32(w3), R32(w3)
- jmp L(lo1)
-
- ALIGNx
-L(am2top1):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
-L(lo1): mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top1)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
-
-L(am2o3):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea -1(n), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 8)
- mov (rp,n,8), w3
- xor R32(w0), R32(w0)
- xor R32(w1), R32(w1)
- mov 8(up,n,8), %rax
- jmp L(lo3)
-
- ALIGNx
-L(am2top3):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
-L(lo3): mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top3)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
- cmp $-1, n
- jnz L(am2o1)
-
-L(cor1):pop n
- mov %rdx, w3
- mov -16(up), v0
- mov -8(up), %rax
- mul v0
- add w3, %rax
- adc $0, %rdx
- mov %rax, -8(rp)
- mov %rdx, (rp)
- jmp L(sqr_diag_addlsh1)
-
- ALIGNx
-L(m2top2):
-L(m2e2):mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top2)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- jmp L(am2o0)
-
- ALIGNx
-L(m2top0):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
-L(m2e0):mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top0)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- cmp $-2, n
- jz L(cor2) C jumps iff entry n = 5
-
-L(am2o2):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea -2(n), i
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 32)
- mov (rp,n,8), w0
- xor R32(w1), R32(w1)
- xor R32(w2), R32(w2)
- mov 8(up,n,8), %rax
- jmp L(lo2)
-
- ALIGNx
-L(am2top2):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
-L(lo2): mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top2)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
-
-L(am2o0):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea 0(n), i
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 2)
- xor R32(w0), R32(w0)
- mov (rp,n,8), w2
- xor R32(w3), R32(w3)
- jmp L(lo0)
-
- ALIGNx
-L(am2top0):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo0): mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top0)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
- cmp $-2, n
- jnz L(am2o2)
-
-L(cor2):pop n
- mov -24(up), v0
- mov %rax, w2
- mov %rdx, w0
- mov -16(up), %rax
- mov %rax, v1
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 32)
- mov -8(up), %rax
- mul v0
- add w2, X0
- mov X0, -16(rp)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov -8(up), %rax
- adc $0, X0
- mul v1
- add w0, X1
- adc $0, X0
- mov X1, -8(rp)
- add X0, %rax
- mov %rax, (rp)
- adc $0, %rdx
- mov %rdx, 8(rp)
- lea 8(rp), rp
-
-L(sqr_diag_addlsh1):
- mov -8(up,n,8), %rax
- shl n
- xor R32(%rbx), R32(%rbx)
- mul %rax
- mov 8(rp,n,8), %r11
- lea (%rdx), %r10
- mov 16(rp,n,8), %r9
- add %r11, %r11
- jmp L(dm)
-
- ALIGNx
-L(dtop):mul %rax
- add %r11, %r10
- mov 8(rp,n,8), %r11
- mov %r10, -8(rp,n,8)
- adc %r9, %rax
- lea (%rdx,%rbx), %r10
- mov 16(rp,n,8), %r9
- adc %r11, %r11
-L(dm): mov %rax, (rp,n,8)
- mov (up,n,4), %rax
- adc %r9, %r9
- setc R8(%rbx)
- add $2, n
- js L(dtop)
-
- mul %rax
- add %r11, %r10
- mov %r10, -8(rp)
- adc %r9, %rax
- lea (%rdx,%rbx), %r10
- mov %rax, (rp)
- adc $0, %r10
- mov %r10, 8(rp)
-
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- mov (up), %rax
- cmp $2, n_param
- jae L(gt1)
-L(n1):
- mul %rax
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(gt1): jne L(gt2)
-L(n2): mov %rax, %r8
- mul %rax
- mov 8(up), %r11
- mov %rax, (rp)
- mov %r11, %rax
- mov %rdx, %r9
- mul %rax
- mov %rax, %r10
- mov %r11, %rax
- mov %rdx, %r11
- mul %r8
- xor %r8, %r8
- add %rax, %r9
- adc %rdx, %r10
- adc %r8, %r11
- add %rax, %r9
- mov %r9, 8(rp)
- adc %rdx, %r10
- mov %r10, 16(rp)
- adc %r8, %r11
- mov %r11, 24(rp)
- FUNC_EXIT()
- ret
-
-L(gt2):
-L(n3): mov %rax, %r10
- mul %rax
- mov 8(up), %r11
- mov %rax, (rp)
- mov %r11, %rax
- mov %rdx, 8(rp)
- mul %rax
- mov 16(up), %rcx
- mov %rax, 16(rp)
- mov %rcx, %rax
- mov %rdx, 24(rp)
- mul %rax
- mov %rax, 32(rp)
- mov %rdx, 40(rp)
-
- mov %r11, %rax
- mul %r10
- mov %rax, %r8
- mov %rcx, %rax
- mov %rdx, %r9
- mul %r10
- xor %r10, %r10
- add %rax, %r9
- mov %r11, %rax
- mov %r10, %r11
- adc %rdx, %r10
-
- mul %rcx
- add %rax, %r10
- adc %r11, %rdx
- add %r8, %r8
- adc %r9, %r9
- adc %r10, %r10
- adc %rdx, %rdx
- adc %r11, %r11
- add %r8, 8(rp)
- adc %r9, 16(rp)
- adc %r10, 24(rp)
- adc %rdx, 32(rp)
- adc %r11, 40(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/sublsh1_n.asm b/gmp/mpn/x86_64/core2/sublsh1_n.asm
deleted file mode 100644
index 46488fcafe..0000000000
--- a/gmp/mpn/x86_64/core2/sublsh1_n.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-define(ADDSUB, sub)
-define(ADCSBB, sbb)
-define(func, mpn_sublsh1_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/sublsh2_n.asm b/gmp/mpn/x86_64/core2/sublsh2_n.asm
deleted file mode 100644
index f3b1e28464..0000000000
--- a/gmp/mpn/x86_64/core2/sublsh2_n.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-define(ADDSUB, sub)
-define(ADCSBB, sbb)
-define(func, mpn_sublsh2_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/core2/sublshC_n.asm')