summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86_64/fastsse
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86_64/fastsse')
-rw-r--r--gmp/mpn/x86_64/fastsse/README21
-rw-r--r--gmp/mpn/x86_64/fastsse/com-palignr.asm302
-rw-r--r--gmp/mpn/x86_64/fastsse/com.asm161
-rw-r--r--gmp/mpn/x86_64/fastsse/copyd-palignr.asm251
-rw-r--r--gmp/mpn/x86_64/fastsse/copyd.asm145
-rw-r--r--gmp/mpn/x86_64/fastsse/copyi-palignr.asm295
-rw-r--r--gmp/mpn/x86_64/fastsse/copyi.asm166
-rw-r--r--gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm182
-rw-r--r--gmp/mpn/x86_64/fastsse/lshift.asm169
-rw-r--r--gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm193
-rw-r--r--gmp/mpn/x86_64/fastsse/lshiftc.asm179
-rw-r--r--gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm201
-rw-r--r--gmp/mpn/x86_64/fastsse/sec_tabselect.asm192
13 files changed, 0 insertions, 2457 deletions
diff --git a/gmp/mpn/x86_64/fastsse/README b/gmp/mpn/x86_64/fastsse/README
deleted file mode 100644
index 520551ed99..0000000000
--- a/gmp/mpn/x86_64/fastsse/README
+++ /dev/null
@@ -1,21 +0,0 @@
-This directory contains code for x86-64 processors with fast
-implementations of SSE operations, hence the name "fastsse".
-
-Current processors that might benefit from this code are:
-
- AMD K10
- AMD Bulldozer
- Intel Nocona
- Intel Nehalem/Westmere
- Intel Sandybridge/Ivybridge
- VIA Nano
-
-Current processors that do not benefit from this code are:
-
- AMD K8
- AMD Bobcat
- Intel Atom
-
-Intel Conroe/Penryn is a border case; its handling of non-aligned
-128-bit memory operands is poor. VIA Nano also have poor handling of
-non-aligned operands.
diff --git a/gmp/mpn/x86_64/fastsse/com-palignr.asm b/gmp/mpn/x86_64/fastsse/com-palignr.asm
deleted file mode 100644
index d9641e890d..0000000000
--- a/gmp/mpn/x86_64/fastsse/com-palignr.asm
+++ /dev/null
@@ -1,302 +0,0 @@
-dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 illop 1.0/1.0 N
-C AMD K10 0.85 illop Y/N
-C AMD bd1 1.39 ? 1.45 Y/N
-C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
-C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.82 opt/0.74 Y
-C Intel NHM 0.52 0.65 opt/opt Y
-C Intel SBR 0.51 0.55 opt/0.51 Y
-C Intel atom 1.16 1.70 opt/opt Y
-C VIA nano 1.09 1.10 opt/opt Y
-
-C We use only 16-byte operations, except for unaligned top-most and bottom-most
-C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
-C instruction is better adapted to mpn_copyd's needs, we need to contort the
-C code to use it here.
-C
-C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
-C from the x86_64 default code.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-C There are three instructions for loading an aligned 128-bit quantity. We use
-C movaps, since it has the shortest coding.
-define(`movdqa', ``movaps'')
-
-ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_com)
- FUNC_ENTRY(3)
-
- cmp $COM_SSE_THRESHOLD, n
- jbe L(bc)
-
- pcmpeqb %xmm7, %xmm7 C set to 111...111
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(rp_aligned) C jump if rp aligned
-
- mov (up), %r8
- lea 8(up), up
- not %r8
- mov %r8, (rp)
- lea 8(rp), rp
- dec n
-
-L(rp_aligned):
- test $8, R8(up)
- jnz L(uent)
-
-ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
-` sub $8, n',
-` jmp L(am)')
-
- ALIGN(16)
-L(atop):movdqa 0(up), %xmm0
- movdqa 16(up), %xmm1
- movdqa 32(up), %xmm2
- movdqa 48(up), %xmm3
- lea 64(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- pxor %xmm7, %xmm2
- pxor %xmm7, %xmm3
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-L(am): sub $8, n
- jnc L(atop)
-
- test $4, R8(n)
- jz 1f
- movdqa (up), %xmm0
- movdqa 16(up), %xmm1
- lea 32(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa (up), %xmm0
- lea 16(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- not %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-L(uent):
-C Code handling up - rp = 8 (mod 16)
-
-C FIXME: The code below only handles overlap if it is close to complete, or
-C quite separate: up-rp < 5 or up-up > 15 limbs
- lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
- sub rp, %rax
- cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
- jbe L(bc) C deflect to plain loop
-
- sub $16, n
- jc L(uend)
-
- movdqa 120(up), %xmm3
-
- sub $16, n
- jmp L(um)
-
- ALIGN(16)
-L(utop):movdqa 120(up), %xmm3
- pxor %xmm7, %xmm0
- movdqa %xmm0, -128(rp)
- sub $16, n
-L(um): movdqa 104(up), %xmm2
- palignr($8, %xmm2, %xmm3)
- movdqa 88(up), %xmm1
- pxor %xmm7, %xmm3
- movdqa %xmm3, 112(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 72(up), %xmm0
- pxor %xmm7, %xmm2
- movdqa %xmm2, 96(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa 56(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 80(rp)
- palignr($8, %xmm3, %xmm0)
- movdqa 40(up), %xmm2
- pxor %xmm7, %xmm0
- movdqa %xmm0, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movdqa 24(up), %xmm1
- pxor %xmm7, %xmm3
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 8(up), %xmm0
- pxor %xmm7, %xmm2
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 128(up), up
- lea 128(rp), rp
- jnc L(utop)
-
- pxor %xmm7, %xmm0
- movdqa %xmm0, -128(rp)
-
-L(uend):test $8, R8(n)
- jz 1f
- movdqa 56(up), %xmm3
- movdqa 40(up), %xmm2
- palignr($8, %xmm2, %xmm3)
- movdqa 24(up), %xmm1
- pxor %xmm7, %xmm3
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 8(up), %xmm0
- pxor %xmm7, %xmm2
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 64(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 64(rp), rp
-
-1: test $4, R8(n)
- jz 1f
- movdqa 24(up), %xmm1
- movdqa 8(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 32(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa 8(up), %xmm0
- movdqa -8(up), %xmm3
- palignr($8, %xmm3, %xmm0)
- lea 16(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- not %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): lea -8(rp), rp
- sub $4, R32(n)
- jc L(end)
-
-ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
-` ALIGN(16)')
-L(top): mov (up), %r8
- mov 8(up), %r9
- lea 32(rp), rp
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- not %r8
- not %r9
- not %r10
- not %r11
- mov %r8, -24(rp)
- mov %r9, -16(rp)
-ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
-` sub $4, R32(n)')
- mov %r10, -8(rp)
- mov %r11, (rp)
-ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
-` jnc L(top)')
-
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- not %r8
- mov %r8, 8(rp)
- lea 8(rp), rp
- lea 8(up), up
-1: test $2, R8(n)
- jz 1f
- mov (up), %r8
- mov 8(up), %r9
- not %r8
- not %r9
- mov %r8, 8(rp)
- mov %r9, 16(rp)
-1: FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/com.asm b/gmp/mpn/x86_64/fastsse/com.asm
deleted file mode 100644
index 4abb076d3f..0000000000
--- a/gmp/mpn/x86_64/fastsse/com.asm
+++ /dev/null
@@ -1,161 +0,0 @@
-dnl AMD64 mpn_com optimised for CPUs with fast SSE.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 2.0 N
-C AMD K10 0.85 1.3 Y/N
-C AMD bd1 1.40 1.40 Y
-C AMD bobcat 3.1 3.1 N
-C Intel P4 2.28 illop Y
-C Intel core2 1.02 1.02 N
-C Intel NHM 0.53 0.68 Y
-C Intel SBR 0.51 0.75 Y
-C Intel atom 3.68 3.68 N
-C VIA nano 1.17 5.09 Y/N
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We can always write using
-C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
-C operations.
-
-C Instead of having separate loops for reading aligned and unaligned, we read
-C using MOVDQU. This seems to work great except for core2; there performance
-C doubles when reading using MOVDQA (for aligned source). It is unclear how to
-C best handle the unaligned case there.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_com)
- FUNC_ENTRY(3)
-
- test n, n
- jz L(don)
-
- pcmpeqb %xmm7, %xmm7 C set to 111...111
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(ali) C jump if rp aligned
- mov (up), %rax
- lea 8(up), up
- not %rax
- mov %rax, (rp)
- lea 8(rp), rp
- dec n
-
- sub $14, n
- jc L(sma)
-
- ALIGN(16)
-L(top): movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- movdqu 64(up), %xmm4
- movdqu 80(up), %xmm5
- movdqu 96(up), %xmm6
- lea 112(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- pxor %xmm7, %xmm2
- pxor %xmm7, %xmm3
- pxor %xmm7, %xmm4
- pxor %xmm7, %xmm5
- pxor %xmm7, %xmm6
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- movdqa %xmm4, 64(rp)
- movdqa %xmm5, 80(rp)
- movdqa %xmm6, 96(rp)
- lea 112(rp), rp
-L(ali): sub $14, n
- jnc L(top)
-
-L(sma): add $14, n
- test $8, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- lea 64(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- pxor %xmm7, %xmm2
- pxor %xmm7, %xmm3
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- lea 32(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- movdqu (up), %xmm0
- lea 16(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov (up), %rax
- not %rax
- mov %rax, (rp)
-1:
-L(don): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyd-palignr.asm b/gmp/mpn/x86_64/fastsse/copyd-palignr.asm
deleted file mode 100644
index 7430cadc09..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyd-palignr.asm
+++ /dev/null
@@ -1,251 +0,0 @@
-dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 illop 1.0/1.0 N
-C AMD K10 0.85 illop Y/N
-C AMD bull 0.70 0.70 Y
-C AMD pile 0.68 0.68 Y
-C AMD steam ? ?
-C AMD bobcat 1.97 8.24 1.5/1.5 N
-C AMD jaguar ? ?
-C Intel P4 2.26 illop Y/N
-C Intel core 0.52 0.68-0.80 opt/0.64 Y
-C Intel NHM 0.52 0.64 opt/opt Y
-C Intel SBR 0.51 0.51 opt/0.51 Y
-C Intel IBR ? ? Y
-C Intel HWL 0.51 0.51 0.25/0.25 N
-C Intel atom 1.16 1.66 opt/opt Y
-C VIA nano 1.08 1.06 opt/opt Y
-
-C We use only 16-byte operations, except for unaligned top-most and bottom-most
-C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
-C
-C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
-C taken from the x86_64 default code.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-C There are three instructions for loading an aligned 128-bit quantity. We use
-C movaps, since it has the shortest coding.
-define(`movdqa', ``movaps'')
-
-ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_copyd)
- FUNC_ENTRY(3)
-
- lea -8(up,n,8), up
- lea -8(rp,n,8), rp
-
- cmp $COPYD_SSE_THRESHOLD, n
- jbe L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jnz L(rp_aligned) C jump if rp aligned
-
- mov (up), %rax C copy one limb
- mov %rax, (rp)
- lea -8(up), up
- lea -8(rp), rp
- dec n
-
-L(rp_aligned):
- test $8, R8(up)
- jz L(uent)
-
-ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
-` sub $8, n',
-` jmp L(am)')
-
- ALIGN(16)
-L(atop):movdqa -8(up), %xmm0
- movdqa -24(up), %xmm1
- movdqa -40(up), %xmm2
- movdqa -56(up), %xmm3
- lea -64(up), up
- movdqa %xmm0, -8(rp)
- movdqa %xmm1, -24(rp)
- movdqa %xmm2, -40(rp)
- movdqa %xmm3, -56(rp)
- lea -64(rp), rp
-L(am): sub $8, n
- jnc L(atop)
-
- test $4, R8(n)
- jz 1f
- movdqa -8(up), %xmm0
- movdqa -24(up), %xmm1
- lea -32(up), up
- movdqa %xmm0, -8(rp)
- movdqa %xmm1, -24(rp)
- lea -32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa -8(up), %xmm0
- lea -16(up), up
- movdqa %xmm0, -8(rp)
- lea -16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-L(uent):sub $16, n
- movdqa (up), %xmm0
- jc L(uend)
-
- ALIGN(16)
-L(utop):sub $16, n
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- movdqa -32(up), %xmm2
- palignr($8, %xmm2, %xmm1)
- movdqa %xmm1, -24(rp)
- movdqa -48(up), %xmm3
- palignr($8, %xmm3, %xmm2)
- movdqa %xmm2, -40(rp)
- movdqa -64(up), %xmm0
- palignr($8, %xmm0, %xmm3)
- movdqa %xmm3, -56(rp)
- movdqa -80(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -72(rp)
- movdqa -96(up), %xmm2
- palignr($8, %xmm2, %xmm1)
- movdqa %xmm1, -88(rp)
- movdqa -112(up), %xmm3
- palignr($8, %xmm3, %xmm2)
- movdqa %xmm2, -104(rp)
- movdqa -128(up), %xmm0
- palignr($8, %xmm0, %xmm3)
- movdqa %xmm3, -120(rp)
- lea -128(up), up
- lea -128(rp), rp
- jnc L(utop)
-
-L(uend):test $8, R8(n)
- jz 1f
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- movdqa -32(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, -24(rp)
- movdqa -48(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -40(rp)
- movdqa -64(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, -56(rp)
- lea -64(up), up
- lea -64(rp), rp
-
-1: test $4, R8(n)
- jz 1f
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- movdqa -32(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, -24(rp)
- lea -32(up), up
- lea -32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- lea -16(up), up
- lea -16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): sub $4, R32(n)
- jc L(end)
-
- ALIGN(16)
-L(top): mov (up), %r8
- mov -8(up), %r9
- lea -32(rp), rp
- mov -16(up), %r10
- mov -24(up), %r11
- lea -32(up), up
- mov %r8, 32(rp)
- mov %r9, 24(rp)
-ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
-` sub $4, R32(n)')
- mov %r10, 16(rp)
- mov %r11, 8(rp)
-ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
-` jnc L(top)')
-
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
- lea -8(rp), rp
- lea -8(up), up
-1: test $2, R8(n)
- jz 1f
- mov (up), %r8
- mov -8(up), %r9
- mov %r8, (rp)
- mov %r9, -8(rp)
-1: FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyd.asm b/gmp/mpn/x86_64/fastsse/copyd.asm
deleted file mode 100644
index 5c6094c7e2..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyd.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-dnl AMD64 mpn_copyd optimised for CPUs with fast SSE.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb good for cpu?
-C AMD K8,K9
-C AMD K10 0.85 Y
-C AMD bd1 0.8 Y
-C AMD bobcat
-C Intel P4 2.28 Y
-C Intel core2 1
-C Intel NHM 0.5 Y
-C Intel SBR 0.5 Y
-C Intel atom
-C VIA nano 1.1 Y
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We can always write using
-C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
-C operations.
-
-C Instead of having separate loops for reading aligned and unaligned, we read
-C using MOVDQU. This seems to work great except for core2; there performance
-C doubles when reading using MOVDQA (for aligned source). It is unclear how to
-C best handle the unaligned case there.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_copyd)
- FUNC_ENTRY(3)
-
- test n, n
- jz L(don)
-
- lea -16(rp,n,8), rp
- lea -16(up,n,8), up
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(ali) C jump if rp aligned
- mov 8(up), %rax
- lea -8(up), up
- mov %rax, 8(rp)
- lea -8(rp), rp
- dec n
-
- sub $16, n
- jc L(sma)
-
- ALIGN(16)
-L(top): movdqu (up), %xmm0
- movdqu -16(up), %xmm1
- movdqu -32(up), %xmm2
- movdqu -48(up), %xmm3
- movdqu -64(up), %xmm4
- movdqu -80(up), %xmm5
- movdqu -96(up), %xmm6
- movdqu -112(up), %xmm7
- lea -128(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, -16(rp)
- movdqa %xmm2, -32(rp)
- movdqa %xmm3, -48(rp)
- movdqa %xmm4, -64(rp)
- movdqa %xmm5, -80(rp)
- movdqa %xmm6, -96(rp)
- movdqa %xmm7, -112(rp)
- lea -128(rp), rp
-L(ali): sub $16, n
- jnc L(top)
-
-L(sma): test $8, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu -16(up), %xmm1
- movdqu -32(up), %xmm2
- movdqu -48(up), %xmm3
- lea -64(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, -16(rp)
- movdqa %xmm2, -32(rp)
- movdqa %xmm3, -48(rp)
- lea -64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu -16(up), %xmm1
- lea -32(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, -16(rp)
- lea -32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- movdqu (up), %xmm0
- lea -16(up), up
- movdqa %xmm0, (rp)
- lea -16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov 8(up), %r8
- mov %r8, 8(rp)
-1:
-L(don): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp/mpn/x86_64/fastsse/copyi-palignr.asm
deleted file mode 100644
index fda3c3500f..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyi-palignr.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 illop 1.0/1.0 N
-C AMD K10 0.85 illop Y/N
-C AMD bull 0.70 0.66 Y
-C AMD pile 0.68 0.66 Y
-C AMD steam ? ?
-C AMD bobcat 1.97 8.16 1.5/1.5 N
-C AMD jaguar ? ?
-C Intel P4 2.26 illop Y/N
-C Intel core 0.52 0.64 opt/opt Y
-C Intel NHM 0.52 0.71 opt/opt Y
-C Intel SBR 0.51 0.54 opt/0.51 Y
-C Intel IBR ? ? Y
-C Intel HWL 0.51 0.52 0.25/0.25 N
-C Intel atom 1.16 1.61 opt/opt Y
-C VIA nano 1.09 1.08 opt/opt Y
-
-C We use only 16-byte operations, except for unaligned top-most and bottom-most
-C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
-C instruction is better adapted to mpn_copyd's needs, we need to contort the
-C code to use it here.
-C
-C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
-C taken from the x86_64 default code.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-C There are three instructions for loading an aligned 128-bit quantity. We use
-C movaps, since it has the shortest coding.
-dnl define(`movdqa', ``movaps'')
-
-ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_copyi)
- FUNC_ENTRY(3)
-
- cmp $COPYI_SSE_THRESHOLD, n
- jbe L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(rp_aligned) C jump if rp aligned
-
- movsq C copy one limb
- dec n
-
-L(rp_aligned):
- test $8, R8(up)
- jnz L(uent)
-
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
-` sub $8, n',
-` jmp L(am)')
-
- ALIGN(16)
-L(atop):movdqa 0(up), %xmm0
- movdqa 16(up), %xmm1
- movdqa 32(up), %xmm2
- movdqa 48(up), %xmm3
- lea 64(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-L(am): sub $8, n
- jnc L(atop)
-
- test $4, R8(n)
- jz 1f
- movdqa (up), %xmm0
- movdqa 16(up), %xmm1
- lea 32(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa (up), %xmm0
- lea 16(up), up
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-L(uent):
-C Code handling up - rp = 8 (mod 16)
-
- cmp $16, n
- jc L(ued0)
-
-IFDOS(` add $-56, %rsp ')
-IFDOS(` movdqa %xmm6, (%rsp) ')
-IFDOS(` movdqa %xmm7, 16(%rsp) ')
-IFDOS(` movdqa %xmm8, 32(%rsp) ')
-
- movaps 120(up), %xmm7
- movaps 104(up), %xmm6
- movaps 88(up), %xmm5
- movaps 72(up), %xmm4
- movaps 56(up), %xmm3
- movaps 40(up), %xmm2
- lea 128(up), up
- sub $32, n
- jc L(ued1)
-
- ALIGN(16)
-L(utop):movaps -104(up), %xmm1
- sub $16, n
- movaps -120(up), %xmm0
- palignr($8, %xmm6, %xmm7)
- movaps -136(up), %xmm8
- movdqa %xmm7, 112(rp)
- palignr($8, %xmm5, %xmm6)
- movaps 120(up), %xmm7
- movdqa %xmm6, 96(rp)
- palignr($8, %xmm4, %xmm5)
- movaps 104(up), %xmm6
- movdqa %xmm5, 80(rp)
- palignr($8, %xmm3, %xmm4)
- movaps 88(up), %xmm5
- movdqa %xmm4, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movaps 72(up), %xmm4
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movaps 56(up), %xmm3
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movaps 40(up), %xmm2
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm8, %xmm0)
- lea 128(up), up
- movdqa %xmm0, (rp)
- lea 128(rp), rp
- jnc L(utop)
-
-L(ued1):movaps -104(up), %xmm1
- movaps -120(up), %xmm0
- movaps -136(up), %xmm8
- palignr($8, %xmm6, %xmm7)
- movdqa %xmm7, 112(rp)
- palignr($8, %xmm5, %xmm6)
- movdqa %xmm6, 96(rp)
- palignr($8, %xmm4, %xmm5)
- movdqa %xmm5, 80(rp)
- palignr($8, %xmm3, %xmm4)
- movdqa %xmm4, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm8, %xmm0)
- movdqa %xmm0, (rp)
- lea 128(rp), rp
-
-IFDOS(` movdqa (%rsp), %xmm6 ')
-IFDOS(` movdqa 16(%rsp), %xmm7 ')
-IFDOS(` movdqa 32(%rsp), %xmm8 ')
-IFDOS(` add $56, %rsp ')
-
-L(ued0):test $8, R8(n)
- jz 1f
- movaps 56(up), %xmm3
- movaps 40(up), %xmm2
- movaps 24(up), %xmm1
- movaps 8(up), %xmm0
- movaps -8(up), %xmm4
- palignr($8, %xmm2, %xmm3)
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm4, %xmm0)
- lea 64(up), up
- movdqa %xmm0, (rp)
- lea 64(rp), rp
-
-1: test $4, R8(n)
- jz 1f
- movaps 24(up), %xmm1
- movaps 8(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movaps -8(up), %xmm3
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 32(up), up
- movdqa %xmm0, (rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa 8(up), %xmm0
- movdqa -8(up), %xmm3
- palignr($8, %xmm3, %xmm0)
- lea 16(up), up
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): lea -8(rp), rp
- sub $4, R32(n)
- jc L(end)
-
- ALIGN(16)
-L(top): mov (up), %r8
- mov 8(up), %r9
- lea 32(rp), rp
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- mov %r8, -24(rp)
- mov %r9, -16(rp)
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
-` sub $4, R32(n)')
- mov %r10, -8(rp)
- mov %r11, (rp)
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
-` jnc L(top)')
-
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, 8(rp)
- lea 8(rp), rp
- lea 8(up), up
-1: test $2, R8(n)
- jz 1f
- mov (up), %r8
- mov 8(up), %r9
- mov %r8, 8(rp)
- mov %r9, 16(rp)
-1: FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyi.asm b/gmp/mpn/x86_64/fastsse/copyi.asm
deleted file mode 100644
index a1a1c231dc..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyi.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-dnl AMD64 mpn_copyi optimised for CPUs with fast SSE.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb good for cpu?
-C AMD K8,K9
-C AMD K10 0.85 1.64 Y/N
-C AMD bd1 1.4 1.4 Y
-C AMD bobcat
-C Intel P4 2.3 2.3 Y
-C Intel core2 1.0 1.0
-C Intel NHM 0.5 0.67 Y
-C Intel SBR 0.5 0.75 Y
-C Intel atom
-C VIA nano 1.16 5.16 Y/N
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We can always write using
-C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
-C operations.
-
-C Instead of having separate loops for reading aligned and unaligned, we read
-C using MOVDQU. This seems to work great except for core2; there performance
-C doubles when reading using MOVDQA (for aligned source). It is unclear how to
-C best handle the unaligned case there.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-dnl define(`movdqu', lddqu)
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_copyi)
- FUNC_ENTRY(3)
-
- cmp $3, n
- jc L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(ali) C jump if rp aligned
- movsq C copy single limb
- dec n
-
- sub $16, n
- jc L(sma)
-
- ALIGN(16)
-L(top): movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- movdqu 64(up), %xmm4
- movdqu 80(up), %xmm5
- movdqu 96(up), %xmm6
- movdqu 112(up), %xmm7
- lea 128(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- movdqa %xmm4, 64(rp)
- movdqa %xmm5, 80(rp)
- movdqa %xmm6, 96(rp)
- movdqa %xmm7, 112(rp)
- lea 128(rp), rp
-L(ali): sub $16, n
- jnc L(top)
-
-L(sma): test $8, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- lea 64(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- lea 32(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- movdqu (up), %xmm0
- lea 16(up), up
- movdqa %xmm0, (rp)
- lea 16(rp), rp
- ALIGN(16)
-1:
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-1:
- FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): sub $2, n
- jc L(end)
- ALIGN(16)
-1: mov (up), %rax
- mov 8(up), %rcx
- lea 16(up), up
- mov %rax, (rp)
- mov %rcx, 8(rp)
- lea 16(rp), rp
- sub $2, n
- jnc 1b
-
- test $1, R8(n)
- jz L(ret)
- mov (up), %rax
- mov %rax, (rp)
-L(ret): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm b/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm
deleted file mode 100644
index a05e850a1f..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm
+++ /dev/null
@@ -1,182 +0,0 @@
-dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 3 3 2.35 no, use shl/shr
-C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
-C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
-C AMD bobcat 3.17 3.17 yes, bad for n < 20
-C Intel P4 4.67 4.67 2.7 no, slow movdqu
-C Intel core2 2.15 2.15 1.25 no, use shld/shrd
-C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
-C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
-C Intel atom 11.7 11.7 4.5 no
-C VIA nano 5.7 5.95 2.0 no, slow movdqu
-
-C We try to do as many aligned 16-byte operations as possible. The top-most
-C and bottom-most writes might need 8-byte operations.
-C
-C This variant rely on fast load movdqu, and uses it even for aligned operands,
-C in order to avoid the need for two separate loops.
-C
-C TODO
-C * Could 2-limb wind-down code be simplified?
-C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
-C for other affected CPUs.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- cmp $3, n
- jle L(bc)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- jz L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea 1(n), %r8d
-
- and $6, R32(%r8)
- jz L(ba0)
- cmp $4, R32(%r8)
- jz L(ba4)
- jc L(ba2)
-L(ba6): add $-4, n
- jmp L(i56)
-L(ba0): add $-6, n
- jmp L(i70)
-L(ba4): add $-2, n
- jmp L(i34)
-L(ba2): add $-8, n
- jle L(end)
-
- ALIGN(16)
-L(top): movdqu 40(ap,n,8), %xmm1
- movdqu 48(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, 48(rp,n,8)
-L(i70):
- movdqu 24(ap,n,8), %xmm1
- movdqu 32(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, 32(rp,n,8)
-L(i56):
- movdqu 8(ap,n,8), %xmm1
- movdqu 16(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, 16(rp,n,8)
-L(i34):
- movdqu -8(ap,n,8), %xmm1
- movdqu (ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, (rp,n,8)
- sub $8, n
- jg L(top)
-
-L(end): test $1, R8(n)
- jnz L(end8)
-
- movdqu (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp)
- FUNC_EXIT()
- ret
-
-C Basecase
- ALIGN(16)
-L(bc): dec R32(n)
- jz L(end8)
-
- movq (ap,n,8), %xmm1
- movq -8(ap,n,8), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, (rp,n,8)
- sub $2, R32(n)
- jl L(end8)
- movq 8(ap), %xmm1
- movq (ap), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- movq %xmm0, (rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshift.asm b/gmp/mpn/x86_64/fastsse/lshift.asm
deleted file mode 100644
index f76972a22f..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshift.asm
+++ /dev/null
@@ -1,169 +0,0 @@
-dnl AMD64 mpn_lshift optimised for CPUs with fast SSE.
-
-dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb good
-C 16-byte aligned 16-byte unaligned for cpu?
-C AMD K8,K9 ? ?
-C AMD K10 1.68 (1.45) 1.75 (1.49) Y
-C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
-C AMD bobcat 4 4
-C Intel P4 3 (2.7) 3 (2.7) Y
-C Intel core2 2.05 (1.67) 2.55 (1.75)
-C Intel NHM 2.05 (1.75) 2.09 (2)
-C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y
-C Intel atom ? ?
-C VIA nano 2.25 (2) 2.5 (2) Y
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations.
-
-C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
-C not true. The aligned case reads 16+8 bytes, the unaligned case reads
-C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
-
-C This is not yet great code:
-C (1) The unaligned case makes many reads.
-C (2) We should do some unrolling, at least 2-way.
-C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
-C Nano.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshift)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- cmp $2, n
- jle L(le2)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea (ap,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(aent)
- jmp L(uent)
-C *****************************************************************************
-
-C Handle the case when ap != rp (mod 16).
-
- ALIGN(16)
-L(utop):movdqa -8(ap,n,8), %xmm0
- movq (ap,n,8), %xmm1
- punpcklqdq 8(ap,n,8), %xmm1
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(uent):sub $2, n
- ja L(utop)
-
- jne L(end8)
-
- movq (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- punpcklqdq 8(ap), %xmm1
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
-C Handle the case when ap = rp (mod 16).
-
- ALIGN(16)
-L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
- movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
- punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(aent):
- sub $2, n
- ja L(atop)
- jne L(end8)
-
- movdqa (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
- ALIGN(16)
-L(le2): jne L(end8)
-
- movq 8(ap), %xmm0
- movq (ap), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- movq %xmm0, (rp)
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
deleted file mode 100644
index 8250910c52..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 3 3 ? no, use shl/shr
-C AMD K10 1.8-2.0 1.8-2.0 ? yes
-C AMD bd1 1.9 1.9 ? yes
-C AMD bobcat 3.67 3.67 yes, bad for n < 20
-C Intel P4 4.75 4.75 ? no, slow movdqu
-C Intel core2 2.27 2.27 ? no, use shld/shrd
-C Intel NHM 2.15 2.15 ? no, use shld/shrd
-C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6
-C Intel atom 12.9 12.9 ? no
-C VIA nano 6.18 6.44 ? no, slow movdqu
-
-C We try to do as many aligned 16-byte operations as possible. The top-most
-C and bottom-most writes might need 8-byte operations.
-C
-C This variant rely on fast load movdqu, and uses it even for aligned operands,
-C in order to avoid the need for two separate loops.
-C
-C TODO
-C * Could 2-limb wind-down code be simplified?
-C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
-C for other affected CPUs.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- pcmpeqb %xmm3, %xmm3 C set to 111...111
-
- cmp $3, n
- jle L(bc)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- jz L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea 1(n), %r8d
-
- and $6, R32(%r8)
- jz L(ba0)
- cmp $4, R32(%r8)
- jz L(ba4)
- jc L(ba2)
-L(ba6): add $-4, n
- jmp L(i56)
-L(ba0): add $-6, n
- jmp L(i70)
-L(ba4): add $-2, n
- jmp L(i34)
-L(ba2): add $-8, n
- jle L(end)
-
- ALIGN(16)
-L(top): movdqu 40(ap,n,8), %xmm1
- movdqu 48(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, 48(rp,n,8)
-L(i70):
- movdqu 24(ap,n,8), %xmm1
- movdqu 32(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, 32(rp,n,8)
-L(i56):
- movdqu 8(ap,n,8), %xmm1
- movdqu 16(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, 16(rp,n,8)
-L(i34):
- movdqu -8(ap,n,8), %xmm1
- movdqu (ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, (rp,n,8)
- sub $8, n
- jg L(top)
-
-L(end): test $1, R8(n)
- jnz L(end8)
-
- movdqu (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, (rp)
- FUNC_EXIT()
- ret
-
-C Basecase
- ALIGN(16)
-L(bc): dec R32(n)
- jz L(end8)
-
- movq (ap,n,8), %xmm1
- movq -8(ap,n,8), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, (rp,n,8)
- sub $2, R32(n)
- jl L(end8)
- movq 8(ap), %xmm1
- movq (ap), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, (rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshiftc.asm b/gmp/mpn/x86_64/fastsse/lshiftc.asm
deleted file mode 100644
index d2520690e2..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshiftc.asm
+++ /dev/null
@@ -1,179 +0,0 @@
-dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
-
-dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb good
-C 16-byte aligned 16-byte unaligned for cpu?
-C AMD K8,K9 ? ?
-C AMD K10 1.85 (1.635) 1.9 (1.67) Y
-C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
-C AMD bobcat 4.5 4.5
-C Intel P4 3.6 (3.125) 3.6 (3.125) Y
-C Intel core2 2.05 (1.67) 2.55 (1.75)
-C Intel NHM 2.05 (1.875) 2.6 (2.25)
-C Intel SBR 1.55 (1.44) 2 (1.57) Y
-C Intel atom ? ?
-C VIA nano 2.5 (2.5) 2.5 (2.5) Y
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We always write using
-C 16-byte operations, we read with both 8-byte and 16-byte operations.
-
-C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
-C not true. The aligned case reads 16+8 bytes, the unaligned case reads
-C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
-
-C This is not yet great code:
-C (1) The unaligned case makes too many reads.
-C (2) We should do some unrolling, at least 2-way.
-C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
-C Nano.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_lshiftc)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- pcmpeqb %xmm7, %xmm7 C set to 111...111
-
- cmp $2, n
- jle L(le2)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea (ap,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(aent)
- jmp L(uent)
-C *****************************************************************************
-
-C Handle the case when ap != rp (mod 16).
-
- ALIGN(16)
-L(utop):movq (ap,n,8), %xmm1
- punpcklqdq 8(ap,n,8), %xmm1
- movdqa -8(ap,n,8), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(uent):sub $2, n
- ja L(utop)
-
- jne L(end8)
-
- movq (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- punpcklqdq 8(ap), %xmm1
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
-C Handle the case when ap = rp (mod 16).
-
- ALIGN(16)
-L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
- movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
- punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(aent):sub $2, n
- ja L(atop)
-
- jne L(end8)
-
- movdqa (ap), %xmm0
- pxor %xmm1, %xmm1
- punpcklqdq %xmm0, %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
- ALIGN(16)
-L(le2): jne L(end8)
-
- movq 8(ap), %xmm0
- movq (ap), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- pxor %xmm7, %xmm0
- movq %xmm0, (rp)
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm b/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm
deleted file mode 100644
index 1e270b13c3..0000000000
--- a/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm
+++ /dev/null
@@ -1,201 +0,0 @@
-dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 3 3 2.35 no, use shl/shr
-C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
-C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
-C AMD bobcat 3.17 3.17 yes, bad for n < 20
-C Intel P4 4.67 4.67 2.7 no, slow movdqu
-C Intel core2 2.15 2.15 1.25 no, use shld/shrd
-C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
-C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
-C Intel atom 11.7 11.7 4.5 no
-C VIA nano 5.7 5.95 2.0 no, slow movdqu
-
-C We try to do as many aligned 16-byte operations as possible. The top-most
-C and bottom-most writes might need 8-byte operations.
-C
-C This variant rely on fast load movdqu, and uses it even for aligned operands,
-C in order to avoid the need for two separate loops.
-C
-C TODO
-C * Could 2-limb wind-down code be simplified?
-C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
-C for other affected CPUs.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov (ap), %rax
- shl R8(%rcx), %rax
-
- cmp $3, n
- jle L(bc)
-
- test $8, R8(rp)
- jz L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq (ap), %xmm0
- movq 8(ap), %xmm1
- psrlq %xmm4, %xmm0
- psllq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, (rp)
- lea 8(ap), ap
- lea 8(rp), rp
- dec n
-
-L(rp_aligned):
- lea 1(n), %r8d
- lea (ap,n,8), ap
- lea (rp,n,8), rp
- neg n
-
- and $6, R32(%r8)
- jz L(bu0)
- cmp $4, R32(%r8)
- jz L(bu4)
- jc L(bu2)
-L(bu6): add $4, n
- jmp L(i56)
-L(bu0): add $6, n
- jmp L(i70)
-L(bu4): add $2, n
- jmp L(i34)
-L(bu2): add $8, n
- jge L(end)
-
- ALIGN(16)
-L(top): movdqu -64(ap,n,8), %xmm1
- movdqu -56(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -64(rp,n,8)
-L(i70):
- movdqu -48(ap,n,8), %xmm1
- movdqu -40(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -48(rp,n,8)
-L(i56):
- movdqu -32(ap,n,8), %xmm1
- movdqu -24(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -32(rp,n,8)
-L(i34):
- movdqu -16(ap,n,8), %xmm1
- movdqu -8(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -16(rp,n,8)
- add $8, n
- jl L(top)
-
-L(end): test $1, R8(n)
- jnz L(e1)
-
- movdqu -16(ap), %xmm1
- movq -8(ap), %xmm0
- psrlq %xmm4, %xmm1
- psllq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, -16(rp)
- FUNC_EXIT()
- ret
-
-L(e1): movq -8(ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, -8(rp)
- FUNC_EXIT()
- ret
-
-C Basecase
- ALIGN(16)
-L(bc): dec R32(n)
- jnz 1f
- movq (ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, (rp)
- FUNC_EXIT()
- ret
-
-1: movq (ap), %xmm1
- movq 8(ap), %xmm0
- psrlq %xmm4, %xmm1
- psllq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, (rp)
- dec R32(n)
- jnz 1f
- movq 8(ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, 8(rp)
- FUNC_EXIT()
- ret
-
-1: movq 8(ap), %xmm1
- movq 16(ap), %xmm0
- psrlq %xmm4, %xmm1
- psllq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, 8(rp)
- movq 16(ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp/mpn/x86_64/fastsse/sec_tabselect.asm
deleted file mode 100644
index e3df110be4..0000000000
--- a/gmp/mpn/x86_64/fastsse/sec_tabselect.asm
+++ /dev/null
@@ -1,192 +0,0 @@
-dnl AMD64 SSE mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb
-C ali,evn n unal,evn n other cases
-C AMD K8,K9 1.65 1.65 1.8
-C AMD K10 0.78 0.78 0.85
-C AMD bd1 0.80 0.91 1.25
-C AMD bobcat 2.15 2.15 2.37
-C Intel P4 2.5 2.5 2.95
-C Intel core2 1.17 1.25 1.25
-C Intel NHM 0.87 0.90 0.90
-C Intel SBR 0.63 0.79 0.77
-C Intel atom 4.3 4.3 4.3 slower than plain code
-C VIA nano 1.4 5.1 3.14 too alignment dependent
-
-C NOTES
-C * We only honour the least significant 32 bits of the `which' and `nents'
-C arguments to allow efficient code using just SSE2. We would need to
-C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
-C * We use movd for copying between xmm and plain registers, since old gas
-C rejects movq. But gas assembles movd as movq when given a 64-bit greg.
-
-define(`rp', `%rdi')
-define(`tp', `%rsi')
-define(`n', `%rdx')
-define(`nents', `%rcx')
-define(`which', `%r8')
-
-define(`i', `%r10')
-define(`j', `%r9')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C nents n rp tab which j i temp * * * *
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sec_tabselect)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
-
- movd which, %xmm8
- pshufd $0, %xmm8, %xmm8 C 4 `which' copies
- mov $1, R32(%rax)
- movd %rax, %xmm9
- pshufd $0, %xmm9, %xmm9 C 4 copies of 1
-
- mov n, j
- add $-8, j
- js L(outer_end)
-
-L(outer_top):
- mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- pxor %xmm5, %xmm5
- pxor %xmm6, %xmm6
- pxor %xmm7, %xmm7
- ALIGN(16)
-L(top): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movdqu 0(tp), %xmm2
- movdqu 16(tp), %xmm3
- pand %xmm0, %xmm2
- pand %xmm0, %xmm3
- por %xmm2, %xmm4
- por %xmm3, %xmm5
- movdqu 32(tp), %xmm2
- movdqu 48(tp), %xmm3
- pand %xmm0, %xmm2
- pand %xmm0, %xmm3
- por %xmm2, %xmm6
- por %xmm3, %xmm7
- lea (tp,n,8), tp
- add $-1, i
- jne L(top)
-
- movdqu %xmm4, 0(rp)
- movdqu %xmm5, 16(rp)
- movdqu %xmm6, 32(rp)
- movdqu %xmm7, 48(rp)
-
- lea 64(%r11), tp
- lea 64(rp), rp
- add $-8, j
- jns L(outer_top)
-L(outer_end):
-
- test $4, R8(n)
- je L(b0xx)
-L(b1xx):mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- pxor %xmm5, %xmm5
- ALIGN(16)
-L(tp4): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movdqu 0(tp), %xmm2
- movdqu 16(tp), %xmm3
- pand %xmm0, %xmm2
- pand %xmm0, %xmm3
- por %xmm2, %xmm4
- por %xmm3, %xmm5
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp4)
- movdqu %xmm4, 0(rp)
- movdqu %xmm5, 16(rp)
- lea 32(%r11), tp
- lea 32(rp), rp
-
-L(b0xx):test $2, R8(n)
- je L(b00x)
-L(b01x):mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- ALIGN(16)
-L(tp2): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movdqu 0(tp), %xmm2
- pand %xmm0, %xmm2
- por %xmm2, %xmm4
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp2)
- movdqu %xmm4, 0(rp)
- lea 16(%r11), tp
- lea 16(rp), rp
-
-L(b00x):test $1, R8(n)
- je L(b000)
-L(b001):mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- ALIGN(16)
-L(tp1): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movq 0(tp), %xmm2
- pand %xmm0, %xmm2
- por %xmm2, %xmm4
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp1)
- movq %xmm4, 0(rp)
-
-L(b000):FUNC_EXIT()
- ret
-EPILOGUE()