diff options
Diffstat (limited to 'gmp/mpn/x86_64/fastsse')
-rw-r--r-- | gmp/mpn/x86_64/fastsse/README | 21 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/com-palignr.asm | 302 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/com.asm | 161 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/copyd-palignr.asm | 251 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/copyd.asm | 145 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/copyi-palignr.asm | 295 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/copyi.asm | 166 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm | 182 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/lshift.asm | 169 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm | 193 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/lshiftc.asm | 179 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm | 201 | ||||
-rw-r--r-- | gmp/mpn/x86_64/fastsse/sec_tabselect.asm | 192 |
13 files changed, 0 insertions, 2457 deletions
diff --git a/gmp/mpn/x86_64/fastsse/README b/gmp/mpn/x86_64/fastsse/README deleted file mode 100644 index 520551ed99..0000000000 --- a/gmp/mpn/x86_64/fastsse/README +++ /dev/null @@ -1,21 +0,0 @@ -This directory contains code for x86-64 processors with fast -implementations of SSE operations, hence the name "fastsse". - -Current processors that might benefit from this code are: - - AMD K10 - AMD Bulldozer - Intel Nocona - Intel Nehalem/Westmere - Intel Sandybridge/Ivybridge - VIA Nano - -Current processors that do not benefit from this code are: - - AMD K8 - AMD Bobcat - Intel Atom - -Intel Conroe/Penryn is a border case; its handling of non-aligned -128-bit memory operands is poor. VIA Nano also have poor handling of -non-aligned operands. diff --git a/gmp/mpn/x86_64/fastsse/com-palignr.asm b/gmp/mpn/x86_64/fastsse/com-palignr.asm deleted file mode 100644 index d9641e890d..0000000000 --- a/gmp/mpn/x86_64/fastsse/com-palignr.asm +++ /dev/null @@ -1,302 +0,0 @@ -dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 2.0 illop 1.0/1.0 N -C AMD K10 0.85 illop Y/N -C AMD bd1 1.39 ? 1.45 Y/N -C AMD bobcat 1.97 ? 8.17 1.5/1.5 N -C Intel P4 2.26 illop Y/N -C Intel core2 0.52 0.82 opt/0.74 Y -C Intel NHM 0.52 0.65 opt/opt Y -C Intel SBR 0.51 0.55 opt/0.51 Y -C Intel atom 1.16 1.70 opt/opt Y -C VIA nano 1.09 1.10 opt/opt Y - -C We use only 16-byte operations, except for unaligned top-most and bottom-most -C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That -C instruction is better adapted to mpn_copyd's needs, we need to contort the -C code to use it here. -C -C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken -C from the x86_64 default code. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') - -C There are three instructions for loading an aligned 128-bit quantity. We use -C movaps, since it has the shortest coding. -define(`movdqa', ``movaps'') - -ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_com) - FUNC_ENTRY(3) - - cmp $COM_SSE_THRESHOLD, n - jbe L(bc) - - pcmpeqb %xmm7, %xmm7 C set to 111...111 - - test $8, R8(rp) C is rp 16-byte aligned? - jz L(rp_aligned) C jump if rp aligned - - mov (up), %r8 - lea 8(up), up - not %r8 - mov %r8, (rp) - lea 8(rp), rp - dec n - -L(rp_aligned): - test $8, R8(up) - jnz L(uent) - -ifelse(eval(COM_SSE_THRESHOLD >= 8),1, -` sub $8, n', -` jmp L(am)') - - ALIGN(16) -L(atop):movdqa 0(up), %xmm0 - movdqa 16(up), %xmm1 - movdqa 32(up), %xmm2 - movdqa 48(up), %xmm3 - lea 64(up), up - pxor %xmm7, %xmm0 - pxor %xmm7, %xmm1 - pxor %xmm7, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - movdqa %xmm2, 32(rp) - movdqa %xmm3, 48(rp) - lea 64(rp), rp -L(am): sub $8, n - jnc L(atop) - - test $4, R8(n) - jz 1f - movdqa (up), %xmm0 - movdqa 16(up), %xmm1 - lea 32(up), up - pxor %xmm7, %xmm0 - pxor %xmm7, %xmm1 - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - lea 32(rp), rp - -1: test $2, R8(n) - jz 1f - movdqa (up), %xmm0 - lea 16(up), up - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - lea 16(rp), rp - -1: test $1, R8(n) - jz 1f - mov (up), %r8 - not %r8 - mov %r8, (rp) - -1: FUNC_EXIT() - ret - -L(uent): -C Code handling up - rp = 8 (mod 16) - -C FIXME: The code below only handles overlap if it is close to complete, or -C quite separate: up-rp < 5 or up-up > 15 limbs - lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES - sub rp, %rax - cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES - jbe L(bc) C deflect to plain loop - - sub $16, n - jc L(uend) - - movdqa 120(up), %xmm3 - - sub $16, n - jmp L(um) - - ALIGN(16) -L(utop):movdqa 120(up), %xmm3 - pxor %xmm7, %xmm0 - movdqa %xmm0, -128(rp) - sub $16, n -L(um): movdqa 104(up), %xmm2 - palignr($8, %xmm2, %xmm3) - movdqa 88(up), %xmm1 - pxor %xmm7, %xmm3 - movdqa %xmm3, 112(rp) - palignr($8, %xmm1, %xmm2) - movdqa 72(up), %xmm0 - pxor %xmm7, %xmm2 - movdqa %xmm2, 96(rp) - palignr($8, %xmm0, %xmm1) - movdqa 56(up), %xmm3 - pxor %xmm7, %xmm1 - movdqa %xmm1, 80(rp) - palignr($8, %xmm3, %xmm0) - movdqa 40(up), %xmm2 - pxor %xmm7, %xmm0 - movdqa %xmm0, 64(rp) - palignr($8, %xmm2, %xmm3) - movdqa 24(up), %xmm1 - pxor %xmm7, %xmm3 - movdqa %xmm3, 48(rp) - palignr($8, %xmm1, %xmm2) - movdqa 8(up), %xmm0 - pxor %xmm7, %xmm2 - movdqa %xmm2, 32(rp) - palignr($8, %xmm0, %xmm1) - movdqa -8(up), %xmm3 - pxor %xmm7, %xmm1 - movdqa %xmm1, 16(rp) - palignr($8, %xmm3, %xmm0) - lea 128(up), up - lea 128(rp), rp - jnc L(utop) - - pxor %xmm7, %xmm0 - movdqa %xmm0, -128(rp) - -L(uend):test $8, R8(n) - jz 1f - movdqa 56(up), %xmm3 - movdqa 40(up), %xmm2 - palignr($8, %xmm2, %xmm3) - movdqa 24(up), %xmm1 - pxor %xmm7, %xmm3 - movdqa %xmm3, 48(rp) - palignr($8, %xmm1, %xmm2) - movdqa 8(up), %xmm0 - pxor %xmm7, %xmm2 - movdqa %xmm2, 32(rp) - palignr($8, %xmm0, %xmm1) - movdqa -8(up), %xmm3 - pxor %xmm7, %xmm1 - movdqa %xmm1, 16(rp) - palignr($8, %xmm3, %xmm0) - lea 64(up), up - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - lea 64(rp), rp - -1: test $4, R8(n) - jz 1f - movdqa 24(up), %xmm1 - movdqa 8(up), %xmm0 - palignr($8, %xmm0, %xmm1) - movdqa -8(up), %xmm3 - pxor %xmm7, %xmm1 - movdqa %xmm1, 16(rp) - palignr($8, %xmm3, %xmm0) - lea 32(up), up - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - lea 32(rp), rp - -1: test $2, R8(n) - jz 1f - movdqa 8(up), %xmm0 - movdqa -8(up), %xmm3 - palignr($8, %xmm3, %xmm0) - lea 16(up), up - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - lea 16(rp), rp - -1: test $1, R8(n) - jz 1f - mov (up), %r8 - not %r8 - mov %r8, (rp) - -1: FUNC_EXIT() - ret - -C Basecase code. Needed for good small operands speed, not for -C correctness as the above code is currently written. - -L(bc): lea -8(rp), rp - sub $4, R32(n) - jc L(end) - -ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, -` ALIGN(16)') -L(top): mov (up), %r8 - mov 8(up), %r9 - lea 32(rp), rp - mov 16(up), %r10 - mov 24(up), %r11 - lea 32(up), up - not %r8 - not %r9 - not %r10 - not %r11 - mov %r8, -24(rp) - mov %r9, -16(rp) -ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, -` sub $4, R32(n)') - mov %r10, -8(rp) - mov %r11, (rp) -ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, -` jnc L(top)') - -L(end): test $1, R8(n) - jz 1f - mov (up), %r8 - not %r8 - mov %r8, 8(rp) - lea 8(rp), rp - lea 8(up), up -1: test $2, R8(n) - jz 1f - mov (up), %r8 - mov 8(up), %r9 - not %r8 - not %r9 - mov %r8, 8(rp) - mov %r9, 16(rp) -1: FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/com.asm b/gmp/mpn/x86_64/fastsse/com.asm deleted file mode 100644 index 4abb076d3f..0000000000 --- a/gmp/mpn/x86_64/fastsse/com.asm +++ /dev/null @@ -1,161 +0,0 @@ -dnl AMD64 mpn_com optimised for CPUs with fast SSE. - -dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 2.0 2.0 N -C AMD K10 0.85 1.3 Y/N -C AMD bd1 1.40 1.40 Y -C AMD bobcat 3.1 3.1 N -C Intel P4 2.28 illop Y -C Intel core2 1.02 1.02 N -C Intel NHM 0.53 0.68 Y -C Intel SBR 0.51 0.75 Y -C Intel atom 3.68 3.68 N -C VIA nano 1.17 5.09 Y/N - -C We try to do as many 16-byte operations as possible. The top-most and -C bottom-most writes might need 8-byte operations. We can always write using -C aligned 16-byte operations, we read with both aligned and unaligned 16-byte -C operations. - -C Instead of having separate loops for reading aligned and unaligned, we read -C using MOVDQU. This seems to work great except for core2; there performance -C doubles when reading using MOVDQA (for aligned source). It is unclear how to -C best handle the unaligned case there. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_com) - FUNC_ENTRY(3) - - test n, n - jz L(don) - - pcmpeqb %xmm7, %xmm7 C set to 111...111 - - test $8, R8(rp) C is rp 16-byte aligned? - jz L(ali) C jump if rp aligned - mov (up), %rax - lea 8(up), up - not %rax - mov %rax, (rp) - lea 8(rp), rp - dec n - - sub $14, n - jc L(sma) - - ALIGN(16) -L(top): movdqu (up), %xmm0 - movdqu 16(up), %xmm1 - movdqu 32(up), %xmm2 - movdqu 48(up), %xmm3 - movdqu 64(up), %xmm4 - movdqu 80(up), %xmm5 - movdqu 96(up), %xmm6 - lea 112(up), up - pxor %xmm7, %xmm0 - pxor %xmm7, %xmm1 - pxor %xmm7, %xmm2 - pxor %xmm7, %xmm3 - pxor %xmm7, %xmm4 - pxor %xmm7, %xmm5 - pxor %xmm7, %xmm6 - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - movdqa %xmm2, 32(rp) - movdqa %xmm3, 48(rp) - movdqa %xmm4, 64(rp) - movdqa %xmm5, 80(rp) - movdqa %xmm6, 96(rp) - lea 112(rp), rp -L(ali): sub $14, n - jnc L(top) - -L(sma): add $14, n - test $8, R8(n) - jz 1f - movdqu (up), %xmm0 - movdqu 16(up), %xmm1 - movdqu 32(up), %xmm2 - movdqu 48(up), %xmm3 - lea 64(up), up - pxor %xmm7, %xmm0 - pxor %xmm7, %xmm1 - pxor %xmm7, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - movdqa %xmm2, 32(rp) - movdqa %xmm3, 48(rp) - lea 64(rp), rp -1: - test $4, R8(n) - jz 1f - movdqu (up), %xmm0 - movdqu 16(up), %xmm1 - lea 32(up), up - pxor %xmm7, %xmm0 - pxor %xmm7, %xmm1 - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - lea 32(rp), rp -1: - test $2, R8(n) - jz 1f - movdqu (up), %xmm0 - lea 16(up), up - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - lea 16(rp), rp -1: - test $1, R8(n) - jz 1f - mov (up), %rax - not %rax - mov %rax, (rp) -1: -L(don): FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/copyd-palignr.asm b/gmp/mpn/x86_64/fastsse/copyd-palignr.asm deleted file mode 100644 index 7430cadc09..0000000000 --- a/gmp/mpn/x86_64/fastsse/copyd-palignr.asm +++ /dev/null @@ -1,251 +0,0 @@ -dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 2.0 illop 1.0/1.0 N -C AMD K10 0.85 illop Y/N -C AMD bull 0.70 0.70 Y -C AMD pile 0.68 0.68 Y -C AMD steam ? ? -C AMD bobcat 1.97 8.24 1.5/1.5 N -C AMD jaguar ? ? -C Intel P4 2.26 illop Y/N -C Intel core 0.52 0.68-0.80 opt/0.64 Y -C Intel NHM 0.52 0.64 opt/opt Y -C Intel SBR 0.51 0.51 opt/0.51 Y -C Intel IBR ? ? Y -C Intel HWL 0.51 0.51 0.25/0.25 N -C Intel atom 1.16 1.66 opt/opt Y -C VIA nano 1.08 1.06 opt/opt Y - -C We use only 16-byte operations, except for unaligned top-most and bottom-most -C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). -C -C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, -C taken from the x86_64 default code. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') - -C There are three instructions for loading an aligned 128-bit quantity. We use -C movaps, since it has the shortest coding. -define(`movdqa', ``movaps'') - -ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_copyd) - FUNC_ENTRY(3) - - lea -8(up,n,8), up - lea -8(rp,n,8), rp - - cmp $COPYD_SSE_THRESHOLD, n - jbe L(bc) - - test $8, R8(rp) C is rp 16-byte aligned? - jnz L(rp_aligned) C jump if rp aligned - - mov (up), %rax C copy one limb - mov %rax, (rp) - lea -8(up), up - lea -8(rp), rp - dec n - -L(rp_aligned): - test $8, R8(up) - jz L(uent) - -ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, -` sub $8, n', -` jmp L(am)') - - ALIGN(16) -L(atop):movdqa -8(up), %xmm0 - movdqa -24(up), %xmm1 - movdqa -40(up), %xmm2 - movdqa -56(up), %xmm3 - lea -64(up), up - movdqa %xmm0, -8(rp) - movdqa %xmm1, -24(rp) - movdqa %xmm2, -40(rp) - movdqa %xmm3, -56(rp) - lea -64(rp), rp -L(am): sub $8, n - jnc L(atop) - - test $4, R8(n) - jz 1f - movdqa -8(up), %xmm0 - movdqa -24(up), %xmm1 - lea -32(up), up - movdqa %xmm0, -8(rp) - movdqa %xmm1, -24(rp) - lea -32(rp), rp - -1: test $2, R8(n) - jz 1f - movdqa -8(up), %xmm0 - lea -16(up), up - movdqa %xmm0, -8(rp) - lea -16(rp), rp - -1: test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, (rp) - -1: FUNC_EXIT() - ret - -L(uent):sub $16, n - movdqa (up), %xmm0 - jc L(uend) - - ALIGN(16) -L(utop):sub $16, n - movdqa -16(up), %xmm1 - palignr($8, %xmm1, %xmm0) - movdqa %xmm0, -8(rp) - movdqa -32(up), %xmm2 - palignr($8, %xmm2, %xmm1) - movdqa %xmm1, -24(rp) - movdqa -48(up), %xmm3 - palignr($8, %xmm3, %xmm2) - movdqa %xmm2, -40(rp) - movdqa -64(up), %xmm0 - palignr($8, %xmm0, %xmm3) - movdqa %xmm3, -56(rp) - movdqa -80(up), %xmm1 - palignr($8, %xmm1, %xmm0) - movdqa %xmm0, -72(rp) - movdqa -96(up), %xmm2 - palignr($8, %xmm2, %xmm1) - movdqa %xmm1, -88(rp) - movdqa -112(up), %xmm3 - palignr($8, %xmm3, %xmm2) - movdqa %xmm2, -104(rp) - movdqa -128(up), %xmm0 - palignr($8, %xmm0, %xmm3) - movdqa %xmm3, -120(rp) - lea -128(up), up - lea -128(rp), rp - jnc L(utop) - -L(uend):test $8, R8(n) - jz 1f - movdqa -16(up), %xmm1 - palignr($8, %xmm1, %xmm0) - movdqa %xmm0, -8(rp) - movdqa -32(up), %xmm0 - palignr($8, %xmm0, %xmm1) - movdqa %xmm1, -24(rp) - movdqa -48(up), %xmm1 - palignr($8, %xmm1, %xmm0) - movdqa %xmm0, -40(rp) - movdqa -64(up), %xmm0 - palignr($8, %xmm0, %xmm1) - movdqa %xmm1, -56(rp) - lea -64(up), up - lea -64(rp), rp - -1: test $4, R8(n) - jz 1f - movdqa -16(up), %xmm1 - palignr($8, %xmm1, %xmm0) - movdqa %xmm0, -8(rp) - movdqa -32(up), %xmm0 - palignr($8, %xmm0, %xmm1) - movdqa %xmm1, -24(rp) - lea -32(up), up - lea -32(rp), rp - -1: test $2, R8(n) - jz 1f - movdqa -16(up), %xmm1 - palignr($8, %xmm1, %xmm0) - movdqa %xmm0, -8(rp) - lea -16(up), up - lea -16(rp), rp - -1: test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, (rp) - -1: FUNC_EXIT() - ret - -C Basecase code. Needed for good small operands speed, not for -C correctness as the above code is currently written. - -L(bc): sub $4, R32(n) - jc L(end) - - ALIGN(16) -L(top): mov (up), %r8 - mov -8(up), %r9 - lea -32(rp), rp - mov -16(up), %r10 - mov -24(up), %r11 - lea -32(up), up - mov %r8, 32(rp) - mov %r9, 24(rp) -ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, -` sub $4, R32(n)') - mov %r10, 16(rp) - mov %r11, 8(rp) -ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, -` jnc L(top)') - -L(end): test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, (rp) - lea -8(rp), rp - lea -8(up), up -1: test $2, R8(n) - jz 1f - mov (up), %r8 - mov -8(up), %r9 - mov %r8, (rp) - mov %r9, -8(rp) -1: FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/copyd.asm b/gmp/mpn/x86_64/fastsse/copyd.asm deleted file mode 100644 index 5c6094c7e2..0000000000 --- a/gmp/mpn/x86_64/fastsse/copyd.asm +++ /dev/null @@ -1,145 +0,0 @@ -dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. - -dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb good for cpu? -C AMD K8,K9 -C AMD K10 0.85 Y -C AMD bd1 0.8 Y -C AMD bobcat -C Intel P4 2.28 Y -C Intel core2 1 -C Intel NHM 0.5 Y -C Intel SBR 0.5 Y -C Intel atom -C VIA nano 1.1 Y - -C We try to do as many 16-byte operations as possible. The top-most and -C bottom-most writes might need 8-byte operations. We can always write using -C aligned 16-byte operations, we read with both aligned and unaligned 16-byte -C operations. - -C Instead of having separate loops for reading aligned and unaligned, we read -C using MOVDQU. This seems to work great except for core2; there performance -C doubles when reading using MOVDQA (for aligned source). It is unclear how to -C best handle the unaligned case there. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_copyd) - FUNC_ENTRY(3) - - test n, n - jz L(don) - - lea -16(rp,n,8), rp - lea -16(up,n,8), up - - test $8, R8(rp) C is rp 16-byte aligned? - jz L(ali) C jump if rp aligned - mov 8(up), %rax - lea -8(up), up - mov %rax, 8(rp) - lea -8(rp), rp - dec n - - sub $16, n - jc L(sma) - - ALIGN(16) -L(top): movdqu (up), %xmm0 - movdqu -16(up), %xmm1 - movdqu -32(up), %xmm2 - movdqu -48(up), %xmm3 - movdqu -64(up), %xmm4 - movdqu -80(up), %xmm5 - movdqu -96(up), %xmm6 - movdqu -112(up), %xmm7 - lea -128(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, -16(rp) - movdqa %xmm2, -32(rp) - movdqa %xmm3, -48(rp) - movdqa %xmm4, -64(rp) - movdqa %xmm5, -80(rp) - movdqa %xmm6, -96(rp) - movdqa %xmm7, -112(rp) - lea -128(rp), rp -L(ali): sub $16, n - jnc L(top) - -L(sma): test $8, R8(n) - jz 1f - movdqu (up), %xmm0 - movdqu -16(up), %xmm1 - movdqu -32(up), %xmm2 - movdqu -48(up), %xmm3 - lea -64(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, -16(rp) - movdqa %xmm2, -32(rp) - movdqa %xmm3, -48(rp) - lea -64(rp), rp -1: - test $4, R8(n) - jz 1f - movdqu (up), %xmm0 - movdqu -16(up), %xmm1 - lea -32(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, -16(rp) - lea -32(rp), rp -1: - test $2, R8(n) - jz 1f - movdqu (up), %xmm0 - lea -16(up), up - movdqa %xmm0, (rp) - lea -16(rp), rp -1: - test $1, R8(n) - jz 1f - mov 8(up), %r8 - mov %r8, 8(rp) -1: -L(don): FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp/mpn/x86_64/fastsse/copyi-palignr.asm deleted file mode 100644 index fda3c3500f..0000000000 --- a/gmp/mpn/x86_64/fastsse/copyi-palignr.asm +++ /dev/null @@ -1,295 +0,0 @@ -dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. - -dnl Copyright 2012, 2013 Free Software Foundation, Inc. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 2.0 illop 1.0/1.0 N -C AMD K10 0.85 illop Y/N -C AMD bull 0.70 0.66 Y -C AMD pile 0.68 0.66 Y -C AMD steam ? ? -C AMD bobcat 1.97 8.16 1.5/1.5 N -C AMD jaguar ? ? -C Intel P4 2.26 illop Y/N -C Intel core 0.52 0.64 opt/opt Y -C Intel NHM 0.52 0.71 opt/opt Y -C Intel SBR 0.51 0.54 opt/0.51 Y -C Intel IBR ? ? Y -C Intel HWL 0.51 0.52 0.25/0.25 N -C Intel atom 1.16 1.61 opt/opt Y -C VIA nano 1.09 1.08 opt/opt Y - -C We use only 16-byte operations, except for unaligned top-most and bottom-most -C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That -C instruction is better adapted to mpn_copyd's needs, we need to contort the -C code to use it here. -C -C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, -C taken from the x86_64 default code. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') - -C There are three instructions for loading an aligned 128-bit quantity. We use -C movaps, since it has the shortest coding. -dnl define(`movdqa', ``movaps'') - -ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_copyi) - FUNC_ENTRY(3) - - cmp $COPYI_SSE_THRESHOLD, n - jbe L(bc) - - test $8, R8(rp) C is rp 16-byte aligned? - jz L(rp_aligned) C jump if rp aligned - - movsq C copy one limb - dec n - -L(rp_aligned): - test $8, R8(up) - jnz L(uent) - -ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, -` sub $8, n', -` jmp L(am)') - - ALIGN(16) -L(atop):movdqa 0(up), %xmm0 - movdqa 16(up), %xmm1 - movdqa 32(up), %xmm2 - movdqa 48(up), %xmm3 - lea 64(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - movdqa %xmm2, 32(rp) - movdqa %xmm3, 48(rp) - lea 64(rp), rp -L(am): sub $8, n - jnc L(atop) - - test $4, R8(n) - jz 1f - movdqa (up), %xmm0 - movdqa 16(up), %xmm1 - lea 32(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - lea 32(rp), rp - -1: test $2, R8(n) - jz 1f - movdqa (up), %xmm0 - lea 16(up), up - movdqa %xmm0, (rp) - lea 16(rp), rp - -1: test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, (rp) - -1: FUNC_EXIT() - ret - -L(uent): -C Code handling up - rp = 8 (mod 16) - - cmp $16, n - jc L(ued0) - -IFDOS(` add $-56, %rsp ') -IFDOS(` movdqa %xmm6, (%rsp) ') -IFDOS(` movdqa %xmm7, 16(%rsp) ') -IFDOS(` movdqa %xmm8, 32(%rsp) ') - - movaps 120(up), %xmm7 - movaps 104(up), %xmm6 - movaps 88(up), %xmm5 - movaps 72(up), %xmm4 - movaps 56(up), %xmm3 - movaps 40(up), %xmm2 - lea 128(up), up - sub $32, n - jc L(ued1) - - ALIGN(16) -L(utop):movaps -104(up), %xmm1 - sub $16, n - movaps -120(up), %xmm0 - palignr($8, %xmm6, %xmm7) - movaps -136(up), %xmm8 - movdqa %xmm7, 112(rp) - palignr($8, %xmm5, %xmm6) - movaps 120(up), %xmm7 - movdqa %xmm6, 96(rp) - palignr($8, %xmm4, %xmm5) - movaps 104(up), %xmm6 - movdqa %xmm5, 80(rp) - palignr($8, %xmm3, %xmm4) - movaps 88(up), %xmm5 - movdqa %xmm4, 64(rp) - palignr($8, %xmm2, %xmm3) - movaps 72(up), %xmm4 - movdqa %xmm3, 48(rp) - palignr($8, %xmm1, %xmm2) - movaps 56(up), %xmm3 - movdqa %xmm2, 32(rp) - palignr($8, %xmm0, %xmm1) - movaps 40(up), %xmm2 - movdqa %xmm1, 16(rp) - palignr($8, %xmm8, %xmm0) - lea 128(up), up - movdqa %xmm0, (rp) - lea 128(rp), rp - jnc L(utop) - -L(ued1):movaps -104(up), %xmm1 - movaps -120(up), %xmm0 - movaps -136(up), %xmm8 - palignr($8, %xmm6, %xmm7) - movdqa %xmm7, 112(rp) - palignr($8, %xmm5, %xmm6) - movdqa %xmm6, 96(rp) - palignr($8, %xmm4, %xmm5) - movdqa %xmm5, 80(rp) - palignr($8, %xmm3, %xmm4) - movdqa %xmm4, 64(rp) - palignr($8, %xmm2, %xmm3) - movdqa %xmm3, 48(rp) - palignr($8, %xmm1, %xmm2) - movdqa %xmm2, 32(rp) - palignr($8, %xmm0, %xmm1) - movdqa %xmm1, 16(rp) - palignr($8, %xmm8, %xmm0) - movdqa %xmm0, (rp) - lea 128(rp), rp - -IFDOS(` movdqa (%rsp), %xmm6 ') -IFDOS(` movdqa 16(%rsp), %xmm7 ') -IFDOS(` movdqa 32(%rsp), %xmm8 ') -IFDOS(` add $56, %rsp ') - -L(ued0):test $8, R8(n) - jz 1f - movaps 56(up), %xmm3 - movaps 40(up), %xmm2 - movaps 24(up), %xmm1 - movaps 8(up), %xmm0 - movaps -8(up), %xmm4 - palignr($8, %xmm2, %xmm3) - movdqa %xmm3, 48(rp) - palignr($8, %xmm1, %xmm2) - movdqa %xmm2, 32(rp) - palignr($8, %xmm0, %xmm1) - movdqa %xmm1, 16(rp) - palignr($8, %xmm4, %xmm0) - lea 64(up), up - movdqa %xmm0, (rp) - lea 64(rp), rp - -1: test $4, R8(n) - jz 1f - movaps 24(up), %xmm1 - movaps 8(up), %xmm0 - palignr($8, %xmm0, %xmm1) - movaps -8(up), %xmm3 - movdqa %xmm1, 16(rp) - palignr($8, %xmm3, %xmm0) - lea 32(up), up - movdqa %xmm0, (rp) - lea 32(rp), rp - -1: test $2, R8(n) - jz 1f - movdqa 8(up), %xmm0 - movdqa -8(up), %xmm3 - palignr($8, %xmm3, %xmm0) - lea 16(up), up - movdqa %xmm0, (rp) - lea 16(rp), rp - -1: test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, (rp) - -1: FUNC_EXIT() - ret - -C Basecase code. Needed for good small operands speed, not for -C correctness as the above code is currently written. - -L(bc): lea -8(rp), rp - sub $4, R32(n) - jc L(end) - - ALIGN(16) -L(top): mov (up), %r8 - mov 8(up), %r9 - lea 32(rp), rp - mov 16(up), %r10 - mov 24(up), %r11 - lea 32(up), up - mov %r8, -24(rp) - mov %r9, -16(rp) -ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, -` sub $4, R32(n)') - mov %r10, -8(rp) - mov %r11, (rp) -ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, -` jnc L(top)') - -L(end): test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, 8(rp) - lea 8(rp), rp - lea 8(up), up -1: test $2, R8(n) - jz 1f - mov (up), %r8 - mov 8(up), %r9 - mov %r8, 8(rp) - mov %r9, 16(rp) -1: FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/copyi.asm b/gmp/mpn/x86_64/fastsse/copyi.asm deleted file mode 100644 index a1a1c231dc..0000000000 --- a/gmp/mpn/x86_64/fastsse/copyi.asm +++ /dev/null @@ -1,166 +0,0 @@ -dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb good for cpu? -C AMD K8,K9 -C AMD K10 0.85 1.64 Y/N -C AMD bd1 1.4 1.4 Y -C AMD bobcat -C Intel P4 2.3 2.3 Y -C Intel core2 1.0 1.0 -C Intel NHM 0.5 0.67 Y -C Intel SBR 0.5 0.75 Y -C Intel atom -C VIA nano 1.16 5.16 Y/N - -C We try to do as many 16-byte operations as possible. The top-most and -C bottom-most writes might need 8-byte operations. We can always write using -C aligned 16-byte operations, we read with both aligned and unaligned 16-byte -C operations. - -C Instead of having separate loops for reading aligned and unaligned, we read -C using MOVDQU. This seems to work great except for core2; there performance -C doubles when reading using MOVDQA (for aligned source). It is unclear how to -C best handle the unaligned case there. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`up', `%rsi') -define(`n', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -dnl define(`movdqu', lddqu) - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_copyi) - FUNC_ENTRY(3) - - cmp $3, n - jc L(bc) - - test $8, R8(rp) C is rp 16-byte aligned? - jz L(ali) C jump if rp aligned - movsq C copy single limb - dec n - - sub $16, n - jc L(sma) - - ALIGN(16) -L(top): movdqu (up), %xmm0 - movdqu 16(up), %xmm1 - movdqu 32(up), %xmm2 - movdqu 48(up), %xmm3 - movdqu 64(up), %xmm4 - movdqu 80(up), %xmm5 - movdqu 96(up), %xmm6 - movdqu 112(up), %xmm7 - lea 128(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - movdqa %xmm2, 32(rp) - movdqa %xmm3, 48(rp) - movdqa %xmm4, 64(rp) - movdqa %xmm5, 80(rp) - movdqa %xmm6, 96(rp) - movdqa %xmm7, 112(rp) - lea 128(rp), rp -L(ali): sub $16, n - jnc L(top) - -L(sma): test $8, R8(n) - jz 1f - movdqu (up), %xmm0 - movdqu 16(up), %xmm1 - movdqu 32(up), %xmm2 - movdqu 48(up), %xmm3 - lea 64(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - movdqa %xmm2, 32(rp) - movdqa %xmm3, 48(rp) - lea 64(rp), rp -1: - test $4, R8(n) - jz 1f - movdqu (up), %xmm0 - movdqu 16(up), %xmm1 - lea 32(up), up - movdqa %xmm0, (rp) - movdqa %xmm1, 16(rp) - lea 32(rp), rp -1: - test $2, R8(n) - jz 1f - movdqu (up), %xmm0 - lea 16(up), up - movdqa %xmm0, (rp) - lea 16(rp), rp - ALIGN(16) -1: -L(end): test $1, R8(n) - jz 1f - mov (up), %r8 - mov %r8, (rp) -1: - FUNC_EXIT() - ret - -C Basecase code. Needed for good small operands speed, not for -C correctness as the above code is currently written. - -L(bc): sub $2, n - jc L(end) - ALIGN(16) -1: mov (up), %rax - mov 8(up), %rcx - lea 16(up), up - mov %rax, (rp) - mov %rcx, 8(rp) - lea 16(rp), rp - sub $2, n - jnc 1b - - test $1, R8(n) - jz L(ret) - mov (up), %rax - mov %rax, (rp) -L(ret): FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm b/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm deleted file mode 100644 index a05e850a1f..0000000000 --- a/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm +++ /dev/null @@ -1,182 +0,0 @@ -dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 3 3 2.35 no, use shl/shr -C AMD K10 1.5-1.8 1.5-1.8 1.33 yes -C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes -C AMD bobcat 3.17 3.17 yes, bad for n < 20 -C Intel P4 4.67 4.67 2.7 no, slow movdqu -C Intel core2 2.15 2.15 1.25 no, use shld/shrd -C Intel NHM 1.66 1.66 1.25 no, use shld/shrd -C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 -C Intel atom 11.7 11.7 4.5 no -C VIA nano 5.7 5.95 2.0 no, slow movdqu - -C We try to do as many aligned 16-byte operations as possible. The top-most -C and bottom-most writes might need 8-byte operations. -C -C This variant rely on fast load movdqu, and uses it even for aligned operands, -C in order to avoid the need for two separate loops. -C -C TODO -C * Could 2-limb wind-down code be simplified? -C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts -C for other affected CPUs. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`ap', `%rsi') -define(`n', `%rdx') -define(`cnt', `%rcx') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_lshift) - FUNC_ENTRY(4) - movd R32(%rcx), %xmm4 - mov $64, R32(%rax) - sub R32(%rcx), R32(%rax) - movd R32(%rax), %xmm5 - - neg R32(%rcx) - mov -8(ap,n,8), %rax - shr R8(%rcx), %rax - - cmp $3, n - jle L(bc) - - lea (rp,n,8), R32(%rcx) - test $8, R8(%rcx) - jz L(rp_aligned) - -C Do one initial limb in order to make rp aligned - movq -8(ap,n,8), %xmm0 - movq -16(ap,n,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, -8(rp,n,8) - dec n - -L(rp_aligned): - lea 1(n), %r8d - - and $6, R32(%r8) - jz L(ba0) - cmp $4, R32(%r8) - jz L(ba4) - jc L(ba2) -L(ba6): add $-4, n - jmp L(i56) -L(ba0): add $-6, n - jmp L(i70) -L(ba4): add $-2, n - jmp L(i34) -L(ba2): add $-8, n - jle L(end) - - ALIGN(16) -L(top): movdqu 40(ap,n,8), %xmm1 - movdqu 48(ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 48(rp,n,8) -L(i70): - movdqu 24(ap,n,8), %xmm1 - movdqu 32(ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 32(rp,n,8) -L(i56): - movdqu 8(ap,n,8), %xmm1 - movdqu 16(ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 16(rp,n,8) -L(i34): - movdqu -8(ap,n,8), %xmm1 - movdqu (ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, (rp,n,8) - sub $8, n - jg L(top) - -L(end): test $1, R8(n) - jnz L(end8) - - movdqu (ap), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (rp) - FUNC_EXIT() - ret - -C Basecase - ALIGN(16) -L(bc): dec R32(n) - jz L(end8) - - movq (ap,n,8), %xmm1 - movq -8(ap,n,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (rp,n,8) - sub $2, R32(n) - jl L(end8) - movq 8(ap), %xmm1 - movq (ap), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(rp) - -L(end8):movq (ap), %xmm0 - psllq %xmm4, %xmm0 - movq %xmm0, (rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/lshift.asm b/gmp/mpn/x86_64/fastsse/lshift.asm deleted file mode 100644 index f76972a22f..0000000000 --- a/gmp/mpn/x86_64/fastsse/lshift.asm +++ /dev/null @@ -1,169 +0,0 @@ -dnl AMD64 mpn_lshift optimised for CPUs with fast SSE. - -dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. - -dnl Copyright 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb cycles/limb good -C 16-byte aligned 16-byte unaligned for cpu? -C AMD K8,K9 ? ? -C AMD K10 1.68 (1.45) 1.75 (1.49) Y -C AMD bd1 1.82 (1.75) 1.82 (1.75) Y -C AMD bobcat 4 4 -C Intel P4 3 (2.7) 3 (2.7) Y -C Intel core2 2.05 (1.67) 2.55 (1.75) -C Intel NHM 2.05 (1.75) 2.09 (2) -C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y -C Intel atom ? ? -C VIA nano 2.25 (2) 2.5 (2) Y - -C We try to do as many 16-byte operations as possible. The top-most and -C bottom-most writes might need 8-byte operations. - -C There are two inner-loops, one for when rp = ap (mod 16) and one when this is -C not true. The aligned case reads 16+8 bytes, the unaligned case reads -C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. - -C This is not yet great code: -C (1) The unaligned case makes many reads. -C (2) We should do some unrolling, at least 2-way. -C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on -C Nano. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`ap', `%rsi') -define(`n', `%rdx') -define(`cnt', `%rcx') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_lshift) - movd R32(%rcx), %xmm4 - mov $64, R32(%rax) - sub R32(%rcx), R32(%rax) - movd R32(%rax), %xmm5 - - neg R32(%rcx) - mov -8(ap,n,8), %rax - shr R8(%rcx), %rax - - cmp $2, n - jle L(le2) - - lea (rp,n,8), R32(%rcx) - test $8, R8(%rcx) - je L(rp_aligned) - -C Do one initial limb in order to make rp aligned - movq -8(ap,n,8), %xmm0 - movq -16(ap,n,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, -8(rp,n,8) - dec n - -L(rp_aligned): - lea (ap,n,8), R32(%rcx) - test $8, R8(%rcx) - je L(aent) - jmp L(uent) -C ***************************************************************************** - -C Handle the case when ap != rp (mod 16). - - ALIGN(16) -L(utop):movdqa -8(ap,n,8), %xmm0 - movq (ap,n,8), %xmm1 - punpcklqdq 8(ap,n,8), %xmm1 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (rp,n,8) -L(uent):sub $2, n - ja L(utop) - - jne L(end8) - - movq (ap), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - punpcklqdq 8(ap), %xmm1 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (rp) - ret -C ***************************************************************************** - -C Handle the case when ap = rp (mod 16). - - ALIGN(16) -L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] - movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] - punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, (rp,n,8) -L(aent): - sub $2, n - ja L(atop) - jne L(end8) - - movdqa (ap), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (rp) - ret -C ***************************************************************************** - - ALIGN(16) -L(le2): jne L(end8) - - movq 8(ap), %xmm0 - movq (ap), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, 8(rp) - -L(end8):movq (ap), %xmm0 - psllq %xmm4, %xmm0 - movq %xmm0, (rp) - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm deleted file mode 100644 index 8250910c52..0000000000 --- a/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm +++ /dev/null @@ -1,193 +0,0 @@ -dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 3 3 ? no, use shl/shr -C AMD K10 1.8-2.0 1.8-2.0 ? yes -C AMD bd1 1.9 1.9 ? yes -C AMD bobcat 3.67 3.67 yes, bad for n < 20 -C Intel P4 4.75 4.75 ? no, slow movdqu -C Intel core2 2.27 2.27 ? no, use shld/shrd -C Intel NHM 2.15 2.15 ? no, use shld/shrd -C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 -C Intel atom 12.9 12.9 ? no -C VIA nano 6.18 6.44 ? no, slow movdqu - -C We try to do as many aligned 16-byte operations as possible. The top-most -C and bottom-most writes might need 8-byte operations. -C -C This variant rely on fast load movdqu, and uses it even for aligned operands, -C in order to avoid the need for two separate loops. -C -C TODO -C * Could 2-limb wind-down code be simplified? -C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts -C for other affected CPUs. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`ap', `%rsi') -define(`n', `%rdx') -define(`cnt', `%rcx') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_lshiftc) - FUNC_ENTRY(4) - movd R32(%rcx), %xmm4 - mov $64, R32(%rax) - sub R32(%rcx), R32(%rax) - movd R32(%rax), %xmm5 - - neg R32(%rcx) - mov -8(ap,n,8), %rax - shr R8(%rcx), %rax - - pcmpeqb %xmm3, %xmm3 C set to 111...111 - - cmp $3, n - jle L(bc) - - lea (rp,n,8), R32(%rcx) - test $8, R8(%rcx) - jz L(rp_aligned) - -C Do one initial limb in order to make rp aligned - movq -8(ap,n,8), %xmm0 - movq -16(ap,n,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, -8(rp,n,8) - dec n - -L(rp_aligned): - lea 1(n), %r8d - - and $6, R32(%r8) - jz L(ba0) - cmp $4, R32(%r8) - jz L(ba4) - jc L(ba2) -L(ba6): add $-4, n - jmp L(i56) -L(ba0): add $-6, n - jmp L(i70) -L(ba4): add $-2, n - jmp L(i34) -L(ba2): add $-8, n - jle L(end) - - ALIGN(16) -L(top): movdqu 40(ap,n,8), %xmm1 - movdqu 48(ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 48(rp,n,8) -L(i70): - movdqu 24(ap,n,8), %xmm1 - movdqu 32(ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 32(rp,n,8) -L(i56): - movdqu 8(ap,n,8), %xmm1 - movdqu 16(ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 16(rp,n,8) -L(i34): - movdqu -8(ap,n,8), %xmm1 - movdqu (ap,n,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (rp,n,8) - sub $8, n - jg L(top) - -L(end): test $1, R8(n) - jnz L(end8) - - movdqu (ap), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (rp) - FUNC_EXIT() - ret - -C Basecase - ALIGN(16) -L(bc): dec R32(n) - jz L(end8) - - movq (ap,n,8), %xmm1 - movq -8(ap,n,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (rp,n,8) - sub $2, R32(n) - jl L(end8) - movq 8(ap), %xmm1 - movq (ap), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, 8(rp) - -L(end8):movq (ap), %xmm0 - psllq %xmm4, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/lshiftc.asm b/gmp/mpn/x86_64/fastsse/lshiftc.asm deleted file mode 100644 index d2520690e2..0000000000 --- a/gmp/mpn/x86_64/fastsse/lshiftc.asm +++ /dev/null @@ -1,179 +0,0 @@ -dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. - -dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. - -dnl Copyright 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb cycles/limb good -C 16-byte aligned 16-byte unaligned for cpu? -C AMD K8,K9 ? ? -C AMD K10 1.85 (1.635) 1.9 (1.67) Y -C AMD bd1 1.82 (1.75) 1.82 (1.75) Y -C AMD bobcat 4.5 4.5 -C Intel P4 3.6 (3.125) 3.6 (3.125) Y -C Intel core2 2.05 (1.67) 2.55 (1.75) -C Intel NHM 2.05 (1.875) 2.6 (2.25) -C Intel SBR 1.55 (1.44) 2 (1.57) Y -C Intel atom ? ? -C VIA nano 2.5 (2.5) 2.5 (2.5) Y - -C We try to do as many 16-byte operations as possible. The top-most and -C bottom-most writes might need 8-byte operations. We always write using -C 16-byte operations, we read with both 8-byte and 16-byte operations. - -C There are two inner-loops, one for when rp = ap (mod 16) and one when this is -C not true. The aligned case reads 16+8 bytes, the unaligned case reads -C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. - -C This is not yet great code: -C (1) The unaligned case makes too many reads. -C (2) We should do some unrolling, at least 2-way. -C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on -C Nano. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`ap', `%rsi') -define(`n', `%rdx') -define(`cnt', `%rcx') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_lshiftc) - movd R32(%rcx), %xmm4 - mov $64, R32(%rax) - sub R32(%rcx), R32(%rax) - movd R32(%rax), %xmm5 - - neg R32(%rcx) - mov -8(ap,n,8), %rax - shr R8(%rcx), %rax - - pcmpeqb %xmm7, %xmm7 C set to 111...111 - - cmp $2, n - jle L(le2) - - lea (rp,n,8), R32(%rcx) - test $8, R8(%rcx) - je L(rp_aligned) - -C Do one initial limb in order to make rp aligned - movq -8(ap,n,8), %xmm0 - movq -16(ap,n,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm7, %xmm0 - movq %xmm0, -8(rp,n,8) - dec n - -L(rp_aligned): - lea (ap,n,8), R32(%rcx) - test $8, R8(%rcx) - je L(aent) - jmp L(uent) -C ***************************************************************************** - -C Handle the case when ap != rp (mod 16). - - ALIGN(16) -L(utop):movq (ap,n,8), %xmm1 - punpcklqdq 8(ap,n,8), %xmm1 - movdqa -8(ap,n,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp,n,8) -L(uent):sub $2, n - ja L(utop) - - jne L(end8) - - movq (ap), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - punpcklqdq 8(ap), %xmm1 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - ret -C ***************************************************************************** - -C Handle the case when ap = rp (mod 16). - - ALIGN(16) -L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] - movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] - punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp,n,8) -L(aent):sub $2, n - ja L(atop) - - jne L(end8) - - movdqa (ap), %xmm0 - pxor %xmm1, %xmm1 - punpcklqdq %xmm0, %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm7, %xmm0 - movdqa %xmm0, (rp) - ret -C ***************************************************************************** - - ALIGN(16) -L(le2): jne L(end8) - - movq 8(ap), %xmm0 - movq (ap), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm7, %xmm0 - movq %xmm0, 8(rp) - -L(end8):movq (ap), %xmm0 - psllq %xmm4, %xmm0 - pxor %xmm7, %xmm0 - movq %xmm0, (rp) - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm b/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm deleted file mode 100644 index 1e270b13c3..0000000000 --- a/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm +++ /dev/null @@ -1,201 +0,0 @@ -dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu. - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2010-2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb cycles/limb cycles/limb good -C aligned unaligned best seen for cpu? -C AMD K8,K9 3 3 2.35 no, use shl/shr -C AMD K10 1.5-1.8 1.5-1.8 1.33 yes -C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes -C AMD bobcat 3.17 3.17 yes, bad for n < 20 -C Intel P4 4.67 4.67 2.7 no, slow movdqu -C Intel core2 2.15 2.15 1.25 no, use shld/shrd -C Intel NHM 1.66 1.66 1.25 no, use shld/shrd -C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 -C Intel atom 11.7 11.7 4.5 no -C VIA nano 5.7 5.95 2.0 no, slow movdqu - -C We try to do as many aligned 16-byte operations as possible. The top-most -C and bottom-most writes might need 8-byte operations. -C -C This variant rely on fast load movdqu, and uses it even for aligned operands, -C in order to avoid the need for two separate loops. -C -C TODO -C * Could 2-limb wind-down code be simplified? -C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts -C for other affected CPUs. - -C INPUT PARAMETERS -define(`rp', `%rdi') -define(`ap', `%rsi') -define(`n', `%rdx') -define(`cnt', `%rcx') - -ASM_START() - TEXT - ALIGN(64) -PROLOGUE(mpn_rshift) - FUNC_ENTRY(4) - movd R32(%rcx), %xmm4 - mov $64, R32(%rax) - sub R32(%rcx), R32(%rax) - movd R32(%rax), %xmm5 - - neg R32(%rcx) - mov (ap), %rax - shl R8(%rcx), %rax - - cmp $3, n - jle L(bc) - - test $8, R8(rp) - jz L(rp_aligned) - -C Do one initial limb in order to make rp aligned - movq (ap), %xmm0 - movq 8(ap), %xmm1 - psrlq %xmm4, %xmm0 - psllq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, (rp) - lea 8(ap), ap - lea 8(rp), rp - dec n - -L(rp_aligned): - lea 1(n), %r8d - lea (ap,n,8), ap - lea (rp,n,8), rp - neg n - - and $6, R32(%r8) - jz L(bu0) - cmp $4, R32(%r8) - jz L(bu4) - jc L(bu2) -L(bu6): add $4, n - jmp L(i56) -L(bu0): add $6, n - jmp L(i70) -L(bu4): add $2, n - jmp L(i34) -L(bu2): add $8, n - jge L(end) - - ALIGN(16) -L(top): movdqu -64(ap,n,8), %xmm1 - movdqu -56(ap,n,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -64(rp,n,8) -L(i70): - movdqu -48(ap,n,8), %xmm1 - movdqu -40(ap,n,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -48(rp,n,8) -L(i56): - movdqu -32(ap,n,8), %xmm1 - movdqu -24(ap,n,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -32(rp,n,8) -L(i34): - movdqu -16(ap,n,8), %xmm1 - movdqu -8(ap,n,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -16(rp,n,8) - add $8, n - jl L(top) - -L(end): test $1, R8(n) - jnz L(e1) - - movdqu -16(ap), %xmm1 - movq -8(ap), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, -16(rp) - FUNC_EXIT() - ret - -L(e1): movq -8(ap), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, -8(rp) - FUNC_EXIT() - ret - -C Basecase - ALIGN(16) -L(bc): dec R32(n) - jnz 1f - movq (ap), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, (rp) - FUNC_EXIT() - ret - -1: movq (ap), %xmm1 - movq 8(ap), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (rp) - dec R32(n) - jnz 1f - movq 8(ap), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 8(rp) - FUNC_EXIT() - ret - -1: movq 8(ap), %xmm1 - movq 16(ap), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(rp) - movq 16(ap), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 16(rp) - FUNC_EXIT() - ret -EPILOGUE() diff --git a/gmp/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp/mpn/x86_64/fastsse/sec_tabselect.asm deleted file mode 100644 index e3df110be4..0000000000 --- a/gmp/mpn/x86_64/fastsse/sec_tabselect.asm +++ /dev/null @@ -1,192 +0,0 @@ -dnl AMD64 SSE mpn_sec_tabselect. - -dnl Contributed to the GNU project by Torbjörn Granlund. - -dnl Copyright 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb cycles/limb cycles/limb -C ali,evn n unal,evn n other cases -C AMD K8,K9 1.65 1.65 1.8 -C AMD K10 0.78 0.78 0.85 -C AMD bd1 0.80 0.91 1.25 -C AMD bobcat 2.15 2.15 2.37 -C Intel P4 2.5 2.5 2.95 -C Intel core2 1.17 1.25 1.25 -C Intel NHM 0.87 0.90 0.90 -C Intel SBR 0.63 0.79 0.77 -C Intel atom 4.3 4.3 4.3 slower than plain code -C VIA nano 1.4 5.1 3.14 too alignment dependent - -C NOTES -C * We only honour the least significant 32 bits of the `which' and `nents' -C arguments to allow efficient code using just SSE2. We would need to -C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. -C * We use movd for copying between xmm and plain registers, since old gas -C rejects movq. But gas assembles movd as movq when given a 64-bit greg. - -define(`rp', `%rdi') -define(`tp', `%rsi') -define(`n', `%rdx') -define(`nents', `%rcx') -define(`which', `%r8') - -define(`i', `%r10') -define(`j', `%r9') - -C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 -C nents n rp tab which j i temp * * * * - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_sec_tabselect) - FUNC_ENTRY(4) -IFDOS(` mov 56(%rsp), %r8d ') - - movd which, %xmm8 - pshufd $0, %xmm8, %xmm8 C 4 `which' copies - mov $1, R32(%rax) - movd %rax, %xmm9 - pshufd $0, %xmm9, %xmm9 C 4 copies of 1 - - mov n, j - add $-8, j - js L(outer_end) - -L(outer_top): - mov nents, i - mov tp, %r11 - pxor %xmm13, %xmm13 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 - pxor %xmm6, %xmm6 - pxor %xmm7, %xmm7 - ALIGN(16) -L(top): movdqa %xmm8, %xmm0 - pcmpeqd %xmm13, %xmm0 - paddd %xmm9, %xmm13 - movdqu 0(tp), %xmm2 - movdqu 16(tp), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 - movdqu 32(tp), %xmm2 - movdqu 48(tp), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm6 - por %xmm3, %xmm7 - lea (tp,n,8), tp - add $-1, i - jne L(top) - - movdqu %xmm4, 0(rp) - movdqu %xmm5, 16(rp) - movdqu %xmm6, 32(rp) - movdqu %xmm7, 48(rp) - - lea 64(%r11), tp - lea 64(rp), rp - add $-8, j - jns L(outer_top) -L(outer_end): - - test $4, R8(n) - je L(b0xx) -L(b1xx):mov nents, i - mov tp, %r11 - pxor %xmm13, %xmm13 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 - ALIGN(16) -L(tp4): movdqa %xmm8, %xmm0 - pcmpeqd %xmm13, %xmm0 - paddd %xmm9, %xmm13 - movdqu 0(tp), %xmm2 - movdqu 16(tp), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 - lea (tp,n,8), tp - add $-1, i - jne L(tp4) - movdqu %xmm4, 0(rp) - movdqu %xmm5, 16(rp) - lea 32(%r11), tp - lea 32(rp), rp - -L(b0xx):test $2, R8(n) - je L(b00x) -L(b01x):mov nents, i - mov tp, %r11 - pxor %xmm13, %xmm13 - pxor %xmm4, %xmm4 - ALIGN(16) -L(tp2): movdqa %xmm8, %xmm0 - pcmpeqd %xmm13, %xmm0 - paddd %xmm9, %xmm13 - movdqu 0(tp), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 - lea (tp,n,8), tp - add $-1, i - jne L(tp2) - movdqu %xmm4, 0(rp) - lea 16(%r11), tp - lea 16(rp), rp - -L(b00x):test $1, R8(n) - je L(b000) -L(b001):mov nents, i - mov tp, %r11 - pxor %xmm13, %xmm13 - pxor %xmm4, %xmm4 - ALIGN(16) -L(tp1): movdqa %xmm8, %xmm0 - pcmpeqd %xmm13, %xmm0 - paddd %xmm9, %xmm13 - movq 0(tp), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 - lea (tp,n,8), tp - add $-1, i - jne L(tp1) - movq %xmm4, 0(rp) - -L(b000):FUNC_EXIT() - ret -EPILOGUE() |