diff options
author | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
commit | 1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch) | |
tree | 90a3f8d19f941a684e1482b8813c534d82cfb19e /sysdeps/i386/i686 | |
parent | a2d18b64edb486825fb5946eefc2131426ccfec9 (diff) | |
download | glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz |
Optimized wcschr and wcscpy for x86-64 and x86-32
Diffstat (limited to 'sysdeps/i386/i686')
-rw-r--r-- | sysdeps/i386/i686/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr-c.c | 8 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr-sse2.S | 220 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr.S | 54 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcscpy-c.c | 5 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcscpy-ssse3.S | 621 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcscpy.S | 46 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcsrchr-c.c | 5 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcsrchr-sse2.S | 355 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcsrchr.S | 54 |
10 files changed, 1370 insertions, 1 deletions
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 426b718e47..b764e5b825 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -37,7 +37,8 @@ endif ifeq ($(subdir),wcsmbs) sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \ - wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \ + wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c endif ifeq (mathyes,$(subdir)$(config-cflags-avx)) diff --git a/sysdeps/i386/i686/multiarch/wcschr-c.c b/sysdeps/i386/i686/multiarch/wcschr-c.c new file mode 100644 index 0000000000..a63e50e283 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr-c.c @@ -0,0 +1,8 @@ +#ifndef NOT_IN_libc +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); +# define WCSCHR __wcschr_ia32 +#endif + +#include "wcsmbs/wcschr.c" diff --git a/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/sysdeps/i386/i686/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..cc8204cfe3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr-sse2.S @@ -0,0 +1,220 @@ +/* wcschr with SSE2, without using bsf instructions + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcschr_sse2) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %eax + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + + and $63, %eax + cmp $48, %eax + ja L(cross_cache) + + movdqu (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + and $-16, %ecx + jmp L(loop) + + .p2align 4 +L(cross_cache): + PUSH (%edi) + mov %ecx, %edi + mov %eax, %ecx + and $-16, %edi + and $15, %ecx + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + + add %edi, %ecx + POP (%edi) + + test %edx, %edx + jz L(match_case1) + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_no_match): + mov %edi, %ecx + POP (%edi) + + test %edx, %edx + jnz L(return_null) + + pxor %xmm2, %xmm2 + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + add $16, %ecx + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jz L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_case2_4): + mov %ecx, %eax + ret + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + test $15, %ah + jnz L(match_case2_12) + test $15, %dh + jnz L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(match_case2_12): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(exit0) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(exit3) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit0): + mov %ecx, %eax + ret + + .p2align 4 +L(exit3): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + +END (__wcschr_sse2) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcschr.S b/sysdeps/i386/i686/multiarch/wcschr.S new file mode 100644 index 0000000000..bf0d6d5754 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr.S @@ -0,0 +1,54 @@ +/* Multiple versions of wcschr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wcschr) + .type wcschr, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcschr_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcschr_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(wcschr) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcscpy-c.c b/sysdeps/i386/i686/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..a3c4024c01 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define wcscpy __wcscpy_ia32 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..84d92a8bde --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -0,0 +1,621 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__wcscpy_ssse3) + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmp $0, (%ecx) + jz L(ExitTail4) + cmp $0, 4(%ecx) + jz L(ExitTail8) + cmp $0, 8(%ecx) + jz L(ExitTail12) + cmp $0, 12(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi + + PUSH (%esi) + lea 16(%ecx), %esi + + and $-16, %esi + + pxor %xmm0, %xmm0 + pcmpeqd (%esi), %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + + jz L(Align16Both) + cmp $4, %eax + je L(Shl4) + cmp $8, %eax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqd %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax + + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqd %xmm7, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %esi + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx + + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%edx), %xmm6 + psrldq $12, %xmm6 + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%edx) + add $12, %edx + add $12, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx + + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%edx), %xmm6 + psrldq $8, %xmm6 + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%edx) + add $8, %edx + add $8, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx + + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%edx), %xmm6 + psrldq $4, %xmm6 + mov $4, %esi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%edx) + + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edx, %eax + ret + +END (__wcscpy_ssse3) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcscpy.S b/sysdeps/i386/i686/multiarch/wcscpy.S new file mode 100644 index 0000000000..c7bafbe82a --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcscpy.S @@ -0,0 +1,46 @@ +/* Multiple versions of wcscpy + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcscpy_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcscpy_ssse3@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(wcscpy) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/sysdeps/i386/i686/multiarch/wcsrchr-c.c new file mode 100644 index 0000000000..c7444ce89b --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcsrchr-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define wcsrchr __wcsrchr_ia32 +#endif + +#include "wcsmbs/wcsrchr.c" diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S new file mode 100644 index 0000000000..2859f7e9f3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S @@ -0,0 +1,355 @@ +/* wcsrchr with SSE2, without using bsf instructions. + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH (%edi); +# define RETURN POP (%edi); ret; CFI_PUSH (%edi); +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcsrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %edi + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + +/* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm2, %ecx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + +/* Save current match */ + mov %eax, %edx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm3, %edx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + + mov %eax, %edx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %ecx + pmovmskb %xmm4, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %ecx + pmovmskb %xmm5, %eax + or %eax, %ecx + jz L(loop) + + .p2align 4 +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %edx, %edx + jz L(return_null_1) + mov %edx, %eax + mov %esi, %edi + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) +/* save match info */ + mov %eax, %edx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(find_zero_in_second_wchar) + and $1, %eax + jz L(return_value) + + POP (%esi) + + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%esi) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%esi) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_fourth_wchar): + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match_second_wchar): + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_third_or_fourth_wchar): + test $15 << 4, %ah + jnz L(match_fourth_wchar) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_third_wchar): + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_fourth_wchar): + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(prolog_find_zero_in_second_wchar) + and $1, %eax + jz L(return_null) + + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(prolog_find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_fourth_wchar): + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + +END (__wcsrchr_sse2) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcsrchr.S b/sysdeps/i386/i686/multiarch/wcsrchr.S new file mode 100644 index 0000000000..8240063dd6 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcsrchr.S @@ -0,0 +1,54 @@ +/* Multiple versions of wcsrchr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wcsrchr) + .type wcsrchr, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcsrchr_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcsrchr_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(wcsrchr) +#endif |