diff options
author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2011-10-23 15:17:23 -0400 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-10-23 15:17:23 -0400 |
commit | fc2ee42abe595bbf6b8bbf0637648ad8b5d4faab (patch) | |
tree | d8083a4d8e42366a82963d3525140c53fbb49ede /sysdeps/i386/i686 | |
parent | 09229f3e1b617d9dcfa3227f32bb72436d7fcac4 (diff) | |
download | glibc-fc2ee42abe595bbf6b8bbf0637648ad8b5d4faab.tar.gz |
Add optimized wcslen and strnlen for x86-32
Diffstat (limited to 'sysdeps/i386/i686')
-rw-r--r-- | sysdeps/i386/i686/multiarch/Makefile | 1 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/strlen-sse2.S | 440 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/strnlen-c.c | 8 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/strnlen-sse2.S | 3 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/strnlen.S | 56 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcslen-c.c | 5 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcslen-sse2.S | 194 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcslen.S | 56 |
8 files changed, 723 insertions, 40 deletions
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 98d1ad6d54..5f18538776 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -18,6 +18,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \ memrchr-sse2 memrchr-sse2-bsf memrchr-c \ rawmemchr-sse2 rawmemchr-sse2-bsf \ + strnlen-sse2 strnlen-c wcslen-sse2 wcslen-c \ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2.S b/sysdeps/i386/i686/multiarch/strlen-sse2.S index 2dbc4a9e76..91b6d799cd 100644 --- a/sysdeps/i386/i686/multiarch/strlen-sse2.S +++ b/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -18,29 +18,46 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#if (defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc +/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ + +#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc + # ifndef USE_AS_STRCAT # include <sysdep.h> -# define CFI_PUSH(REG) \ +# define PARMS 4 +# define STR PARMS +# define RETURN ret + +# ifdef USE_AS_STRNLEN +# define LEN PARMS + 8 +# define CFI_PUSH(REG) \ cfi_adjust_cfa_offset (4); \ cfi_rel_offset (REG, 0) -# define CFI_POP(REG) \ +# define CFI_POP(REG) \ cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) -# define PARMS 4 -# define STR PARMS -# define ENTRANCE -# define RETURN ret +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# undef RETURN +# define RETURN POP (%edi); CFI_PUSH(%edi); ret +# endif + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif atom_text_section -ENTRY (__strlen_sse2) - ENTRANCE +ENTRY (STRLEN) mov STR(%esp), %edx +# ifdef USE_AS_STRNLEN + PUSH (%edi) + movl LEN(%esp), %edi + sub $4, %edi + jbe L(len_less4_prolog) +# endif # endif xor %eax, %eax cmpb $0, (%edx) @@ -51,6 +68,12 @@ ENTRY (__strlen_sse2) jz L(exit_tail2) cmpb $0, 3(%edx) jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less8_prolog) +# endif + cmpb $0, 4(%edx) jz L(exit_tail4) cmpb $0, 5(%edx) @@ -59,6 +82,12 @@ ENTRY (__strlen_sse2) jz L(exit_tail6) cmpb $0, 7(%edx) jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less12_prolog) +# endif + cmpb $0, 8(%edx) jz L(exit_tail8) cmpb $0, 9(%edx) @@ -67,6 +96,12 @@ ENTRY (__strlen_sse2) jz L(exit_tail10) cmpb $0, 11(%edx) jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less16_prolog) +# endif + cmpb $0, 12(%edx) jz L(exit_tail12) cmpb $0, 13(%edx) @@ -75,11 +110,18 @@ ENTRY (__strlen_sse2) jz L(exit_tail14) cmpb $0, 15(%edx) jz L(exit_tail15) + pxor %xmm0, %xmm0 - mov %edx, %eax - lea 16(%edx), %ecx + lea 16(%edx), %eax + mov %eax, %ecx and $-16, %eax - add $16, %eax + +# ifdef USE_AS_STRNLEN + and $15, %edx + add %edx, %edi + sub $64, %edi + jbe L(len_less64) +# endif pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx @@ -95,7 +137,6 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) - pcmpeqb (%eax), %xmm2 pmovmskb %xmm2, %edx pxor %xmm3, %xmm3 @@ -109,6 +150,11 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx @@ -133,6 +179,11 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx @@ -157,6 +208,11 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx @@ -181,8 +237,20 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + mov %eax, %edx + and $63, %edx + add %edx, %edi +# endif + and $-0x40, %eax -L(aligned_64): + + .p2align 4 +L(aligned_64_loop): +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif movaps (%eax), %xmm0 movaps 16(%eax), %xmm1 movaps 32(%eax), %xmm2 @@ -194,7 +262,7 @@ L(aligned_64): pmovmskb %xmm2, %edx test %edx, %edx lea 64(%eax), %eax - jz L(aligned_64) + jz L(aligned_64_loop) pcmpeqb -64(%eax), %xmm3 pmovmskb %xmm3, %edx @@ -221,56 +289,348 @@ L(exit): sub %ecx, %eax test %dl, %dl jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_8) test $0x01, %dl jnz L(exit_tail0) - test $0x02, %dl jnz L(exit_tail1) - test $0x04, %dl jnz L(exit_tail2) + add $3, %eax + RETURN - test $0x08, %dl - jnz L(exit_tail3) - + .p2align 4 +L(exit_8): test $0x10, %dl jnz L(exit_tail4) - test $0x20, %dl jnz L(exit_tail5) - test $0x40, %dl jnz L(exit_tail6) add $7, %eax -L(exit_tail0): RETURN + .p2align 4 L(exit_high): - add $8, %eax + mov %dh, %ch + and $15, %ch + jz L(exit_high_8) test $0x01, %dh + jnz L(exit_tail8) + test $0x02, %dh + jnz L(exit_tail9) + test $0x04, %dh + jnz L(exit_tail10) + add $11, %eax + RETURN + + .p2align 4 +L(exit_high_8): + test $0x10, %dh + jnz L(exit_tail12) + test $0x20, %dh + jnz L(exit_tail13) + test $0x40, %dh + jnz L(exit_tail14) + add $15, %eax +L(exit_tail0): + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %edi + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + movl LEN(%esp), %eax + RETURN + + .p2align 4 +L(strnlen_exit): + sub %ecx, %eax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %edi + jb L(return_start_len) + lea 3(%eax), %eax + RETURN - test $0x02, %dh - jnz L(exit_tail1) + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %edi + jb L(return_start_len) + lea 7(%eax), %eax + RETURN + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) + jnz L(strnlen_exit_tail10) + sub $12, %edi + jb L(return_start_len) + lea 11(%eax), %eax + RETURN + .p2align 4 +L(strnlen_exit_high_8): test $0x10, %dh - jnz L(exit_tail4) - + jnz L(strnlen_exit_tail12) test $0x20, %dh - jnz L(exit_tail5) - + jnz L(strnlen_exit_tail13) test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax + jnz L(strnlen_exit_tail14) + sub $16, %edi + jb L(return_start_len) + lea 15(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %edi + jb L(return_start_len) + lea 1(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %edi + jb L(return_start_len) + lea 2(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %edi + jb L(return_start_len) + lea 4(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %edi + jb L(return_start_len) + lea 5(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %edi + jb L(return_start_len) + lea 6(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %edi + jb L(return_start_len) + lea 8(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %edi + jb L(return_start_len) + lea 9(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %edi + jb L(return_start_len) + lea 10(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %edi + jb L(return_start_len) + lea 12(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %edi + jb L(return_start_len) + lea 13(%eax), %eax RETURN .p2align 4 +L(strnlen_exit_tail14): + sub $15, %edi + jb L(return_start_len) + lea 14(%eax), %eax + RETURN + + .p2align 4 +L(return_start_len): + movl LEN(%esp), %eax + RETURN + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + xor %eax, %eax + + add $4, %edi + jz L(exit_tail0) + + cmpb $0, (%edx) + jz L(exit_tail0) + cmp $1, %edi + je L(exit_tail1) + + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmp $2, %edi + je L(exit_tail2) + + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmp $3, %edi + je L(exit_tail3) + + cmpb $0, 3(%edx) + jz L(exit_tail3) + mov $4, %eax + RETURN + + .p2align 4 +L(len_less8_prolog): + add $4, %edi + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmp $1, %edi + je L(exit_tail5) + + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmp $2, %edi + je L(exit_tail6) + + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmp $3, %edi + je L(exit_tail7) + + cmpb $0, 7(%edx) + jz L(exit_tail7) + mov $8, %eax + RETURN + + + .p2align 4 +L(len_less12_prolog): + add $4, %edi + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmp $1, %edi + je L(exit_tail9) + + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmp $2, %edi + je L(exit_tail10) + + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmp $3, %edi + je L(exit_tail11) + + cmpb $0, 11(%edx) + jz L(exit_tail11) + mov $12, %eax + RETURN + + .p2align 4 +L(len_less16_prolog): + add $4, %edi + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmp $1, %edi + je L(exit_tail13) + + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmp $2, %edi + je L(exit_tail14) + + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmp $3, %edi + je L(exit_tail15) + + cmpb $0, 15(%edx) + jz L(exit_tail15) + mov $16, %eax + RETURN +# endif + + .p2align 4 L(exit_tail1): add $1, %eax RETURN @@ -330,7 +690,7 @@ L(exit_tail14): L(exit_tail15): add $15, %eax # ifndef USE_AS_STRCAT - ret -END (__strlen_sse2) + RETURN +END (STRLEN) # endif #endif diff --git a/sysdeps/i386/i686/multiarch/strnlen-c.c b/sysdeps/i386/i686/multiarch/strnlen-c.c new file mode 100644 index 0000000000..567af2c815 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strnlen-c.c @@ -0,0 +1,8 @@ +#ifndef NOT_IN_libc +# define STRNLEN __strnlen_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); +#endif + +#include "string/strnlen.c" diff --git a/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/sysdeps/i386/i686/multiarch/strnlen-sse2.S new file mode 100644 index 0000000000..56b6ae2a5c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strnlen-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2 +#include "strlen-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/strnlen.S b/sysdeps/i386/i686/multiarch/strnlen.S new file mode 100644 index 0000000000..7e542d9b7c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strnlen.S @@ -0,0 +1,56 @@ +/* Multiple versions of strnlen + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __strnlen_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __strnlen_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(__strnlen) + +weak_alias(__strnlen, strnlen) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcslen-c.c b/sysdeps/i386/i686/multiarch/wcslen-c.c new file mode 100644 index 0000000000..49f32a25e4 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcslen-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WCSLEN __wcslen_ia32 +#endif + +#include "wcsmbs/wcslen.c" diff --git a/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/sysdeps/i386/i686/multiarch/wcslen-sse2.S new file mode 100644 index 0000000000..d41d623098 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcslen-sse2.S @@ -0,0 +1,194 @@ +/* wcslen with SSE2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> +# define STR 4 + + .text +ENTRY (__wcslen_sse2) + mov STR(%esp), %edx + + cmp $0, (%edx) + jz L(exit_tail0) + cmp $0, 4(%edx) + jz L(exit_tail1) + cmp $0, 8(%edx) + jz L(exit_tail2) + cmp $0, 12(%edx) + jz L(exit_tail3) + cmp $0, 16(%edx) + jz L(exit_tail4) + cmp $0, 20(%edx) + jz L(exit_tail5) + cmp $0, 24(%edx) + jz L(exit_tail6) + cmp $0, 28(%edx) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%edx), %eax + lea 16(%edx), %ecx + and $-16, %eax + + pcmpeqd (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqd -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %ecx, %eax + shr $2, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %eax + ret + + .p2align 4 +L(exit_1): + add $1, %eax + ret + + .p2align 4 +L(exit_3): + add $3, %eax + ret + + .p2align 4 +L(exit_tail0): + xor %eax, %eax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %eax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %eax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %eax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %eax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %eax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %eax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %eax + ret + +END (__wcslen_sse2) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcslen.S b/sysdeps/i386/i686/multiarch/wcslen.S new file mode 100644 index 0000000000..58670377e0 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcslen.S @@ -0,0 +1,56 @@ +/* Multiple versions of wcslen + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__wcslen) + .type __wcslen, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcslen_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcslen_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(__wcslen) + +weak_alias(__wcslen, wcslen) +#endif |