From 165308eb2c66542c88d002d63dc68df112f5c818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20B=C3=ADlka?= Date: Fri, 26 Jun 2015 22:19:29 +0200 Subject: Optimize sse4 strspn/strcspn/strpbrk --- sysdeps/x86_64/multiarch/Makefile | 6 +- sysdeps/x86_64/multiarch/strcspn-c.c | 173 -------------------------- sysdeps/x86_64/multiarch/strcspn_sse42.S | 3 + sysdeps/x86_64/multiarch/strpbrk-c.c | 8 -- sysdeps/x86_64/multiarch/strpbrk_sse42.S | 204 +++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/strspn-c.c | 145 ---------------------- sysdeps/x86_64/multiarch/strspn_sse42.S | 3 + sysdeps/x86_64/multiarch/varshift.c | 25 ---- sysdeps/x86_64/multiarch/varshift.h | 30 ----- 9 files changed, 211 insertions(+), 386 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcspn-c.c create mode 100644 sysdeps/x86_64/multiarch/strcspn_sse42.S delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-c.c create mode 100644 sysdeps/x86_64/multiarch/strpbrk_sse42.S delete mode 100644 sysdeps/x86_64/multiarch/strspn-c.c create mode 100644 sysdeps/x86_64/multiarch/strspn_sse42.S delete mode 100644 sysdeps/x86_64/multiarch/varshift.c delete mode 100644 sysdeps/x86_64/multiarch/varshift.h diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 80941621eb..05d5c9ba30 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -22,11 +22,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c varshift -CFLAGS-varshift.c += -msse4 -CFLAGS-strcspn-c.c += -msse4 -CFLAGS-strpbrk-c.c += -msse4 -CFLAGS-strspn-c.c += -msse4 +sysdep_routines += strcspn_sse42 strpbrk_sse42 strspn_sse42 endif ifeq (yes,$(config-cflags-avx2)) diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c deleted file mode 100644 index 60b2ed7a3f..0000000000 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ /dev/null @@ -1,173 +0,0 @@ -/* strcspn with SSE4.2 intrinsics - Copyright (C) 2009-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include -#include "varshift.h" - -/* We use 0x2: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_POSITIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - A A A A A A A A A A A A A A A A - - to find out if the first 16byte data element has any byte A and - the offset of the first byte. There are 3 cases: - - 1. The first 16byte data element has the byte A at the offset X. - 2. The first 16byte data element has EOS and doesn't have the byte A. - 3. The first 16byte data element is valid and doesn't have the byte A. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: - - 1 X 1 0/1 0 - 2 16 0 1 0 - 3 16 0 0 0 - - We exit from the loop for cases 1 and 2 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset - X for case 1. */ - -#ifndef STRCSPN_SSE2 -# define STRCSPN_SSE2 __strcspn_sse2 -# define STRCSPN_SSE42 __strcspn_sse42 -#endif - -#ifdef USE_AS_STRPBRK -# define RETURN(val1, val2) return val1 -#else -# define RETURN(val1, val2) return val2 -#endif - -extern -#ifdef USE_AS_STRPBRK -char * -#else -size_t -#endif -STRCSPN_SSE2 (const char *, const char *); - - -#ifdef USE_AS_STRPBRK -char * -#else -size_t -#endif -__attribute__ ((section (".text.sse4.2"))) -STRCSPN_SSE42 (const char *s, const char *a) -{ - if (*a == 0) - RETURN (NULL, strlen (s)); - - const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); - if (offset != 0) - { - /* Load masks. */ - aligned = (const char *) ((size_t) a & -16L); - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return STRCSPN_SSE2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } - } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } - } - - offset = (int) ((size_t) s & 15); - if (offset != 0) - { - /* Check partial string. */ - aligned = (const char *) ((size_t) s & -16L); - __m128i value = _mm_load_si128 ((__m128i *) aligned); - - value = __m128i_shift_right (value, offset); - - int length = _mm_cmpistri (mask, value, 0x2); - /* No need to check ZFlag since ZFlag is always 1. */ - int cflag = _mm_cmpistrc (mask, value, 0x2); - if (cflag) - RETURN ((char *) (s + length), length); - /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) - RETURN (NULL, index); - aligned += 16; - } - else - aligned = s; - - while (1) - { - __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x2); - int cflag = _mm_cmpistrc (mask, value, 0x2); - int zflag = _mm_cmpistrz (mask, value, 0x2); - if (cflag) - RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); - if (zflag) - RETURN (NULL, - /* Find where the NULL terminator is. */ - (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); - aligned += 16; - } -} diff --git a/sysdeps/x86_64/multiarch/strcspn_sse42.S b/sysdeps/x86_64/multiarch/strcspn_sse42.S new file mode 100644 index 0000000000..3e4e659937 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn_sse42.S @@ -0,0 +1,3 @@ +#define AS_STRCSPN +#define __strpbrk_sse42 __strcspn_sse42 +#include "strpbrk_sse42.S" diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c deleted file mode 100644 index bbf5c49d89..0000000000 --- a/sysdeps/x86_64/multiarch/strpbrk-c.c +++ /dev/null @@ -1,8 +0,0 @@ -/* Don't define multiple versions for strpbrk in static library since we - need strpbrk before the initialization happened. */ -#ifdef SHARED -# define USE_AS_STRPBRK -# define STRCSPN_SSE2 __strpbrk_sse2 -# define STRCSPN_SSE42 __strpbrk_sse42 -# include "strcspn-c.c" -#endif diff --git a/sysdeps/x86_64/multiarch/strpbrk_sse42.S b/sysdeps/x86_64/multiarch/strpbrk_sse42.S new file mode 100644 index 0000000000..512ac19d68 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk_sse42.S @@ -0,0 +1,204 @@ +/* strcspn (str, ss) -- Return the length of the initial segment of STR + which contains no characters from SS. + Copyright (C) 1994-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#ifdef AS_STRSPN +# define AS_STRCSPN +# define MATCH_ALL $18 +#else +# define MATCH_ALL $2 +#endif + +ENTRY(__strpbrk_sse42) + movq %rdi, %rax + andl $4095, %eax + cmp $4032, %eax + ja L(cross_page) + movq %rsi, %rax + andl $4095, %eax + cmp $4080, %eax + ja L(cross_page) + movdqu (%rsi), %xmm4 + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm6 + movdqu 48(%rdi), %xmm7 + +L(back_from_crosspage): + pxor %xmm3, %xmm3 + pxor %xmm2, %xmm2 + + pcmpeqb %xmm4, %xmm2 + pmovmskb %xmm2, %eax + testl %eax, %eax + je L(call) + pcmpistri MATCH_ALL, %xmm1, %xmm4 + jc L(rx0) + je L(ret0) + pcmpistri MATCH_ALL, %xmm5, %xmm4 + jc L(rx16) + je L(ret16) + pcmpistri MATCH_ALL, %xmm6, %xmm4 + jc L(rx32) + je L(ret32) + pcmpistri MATCH_ALL, %xmm7, %xmm4 + jc L(rx48) + je L(ret48) + + movq %rdi, %rax + andq $-16, %rax + addq $16, %rax + .p2align 4,,10 + .p2align 3 +L(loop): + pcmpistri MATCH_ALL, (%rax), %xmm4 + lea 16(%rax), %rax + jc L(rx_loop) + jne L(loop) +#ifdef AS_STRCSPN + movdqa -16(%rax), %xmm1 + pcmpistri $58, %xmm1, %xmm1 + lea -16(%rcx, %rax), %rax + sub %rdi, %rax +#else + xor %eax, %eax +#endif + ret +L(rx_loop): + lea -16(%rcx, %rax), %rax +#ifdef AS_STRCSPN + sub %rdi, %rax +#endif + ret + .p2align 4,,10 + .p2align 3 +#ifndef AS_STRCSPN +L(ret0): +L(ret16): +L(ret32): +L(ret48): + xorl %eax, %eax + ret +#endif +L(call): +#ifdef AS_STRCSPN +# ifdef AS_STRSPN + jmp __strspn_sse2 +# else + jmp __strcspn_sse2 +# endif +#else + jmp __strpbrk_sse2 +#endif + .p2align 4,,10 + .p2align 3 +#ifdef AS_STRCSPN +L(ret0): + pcmpistri $58, %xmm1, %xmm1 +L(rx0): + lea 0(%rcx), %rax +#else +L(rx0): + leaq (%rdi,%rcx), %rax +#endif + ret +#ifdef AS_STRCSPN +L(ret16): + pcmpistri $58, %xmm5, %xmm5 +L(rx16): + lea 16(%rcx), %rax +#else +L(rx16): + leaq 16(%rdi,%rcx), %rax +#endif + ret +#ifdef AS_STRCSPN +L(ret32): + pcmpistri $58, %xmm6, %xmm6 +L(rx32): + lea 32(%rcx), %rax +#else +L(rx32): + leaq 32(%rdi,%rcx), %rax +#endif + ret +#ifdef AS_STRCSPN +L(ret48): + pcmpistri $58, %xmm7, %xmm7 +L(rx48): + lea 48(%rcx), %rax +#else +L(rx48): + leaq 48(%rdi,%rcx), %rax +#endif + ret + + .p2align 4,,10 + .p2align 3 +L(cross_page): + movzbl (%rdi), %ecx + xorl %eax, %eax + leaq -80(%rsp), %r8 + testb %cl, %cl + je L(sloop_end) + leaq -80(%rsp), %r8 + xorl %edx, %edx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +L(sloop): + movb %cl, (%r8,%rdx) + movzbl 1(%rdi,%rdx), %ecx + addl $1, %eax + testb %cl, %cl + je L(sloop_end) + addq $1, %rdx + cmpl $64, %eax + jne L(sloop) +L(sloop_end): + movzbl (%rsi), %ecx + cltq + movb $0, -80(%rsp,%rax) + movdqu (%r8), %xmm1 + movdqu 16(%r8), %xmm5 + movdqu 32(%r8), %xmm6 + movdqu 48(%r8), %xmm7 + + xorl %eax, %eax + testb %cl, %cl + je L(aloop_end) + xorl %edx, %edx + .p2align 4,,10 + .p2align 3 +L(aloop): + movb %cl, (%r8,%rdx) + movzbl 1(%rsi,%rdx), %ecx + addl $1, %eax + testb %cl, %cl + je L(aloop_end) + addq $1, %rdx + cmpl $16, %eax + jne L(aloop) +L(aloop_end): + cltq + movb $0, -80(%rsp,%rax) + movdqu (%r8), %xmm4 + jmp L(back_from_crosspage) +END(__strpbrk_sse42) diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c deleted file mode 100644 index 6b0c80aa43..0000000000 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ /dev/null @@ -1,145 +0,0 @@ -/* strspn with SSE4.2 intrinsics - Copyright (C) 2009-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include -#include "varshift.h" - -/* We use 0x12: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_NEGATIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - A A A A A A A A A A A A A A A A - - to find out if the first 16byte data element has any non-A byte and - the offset of the first byte. There are 2 cases: - - 1. The first 16byte data element has the non-A byte, including - EOS, at the offset X. - 2. The first 16byte data element is valid and doesn't have the non-A - byte. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 0/1 0 - 2 16 0 0 0 - - We exit from the loop for case 1. */ - -extern size_t __strspn_sse2 (const char *, const char *); - - -size_t -__attribute__ ((section (".text.sse4.2"))) -__strspn_sse42 (const char *s, const char *a) -{ - if (*a == 0) - return 0; - - const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); - if (offset != 0) - { - /* Load masks. */ - aligned = (const char *) ((size_t) a & -16L); - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return __strspn_sse2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } - } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return __strspn_sse2 (s, a); - } - } - - offset = (int) ((size_t) s & 15); - if (offset != 0) - { - /* Check partial string. */ - aligned = (const char *) ((size_t) s & -16L); - __m128i value = _mm_load_si128 ((__m128i *) aligned); - - value = __m128i_shift_right (value, offset); - - int length = _mm_cmpistri (mask, value, 0x12); - /* No need to check CFlag since it is always 1. */ - if (length < 16 - offset) - return length; - /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) - return length; - aligned += 16; - } - else - aligned = s; - - while (1) - { - __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x12); - int cflag = _mm_cmpistrc (mask, value, 0x12); - if (cflag) - return (size_t) (aligned + index - s); - aligned += 16; - } -} diff --git a/sysdeps/x86_64/multiarch/strspn_sse42.S b/sysdeps/x86_64/multiarch/strspn_sse42.S new file mode 100644 index 0000000000..d460167773 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn_sse42.S @@ -0,0 +1,3 @@ +#define AS_STRSPN +#define __strpbrk_sse42 __strspn_sse42 +#include "strpbrk_sse42.S" diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c deleted file mode 100644 index 0007ef79e5..0000000000 --- a/sysdeps/x86_64/multiarch/varshift.c +++ /dev/null @@ -1,25 +0,0 @@ -/* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include "varshift.h" - -const int8_t ___m128i_shift_right[31] attribute_hidden = - { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h deleted file mode 100644 index 30ace3d914..0000000000 --- a/sysdeps/x86_64/multiarch/varshift.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -extern const int8_t ___m128i_shift_right[31] attribute_hidden; - -static __inline__ __m128i -__m128i_shift_right (__m128i value, unsigned long int offset) -{ - return _mm_shuffle_epi8 (value, - _mm_loadu_si128 ((__m128i *) (___m128i_shift_right - + offset))); -} -- cgit v1.2.1