diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2017-05-20 06:48:04 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2017-05-21 10:02:58 -0700 |
commit | 2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1 (patch) | |
tree | 1fabc0d83269e899abac8dcfdf42f560711bbaac /sysdeps | |
parent | 2cdfa9e84886535cf7586bc4449850cdce427c64 (diff) | |
download | glibc-2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1.tar.gz |
x86-64: Update strlen.S to support wcslen/wcsnlen
The difference between strlen and wcslen is byte vs int. We can
replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen
into wcslen. Tested on Ivy Bridge with benchtests/bench-wcslen.c,
the new strlen based wcslen is as fast as the old wcslen.
* sysdeps/x86_64/strlen.S (PMINU): New.
(PCMPEQ): Likewise.
(SHIFT_RETURN): Likewise.
(FIND_ZERO): Replace pcmpeqb with PCMPEQ.
(strlen): Add SHIFT_RETURN before ret. Replace pcmpeqb and
pminub with PCMPEQ and PMINU.
* sysdeps/x86_64/wcslen.S: Define AS_WCSLEN and strlen.
Include "strlen.S".
* sysdeps/x86_64/wcsnlen.S: New file.
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/x86_64/strlen.S | 61 | ||||
-rw-r--r-- | sysdeps/x86_64/wcslen.S | 238 | ||||
-rw-r--r-- | sysdeps/x86_64/wcsnlen.S | 7 |
3 files changed, 50 insertions, 256 deletions
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 5896e6b9ee..b5ab117c79 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,4 +1,4 @@ -/* SSE2 version of strlen. +/* SSE2 version of strlen/wcslen. Copyright (C) 2012-2017 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,6 +18,16 @@ #include <sysdep.h> +#ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +#else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +#endif + /* Long lived register in strlen(s), strnlen(s, n) are: %xmm3 - zero @@ -32,10 +42,10 @@ ENTRY(strlen) /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ #define FIND_ZERO \ - pcmpeqb (%rax), %xmm0; \ - pcmpeqb 16(%rax), %xmm1; \ - pcmpeqb 32(%rax), %xmm2; \ - pcmpeqb 48(%rax), %xmm3; \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ pmovmskb %xmm0, %esi; \ pmovmskb %xmm1, %edx; \ pmovmskb %xmm2, %r8d; \ @@ -54,6 +64,9 @@ ENTRY(strlen) xor %rax, %rax ret L(n_nonzero): +# ifdef AS_WCSLEN + shlq $2, %rsi +# endif /* Initialize long lived registers. */ @@ -96,6 +109,7 @@ L(n_nonzero): test %rdx, %rdx; \ je L(lab); \ bsfq %rdx, %rax; \ + SHIFT_RETURN; \ ret #ifdef AS_STRNLEN @@ -104,19 +118,20 @@ L(n_nonzero): #else /* Test first 16 bytes unaligned. */ movdqu (%rax), %xmm4 - pcmpeqb %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm4 pmovmskb %xmm4, %edx test %edx, %edx je L(next48_bytes) bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN ret L(next48_bytes): /* Same as FIND_ZERO except we do not check first 16 bytes. */ andq $-16, %rax - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 pmovmskb %xmm1, %edx pmovmskb %xmm2, %r8d pmovmskb %xmm3, %ecx @@ -145,6 +160,7 @@ L(strnlen_ret): test %rdx, %rdx je L(loop_init) bsfq %rdx, %rax + SHIFT_RETURN ret #endif .p2align 4 @@ -161,10 +177,10 @@ L(loop): je L(exit_end) movdqa (%rax), %xmm0 - pminub 16(%rax), %xmm0 - pminub 32(%rax), %xmm0 - pminub 48(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit) @@ -182,6 +198,7 @@ L(first): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret .p2align 4 @@ -192,6 +209,7 @@ L(exit): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret #else @@ -201,10 +219,10 @@ L(exit): L(loop): movdqa 64(%rax), %xmm0 - pminub 80(%rax), %xmm0 - pminub 96(%rax), %xmm0 - pminub 112(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit64) @@ -212,10 +230,10 @@ L(loop): subq $-128, %rax movdqa (%rax), %xmm0 - pminub 16(%rax), %xmm0 - pminub 32(%rax), %xmm0 - pminub 48(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit0) @@ -231,6 +249,7 @@ L(exit0): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret #endif diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index c6081a482f..88ecdb2082 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -1,238 +1,6 @@ -/* Optimized wcslen for x86-64 with SSE2. - Copyright (C) 2011-2017 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. +#define AS_WCSLEN +#define strlen __wcslen - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - .text -ENTRY (__wcslen) - cmpl $0, (%rdi) - jz L(exit_tail0) - cmpl $0, 4(%rdi) - jz L(exit_tail1) - cmpl $0, 8(%rdi) - jz L(exit_tail2) - cmpl $0, 12(%rdi) - jz L(exit_tail3) - cmpl $0, 16(%rdi) - jz L(exit_tail4) - cmpl $0, 20(%rdi) - jz L(exit_tail5) - cmpl $0, 24(%rdi) - jz L(exit_tail6) - cmpl $0, 28(%rdi) - jz L(exit_tail7) - - pxor %xmm0, %xmm0 - - lea 32(%rdi), %rax - lea 16(%rdi), %rcx - and $-16, %rax - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64_loop): - movaps (%rax), %xmm0 - movaps 16(%rax), %xmm1 - movaps 32(%rax), %xmm2 - movaps 48(%rax), %xmm6 - - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqd %xmm3, %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 64(%rax), %rax - jz L(aligned_64_loop) - - pcmpeqd -64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 48(%rcx), %rcx - jnz L(exit) - - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - pcmpeqd -32(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - pcmpeqd %xmm6, %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - jmp L(aligned_64_loop) - - .p2align 4 -L(exit): - sub %rcx, %rax - shr $2, %rax - test %dl, %dl - jz L(exit_high) - - mov %dl, %cl - and $15, %cl - jz L(exit_1) - ret - - .p2align 4 -L(exit_high): - mov %dh, %ch - and $15, %ch - jz L(exit_3) - add $2, %rax - ret - - .p2align 4 -L(exit_1): - add $1, %rax - ret - - .p2align 4 -L(exit_3): - add $3, %rax - ret - - .p2align 4 -L(exit_tail0): - xor %rax, %rax - ret - - .p2align 4 -L(exit_tail1): - mov $1, %rax - ret - - .p2align 4 -L(exit_tail2): - mov $2, %rax - ret - - .p2align 4 -L(exit_tail3): - mov $3, %rax - ret - - .p2align 4 -L(exit_tail4): - mov $4, %rax - ret - - .p2align 4 -L(exit_tail5): - mov $5, %rax - ret - - .p2align 4 -L(exit_tail6): - mov $6, %rax - ret - - .p2align 4 -L(exit_tail7): - mov $7, %rax - ret - -END (__wcslen) +#include "strlen.S" weak_alias(__wcslen, wcslen) diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S new file mode 100644 index 0000000000..968bb693b4 --- /dev/null +++ b/sysdeps/x86_64/wcsnlen.S @@ -0,0 +1,7 @@ +#define AS_WCSLEN +#define AS_STRNLEN +#define strlen __wcsnlen + +#include "strlen.S" + +weak_alias(__wcsnlen, wcsnlen) |