summaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-05-20 06:48:04 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-05-21 10:02:58 -0700
commit2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1 (patch)
tree1fabc0d83269e899abac8dcfdf42f560711bbaac /sysdeps
parent2cdfa9e84886535cf7586bc4449850cdce427c64 (diff)
downloadglibc-2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1.tar.gz
x86-64: Update strlen.S to support wcslen/wcsnlen
The difference between strlen and wcslen is byte vs int. We can replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen into wcslen. Tested on Ivy Bridge with benchtests/bench-wcslen.c, the new strlen based wcslen is as fast as the old wcslen. * sysdeps/x86_64/strlen.S (PMINU): New. (PCMPEQ): Likewise. (SHIFT_RETURN): Likewise. (FIND_ZERO): Replace pcmpeqb with PCMPEQ. (strlen): Add SHIFT_RETURN before ret. Replace pcmpeqb and pminub with PCMPEQ and PMINU. * sysdeps/x86_64/wcslen.S: Define AS_WCSLEN and strlen. Include "strlen.S". * sysdeps/x86_64/wcsnlen.S: New file.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/x86_64/strlen.S61
-rw-r--r--sysdeps/x86_64/wcslen.S238
-rw-r--r--sysdeps/x86_64/wcsnlen.S7
3 files changed, 50 insertions, 256 deletions
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 5896e6b9ee..b5ab117c79 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,4 +1,4 @@
-/* SSE2 version of strlen.
+/* SSE2 version of strlen/wcslen.
Copyright (C) 2012-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -18,6 +18,16 @@
#include <sysdep.h>
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
/* Long lived register in strlen(s), strnlen(s, n) are:
%xmm3 - zero
@@ -32,10 +42,10 @@ ENTRY(strlen)
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
#define FIND_ZERO \
- pcmpeqb (%rax), %xmm0; \
- pcmpeqb 16(%rax), %xmm1; \
- pcmpeqb 32(%rax), %xmm2; \
- pcmpeqb 48(%rax), %xmm3; \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
pmovmskb %xmm0, %esi; \
pmovmskb %xmm1, %edx; \
pmovmskb %xmm2, %r8d; \
@@ -54,6 +64,9 @@ ENTRY(strlen)
xor %rax, %rax
ret
L(n_nonzero):
+# ifdef AS_WCSLEN
+ shlq $2, %rsi
+# endif
/* Initialize long lived registers. */
@@ -96,6 +109,7 @@ L(n_nonzero):
test %rdx, %rdx; \
je L(lab); \
bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
ret
#ifdef AS_STRNLEN
@@ -104,19 +118,20 @@ L(n_nonzero):
#else
/* Test first 16 bytes unaligned. */
movdqu (%rax), %xmm4
- pcmpeqb %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm4
pmovmskb %xmm4, %edx
test %edx, %edx
je L(next48_bytes)
bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
ret
L(next48_bytes):
/* Same as FIND_ZERO except we do not check first 16 bytes. */
andq $-16, %rax
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
pmovmskb %xmm1, %edx
pmovmskb %xmm2, %r8d
pmovmskb %xmm3, %ecx
@@ -145,6 +160,7 @@ L(strnlen_ret):
test %rdx, %rdx
je L(loop_init)
bsfq %rdx, %rax
+ SHIFT_RETURN
ret
#endif
.p2align 4
@@ -161,10 +177,10 @@ L(loop):
je L(exit_end)
movdqa (%rax), %xmm0
- pminub 16(%rax), %xmm0
- pminub 32(%rax), %xmm0
- pminub 48(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit)
@@ -182,6 +198,7 @@ L(first):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
.p2align 4
@@ -192,6 +209,7 @@ L(exit):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
#else
@@ -201,10 +219,10 @@ L(exit):
L(loop):
movdqa 64(%rax), %xmm0
- pminub 80(%rax), %xmm0
- pminub 96(%rax), %xmm0
- pminub 112(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit64)
@@ -212,10 +230,10 @@ L(loop):
subq $-128, %rax
movdqa (%rax), %xmm0
- pminub 16(%rax), %xmm0
- pminub 32(%rax), %xmm0
- pminub 48(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit0)
@@ -231,6 +249,7 @@ L(exit0):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
#endif
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index c6081a482f..88ecdb2082 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -1,238 +1,6 @@
-/* Optimized wcslen for x86-64 with SSE2.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
+#define AS_WCSLEN
+#define strlen __wcslen
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
- .text
-ENTRY (__wcslen)
- cmpl $0, (%rdi)
- jz L(exit_tail0)
- cmpl $0, 4(%rdi)
- jz L(exit_tail1)
- cmpl $0, 8(%rdi)
- jz L(exit_tail2)
- cmpl $0, 12(%rdi)
- jz L(exit_tail3)
- cmpl $0, 16(%rdi)
- jz L(exit_tail4)
- cmpl $0, 20(%rdi)
- jz L(exit_tail5)
- cmpl $0, 24(%rdi)
- jz L(exit_tail6)
- cmpl $0, 28(%rdi)
- jz L(exit_tail7)
-
- pxor %xmm0, %xmm0
-
- lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
- and $-16, %rax
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64_loop):
- movaps (%rax), %xmm0
- movaps 16(%rax), %xmm1
- movaps 32(%rax), %xmm2
- movaps 48(%rax), %xmm6
-
- pminub %xmm1, %xmm0
- pminub %xmm6, %xmm2
- pminub %xmm0, %xmm2
- pcmpeqd %xmm3, %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 64(%rax), %rax
- jz L(aligned_64_loop)
-
- pcmpeqd -64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 48(%rcx), %rcx
- jnz L(exit)
-
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- pcmpeqd -32(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- pcmpeqd %xmm6, %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
-
- .p2align 4
-L(exit):
- sub %rcx, %rax
- shr $2, %rax
- test %dl, %dl
- jz L(exit_high)
-
- mov %dl, %cl
- and $15, %cl
- jz L(exit_1)
- ret
-
- .p2align 4
-L(exit_high):
- mov %dh, %ch
- and $15, %ch
- jz L(exit_3)
- add $2, %rax
- ret
-
- .p2align 4
-L(exit_1):
- add $1, %rax
- ret
-
- .p2align 4
-L(exit_3):
- add $3, %rax
- ret
-
- .p2align 4
-L(exit_tail0):
- xor %rax, %rax
- ret
-
- .p2align 4
-L(exit_tail1):
- mov $1, %rax
- ret
-
- .p2align 4
-L(exit_tail2):
- mov $2, %rax
- ret
-
- .p2align 4
-L(exit_tail3):
- mov $3, %rax
- ret
-
- .p2align 4
-L(exit_tail4):
- mov $4, %rax
- ret
-
- .p2align 4
-L(exit_tail5):
- mov $5, %rax
- ret
-
- .p2align 4
-L(exit_tail6):
- mov $6, %rax
- ret
-
- .p2align 4
-L(exit_tail7):
- mov $7, %rax
- ret
-
-END (__wcslen)
+#include "strlen.S"
weak_alias(__wcslen, wcslen)
diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S
new file mode 100644
index 0000000000..968bb693b4
--- /dev/null
+++ b/sysdeps/x86_64/wcsnlen.S
@@ -0,0 +1,7 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen __wcsnlen
+
+#include "strlen.S"
+
+weak_alias(__wcsnlen, wcsnlen)