summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-05-21 09:22:44 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-05-21 13:29:49 -0700
commit9360ab1ce97239666600b9a84a71eb858a8792c9 (patch)
tree37270befc941588b574bb3504eb0de93a16eab7f
parent2e9bca4211cfb79b86d315d12f6d9d4a41bb2dc1 (diff)
downloadglibc-hjl/wcslen/master.tar.gz
x86-64: Add wmemset optimized with SSE2/AVX2/AVX512hjl/wcslen/master
The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.S: New file. * sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise. * sysdeps/x86_64/wmemset.S: Likewise. * sysdeps/x86_64/wmemset_chk.S: Likewise.
-rw-r--r--include/wchar.h3
-rw-r--r--sysdeps/x86_64/memset.S18
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c22
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S8
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S9
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S24
-rw-r--r--sysdeps/x86_64/multiarch/memset.S13
-rw-r--r--sysdeps/x86_64/multiarch/wmemset.S47
-rw-r--r--sysdeps/x86_64/multiarch/wmemset_chk.S46
-rw-r--r--sysdeps/x86_64/wmemset.S1
-rw-r--r--sysdeps/x86_64/wmemset_chk.S33
11 files changed, 215 insertions, 9 deletions
diff --git a/include/wchar.h b/include/wchar.h
index e2579a176a..a773d56b36 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
__attribute_pure__;
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+ size_t __ns) __THROW;
+
extern int __vfwscanf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 69ed509c28..41278787fe 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,13 +26,18 @@
#define VMOVU movdqu
#define VMOVA movdqa
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
punpcklbw %xmm0, %xmm0; \
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movd d, %xmm0; \
+ movq r, %rax; \
+ pshufd $0, %xmm0, %xmm0
+
#define SECTION(p) p
#ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@
# define MEMSET_SYMBOL(p,s) memset
#endif
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s) __wmemset
+#endif
+
#include "multiarch/memset-vec-unaligned-erms.S"
libc_hidden_builtin_def (memset)
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d7f7..a91d2f9efb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+ /* Support sysdeps/x86_64/multiarch/wmemset.S. */
+ IFUNC_IMPL (i, name, wmemset,
+ IFUNC_IMPL_ADD (array, i, wmemset, 1,
+ __wmemset_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __wmemset_avx512_unaligned))
+
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
__strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */
+ IFUNC_IMPL (i, name, __wmemset_chk,
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+ __wmemset_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __wmemset_chk_avx512_unaligned))
#endif
return i;
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 79975e0825..7ab3d89849 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,13 +4,19 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastd %xmm0, %ymm0
+
# define SECTION(p) p##.avx
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index a5ec349198..0783979ca5 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,14 +4,21 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %xmm0; \
vpbroadcastq %xmm0, %zmm0
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastd %xmm0, %xmm0; \
+ vpbroadcastq %xmm0, %zmm0
+
# define SECTION(p) p##.avx512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 704eed9a3f..2eb9e3744e 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -30,6 +30,10 @@
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
#endif
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -79,6 +83,21 @@ END (__bzero)
weak_alias (__bzero, bzero)
#endif
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ shlq $2, %rdx
+ WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ jmp L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
cmpq %rdx, %rcx
@@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
- VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
@@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
- VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 9d33118cf8..11f27378b0 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -58,16 +58,23 @@ END(memset)
#if IS_IN (libc)
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
# ifdef SHARED
-# undef libc_hidden_builtin_def
+# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal memset calls through a PLT.
The speedup we get from using SSE2 instructions is likely eaten away
by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+ .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+ .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
# endif
+# undef weak_alias
+# define weak_alias(original, alias) \
+ .weak bzero; bzero = __bzero
+
# undef strong_alias
# define strong_alias(original, alias)
#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
new file mode 100644
index 0000000000..3bd7ca2092
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemset
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+ .type __wmemset, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ lea __wmemset_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ lea __wmemset_avx2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ lea __wmemset_avx512_unaligned(%rip), %RAX_LP
+1: ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
new file mode 100644
index 0000000000..c76fcb1587
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wmemset_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+ .type __wmemset_chk, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ lea __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ lea __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ lea __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1: ret
+END(__wmemset_chk)
+# else
+# include "../wmemset_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000000..f96d567fd8
--- /dev/null
+++ b/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@
+/* Implemented in memset.S. */
diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000000..64c277413f
--- /dev/null
+++ b/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+ /* For libc.so this is defined in wmemset.S.
+ For libc.a, this is a separate source to avoid
+ wmemset bringing in __chk_fail and all routines
+ it calls. */
+ .text
+ENTRY (__wmemset_chk)
+ cmpq %rdx, %rcx
+ jb __chk_fail
+ jmp wmemset
+END (__wmemset_chk)
+#endif