From 9360ab1ce97239666600b9a84a71eb858a8792c9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 21 May 2017 09:22:44 -0700 Subject: x86-64: Add wmemset optimized with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.S: New file. * sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise. * sysdeps/x86_64/wmemset.S: Likewise. * sysdeps/x86_64/wmemset_chk.S: Likewise. --- include/wchar.h | 3 ++ sysdeps/x86_64/memset.S | 18 ++++++++- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 ++++++++++ .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 8 +++- .../multiarch/memset-avx512-unaligned-erms.S | 9 ++++- .../x86_64/multiarch/memset-vec-unaligned-erms.S | 24 +++++++++-- sysdeps/x86_64/multiarch/memset.S | 13 ++++-- sysdeps/x86_64/multiarch/wmemset.S | 47 ++++++++++++++++++++++ sysdeps/x86_64/multiarch/wmemset_chk.S | 46 +++++++++++++++++++++ sysdeps/x86_64/wmemset.S | 1 + sysdeps/x86_64/wmemset_chk.S | 33 +++++++++++++++ 11 files changed, 215 insertions(+), 9 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/wmemset.S create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S create mode 100644 sysdeps/x86_64/wmemset.S create mode 100644 sysdeps/x86_64/wmemset_chk.S diff --git a/include/wchar.h b/include/wchar.h index e2579a176a..a773d56b36 100644 --- a/include/wchar.h +++ b/include/wchar.h @@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2, extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc) __attribute_pure__; +extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n, + size_t __ns) __THROW; + extern int __vfwscanf (__FILE *__restrict __s, const wchar_t *__restrict __format, __gnuc_va_list __arg) diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 69ed509c28..41278787fe 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -26,13 +26,18 @@ #define VMOVU movdqu #define VMOVA movdqa -#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ movq r, %rax; \ punpcklbw %xmm0, %xmm0; \ punpcklwd %xmm0, %xmm0; \ pshufd $0, %xmm0, %xmm0 +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + pshufd $0, %xmm0, %xmm0 + #define SECTION(p) p #ifndef MEMSET_SYMBOL @@ -40,10 +45,21 @@ # define MEMSET_SYMBOL(p,s) memset #endif +#ifndef WMEMSET_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) p +# define WMEMSET_SYMBOL(p,s) __wmemset +#endif + #include "multiarch/memset-vec-unaligned-erms.S" libc_hidden_builtin_def (memset) +#if IS_IN (libc) +libc_hidden_def (__wmemset) +weak_alias (__wmemset, wmemset) +libc_hidden_weak (wmemset) +#endif + #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH strong_alias (__memset_chk, __memset_zero_constant_len_parameter) .section .gnu.warning.__memset_zero_constant_len_parameter diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 06d9a9d7f7..a91d2f9efb 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) + /* Support sysdeps/x86_64/multiarch/wmemset.S. */ + IFUNC_IMPL (i, name, wmemset, + IFUNC_IMPL_ADD (array, i, wmemset, 1, + __wmemset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_avx512_unaligned)) + #ifdef SHARED /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ IFUNC_IMPL (i, name, __memcpy_chk, @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */ + IFUNC_IMPL (i, name, __wmemset_chk, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1, + __wmemset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_chk_avx512_unaligned)) #endif return i; diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 79975e0825..7ab3d89849 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -4,13 +4,19 @@ # define VMOVU vmovdqu # define VMOVA vmovdqa -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ movq r, %rax; \ vpbroadcastb %xmm0, %ymm0 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + # define SECTION(p) p##.avx # define MEMSET_SYMBOL(p,s) p##_avx2_##s +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index a5ec349198..0783979ca5 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -4,14 +4,21 @@ # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ movq r, %rax; \ vpbroadcastb %xmm0, %xmm0; \ vpbroadcastq %xmm0, %zmm0 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + # define SECTION(p) p##.avx512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 704eed9a3f..2eb9e3744e 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -30,6 +30,10 @@ # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) #endif +#ifndef WMEMSET_CHK_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -79,6 +83,21 @@ END (__bzero) weak_alias (__bzero, bzero) #endif +#if IS_IN (libc) +# if defined SHARED +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +# endif + +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shlq $2, %rdx + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) +END (WMEMSET_SYMBOL (__wmemset, unaligned)) +#endif + #if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) cmpq %rdx, %rcx @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) #endif ENTRY (MEMSET_SYMBOL (__memset, unaligned)) -L(memset_entry): - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) cmpq $VEC_SIZE, %rdx jb L(less_vec) cmpq $(VEC_SIZE * 2), %rdx diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index 9d33118cf8..11f27378b0 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -58,16 +58,23 @@ END(memset) #if IS_IN (libc) # define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s # ifdef SHARED -# undef libc_hidden_builtin_def +# undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. The speedup we get from using SSE2 instructions is likely eaten away by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2_unaligned +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \ + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \ + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned # endif +# undef weak_alias +# define weak_alias(original, alias) \ + .weak bzero; bzero = __bzero + # undef strong_alias # define strong_alias(original, alias) #endif diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S new file mode 100644 index 0000000000..3bd7ca2092 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset.S @@ -0,0 +1,47 @@ +/* Multiple versions of wmemset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + +#include +#include +#include + +ENTRY(__wmemset) + .type __wmemset, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __wmemset_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + lea __wmemset_avx2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __wmemset_avx512_unaligned(%rip), %RAX_LP +1: ret +END(__wmemset) + +weak_alias (__wmemset, wmemset) +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S new file mode 100644 index 0000000000..c76fcb1587 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk.S @@ -0,0 +1,46 @@ +/* Multiple versions of wmemset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# ifdef SHARED +ENTRY(__wmemset_chk) + .type __wmemset_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __wmemset_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + lea __wmemset_chk_avx2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __wmemset_chk_avx512_unaligned(%rip), %RAX_LP +1: ret +END(__wmemset_chk) +# else +# include "../wmemset_chk.S" +# endif +#endif diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S new file mode 100644 index 0000000000..f96d567fd8 --- /dev/null +++ b/sysdeps/x86_64/wmemset.S @@ -0,0 +1 @@ +/* Implemented in memset.S. */ diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S new file mode 100644 index 0000000000..64c277413f --- /dev/null +++ b/sysdeps/x86_64/wmemset_chk.S @@ -0,0 +1,33 @@ +/* Checking wmemset for x86-64. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in wmemset.S. + For libc.a, this is a separate source to avoid + wmemset bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__wmemset_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp wmemset +END (__wmemset_chk) +#endif -- cgit v1.2.1