summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch/init-arch.h
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2015-01-30 06:50:20 -0800
committerH.J. Lu <hjl.tools@gmail.com>2015-01-30 12:26:33 -0800
commit328fc20e5e334a642f0152d9662474789381a897 (patch)
tree6efc6b6b150d1373c36c8bdaf4a606989c152a9a /sysdeps/x86_64/multiarch/init-arch.h
parentf80af76648ed97a76745fad6caa3315a79cb1c7c (diff)
downloadglibc-hjl/release/2.20/master.tar.gz
Use AVX unaligned memcpy only if AVX2 is availablehjl/release/2.20/master
memcpy with unaligned 256-bit AVX register loads/stores are slow on older processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load and sets it only when AVX2 is available. [BZ #17801] * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): New. (index_AVX_Fast_Unaligned_Load): Likewise. (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. [cherry picked from commit 56d25c11b64a97255a115901d136d753c86de24e]
Diffstat (limited to 'sysdeps/x86_64/multiarch/init-arch.h')
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h4
1 files changed, 4 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index ef0abbd226..2fc7c7ceec 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -25,6 +25,7 @@
#define bit_FMA4_Usable (1 << 8)
#define bit_Slow_SSE4_2 (1 << 9)
#define bit_AVX2_Usable (1 << 10)
+#define bit_AVX_Fast_Unaligned_Load (1 << 11)
/* CPUID Feature flags. */
@@ -74,6 +75,7 @@
# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_FMA4_Usable FEATURE_INDEX_1
# define index_Slow_SSE4_2 FEATURE_INDEX_1
# define index_AVX2_Usable FEATURE_INDEX_1
+# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
# define HAS_ARCH_FEATURE(name) \
((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
@@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable)
# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable)
# define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable)
+# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
#endif /* __ASSEMBLER__ */