diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2016-04-06 10:19:16 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2016-04-06 10:19:16 -0700 |
commit | a7d1c51482d15ab6c07e2ee0ae5e007067b18bfb (patch) | |
tree | 66c861f9ff408054d67a5d36c43e172e72bbf5a0 | |
parent | 4af1bb06c59d24f35bf8dc55897838d926c05892 (diff) | |
download | glibc-a7d1c51482d15ab6c07e2ee0ae5e007067b18bfb.tar.gz |
X86-64: Prepare memmove-vec-unaligned-erms.S
Prepare memmove-vec-unaligned-erms.S to make the SSE2 version as the
default memcpy, mempcpy and memmove.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
(MEMCPY_SYMBOL): New.
(MEMPCPY_SYMBOL): Likewise.
(MEMMOVE_CHK_SYMBOL): Likewise.
Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
__mempcpy symbols. Provide alias for __memcpy_chk in libc.a.
Provide alias for memcpy in libc.a and ld.so.
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 138 |
2 files changed, 95 insertions, 54 deletions
@@ -1,5 +1,16 @@ 2016-04-06 H.J. Lu <hongjiu.lu@intel.com> + * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S + (MEMCPY_SYMBOL): New. + (MEMPCPY_SYMBOL): Likewise. + (MEMMOVE_CHK_SYMBOL): Likewise. + Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk + symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on + __mempcpy symbols. Provide alias for __memcpy_chk in libc.a. + Provide alias for memcpy in libc.a and ld.so. + +2016-04-06 H.J. Lu <hongjiu.lu@intel.com> + * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (MEMSET_CHK_SYMBOL): New. Define if not defined. (__bzero): Check VEC_SIZE == 16 instead of USE_MULTIARCH. diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 66779a3bec..8a60d0ff02 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -32,18 +32,27 @@ 8 * VEC_SIZE at a time. 8. Otherwise, forward copy 8 * VEC_SIZE at a time. */ -#if IS_IN (libc) +#include <sysdep.h> -# include <sysdep.h> -# include "asm-syntax.h" +#ifndef MEMCPY_SYMBOL +# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif -# ifndef VZEROUPPER -# if VEC_SIZE > 16 -# define VZEROUPPER vzeroupper -# else -# define VZEROUPPER -# endif +#ifndef MEMPCPY_SYMBOL +# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMMOVE_CHK_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER # endif +#endif /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set up REP MOVSB operation, REP MOVSB isn't faster on short data. The @@ -52,32 +61,36 @@ on processors with Enhanced REP MOVSB. Since larger register size can move more data with a single load and store, the threshold is higher with larger register size. */ -# ifndef REP_MOVSB_THRESHOLD -# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) -# endif +#ifndef REP_MOVSB_THRESHOLD +# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) +#endif -# ifndef SECTION -# error SECTION is not defined! -# endif - .section SECTION(.text),"ax",@progbits +#ifndef SECTION +# error SECTION is not defined! +#endif -# ifdef SHARED -ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2)) + .section SECTION(.text),"ax",@progbits +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2)) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2)) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2)) +#endif -ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2)) +#if VEC_SIZE == 16 || defined SHARED +ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2)) movq %rdi, %rax addq %rdx, %rax jmp L(start) -END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2)) +END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2)) +#endif -ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2)) +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2)) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2)) -# endif +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2)) +#endif ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2)) movq %rdi, %rax @@ -86,24 +99,29 @@ L(start): jb L(less_vec) cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(last_2x_vec): +#endif /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU (%rsi), %VEC(0) VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) VZEROUPPER +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(nop): +#endif ret +#if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned_2)) -# if VEC_SIZE == 16 +# if VEC_SIZE == 16 && defined SHARED /* Only used to measure performance of REP MOVSB. */ -# ifdef SHARED ENTRY (__mempcpy_erms) movq %rdi, %rax addq %rdx, %rax jmp L(start_movsb) END (__mempcpy_erms) -# endif ENTRY (__memmove_erms) movq %rdi, %rax @@ -132,11 +150,10 @@ strong_alias (__memmove_erms, __memcpy_erms) # endif # ifdef SHARED -ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms)) +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms)) -# endif +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) movq %rdi, %rax @@ -144,11 +161,10 @@ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) jmp L(start_erms) END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) -# ifdef SHARED -ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms)) +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms)) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) # endif ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) @@ -192,6 +208,7 @@ L(movsb_more_2x_vec): /* Force 32-bit displacement to avoid long nop between instructions. */ ja.d32 L(movsb) +#endif .p2align 4 L(more_2x_vec): /* More than 2 * VEC. */ @@ -227,13 +244,19 @@ L(copy_forward): VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx) cmpq $(VEC_SIZE * 8), %rdx -# if VEC_SIZE == 16 +#if VEC_SIZE == 16 +# if defined USE_MULTIARCH && IS_IN (libc) jbe L(return) # else + /* Use 32-bit displacement to avoid long nop between + instructions. */ + jbe.d32 L(return) +# endif +#else /* Use 8-bit displacement to avoid long nop between instructions. */ jbe L(return_disp8) -# endif +#endif leaq (VEC_SIZE * 4)(%rdi), %rcx addq %rdi, %rdx andq $-(VEC_SIZE * 4), %rdx @@ -263,22 +286,25 @@ L(loop): addq $(VEC_SIZE * 4), %rcx cmpq %rcx, %rdx jne L(loop) +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(return): +#endif L(return_disp8): VZEROUPPER ret L(less_vec): /* Less than 1 VEC. */ -# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 -# error Unsupported VEC_SIZE! -# endif -# if VEC_SIZE > 32 +#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +#endif +#if VEC_SIZE > 32 cmpb $32, %dl jae L(between_32_63) -# endif -# if VEC_SIZE > 16 +#endif +#if VEC_SIZE > 16 cmpb $16, %dl jae L(between_16_31) -# endif +#endif cmpb $8, %dl jae L(between_8_15) cmpb $4, %dl @@ -290,7 +316,7 @@ L(less_vec): movb %cl, (%rdi) 1: ret -# if VEC_SIZE > 32 +#if VEC_SIZE > 32 L(between_32_63): /* From 32 to 63. No branch when size == 32. */ vmovdqu (%rsi), %ymm0 @@ -299,8 +325,8 @@ L(between_32_63): vmovdqu %ymm1, -32(%rdi,%rdx) VZEROUPPER ret -# endif -# if VEC_SIZE > 16 +#endif +#if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): vmovdqu (%rsi), %xmm0 @@ -308,7 +334,7 @@ L(between_16_31): vmovdqu %xmm0, (%rdi) vmovdqu %xmm1, -16(%rdi,%rdx) ret -# endif +#endif L(between_8_15): /* From 8 to 15. No branch when size == 8. */ movq -8(%rsi,%rdx), %rcx @@ -331,10 +357,10 @@ L(between_2_3): movw %si, (%rdi) ret -# if VEC_SIZE > 16 +#if VEC_SIZE > 16 /* Align to 16 bytes to avoid long nop between instructions. */ .p2align 4 -# endif +#endif L(more_2x_vec_overlap): /* More than 2 * VEC and there is overlap bewteen destination and source. */ @@ -454,15 +480,19 @@ L(loop_8x_vec_backward): jmp L(between_4x_vec_and_8x_vec) END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) -# ifdef SHARED +#ifdef SHARED +# if IS_IN (libc) +# ifdef USE_MULTIARCH strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) -strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2), - MEMMOVE_SYMBOL (__memcpy, unaligned_2)) -strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2), - MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2)) +# endif +strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2), + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2)) # endif - +#endif +#if VEC_SIZE == 16 || defined SHARED +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2), + MEMCPY_SYMBOL (__memcpy, unaligned_2)) #endif |