summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-04-24 10:53:25 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-04-24 11:13:34 -0700
commit9cdb7e72b694274d76fce00a23b87efde2e0d28b (patch)
treee18696d97e56020687f67f48435b34e37eb4e998
parent8dd19b0b3ca334060eec990f0afa502700939ad3 (diff)
downloadglibc-hjl/erms/cacheline.tar.gz
Align to cachelinehjl/erms/cacheline
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S94
1 files changed, 94 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 0a2bf4108f..c140cb349e 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -376,6 +376,27 @@ L(more_8x_vec):
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
+#if CACHELINE_SIZE != VEC_SIZE
+ movl %edi, %r8d
+ andl $(CACHELINE_SIZE - 1), %r8d
+ je L(loop_4x_vec_forward_pre)
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+ /* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or
+ 3 * VEC_SIZE. */
+ cmpl $(VEC_SIZE * 2), %r8d
+ je L(misaligned_by_2x_vec_forward)
+ jb L(misaligned_by_3x_vec_forward)
+# elif CACHELINE_SIZE != (VEC_SIZE * 2)
+# error Unsupported CACHELINE_SIZE!
+# endif
+ /* Cacheline misaligned by VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ addq $VEC_SIZE, %rsi
+ subq $VEC_SIZE, %rdx
+ VMOVA %VEC(0), (%rdi)
+ addq $VEC_SIZE, %rdi
+#endif
+L(loop_4x_vec_forward_pre):
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
@@ -406,6 +427,32 @@ L(loop_4x_vec_forward):
VZEROUPPER
ret
+#if CACHELINE_SIZE == (VEC_SIZE * 4)
+L(misaligned_by_2x_vec_forward):
+ /* Cacheline misaligned by 2 * VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ addq $(VEC_SIZE * 2), %rsi
+ subq $(VEC_SIZE * 2), %rdx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ addq $(VEC_SIZE * 2), %rdi
+ jmp L(loop_4x_vec_forward_pre)
+
+L(misaligned_by_3x_vec_forward):
+ /* Cacheline misaligned by 3 * VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ addq $(VEC_SIZE * 3), %rsi
+ subq $(VEC_SIZE * 3), %rdx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ addq $(VEC_SIZE * 3), %rdi
+ jmp L(loop_4x_vec_forward_pre)
+#endif
+
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
addresses. */
@@ -428,6 +475,27 @@ L(more_8x_vec_backward):
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
+#if CACHELINE_SIZE != VEC_SIZE
+ movl %r9d, %r8d
+ andl $(CACHELINE_SIZE - 1), %r8d
+ je L(loop_4x_vec_backward_pre)
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+ /* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or
+ 3 * VEC_SIZE. */
+ cmpl $(VEC_SIZE * 2), %r8d
+ je L(misaligned_by_2x_vec_backward)
+ jb L(misaligned_by_3x_vec_backward)
+# elif CACHELINE_SIZE != (VEC_SIZE * 2)
+# error Unsupported CACHELINE_SIZE!
+# endif
+ /* Cacheline misaligned by VEC_SIZE. */
+ VMOVU (%rcx), %VEC(0)
+ subq $VEC_SIZE, %rcx
+ subq $VEC_SIZE, %rdx
+ VMOVA %VEC(0), (%r9)
+ subq $VEC_SIZE, %r9
+#endif
+L(loop_4x_vec_backward_pre):
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
@@ -458,6 +526,32 @@ L(loop_4x_vec_backward):
VZEROUPPER
ret
+#if CACHELINE_SIZE == (VEC_SIZE * 4)
+L(misaligned_by_2x_vec_backward):
+ /* Cacheline misaligned by 2 * VEC_SIZE. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ subq $(VEC_SIZE * 2), %rcx
+ subq $(VEC_SIZE * 2), %rdx
+ VMOVA %VEC(0), (%r9)
+ VMOVA %VEC(1), -VEC_SIZE(%r9)
+ subq $(VEC_SIZE * 2), %r9
+ jmp L(loop_4x_vec_backward_pre)
+
+L(misaligned_by_3x_vec_backward):
+ /* Cacheline misaligned by 3 * VEC_SIZE. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ subq $(VEC_SIZE * 3), %rcx
+ subq $(VEC_SIZE * 3), %rdx
+ VMOVA %VEC(0), (%r9)
+ VMOVA %VEC(1), -VEC_SIZE(%r9)
+ VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
+ subq $(VEC_SIZE * 3), %r9
+ jmp L(loop_4x_vec_backward_pre)
+#endif
+
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_forward):
/* Don't use non-temporal store if there is overlap between