summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-03-27 17:38:17 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-03-27 17:38:17 -0700
commitbf09fc11484f364096b5d5f0ae22037d2f9c9e4f (patch)
treec47e3500953a664217f334e8f9fd871989cf96ce
parent101aa97f04af9b9f89e0c52fb266188f6ea8a546 (diff)
downloadglibc-hjl/erms/hybrid.tar.gz
Align/displacement to avoid long nop memmove-vec-unaligned-erms.Shjl/erms/hybrid
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S25
1 files changed, 23 insertions, 2 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 385f2c2211..f2557bcea8 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -178,7 +178,8 @@ L(movsb):
.p2align 4
L(movsb_more_2x_vec):
cmpq $REP_MOVSB_THRESHOLD, %rdx
- /* Force 32-bit displacement to avoid long nop. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
ja.d32 L(movsb)
.p2align 4
L(more_2x_vec):
@@ -201,7 +202,9 @@ L(copy_forward):
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
cmpq $(VEC_SIZE * 4), %rdx
- jbe L(return)
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ jbe.d32 L(return)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
@@ -211,7 +214,13 @@ L(copy_forward):
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
cmpq $(VEC_SIZE * 8), %rdx
+# if VEC_SIZE == 16
jbe L(return)
+# else
+ /* Use 8-bit displacement to avoid long nop between
+ instructions. */
+ jbe L(return_disp8)
+# endif
leaq (VEC_SIZE * 4)(%rdi), %rcx
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
@@ -220,7 +229,13 @@ L(copy_forward):
subq %rdi, %r11
addq %r11, %rsi
cmpq %rdx, %rcx
+# if VEC_SIZE == 16
je L(return)
+# else
+ /* Use 8-bit displacement to avoid long nop between
+ instructions. */
+ je L(return_disp8)
+# endif
movq %rsi, %r10
subq %rcx, %r10
leaq VEC_SIZE(%r10), %r9
@@ -239,6 +254,7 @@ L(loop):
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
+L(return_disp8):
VZEROUPPER
ret
L(less_vec):
@@ -306,6 +322,10 @@ L(between_2_3):
movw %si, (%rdi)
ret
+# if VEC_SIZE > 16
+ /* Align to 16 bytes to avoid long nop between instructions. */
+ .p2align 4
+# endif
L(more_2x_vec_overlap):
/* More than 2 * VEC and there is overlap bewteen destination
and source. */
@@ -389,6 +409,7 @@ L(loop_8x_vec_forward):
jb L(between_0_and_4x_vec)
jmp L(between_4x_vec_and_8x_vec)
+ .p2align 4
L(more_8x_vec_backward):
leaq -VEC_SIZE(%rsi, %rdx), %rcx
leaq -VEC_SIZE(%rdi, %rdx), %r9