diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2017-05-30 12:39:14 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2017-05-30 12:39:14 -0700 |
commit | 4f26ef1b67287d1f2c32865f7d79c13abda81915 (patch) | |
tree | bc9cb0c0b5c581cee6aba4f2fc138b48783ab565 | |
parent | 542a34783ce1cfc63929ec50ab1f9f738711b815 (diff) | |
download | glibc-4f26ef1b67287d1f2c32865f7d79c13abda81915.tar.gz |
x86_64: Remove redundant REX bytes from memchr.S
By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits. There is no need to use 64-bit registers when only the lower
32 bits are non-zero.
* sysdeps/x86_64/memchr.S (MEMCHR): Use 32-bit registers for
the lower 32 bits.
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/memchr.S | 28 |
2 files changed, 19 insertions, 14 deletions
@@ -1,3 +1,8 @@ +2017-05-30 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/memchr.S (memchr): Use 32-bit registers for + the lower 32 bits. + 2017-05-29 Andreas Schwab <schwab@linux-m68k.org> * sysdeps/m68k/Makefile (ASFLAGS-.o) [$(subdir) = csu && diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index f82e1c5bf7..d3be012424 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -22,18 +22,18 @@ .text ENTRY(memchr) - movd %rsi, %xmm1 - mov %rdi, %rcx + movd %esi, %xmm1 + mov %edi, %ecx punpcklbw %xmm1, %xmm1 test %rdx, %rdx jz L(return_null) punpcklbw %xmm1, %xmm1 - and $63, %rcx + and $63, %ecx pshufd $0, %xmm1, %xmm1 - cmp $48, %rcx + cmp $48, %ecx ja L(crosscache) movdqu (%rdi), %xmm0 @@ -45,7 +45,7 @@ ENTRY(memchr) sub $16, %rdx jbe L(return_null) add $16, %rdi - and $15, %rcx + and $15, %ecx and $-16, %rdi add %rcx, %rdx sub $64, %rdx @@ -54,7 +54,7 @@ ENTRY(memchr) .p2align 4 L(crosscache): - and $15, %rcx + and $15, %ecx and $-16, %rdi movdqa (%rdi), %xmm0 @@ -148,7 +148,7 @@ L(loop_prolog): mov %rdi, %rcx and $-64, %rdi - and $63, %rcx + and $63, %ecx add %rcx, %rdx .p2align 4 @@ -200,7 +200,7 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $32, %rdx + add $32, %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 @@ -220,32 +220,32 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) - sub $16, %rdx + sub $16, %edx jle L(return_null) pcmpeqb 48(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches48_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 L(exit_loop_32): - add $32, %rdx + add $32, %edx movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) - sub $16, %rdx + sub $16, %edx jbe L(return_null) pcmpeqb 16(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches16_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -306,7 +306,7 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret END(memchr) |