summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/x86_64/memcpy.S692
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S175
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-sse2.S569
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S37
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S17
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy-sse2.S4
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S39
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S17
9 files changed, 789 insertions, 763 deletions
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index eea8c2a5af..00c17a4287 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -1,9 +1,5 @@
-/*
- Optimized memcpy for x86-64.
-
- Copyright (C) 2007-2015 Free Software Foundation, Inc.
- Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
-
+/* Optimized memcpy for x86-64
+ Copyright (C) 2013-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,33 +14,20 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>.
-*/
+ <http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "asm-syntax.h"
-/* Stack slots in the red-zone. */
+#include "asm-syntax.h"
-#ifdef USE_AS_MEMPCPY
-# define RETVAL (0)
-#else
-# define RETVAL (-8)
-# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
-# define memcpy __memcpy
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
+#if defined SHARED && IS_IN (libc)
+# if !defined USE_AS_MEMPCPY && !defined USE_MULTIARCH
+# define memcpy __memcpy
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
.globl __GI_memcpy; __GI_memcpy = __memcpy
-# endif
-#endif
-#define SAVE0 (RETVAL - 8)
-#define SAVE1 (SAVE0 - 8)
-#define SAVE2 (SAVE1 - 8)
-#define SAVE3 (SAVE2 - 8)
-
- .text
+# endif
-#if defined PIC && IS_IN (libc)
ENTRY_CHK (__memcpy_chk)
cmpq %rdx, %rcx
@@ -53,525 +36,160 @@ ENTRY_CHK (__memcpy_chk)
END_CHK (__memcpy_chk)
#endif
-ENTRY(memcpy) /* (void *, const void*, size_t) */
-
-/* Handle tiny blocks. */
-
-L(1try): /* up to 32B */
- cmpq $32, %rdx
-#ifndef USE_AS_MEMPCPY
- movq %rdi, %rax /* save return value */
-#endif
- jae L(1after)
-
-L(1): /* 1-byte once */
- testb $1, %dl
- jz L(1a)
-
- movzbl (%rsi), %ecx
- movb %cl, (%rdi)
-
- incq %rsi
- incq %rdi
-
- .p2align 4,, 4
-
-L(1a): /* 2-byte once */
- testb $2, %dl
- jz L(1b)
-
- movzwl (%rsi), %ecx
- movw %cx, (%rdi)
-
- addq $2, %rsi
- addq $2, %rdi
-
- .p2align 4,, 4
-
-L(1b): /* 4-byte once */
- testb $4, %dl
- jz L(1c)
-
- movl (%rsi), %ecx
- movl %ecx, (%rdi)
-
- addq $4, %rsi
- addq $4, %rdi
-
- .p2align 4,, 4
-
-L(1c): /* 8-byte once */
- testb $8, %dl
- jz L(1d)
-
- movq (%rsi), %rcx
- movq %rcx, (%rdi)
-
- addq $8, %rsi
- addq $8, %rdi
-
- .p2align 4,, 4
-
-L(1d): /* 16-byte loop */
- andl $0xf0, %edx
- jz L(exit)
-
- .p2align 4
-
-L(1loop):
- movq (%rsi), %rcx
- movq 8(%rsi), %r8
- movq %rcx, (%rdi)
- movq %r8, 8(%rdi)
-
- subl $16, %edx
-
- leaq 16(%rsi), %rsi
- leaq 16(%rdi), %rdi
-
- jnz L(1loop)
-
- .p2align 4,, 4
-
-L(exit): /* exit */
+ENTRY(memcpy)
+ movq %rsi, %rax
#ifdef USE_AS_MEMPCPY
- movq %rdi, %rax /* return value */
-#else
- rep
+ leaq (%rdi,%rdx), %r11
#endif
- retq
-
- .p2align 4
-
-L(1after):
-#ifndef USE_AS_MEMPCPY
- movq %rax, RETVAL(%rsp) /* save return value */
-#endif
-
-/* Align to the natural word size. */
-
-L(aligntry):
- movl %esi, %ecx /* align by source */
-
- andl $7, %ecx
- jz L(alignafter) /* already aligned */
-
-L(align): /* align */
- leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
- subl $8, %ecx
-
- .p2align 4
-
-L(alignloop): /* 1-byte alignment loop */
- movzbl (%rsi), %eax
- movb %al, (%rdi)
-
- incl %ecx
-
- leaq 1(%rsi), %rsi
- leaq 1(%rdi), %rdi
-
- jnz L(alignloop)
-
- .p2align 4
-
-L(alignafter):
-
-/* Handle mid-sized blocks. */
-
-L(32try): /* up to 1KB */
- cmpq $1024, %rdx
- ja L(32after)
-
-L(32): /* 32-byte loop */
- movl %edx, %ecx
- shrl $5, %ecx
- jz L(32skip)
-
- .p2align 4
-
-L(32loop):
- decl %ecx
-
- movq (%rsi), %rax
- movq 8(%rsi), %r8
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
-
- movq %rax, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
-
- leaq 32(%rsi), %rsi
- leaq 32(%rdi), %rdi
-
- jz L(32skip) /* help out smaller blocks */
-
- decl %ecx
-
- movq (%rsi), %rax
- movq 8(%rsi), %r8
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
-
- movq %rax, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
-
- leaq 32(%rsi), %rsi
- leaq 32(%rdi), %rdi
-
- jnz L(32loop)
-
- .p2align 4
-
-L(32skip):
- andl $31, %edx /* check for left overs */
+ leaq (%rdx,%rdx), %rcx
+ subq %rdi, %rax
+ subq %rdx, %rax
+ cmpq %rcx, %rax
+ jb L(overlapping)
+ cmpq $16, %rdx
+ jbe L(less_16)
+ movdqu (%rsi), %xmm8
+ cmpq $32, %rdx
+ movdqu %xmm8, (%rdi)
+ movdqu -16(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -16(%rdi,%rdx)
+ ja .L31
+L(return):
#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
- movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
-#endif
- retq /* exit */
-
- .p2align 4
-
-L(32after):
-
-/*
- In order to minimize code-size in RTLD, algorithms specific for
- larger blocks are excluded when building for RTLD.
-*/
-
-/* Handle blocks smaller than 1/2 L1. */
-
-L(fasttry): /* first 1/2 L1 */
-#if IS_IN (libc) /* only up to this algorithm outside of libc.so */
- mov __x86_data_cache_size_half(%rip), %R11_LP
- cmpq %rdx, %r11 /* calculate the smaller of */
- cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
-#endif
-
-L(fast): /* good ol' MOVS */
-#if IS_IN (libc)
- movq %r11, %rcx
- andq $-8, %r11
+ movq %r11, %rax
#else
- movq %rdx, %rcx
-#endif
- shrq $3, %rcx
- jz L(fastskip)
-
- rep
- movsq
-
- .p2align 4,, 4
-
-L(fastskip):
-#if IS_IN (libc)
- subq %r11, %rdx /* check for more */
- testq $-8, %rdx
- jnz L(fastafter)
-#endif
-
- andl $7, %edx /* check for left overs */
-#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
#endif
- retq /* exit */
-
-#if IS_IN (libc) /* none of the algorithms below for RTLD */
-
- .p2align 4
-
-L(fastafter):
-
-/* Handle large blocks smaller than 1/2 L2. */
-
-L(pretry): /* first 1/2 L2 */
- mov __x86_shared_cache_size_half (%rip), %R8_LP
- cmpq %rdx, %r8 /* calculate the lesser of */
- cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
-
-L(pre): /* 64-byte with prefetching */
- movq %r8, %rcx
- andq $-64, %r8
- shrq $6, %rcx
- jz L(preskip)
-
- movq %r14, SAVE0(%rsp)
- cfi_rel_offset (%r14, SAVE0)
- movq %r13, SAVE1(%rsp)
- cfi_rel_offset (%r13, SAVE1)
- movq %r12, SAVE2(%rsp)
- cfi_rel_offset (%r12, SAVE2)
- movq %rbx, SAVE3(%rsp)
- cfi_rel_offset (%rbx, SAVE3)
-
- cmpl $0, __x86_prefetchw(%rip)
- jz L(preloop) /* check if PREFETCHW OK */
-
+ ret
+ .p2align 4,,10
.p2align 4
-
-/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
-
-L(prewloop): /* cache-line in state M */
- decq %rcx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
- movq 32 (%rsi), %r11
- movq 40 (%rsi), %r12
- movq 48 (%rsi), %r13
- movq 56 (%rsi), %r14
-
- prefetcht0 0 + 896 (%rsi)
- prefetcht0 64 + 896 (%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jz L(prebail)
-
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %rbx
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- prefetchw 896 - 64(%rdi)
- prefetchw 896 - 0(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz L(prewloop)
- jmp L(prebail)
-
+.L31:
+ movdqu 16(%rsi), %xmm8
+ cmpq $64, %rdx
+ movdqu %xmm8, 16(%rdi)
+ movdqu -32(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -32(%rdi,%rdx)
+ jbe L(return)
+ movdqu 32(%rsi), %xmm8
+ cmpq $128, %rdx
+ movdqu %xmm8, 32(%rdi)
+ movdqu -48(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -48(%rdi,%rdx)
+ movdqu 48(%rsi), %xmm8
+ movdqu %xmm8, 48(%rdi)
+ movdqu -64(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -64(%rdi,%rdx)
+ jbe L(return)
+ leaq 64(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ andq $-64, %rcx
+ movq %rcx, %rax
+ subq %rdi, %rax
+ addq %rax, %rsi
+ cmpq %rdx, %rcx
+ je L(return)
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq 16(%r10), %r9
+ leaq 32(%r10), %r8
+ leaq 48(%r10), %rax
+ .p2align 4,,10
.p2align 4
-
-/* ... when PREFETCHW is not available. */
-
-L(preloop): /* cache-line in state E */
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %rbx
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- prefetcht0 896 + 0(%rsi)
- prefetcht0 896 + 64(%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
-
- jz L(prebail)
-
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %rbx
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- prefetcht0 896 - 64(%rdi)
- prefetcht0 896 - 0(%rdi)
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz L(preloop)
-
-L(prebail):
- movq SAVE3(%rsp), %rbx
- cfi_restore (%rbx)
- movq SAVE2(%rsp), %r12
- cfi_restore (%r12)
- movq SAVE1(%rsp), %r13
- cfi_restore (%r13)
- movq SAVE0(%rsp), %r14
- cfi_restore (%r14)
-
-/* .p2align 4 */
-
-L(preskip):
- subq %r8, %rdx /* check for more */
- testq $-64, %rdx
- jnz L(preafter)
-
- andl $63, %edx /* check for left overs */
-#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
- movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
-#endif
- retq /* exit */
-
+L(loop):
+ movdqu (%rcx,%r10), %xmm8
+ movdqa %xmm8, (%rcx)
+ movdqu (%rcx,%r9), %xmm8
+ movdqa %xmm8, 16(%rcx)
+ movdqu (%rcx,%r8), %xmm8
+ movdqa %xmm8, 32(%rcx)
+ movdqu (%rcx,%rax), %xmm8
+ movdqa %xmm8, 48(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ jmp L(return)
+L(overlapping):
+ cmpq %rsi, %rdi
+ jae .L3
+ testq %rdx, %rdx
+ .p2align 4,,5
+ je L(return)
+ movq %rdx, %r9
+ leaq 16(%rsi), %rcx
+ leaq 16(%rdi), %r8
+ shrq $4, %r9
+ movq %r9, %rax
+ salq $4, %rax
+ cmpq %rcx, %rdi
+ setae %cl
+ cmpq %r8, %rsi
+ setae %r8b
+ orl %r8d, %ecx
+ cmpq $15, %rdx
+ seta %r8b
+ testb %r8b, %cl
+ je .L16
+ testq %rax, %rax
+ je .L16
+ xorl %ecx, %ecx
+ xorl %r8d, %r8d
+.L7:
+ movdqu (%rsi,%rcx), %xmm8
+ addq $1, %r8
+ movdqu %xmm8, (%rdi,%rcx)
+ addq $16, %rcx
+ cmpq %r8, %r9
+ ja .L7
+ cmpq %rax, %rdx
+ je L(return)
+.L21:
+ movzbl (%rsi,%rax), %ecx
+ movb %cl, (%rdi,%rax)
+ addq $1, %rax
+ cmpq %rax, %rdx
+ ja .L21
+ jmp L(return)
+L(less_16):
+ testb $24, %dl
+ jne L(between_9_16)
+ testb $4, %dl
+ .p2align 4,,5
+ jne L(between_5_8)
+ testq %rdx, %rdx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%rsi), %eax
+ testb $2, %dl
+ movb %al, (%rdi)
+ je L(return)
+ movzwl -2(%rsi,%rdx), %eax
+ movw %ax, -2(%rdi,%rdx)
+ jmp L(return)
+.L3:
+ leaq -1(%rdx), %rax
+ .p2align 4,,10
.p2align 4
-
-L(preafter):
-
-/* Handle huge blocks. */
-
-L(NTtry):
-
-L(NT): /* non-temporal 128-byte */
- movq %rdx, %rcx
- shrq $7, %rcx
- jz L(NTskip)
-
- movq %r14, SAVE0(%rsp)
- cfi_rel_offset (%r14, SAVE0)
- movq %r13, SAVE1(%rsp)
- cfi_rel_offset (%r13, SAVE1)
- movq %r12, SAVE2(%rsp)
- cfi_rel_offset (%r12, SAVE2)
-
- .p2align 4
-
-L(NTloop):
- prefetchnta 768(%rsi)
- prefetchnta 832(%rsi)
-
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %r8
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- movntiq %rax, (%rdi)
- movntiq %r8, 8(%rdi)
- movntiq %r9, 16(%rdi)
- movntiq %r10, 24(%rdi)
- movntiq %r11, 32(%rdi)
- movntiq %r12, 40(%rdi)
- movntiq %r13, 48(%rdi)
- movntiq %r14, 56(%rdi)
-
- movq 64(%rsi), %rax
- movq 72(%rsi), %r8
- movq 80(%rsi), %r9
- movq 88(%rsi), %r10
- movq 96(%rsi), %r11
- movq 104(%rsi), %r12
- movq 112(%rsi), %r13
- movq 120(%rsi), %r14
-
- movntiq %rax, 64(%rdi)
- movntiq %r8, 72(%rdi)
- movntiq %r9, 80(%rdi)
- movntiq %r10, 88(%rdi)
- movntiq %r11, 96(%rdi)
- movntiq %r12, 104(%rdi)
- movntiq %r13, 112(%rdi)
- movntiq %r14, 120(%rdi)
-
- leaq 128(%rsi), %rsi
- leaq 128(%rdi), %rdi
-
- jnz L(NTloop)
-
- sfence /* serialize memory stores */
-
- movq SAVE2(%rsp), %r12
- cfi_restore (%r12)
- movq SAVE1(%rsp), %r13
- cfi_restore (%r13)
- movq SAVE0(%rsp), %r14
- cfi_restore (%r14)
-
-L(NTskip):
- andl $127, %edx /* check for left overs */
-#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
- movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
-#endif
- retq /* exit */
-
-#endif /* IS_IN (libc) */
-
+.L11:
+ movzbl (%rsi,%rax), %edx
+ movb %dl, (%rdi,%rax)
+ subq $1, %rax
+ jmp .L11
+L(between_9_16):
+ movq (%rsi), %rax
+ movq %rax, (%rdi)
+ movq -8(%rsi,%rdx), %rax
+ movq %rax, -8(%rdi,%rdx)
+ jmp L(return)
+.L16:
+ xorl %eax, %eax
+ jmp .L21
+L(between_5_8):
+ movl (%rsi), %eax
+ movl %eax, (%rdi)
+ movl -4(%rsi,%rdx), %eax
+ movl %eax, -4(%rdi,%rdx)
+ jmp L(return)
END(memcpy)
#ifndef USE_AS_MEMPCPY
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d10b4d4fb3..917e1a0c79 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memcmp-sse4 memcpy-ssse3 \
- memcpy-sse2-unaligned mempcpy-ssse3 \
+ memcpy-sse2 mempcpy-sse2 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
memmove-ssse3-back strcasecmp_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
deleted file mode 100644
index 5693ba7395..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,175 +0,0 @@
-/* memcpy with unaliged loads
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-#include <sysdep.h>
-
-#include "asm-syntax.h"
-
-
-ENTRY(__memcpy_sse2_unaligned)
- movq %rsi, %rax
- leaq (%rdx,%rdx), %rcx
- subq %rdi, %rax
- subq %rdx, %rax
- cmpq %rcx, %rax
- jb L(overlapping)
- cmpq $16, %rdx
- jbe L(less_16)
- movdqu (%rsi), %xmm8
- cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu -16(%rsi,%rdx), %xmm8
- movdqu %xmm8, -16(%rdi,%rdx)
- ja .L31
-L(return):
- movq %rdi, %rax
- ret
- .p2align 4,,10
- .p2align 4
-.L31:
- movdqu 16(%rsi), %xmm8
- cmpq $64, %rdx
- movdqu %xmm8, 16(%rdi)
- movdqu -32(%rsi,%rdx), %xmm8
- movdqu %xmm8, -32(%rdi,%rdx)
- jbe L(return)
- movdqu 32(%rsi), %xmm8
- cmpq $128, %rdx
- movdqu %xmm8, 32(%rdi)
- movdqu -48(%rsi,%rdx), %xmm8
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu 48(%rsi), %xmm8
- movdqu %xmm8, 48(%rdi)
- movdqu -64(%rsi,%rdx), %xmm8
- movdqu %xmm8, -64(%rdi,%rdx)
- jbe L(return)
- leaq 64(%rdi), %rcx
- addq %rdi, %rdx
- andq $-64, %rdx
- andq $-64, %rcx
- movq %rcx, %rax
- subq %rdi, %rax
- addq %rax, %rsi
- cmpq %rdx, %rcx
- je L(return)
- movq %rsi, %r10
- subq %rcx, %r10
- leaq 16(%r10), %r9
- leaq 32(%r10), %r8
- leaq 48(%r10), %rax
- .p2align 4,,10
- .p2align 4
-L(loop):
- movdqu (%rcx,%r10), %xmm8
- movdqa %xmm8, (%rcx)
- movdqu (%rcx,%r9), %xmm8
- movdqa %xmm8, 16(%rcx)
- movdqu (%rcx,%r8), %xmm8
- movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%rax), %xmm8
- movdqa %xmm8, 48(%rcx)
- addq $64, %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- jmp L(return)
-L(overlapping):
- cmpq %rsi, %rdi
- jae .L3
- testq %rdx, %rdx
- .p2align 4,,5
- je L(return)
- movq %rdx, %r9
- leaq 16(%rsi), %rcx
- leaq 16(%rdi), %r8
- shrq $4, %r9
- movq %r9, %rax
- salq $4, %rax
- cmpq %rcx, %rdi
- setae %cl
- cmpq %r8, %rsi
- setae %r8b
- orl %r8d, %ecx
- cmpq $15, %rdx
- seta %r8b
- testb %r8b, %cl
- je .L16
- testq %rax, %rax
- je .L16
- xorl %ecx, %ecx
- xorl %r8d, %r8d
-.L7:
- movdqu (%rsi,%rcx), %xmm8
- addq $1, %r8
- movdqu %xmm8, (%rdi,%rcx)
- addq $16, %rcx
- cmpq %r8, %r9
- ja .L7
- cmpq %rax, %rdx
- je L(return)
-.L21:
- movzbl (%rsi,%rax), %ecx
- movb %cl, (%rdi,%rax)
- addq $1, %rax
- cmpq %rax, %rdx
- ja .L21
- jmp L(return)
-L(less_16):
- testb $24, %dl
- jne L(between_9_16)
- testb $4, %dl
- .p2align 4,,5
- jne L(between_5_8)
- testq %rdx, %rdx
- .p2align 4,,2
- je L(return)
- movzbl (%rsi), %eax
- testb $2, %dl
- movb %al, (%rdi)
- je L(return)
- movzwl -2(%rsi,%rdx), %eax
- movw %ax, -2(%rdi,%rdx)
- jmp L(return)
-.L3:
- leaq -1(%rdx), %rax
- .p2align 4,,10
- .p2align 4
-.L11:
- movzbl (%rsi,%rax), %edx
- movb %dl, (%rdi,%rax)
- subq $1, %rax
- jmp .L11
-L(between_9_16):
- movq (%rsi), %rax
- movq %rax, (%rdi)
- movq -8(%rsi,%rdx), %rax
- movq %rax, -8(%rdi,%rdx)
- jmp L(return)
-.L16:
- xorl %eax, %eax
- jmp .L21
-L(between_5_8):
- movl (%rsi), %eax
- movl %eax, (%rdi)
- movl -4(%rsi,%rdx), %eax
- movl %eax, -4(%rdi,%rdx)
- jmp L(return)
-END(__memcpy_sse2_unaligned)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2.S b/sysdeps/x86_64/multiarch/memcpy-sse2.S
new file mode 100644
index 0000000000..9585b1fd03
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2.S
@@ -0,0 +1,569 @@
+/*
+ Optimized memcpy for x86-64.
+
+ Copyright (C) 2007-2015 Free Software Foundation, Inc.
+ Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* Stack slots in the red-zone. */
+
+#ifdef USE_AS_MEMPCPY
+# define RETVAL (0)
+#else
+# define RETVAL (-8)
+#endif
+#define SAVE0 (RETVAL - 8)
+#define SAVE1 (SAVE0 - 8)
+#define SAVE2 (SAVE1 - 8)
+#define SAVE3 (SAVE2 - 8)
+
+ .text
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (__memcpy_chk_sse2)
+
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+
+END_CHK (__memcpy_chk_sse2)
+#endif
+
+ENTRY(__memcpy_sse2) /* (void *, const void*, size_t) */
+
+/* Handle tiny blocks. */
+
+L(1try): /* up to 32B */
+ cmpq $32, %rdx
+#ifndef USE_AS_MEMPCPY
+ movq %rdi, %rax /* save return value */
+#endif
+ jae L(1after)
+
+L(1): /* 1-byte once */
+ testb $1, %dl
+ jz L(1a)
+
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+
+ incq %rsi
+ incq %rdi
+
+ .p2align 4,, 4
+
+L(1a): /* 2-byte once */
+ testb $2, %dl
+ jz L(1b)
+
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+
+ addq $2, %rsi
+ addq $2, %rdi
+
+ .p2align 4,, 4
+
+L(1b): /* 4-byte once */
+ testb $4, %dl
+ jz L(1c)
+
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+
+ addq $4, %rsi
+ addq $4, %rdi
+
+ .p2align 4,, 4
+
+L(1c): /* 8-byte once */
+ testb $8, %dl
+ jz L(1d)
+
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+
+ addq $8, %rsi
+ addq $8, %rdi
+
+ .p2align 4,, 4
+
+L(1d): /* 16-byte loop */
+ andl $0xf0, %edx
+ jz L(exit)
+
+ .p2align 4
+
+L(1loop):
+ movq (%rsi), %rcx
+ movq 8(%rsi), %r8
+ movq %rcx, (%rdi)
+ movq %r8, 8(%rdi)
+
+ subl $16, %edx
+
+ leaq 16(%rsi), %rsi
+ leaq 16(%rdi), %rdi
+
+ jnz L(1loop)
+
+ .p2align 4,, 4
+
+L(exit): /* exit */
+#ifdef USE_AS_MEMPCPY
+ movq %rdi, %rax /* return value */
+#else
+ rep
+#endif
+ retq
+
+ .p2align 4
+
+L(1after):
+#ifndef USE_AS_MEMPCPY
+ movq %rax, RETVAL(%rsp) /* save return value */
+#endif
+
+/* Align to the natural word size. */
+
+L(aligntry):
+ movl %esi, %ecx /* align by source */
+
+ andl $7, %ecx
+ jz L(alignafter) /* already aligned */
+
+L(align): /* align */
+ leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
+ subl $8, %ecx
+
+ .p2align 4
+
+L(alignloop): /* 1-byte alignment loop */
+ movzbl (%rsi), %eax
+ movb %al, (%rdi)
+
+ incl %ecx
+
+ leaq 1(%rsi), %rsi
+ leaq 1(%rdi), %rdi
+
+ jnz L(alignloop)
+
+ .p2align 4
+
+L(alignafter):
+
+/* Handle mid-sized blocks. */
+
+L(32try): /* up to 1KB */
+ cmpq $1024, %rdx
+ ja L(32after)
+
+L(32): /* 32-byte loop */
+ movl %edx, %ecx
+ shrl $5, %ecx
+ jz L(32skip)
+
+ .p2align 4
+
+L(32loop):
+ decl %ecx
+
+ movq (%rsi), %rax
+ movq 8(%rsi), %r8
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+
+ movq %rax, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+
+ leaq 32(%rsi), %rsi
+ leaq 32(%rdi), %rdi
+
+ jz L(32skip) /* help out smaller blocks */
+
+ decl %ecx
+
+ movq (%rsi), %rax
+ movq 8(%rsi), %r8
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+
+ movq %rax, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+
+ leaq 32(%rsi), %rsi
+ leaq 32(%rdi), %rdi
+
+ jnz L(32loop)
+
+ .p2align 4
+
+L(32skip):
+ andl $31, %edx /* check for left overs */
+#ifdef USE_AS_MEMPCPY
+ jnz L(1)
+
+ movq %rdi, %rax
+#else
+ movq RETVAL(%rsp), %rax
+ jnz L(1)
+
+ rep
+#endif
+ retq /* exit */
+
+ .p2align 4
+
+L(32after):
+
+/*
+ In order to minimize code-size in RTLD, algorithms specific for
+ larger blocks are excluded when building for RTLD.
+*/
+
+/* Handle blocks smaller than 1/2 L1. */
+
+L(fasttry): /* first 1/2 L1 */
+#if IS_IN (libc) /* only up to this algorithm outside of libc.so */
+ mov __x86_data_cache_size_half(%rip), %R11_LP
+ cmpq %rdx, %r11 /* calculate the smaller of */
+ cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
+#endif
+
+L(fast): /* good ol' MOVS */
+#if IS_IN (libc)
+ movq %r11, %rcx
+ andq $-8, %r11
+#else
+ movq %rdx, %rcx
+#endif
+ shrq $3, %rcx
+ jz L(fastskip)
+
+ rep
+ movsq
+
+ .p2align 4,, 4
+
+L(fastskip):
+#if IS_IN (libc)
+ subq %r11, %rdx /* check for more */
+ testq $-8, %rdx
+ jnz L(fastafter)
+#endif
+
+ andl $7, %edx /* check for left overs */
+#ifdef USE_AS_MEMPCPY
+ jnz L(1)
+
+ movq %rdi, %rax
+#else
+ movq RETVAL(%rsp), %rax
+ jnz L(1)
+
+ rep
+#endif
+ retq /* exit */
+
+#if IS_IN (libc) /* none of the algorithms below for RTLD */
+
+ .p2align 4
+
+L(fastafter):
+
+/* Handle large blocks smaller than 1/2 L2. */
+
+L(pretry): /* first 1/2 L2 */
+ mov __x86_shared_cache_size_half (%rip), %R8_LP
+ cmpq %rdx, %r8 /* calculate the lesser of */
+ cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
+
+L(pre): /* 64-byte with prefetching */
+ movq %r8, %rcx
+ andq $-64, %r8
+ shrq $6, %rcx
+ jz L(preskip)
+
+ movq %r14, SAVE0(%rsp)
+ cfi_rel_offset (%r14, SAVE0)
+ movq %r13, SAVE1(%rsp)
+ cfi_rel_offset (%r13, SAVE1)
+ movq %r12, SAVE2(%rsp)
+ cfi_rel_offset (%r12, SAVE2)
+ movq %rbx, SAVE3(%rsp)
+ cfi_rel_offset (%rbx, SAVE3)
+
+ cmpl $0, __x86_prefetchw(%rip)
+ jz L(preloop) /* check if PREFETCHW OK */
+
+ .p2align 4
+
+/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
+
+L(prewloop): /* cache-line in state M */
+ decq %rcx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %r9
+ movq 24 (%rsi), %r10
+ movq 32 (%rsi), %r11
+ movq 40 (%rsi), %r12
+ movq 48 (%rsi), %r13
+ movq 56 (%rsi), %r14
+
+ prefetcht0 0 + 896 (%rsi)
+ prefetcht0 64 + 896 (%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
+
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
+
+ jz L(prebail)
+
+ decq %rcx
+
+ movq (%rsi), %rax
+ movq 8(%rsi), %rbx
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
+
+ prefetchw 896 - 64(%rdi)
+ prefetchw 896 - 0(%rdi)
+
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
+
+ jnz L(prewloop)
+ jmp L(prebail)
+
+ .p2align 4
+
+/* ... when PREFETCHW is not available. */
+
+L(preloop): /* cache-line in state E */
+ decq %rcx
+
+ movq (%rsi), %rax
+ movq 8(%rsi), %rbx
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ prefetcht0 896 + 0(%rsi)
+ prefetcht0 896 + 64(%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
+
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
+
+ jz L(prebail)
+
+ decq %rcx
+
+ movq (%rsi), %rax
+ movq 8(%rsi), %rbx
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ prefetcht0 896 - 64(%rdi)
+ prefetcht0 896 - 0(%rdi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
+
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
+
+ jnz L(preloop)
+
+L(prebail):
+ movq SAVE3(%rsp), %rbx
+ cfi_restore (%rbx)
+ movq SAVE2(%rsp), %r12
+ cfi_restore (%r12)
+ movq SAVE1(%rsp), %r13
+ cfi_restore (%r13)
+ movq SAVE0(%rsp), %r14
+ cfi_restore (%r14)
+
+/* .p2align 4 */
+
+L(preskip):
+ subq %r8, %rdx /* check for more */
+ testq $-64, %rdx
+ jnz L(preafter)
+
+ andl $63, %edx /* check for left overs */
+#ifdef USE_AS_MEMPCPY
+ jnz L(1)
+
+ movq %rdi, %rax
+#else
+ movq RETVAL(%rsp), %rax
+ jnz L(1)
+
+ rep
+#endif
+ retq /* exit */
+
+ .p2align 4
+
+L(preafter):
+
+/* Handle huge blocks. */
+
+L(NTtry):
+
+L(NT): /* non-temporal 128-byte */
+ movq %rdx, %rcx
+ shrq $7, %rcx
+ jz L(NTskip)
+
+ movq %r14, SAVE0(%rsp)
+ cfi_rel_offset (%r14, SAVE0)
+ movq %r13, SAVE1(%rsp)
+ cfi_rel_offset (%r13, SAVE1)
+ movq %r12, SAVE2(%rsp)
+ cfi_rel_offset (%r12, SAVE2)
+
+ .p2align 4
+
+L(NTloop):
+ prefetchnta 768(%rsi)
+ prefetchnta 832(%rsi)
+
+ decq %rcx
+
+ movq (%rsi), %rax
+ movq 8(%rsi), %r8
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ movntiq %rax, (%rdi)
+ movntiq %r8, 8(%rdi)
+ movntiq %r9, 16(%rdi)
+ movntiq %r10, 24(%rdi)
+ movntiq %r11, 32(%rdi)
+ movntiq %r12, 40(%rdi)
+ movntiq %r13, 48(%rdi)
+ movntiq %r14, 56(%rdi)
+
+ movq 64(%rsi), %rax
+ movq 72(%rsi), %r8
+ movq 80(%rsi), %r9
+ movq 88(%rsi), %r10
+ movq 96(%rsi), %r11
+ movq 104(%rsi), %r12
+ movq 112(%rsi), %r13
+ movq 120(%rsi), %r14
+
+ movntiq %rax, 64(%rdi)
+ movntiq %r8, 72(%rdi)
+ movntiq %r9, 80(%rdi)
+ movntiq %r10, 88(%rdi)
+ movntiq %r11, 96(%rdi)
+ movntiq %r12, 104(%rdi)
+ movntiq %r13, 112(%rdi)
+ movntiq %r14, 120(%rdi)
+
+ leaq 128(%rsi), %rsi
+ leaq 128(%rdi), %rdi
+
+ jnz L(NTloop)
+
+ sfence /* serialize memory stores */
+
+ movq SAVE2(%rsp), %r12
+ cfi_restore (%r12)
+ movq SAVE1(%rsp), %r13
+ cfi_restore (%r13)
+ movq SAVE0(%rsp), %r14
+ cfi_restore (%r14)
+
+L(NTskip):
+ andl $127, %edx /* check for left overs */
+#ifdef USE_AS_MEMPCPY
+ jnz L(1)
+
+ movq %rdi, %rax
+#else
+ movq RETVAL(%rsp), %rax
+ jnz L(1)
+
+ rep
+#endif
+ retq /* exit */
+
+#endif /* IS_IN (libc) */
+
+END(__memcpy_sse2)
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 7e119d30e5..99c481c866 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -32,48 +32,49 @@ ENTRY(__new_memcpy)
LOAD_RTLD_GLOBAL_RO_RDX
leaq __memcpy_avx_unaligned(%rip), %rax
HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 1f
- ret
-1: leaq __memcpy_sse2(%rip), %rax
- HAS_ARCH_FEATURE (Slow_BSF)
- jnz 2f
+ jnz 3f
leaq __memcpy_sse2_unaligned(%rip), %rax
- ret
-2: HAS_CPU_FEATURE (SSSE3)
- jz 3f
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 3f
+ leaq __memcpy_sse2(%rip), %rax
+ HAS_CPU_FEATURE (SSSE3)
+ jz 3f
+ leaq __memcpy_ssse3_back(%rip), %rax
+ HAS_CPU_FEATURE (Fast_Copy_Backward)
+ jnz 3f
leaq __memcpy_ssse3(%rip), %rax
3: ret
END(__new_memcpy)
# undef ENTRY
# define ENTRY(name) \
- .type __memcpy_sse2, @function; \
- .globl __memcpy_sse2; \
- .hidden __memcpy_sse2; \
+ .type __memcpy_sse2_unaligned, @function; \
+ .globl __memcpy_sse2_unaligned; \
+ .hidden __memcpy_sse2_unaligned; \
.p2align 4; \
- __memcpy_sse2: cfi_startproc; \
+ __memcpy_sse2_unaligned: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
- cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
+ cfi_endproc; .size __memcpy_sse2_unaligned, .-__memcpy_sse2_unaligned
# undef ENTRY_CHK
# define ENTRY_CHK(name) \
- .type __memcpy_chk_sse2, @function; \
- .globl __memcpy_chk_sse2; \
+ .type __memcpy_chk_sse2_unaligned, @function; \
+ .globl __memcpy_chk_sse2_unaligned; \
.p2align 4; \
- __memcpy_chk_sse2: cfi_startproc; \
+ __memcpy_chk_sse2_unaligned: cfi_startproc; \
CALL_MCOUNT
# undef END_CHK
# define END_CHK(name) \
- cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
+ cfi_endproc; .size __memcpy_chk_sse2_unaligned, .-__memcpy_chk_sse2_unaligned
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
The speedup we get from using SSSE3 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
- .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2_unaligned
versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 81f83ddb71..591d18a0c1 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,16 +30,19 @@
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memcpy_chk_avx_unaligned(%rip), %rax
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ leaq __memcpy_chk_sse2_unaligned(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
leaq __memcpy_chk_sse2(%rip), %rax
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __memcpy_chk_ssse3(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __memcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __memcpy_chk_avx_unaligned(%rip), %rax
+ leaq __memcpy_chk_ssse3_back(%rip), %rax
+ HAS_CPU_FEATURE (Fast_Copy_Backward)
+ jnz 2f
+ leaq __memcpy_chk_ssse3(%rip), %rax
2: ret
END(__memcpy_chk)
# else
diff --git a/sysdeps/x86_64/multiarch/mempcpy-sse2.S b/sysdeps/x86_64/multiarch/mempcpy-sse2.S
new file mode 100644
index 0000000000..e8bde29dc1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define __memcpy_sse2 __mempcpy_sse2
+#define __memcpy_chk_sse2 __mempcpy_chk_sse2
+#include "memcpy-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ad36840d54..450915f60f 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -28,41 +28,44 @@
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __mempcpy_avx_unaligned(%rip), %rax
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ leaq __mempcpy_sse2_unaligned(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
leaq __mempcpy_sse2(%rip), %rax
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_ssse3(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_avx_unaligned(%rip), %rax
+ leaq __mempcpy_ssse3_back(%rip), %rax
+ HAS_CPU_FEATURE (Fast_Copy_Backward)
+ jnz 2f
+ leaq __mempcpy_ssse3(%rip), %rax
2: ret
END(__mempcpy)
# undef ENTRY
# define ENTRY(name) \
- .type __mempcpy_sse2, @function; \
+ .type __mempcpy_sse2_unaligned, @function; \
.p2align 4; \
- .globl __mempcpy_sse2; \
- .hidden __mempcpy_sse2; \
- __mempcpy_sse2: cfi_startproc; \
+ .globl __mempcpy_sse2_unaligned; \
+ .hidden __mempcpy_sse2_unaligned; \
+ __mempcpy_sse2_unaligned: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
- cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
+ cfi_endproc; .size __mempcpy_sse2_unaligned, .-__mempcpy_sse2_unaligned
# undef ENTRY_CHK
# define ENTRY_CHK(name) \
- .type __mempcpy_chk_sse2, @function; \
- .globl __mempcpy_chk_sse2; \
+ .type __mempcpy_chk_sse2_unaligned, @function; \
+ .globl __mempcpy_chk_sse2_unaligned; \
.p2align 4; \
- __mempcpy_chk_sse2: cfi_startproc; \
+ __mempcpy_chk_sse2_unaligned: cfi_startproc; \
CALL_MCOUNT
# undef END_CHK
# define END_CHK(name) \
- cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
+ cfi_endproc; .size __mempcpy_chk_sse2_unaligned, .-__mempcpy_chk_sse2_unaligned
# undef libc_hidden_def
# undef libc_hidden_builtin_def
@@ -70,9 +73,9 @@ END(__mempcpy)
The speedup we get from using SSSE3 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_def(name) \
- .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
+ .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2_unaligned
# define libc_hidden_builtin_def(name) \
- .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
+ .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2_unaligned
#endif
#include "../mempcpy.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 0a46b567ec..9dc7dc80b7 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,16 +30,19 @@
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __mempcpy_chk_avx_unaligned(%rip), %rax
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ leaq __mempcpy_chk_sse2_unaligned(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
leaq __mempcpy_chk_sse2(%rip), %rax
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_chk_ssse3(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_chk_avx_unaligned(%rip), %rax
+ leaq __mempcpy_chk_ssse3_back(%rip), %rax
+ HAS_CPU_FEATURE (Fast_Copy_Backward)
+ jnz 2f
+ leaq __mempcpy_chk_ssse3(%rip), %rax
2: ret
END(__mempcpy_chk)
# else