summaryrefslogtreecommitdiff
path: root/sysdeps/arm/armv7
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/arm/armv7')
-rw-r--r--sysdeps/arm/armv7/Implies2
-rw-r--r--sysdeps/arm/armv7/configure72
-rw-r--r--sysdeps/arm/armv7/configure.ac12
-rw-r--r--sysdeps/arm/armv7/multiarch/Makefile3
-rw-r--r--sysdeps/arm/armv7/multiarch/aeabi_memcpy.c2
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-impl-list.c56
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy.S76
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_impl.S917
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_neon.S9
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_vfp.S7
10 files changed, 1156 insertions, 0 deletions
diff --git a/sysdeps/arm/armv7/Implies b/sysdeps/arm/armv7/Implies
new file mode 100644
index 0000000000..c6cd0eb877
--- /dev/null
+++ b/sysdeps/arm/armv7/Implies
@@ -0,0 +1,2 @@
+# We can do everything that 6T2 can
+arm/armv6t2
diff --git a/sysdeps/arm/armv7/configure b/sysdeps/arm/armv7/configure
new file mode 100644
index 0000000000..46e5d52df4
--- /dev/null
+++ b/sysdeps/arm/armv7/configure
@@ -0,0 +1,72 @@
+# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
+ # Local configure fragment for sysdeps/arm/armv7.
+
+# We need binutils 2.21 to ensure that NEON alignments are assembled correctly.
+libc_cv_arm_as_version_ok=yes
+for ac_prog in $AS
+do
+ # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AS+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$AS"; then
+ ac_cv_prog_AS="$AS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_AS="$ac_prog"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+AS=$ac_cv_prog_AS
+if test -n "$AS"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AS" >&5
+$as_echo "$AS" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$AS" && break
+done
+
+if test -z "$AS"; then
+ ac_verc_fail=yes
+else
+ # Found it, now check the version.
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking version of $AS" >&5
+$as_echo_n "checking version of $AS... " >&6; }
+ ac_prog_version=`$AS --version 2>&1 | sed -n 's/^.*GNU assembler.* \([0-9]*\.[0-9.]*\).*$/\1/p'`
+ case $ac_prog_version in
+ '') ac_prog_version="v. ?.??, bad"; ac_verc_fail=yes;;
+ 2.1[0-9][0-9]*|2.2[1-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*)
+ ac_prog_version="$ac_prog_version, ok"; ac_verc_fail=no;;
+ *) ac_prog_version="$ac_prog_version, bad"; ac_verc_fail=yes;;
+
+ esac
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_prog_version" >&5
+$as_echo "$ac_prog_version" >&6; }
+fi
+if test $ac_verc_fail = yes; then
+ libc_cv_arm_as_version_ok=no
+fi
+
+
+if test $libc_cv_arm_as_version_ok != yes; then
+ as_fn_error $? "as version too old, at least 2.21 is required" "$LINENO" 5
+fi
diff --git a/sysdeps/arm/armv7/configure.ac b/sysdeps/arm/armv7/configure.ac
new file mode 100644
index 0000000000..01e93ecd36
--- /dev/null
+++ b/sysdeps/arm/armv7/configure.ac
@@ -0,0 +1,12 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/arm/armv7.
+
+# We need binutils 2.21 to ensure that NEON alignments are assembled correctly.
+libc_cv_arm_as_version_ok=yes
+AC_CHECK_PROG_VER(AS, $AS, --version,
+ [GNU assembler.* \([0-9]*\.[0-9.]*\)],
+ [2.1[0-9][0-9]*|2.2[1-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*], libc_cv_arm_as_version_ok=no)
+
+if test $libc_cv_arm_as_version_ok != yes; then
+ AC_MSG_ERROR([as version too old, at least 2.21 is required])
+fi
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
new file mode 100644
index 0000000000..e834cc937f
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_neon memcpy_vfp
+endif
diff --git a/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c b/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
new file mode 100644
index 0000000000..c6a2a98a55
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
@@ -0,0 +1,2 @@
+/* Empty file to override sysdeps/arm version. See memcpy.S for definitions
+ of these functions. */
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..2515418eda
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -0,0 +1,56 @@
+/* Enumerate available IFUNC implementations of a function. ARM version.
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdbool.h>
+#include <string.h>
+#include <ldsodefs.h>
+#include <sysdep.h>
+#include <ifunc-impl-list.h>
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+ NAME and return the number of valid entries. */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ size_t i = 0;
+
+ bool use_neon = true;
+#ifdef __ARM_NEON__
+# define __memcpy_neon memcpy
+#else
+ use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
+#endif
+
+#ifndef __ARM_NEON__
+ bool use_vfp = true;
+# ifdef __SOFTFP__
+ use_vfp = (GLRO(dl_hwcap) & HWCAP_ARM_VFP) != 0;
+# endif
+#endif
+
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, use_neon, __memcpy_neon)
+#ifndef __ARM_NEON__
+ IFUNC_IMPL_ADD (array, i, memcpy, use_vfp, __memcpy_vfp)
+#endif
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
+
+ return i;
+}
diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S
new file mode 100644
index 0000000000..c4f4e80fb0
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy.S
@@ -0,0 +1,76 @@
+/* Multiple versions of memcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Thumb requires excess IT instructions here. */
+#define NO_THUMB
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#ifndef NOT_IN_libc
+/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */
+# ifndef __ARM_NEON__
+ .text
+ENTRY(memcpy)
+ .type memcpy, %gnu_indirect_function
+# ifdef __SOFTFP__
+ ldr r1, .Lmemcpy_arm
+ tst r0, #HWCAP_ARM_VFP
+ ldrne r1, .Lmemcpy_vfp
+# else
+ ldr r1, .Lmemcpy_vfp
+# endif
+ tst r0, #HWCAP_ARM_NEON
+ ldrne r1, .Lmemcpy_neon
+1:
+ add r0, r1, pc
+ DO_RET(lr)
+
+# ifdef __SOFTFP__
+.Lmemcpy_arm:
+ .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS
+# endif
+.Lmemcpy_neon:
+ .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS
+.Lmemcpy_vfp:
+ .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS
+
+END(memcpy)
+
+libc_hidden_builtin_def (memcpy)
+#endif /* Not __ARM_NEON__. */
+
+/* These versions of memcpy are defined not to clobber any VFP or NEON
+ registers so they must always call the ARM variant of the memcpy code. */
+strong_alias (__memcpy_arm, __aeabi_memcpy)
+strong_alias (__memcpy_arm, __aeabi_memcpy4)
+strong_alias (__memcpy_arm, __aeabi_memcpy8)
+libc_hidden_def (__memcpy_arm)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memcpy __memcpy_arm
+
+#endif
+
+#include "memcpy_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
new file mode 100644
index 0000000000..1562416cf6
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -0,0 +1,917 @@
+/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>.
+
+ This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
+
+ Assumptions:
+
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+
+ */
+
+/* Thumb cannot encode negative immediate offsets in memory operations. */
+#ifndef NO_THUMB
+#define NO_THUMB
+#endif
+#include <sysdep.h>
+#include <arm-features.h>
+
+ .syntax unified
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef MEMCPY_NEON
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif defined (MEMCPY_VFP)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
+
+#define ALIGN(addr, align) addr:align
+
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r8
+
+/* These two macros both work by repeated invocation of the macro
+ dispatch_step (not defined here). That macro performs one "step",
+ doing one load instruction and one store instruction to copy one
+ "unit". On entry, TMP1 contains the number of bytes to be copied,
+ a multiple of the unit size. The macro clobbers TMP1 in the
+ process of doing a computed jump to the tail containing the
+ appropriate number of steps.
+
+ In dispatch_7_dword, dispatch_step is invoked seven times, with an
+ argument that is 7 for the first and 1 for the last. Units are
+ double-words (8 bytes). TMP1 is at most 56.
+
+ In dispatch_15_word, dispatch_step is invoked fifteen times,
+ with an argument that is 15 for the first and 1 for the last.
+ Units are words (4 bytes). TMP1 is at most 60. */
+
+#ifndef ARM_ALWAYS_BX
+# if ARM_BX_ALIGN_LOG2 != 2
+# error case not handled
+# endif
+ .macro dispatch_7_dword
+ rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
+ add pc, pc, tmp1
+ dispatch_step 7
+ dispatch_step 6
+ dispatch_step 5
+ dispatch_step 4
+ dispatch_step 3
+ dispatch_step 2
+ dispatch_step 1
+ .purgem dispatch_step
+ .endm
+
+ .macro dispatch_15_word
+ rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
+ add pc, pc, tmp1, lsl #1
+ dispatch_step 15
+ dispatch_step 14
+ dispatch_step 13
+ dispatch_step 12
+ dispatch_step 11
+ dispatch_step 10
+ dispatch_step 9
+ dispatch_step 8
+ dispatch_step 7
+ dispatch_step 6
+ dispatch_step 5
+ dispatch_step 4
+ dispatch_step 3
+ dispatch_step 2
+ dispatch_step 1
+ .purgem dispatch_step
+ .endm
+#else
+# if ARM_BX_ALIGN_LOG2 < 3
+# error case not handled
+# endif
+ .macro dispatch_helper steps, log2_bytes_per_step
+ /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
+ (STEPS << LOG2_BYTES_PER_STEP).
+ So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
+ Then it needs further adjustment to compensate for the
+ distance between the PC value taken below (0f + PC_OFS)
+ and the first step's instructions (1f). */
+ rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
+ + ((1f - PC_OFS - 0f) \
+ >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
+ /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
+ steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
+ the (byte) distance to add to the PC. */
+0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
+ bx tmp1
+ .p2align ARM_BX_ALIGN_LOG2
+1:
+ .endm
+
+ .macro dispatch_7_dword
+ dispatch_helper 7, 3
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 7
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 6
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 5
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 4
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 3
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 2
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 1
+ .p2align ARM_BX_ALIGN_LOG2
+ .purgem dispatch_step
+ .endm
+
+ .macro dispatch_15_word
+ dispatch_helper 15, 2
+ dispatch_step 15
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 14
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 13
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 12
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 11
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 10
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 9
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 8
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 7
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 6
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 5
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 4
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 3
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 2
+ .p2align ARM_BX_ALIGN_LOG2
+ dispatch_step 1
+ .p2align ARM_BX_ALIGN_LOG2
+ .purgem dispatch_step
+ .endm
+
+#endif
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
+#define D_l r10
+#define D_h r11
+#endif
+
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ sfi_breg dst, \
+ vstr \vreg, [\B, #\base]
+ sfi_breg src, \
+ vldr \vreg, [\B, #\base]
+ sfi_breg dst, \
+ vstr d0, [\B, #\base + 8]
+ sfi_breg src, \
+ vldr d0, [\B, #\base + 8]
+ sfi_breg dst, \
+ vstr d1, [\B, #\base + 16]
+ sfi_breg src, \
+ vldr d1, [\B, #\base + 16]
+ sfi_breg dst, \
+ vstr d2, [\B, #\base + 24]
+ sfi_breg src, \
+ vldr d2, [\B, #\base + 24]
+ sfi_breg dst, \
+ vstr \vreg, [\B, #\base + 32]
+ sfi_breg src, \
+ vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32]
+ sfi_breg dst, \
+ vstr d0, [\B, #\base + 40]
+ sfi_breg src, \
+ vldr d0, [\B, #\base + 40]
+ sfi_breg dst, \
+ vstr d1, [\B, #\base + 48]
+ sfi_breg src, \
+ vldr d1, [\B, #\base + 48]
+ sfi_breg dst, \
+ vstr d2, [\B, #\base + 56]
+ sfi_breg src, \
+ vldr d2, [\B, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ sfi_breg dst, \
+ vstr \vreg, [\B, #\base]
+ sfi_breg src, \
+ vldr \vreg, [\B, #\base]
+ sfi_breg dst, \
+ vstr d0, [\B, #\base + 8]
+ sfi_breg src, \
+ vldr d0, [\B, #\base + 8]
+ sfi_breg dst, \
+ vstr d1, [\B, #\base + 16]
+ sfi_breg src, \
+ vldr d1, [\B, #\base + 16]
+ sfi_breg dst, \
+ vstr d2, [\B, #\base + 24]
+ sfi_breg src, \
+ vldr d2, [\B, #\base + 24]
+ sfi_breg dst, \
+ vstr \vreg, [\B, #\base + 32]
+ sfi_breg dst, \
+ vstr d0, [\B, #\base + 40]
+ sfi_breg src, \
+ vldr d0, [\B, #\base + 40]
+ sfi_breg dst, \
+ vstr d1, [\B, #\base + 48]
+ sfi_breg src, \
+ vldr d1, [\B, #\base + 48]
+ sfi_breg dst, \
+ vstr d2, [\B, #\base + 56]
+ sfi_breg src, \
+ vldr d2, [\B, #\base + 56]
+ .endm
+#endif
+
+ .p2align 6
+ENTRY(memcpy)
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bge .Lcpy_not_short
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+ /* These need an extra layer of macro just to work around a
+ bug in the assembler's parser when an operand starts with
+ a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647
+ tracks that bug; it was not fixed as of binutils-2.23.2. */
+ .macro neon_load_d0 reg
+ vld1.8 {d0}, [\reg]!
+ .endm
+ .macro neon_store_d0 reg
+ vst1.8 {d0}, [\reg]!
+ .endm
+
+ /* These are used by the NaCl sfi_breg macro. */
+ .macro _sfi_breg_dmask_neon_load_d0 reg
+ _sfi_dmask \reg
+ .endm
+ .macro _sfi_breg_dmask_neon_store_d0 reg
+ _sfi_dmask \reg
+ .endm
+
+ and tmp1, count, #0x38
+ .macro dispatch_step i
+ sfi_breg src, neon_load_d0 \B
+ sfi_breg dst, neon_store_d0 \B
+ .endm
+ dispatch_7_dword
+
+ tst count, #4
+ sfi_breg src, \
+ ldrne tmp1, [\B], #4
+ sfi_breg dst, \
+ strne tmp1, [\B], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ /* Jump directly into the sequence below at the correct offset. */
+ .macro dispatch_step i
+ sfi_breg src, \
+ ldr tmp1, [\B, #-(\i * 4)]
+ sfi_breg dst, \
+ str tmp1, [\B, #-(\i * 4)]
+ .endm
+ dispatch_15_word
+#endif
+
+ lsls count, count, #31
+ sfi_breg src, \
+ ldrhcs tmp1, [\B], #2
+ sfi_breg src, \
+ ldrbne src, [\B] /* Src is dead, use as a scratch. */
+ sfi_breg dst, \
+ strhcs tmp1, [\B], #2
+ sfi_breg dst, \
+ strbne src, [\B]
+ bx lr
+
+.Lcpy_not_short:
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ cfi_adjust_cfa_offset (FRAME_SIZE)
+ cfi_rel_offset (tmp2, 0)
+ cfi_remember_state
+ and tmp2, src, #7
+ and tmp1, dst, #7
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
+#endif
+
+ /* SRC and DST have the same mutual 64-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring SRC and DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ sfi_breg src, \
+ ldrmi tmp1, [\B], #4
+ sfi_breg dst, \
+ strmi tmp1, [\B], #4
+ lsls tmp2, tmp2, #2
+ sfi_breg src, \
+ ldrhcs tmp1, [\B], #2
+ sfi_breg src, \
+ ldrbne tmp2, [\B], #1
+ sfi_breg dst, \
+ strhcs tmp1, [\B], #2
+ sfi_breg dst, \
+ strbne tmp2, [\B], #1
+
+1:
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blt .Ltail63aligned
+
+ cmp tmp2, #512
+ bge .Lcpy_body_long
+
+.Lcpy_body_medium: /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ sfi_breg src, \
+ vldr d0, [\B, #0]
+ subs tmp2, tmp2, #64
+ sfi_breg src, \
+ vldr d1, [\B, #8]
+ sfi_breg dst, \
+ vstr d0, [\B, #0]
+ sfi_breg src, \
+ vldr d0, [\B, #16]
+ sfi_breg dst, \
+ vstr d1, [\B, #8]
+ sfi_breg src, \
+ vldr d1, [\B, #24]
+ sfi_breg dst, \
+ vstr d0, [\B, #16]
+ sfi_breg src, \
+ vldr d0, [\B, #32]
+ sfi_breg dst, \
+ vstr d1, [\B, #24]
+ sfi_breg src, \
+ vldr d1, [\B, #40]
+ sfi_breg dst, \
+ vstr d0, [\B, #32]
+ sfi_breg src, \
+ vldr d0, [\B, #48]
+ sfi_breg dst, \
+ vstr d1, [\B, #40]
+ sfi_breg src, \
+ vldr d1, [\B, #56]
+ sfi_breg dst, \
+ vstr d0, [\B, #48]
+ add src, src, #64
+ sfi_breg dst, \
+ vstr d1, [\B, #56]
+ add dst, dst, #64
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ .macro dispatch_step i
+ sfi_breg src, \
+ vldr d0, [\B, #-(\i * 8)]
+ sfi_breg dst, \
+ vstr d0, [\B, #-(\i * 8)]
+ .endm
+ dispatch_7_dword
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #8]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #8]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #16]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #16]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #24]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #24]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #32]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #32]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #40]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #40]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #48]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #48]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #56]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #56]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #64]!
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #64]!
+ subs tmp2, tmp2, #64
+ bge 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bx lr
+
+ cfi_restore_state
+ cfi_remember_state
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 64-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ .macro dispatch_step i
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #-(\i * 8)]
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #-(\i * 8)]
+ .endm
+ dispatch_7_dword
+#endif
+
+ tst tmp2, #4
+ sfi_breg src, \
+ ldrne tmp1, [\B], #4
+ sfi_breg dst, \
+ strne tmp1, [\B], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ sfi_breg src, \
+ ldrhcs tmp1, [\B], #2
+ sfi_breg src, \
+ ldrbne tmp2, [\B]
+ sfi_breg dst, \
+ strhcs tmp1, [\B], #2
+ sfi_breg dst, \
+ strbne tmp2, [\B]
+
+.Ldone:
+ ldr tmp2, [sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bx lr
+
+ cfi_restore_state
+ cfi_remember_state
+
+.Lcpy_body_long: /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ sfi_breg src, \
+ vldr d3, [\B, #0]
+ sfi_breg src, \
+ vldr d4, [\B, #64]
+ sfi_breg src, \
+ vldr d5, [\B, #128]
+ sfi_breg src, \
+ vldr d6, [\B, #192]
+ sfi_breg src, \
+ vldr d7, [\B, #256]
+
+ sfi_breg src, \
+ vldr d0, [\B, #8]
+ sfi_breg src, \
+ vldr d1, [\B, #16]
+ sfi_breg src, \
+ vldr d2, [\B, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blt 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bge 1b
+
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ sfi_breg dst, \
+ vstr d7, [\B, #64]
+ sfi_breg src, \
+ vldr d7, [\B, #64]
+ sfi_breg dst, \
+ vstr d0, [\B, #64 + 8]
+ sfi_breg src, \
+ vldr d0, [\B, #64 + 8]
+ sfi_breg dst, \
+ vstr d1, [\B, #64 + 16]
+ sfi_breg src, \
+ vldr d1, [\B, #64 + 16]
+ sfi_breg dst, \
+ vstr d2, [\B, #64 + 24]
+ sfi_breg src, \
+ vldr d2, [\B, #64 + 24]
+ sfi_breg dst, \
+ vstr d7, [\B, #64 + 32]
+ add src, src, #96
+ sfi_breg dst, \
+ vstr d0, [\B, #64 + 40]
+ sfi_breg dst, \
+ vstr d1, [\B, #64 + 48]
+ sfi_breg dst, \
+ vstr d2, [\B, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b .Lcpy_body_medium
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ sfi_pld src, #8
+ sfi_pld src, #72
+ subs tmp2, tmp2, #64
+ sfi_pld src, #136
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #8]
+ strd B_l, B_h, [sp, #8]
+ cfi_rel_offset (B_l, 8)
+ cfi_rel_offset (B_h, 12)
+ sfi_breg src, \
+ ldrd B_l, B_h, [\B, #16]
+ strd C_l, C_h, [sp, #16]
+ cfi_rel_offset (C_l, 16)
+ cfi_rel_offset (C_h, 20)
+ sfi_breg src, \
+ ldrd C_l, C_h, [\B, #24]
+ strd D_l, D_h, [sp, #24]
+ cfi_rel_offset (D_l, 24)
+ cfi_rel_offset (D_h, 28)
+ sfi_pld src, #200
+ sfi_breg src, \
+ ldrd D_l, D_h, [\B, #32]!
+ b 1f
+ .p2align 6
+2:
+ sfi_pld src, #232
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #40]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #40]
+ sfi_breg dst, \
+ strd B_l, B_h, [\B, #48]
+ sfi_breg src, \
+ ldrd B_l, B_h, [\B, #48]
+ sfi_breg dst, \
+ strd C_l, C_h, [\B, #56]
+ sfi_breg src, \
+ ldrd C_l, C_h, [\B, #56]
+ sfi_breg dst, \
+ strd D_l, D_h, [\B, #64]!
+ sfi_breg src, \
+ ldrd D_l, D_h, [\B, #64]!
+ subs tmp2, tmp2, #64
+1:
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #8]
+ sfi_breg src, \
+ ldrd A_l, A_h, [\B, #8]
+ sfi_breg dst, \
+ strd B_l, B_h, [\B, #16]
+ sfi_breg src, \
+ ldrd B_l, B_h, [\B, #16]
+ sfi_breg dst, \
+ strd C_l, C_h, [\B, #24]
+ sfi_breg src, \
+ ldrd C_l, C_h, [\B, #24]
+ sfi_breg dst, \
+ strd D_l, D_h, [\B, #32]
+ sfi_breg src, \
+ ldrd D_l, D_h, [\B, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #40]
+ add src, src, #40
+ sfi_breg dst, \
+ strd B_l, B_h, [\B, #48]
+ ldrd B_l, B_h, [sp, #8]
+ cfi_restore (B_l)
+ cfi_restore (B_h)
+ sfi_breg dst, \
+ strd C_l, C_h, [\B, #56]
+ ldrd C_l, C_h, [sp, #16]
+ cfi_restore (C_l)
+ cfi_restore (C_h)
+ sfi_breg dst, \
+ strd D_l, D_h, [\B, #64]
+ ldrd D_l, D_h, [sp, #24]
+ cfi_restore (D_l)
+ cfi_restore (D_h)
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne .Ltail63aligned
+ ldr tmp2, [sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bx lr
+#endif
+
+ cfi_restore_state
+ cfi_remember_state
+
+.Lcpy_notaligned:
+ sfi_pld src
+ sfi_pld src, #64
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ sfi_pld src, #(2 * 64)
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ sfi_breg src, \
+ ldrmi tmp1, [\B], #4
+ sfi_breg dst, \
+ strmi tmp1, [\B], #4
+ lsls tmp2, tmp2, #2
+ sfi_breg src, \
+ ldrbne tmp1, [\B], #1
+ sfi_breg src, \
+ ldrhcs tmp2, [\B], #2
+ sfi_breg dst, \
+ strbne tmp1, [\B], #1
+ sfi_breg dst, \
+ strhcs tmp2, [\B], #2
+1:
+ sfi_pld src, #(3 * 64)
+ subs count, count, #64
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi .Ltail63unaligned
+ sfi_pld src, #(4 * 64)
+
+#ifdef USE_NEON
+ /* These need an extra layer of macro just to work around a
+ bug in the assembler's parser when an operand starts with
+ a {...}. */
+ .macro neon_load_multi reglist, basereg
+ vld1.8 {\reglist}, [\basereg]!
+ .endm
+ .macro neon_store_multi reglist, basereg
+ vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
+ .endm
+
+ /* These are used by the NaCl sfi_breg macro. */
+ .macro _sfi_breg_dmask_neon_load_multi reg
+ _sfi_dmask \reg
+ .endm
+ .macro _sfi_breg_dmask_neon_store_multi reg
+ _sfi_dmask \reg
+ .endm
+
+ sfi_breg src, neon_load_multi d0-d3, \B
+ sfi_breg src, neon_load_multi d4-d7, \B
+ subs count, count, #64
+ bmi 2f
+1:
+ sfi_pld src, #(4 * 64)
+ sfi_breg dst, neon_store_multi d0-d3, \B
+ sfi_breg src, neon_load_multi d0-d3, \B
+ sfi_breg dst, neon_store_multi d4-d7, \B
+ sfi_breg src, neon_load_multi d4-d7, \B
+ subs count, count, #64
+ bpl 1b
+2:
+ sfi_breg dst, neon_store_multi d0-d3, \B
+ sfi_breg dst, neon_store_multi d4-d7, \B
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ sfi_breg src, \
+ ldr A_l, [\B, #4]
+ sfi_breg src, \
+ ldr A_h, [\B, #8]
+ strd B_l, B_h, [sp, #8]
+ cfi_rel_offset (B_l, 8)
+ cfi_rel_offset (B_h, 12)
+ sfi_breg src, \
+ ldr B_l, [\B, #12]
+ sfi_breg src, \
+ ldr B_h, [\B, #16]
+ strd C_l, C_h, [sp, #16]
+ cfi_rel_offset (C_l, 16)
+ cfi_rel_offset (C_h, 20)
+ sfi_breg src, \
+ ldr C_l, [\B, #20]
+ sfi_breg src, \
+ ldr C_h, [\B, #24]
+ strd D_l, D_h, [sp, #24]
+ cfi_rel_offset (D_l, 24)
+ cfi_rel_offset (D_h, 28)
+ sfi_breg src, \
+ ldr D_l, [\B, #28]
+ sfi_breg src, \
+ ldr D_h, [\B, #32]!
+ b 1f
+ .p2align 6
+2:
+ sfi_pld src, #(5 * 64) - (32 - 4)
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #40]
+ sfi_breg src, \
+ ldr A_l, [\B, #36]
+ sfi_breg src, \
+ ldr A_h, [\B, #40]
+ sfi_breg dst, \
+ strd B_l, B_h, [\B, #48]
+ sfi_breg src, \
+ ldr B_l, [\B, #44]
+ sfi_breg src, \
+ ldr B_h, [\B, #48]
+ sfi_breg dst, \
+ strd C_l, C_h, [\B, #56]
+ sfi_breg src, \
+ ldr C_l, [\B, #52]
+ sfi_breg src, \
+ ldr C_h, [\B, #56]
+ sfi_breg dst, \
+ strd D_l, D_h, [\B, #64]!
+ sfi_breg src, \
+ ldr D_l, [\B, #60]
+ sfi_breg src, \
+ ldr D_h, [\B, #64]!
+ subs tmp2, tmp2, #64
+1:
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #8]
+ sfi_breg src, \
+ ldr A_l, [\B, #4]
+ sfi_breg src, \
+ ldr A_h, [\B, #8]
+ sfi_breg dst, \
+ strd B_l, B_h, [\B, #16]
+ sfi_breg src, \
+ ldr B_l, [\B, #12]
+ sfi_breg src, \
+ ldr B_h, [\B, #16]
+ sfi_breg dst, \
+ strd C_l, C_h, [\B, #24]
+ sfi_breg src, \
+ ldr C_l, [\B, #20]
+ sfi_breg src, \
+ ldr C_h, [\B, #24]
+ sfi_breg dst, \
+ strd D_l, D_h, [\B, #32]
+ sfi_breg src, \
+ ldr D_l, [\B, #28]
+ sfi_breg src, \
+ ldr D_h, [\B, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ sfi_breg dst, \
+ strd A_l, A_h, [\B, #40]
+ add src, src, #36
+ sfi_breg dst, \
+ strd B_l, B_h, [\B, #48]
+ ldrd B_l, B_h, [sp, #8]
+ cfi_restore (B_l)
+ cfi_restore (B_h)
+ sfi_breg dst, \
+ strd C_l, C_h, [\B, #56]
+ ldrd C_l, C_h, [sp, #16]
+ cfi_restore (C_l)
+ cfi_restore (C_h)
+ sfi_breg dst, \
+ strd D_l, D_h, [\B, #64]
+ ldrd D_l, D_h, [sp, #24]
+ cfi_restore (D_l)
+ cfi_restore (D_h)
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bne .Ltail63unaligned
+ bx lr
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
new file mode 100644
index 0000000000..e60d1cc0e1
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
@@ -0,0 +1,9 @@
+#ifdef __ARM_NEON__
+/* Under __ARM_NEON__, this file defines memcpy directly. */
+libc_hidden_builtin_def (memcpy)
+#else
+# define memcpy __memcpy_neon
+#endif
+
+#define MEMCPY_NEON
+#include "memcpy_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
new file mode 100644
index 0000000000..e008c041ed
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
@@ -0,0 +1,7 @@
+/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly
+ and the __memcpy_vfp code will never be used. */
+#ifndef __ARM_NEON__
+# define MEMCPY_VFP
+# define memcpy __memcpy_vfp
+# include "memcpy_impl.S"
+#endif