diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2014-02-10 10:54:47 -0500 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2014-02-10 11:04:39 -0500 |
commit | 4372980f5881e7d537a52e3c49588ce116088061 (patch) | |
tree | 138fbc88313c8cb0cabee393e0c08afd67dac6f1 /sysdeps/tile/tilegx/memcpy.c | |
parent | a1ffb40e32741f992c743e7b16c061fefa3747ac (diff) | |
download | glibc-4372980f5881e7d537a52e3c49588ce116088061.tar.gz |
Move tilegx, tilepro, and linux-generic from ports to libc.
I've moved the TILE-Gx and TILEPro ports to the main sysdeps hierarchy,
along with the linux-generic ports infrastructure. Beyond the README
update, the move was just
git mv ports/sysdeps/tile sysdeps/tile
git mv ports/sysdeps/unix/sysv/linux/tile \
sysdeps/unix/sysv/linux/tile
git mv ports/sysdeps/unix/sysv/linux/generic \
sysdeps/unix/sysv/linux/generic
I updated the relevant ChangeLogs along the lines of the ARM move
in commit c6bfe5c4d75 and tested the 64-bit tilegx build to confirm that
there were no changes in "objdump -dr" output in the shared objects.
Diffstat (limited to 'sysdeps/tile/tilegx/memcpy.c')
-rw-r--r-- | sysdeps/tile/tilegx/memcpy.c | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/sysdeps/tile/tilegx/memcpy.c b/sysdeps/tile/tilegx/memcpy.c new file mode 100644 index 0000000000..5d5df19ef3 --- /dev/null +++ b/sysdeps/tile/tilegx/memcpy.c @@ -0,0 +1,272 @@ +/* Copyright (C) 2011-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <string.h> +#include <stdint.h> +#include <stdlib.h> +#include <memcopy.h> +#include <arch/chip.h> + +/* How many cache lines ahead should we prefetch? */ +#define PREFETCH_LINES_AHEAD 3 + +void * +__memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n) +{ + char *__restrict dst1 = (char *) dstv; + const char *__restrict src1 = (const char *) srcv; + const char *__restrict src1_end; + const char *__restrict prefetch; + op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ + op_t final; /* Final bytes to write to trailing word, if any */ + long i; + + if (n < 16) + { + for (; n; n--) + *dst1++ = *src1++; + return dstv; + } + + /* Locate the end of source memory we will copy. Don't prefetch + past this. */ + src1_end = src1 + n - 1; + + /* Prefetch ahead a few cache lines, but not past the end. */ + prefetch = src1; + for (i = 0; i < PREFETCH_LINES_AHEAD; i++) + { + __insn_prefetch (prefetch); + prefetch += CHIP_L2_LINE_SIZE (); + prefetch = (prefetch < src1_end) ? prefetch : src1; + } + + /* Copy bytes until dst is word-aligned. */ + for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--) + *dst1++ = *src1++; + + /* 8-byte pointer to destination memory. */ + dst8 = (op_t *) dst1; + + if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0)) + { + /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but + inline it to avoid prologue/epilogue. TODO: Consider + prefetching and using wh64 as well. */ + void * srci; + op_t a0, a1, a2, a3; + long int dstp = (long int) dst1; + long int srcp = (long int) src1; + long int len = n / OPSIZ; + + /* Save the initial source pointer so we know the number of + bytes to shift for merging two unaligned results. */ + srci = (void *) srcp; + + /* Make SRCP aligned by rounding it down to the beginning of the + `op_t' it points in the middle of. */ + srcp &= -OPSIZ; + + switch (len % 4) + { + case 2: + a1 = ((op_t *) srcp)[0]; + a2 = ((op_t *) srcp)[1]; + len += 2; + srcp += 2 * OPSIZ; + goto do1; + case 3: + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + len += 1; + srcp += 2 * OPSIZ; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return dstv; + a3 = ((op_t *) srcp)[0]; + a0 = ((op_t *) srcp)[1]; + len += 0; + srcp += 2 * OPSIZ; + goto do3; + case 1: + a2 = ((op_t *) srcp)[0]; + a3 = ((op_t *) srcp)[1]; + srcp += 2 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do + { + do4: + a0 = ((op_t *) srcp)[0]; + a2 = __insn_dblalign (a2, a3, srci); + ((op_t *) dstp)[0] = a2; + srcp += OPSIZ; + dstp += OPSIZ; + do3: + a1 = ((op_t *) srcp)[0]; + a3 = __insn_dblalign (a3, a0, srci); + ((op_t *) dstp)[0] = a3; + srcp += OPSIZ; + dstp += OPSIZ; + do2: + a2 = ((op_t *) srcp)[0]; + a0 = __insn_dblalign (a0, a1, srci); + ((op_t *) dstp)[0] = a0; + srcp += OPSIZ; + dstp += OPSIZ; + do1: + a3 = ((op_t *) srcp)[0]; + a1 = __insn_dblalign (a1, a2, srci); + ((op_t *) dstp)[0] = a1; + srcp += OPSIZ; + dstp += OPSIZ; + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci); + + n = n % OPSIZ; + if (n == 0) + return dstv; + + a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0; + + final = __insn_dblalign (a3, a0, srci); + dst8 = (op_t *)(dstp + OPSIZ); + } + else + { + /* Aligned copy. */ + + const op_t *__restrict src8 = (const op_t *) src1; + + /* src8 and dst8 are both word-aligned. */ + if (n >= CHIP_L2_LINE_SIZE ()) + { + /* Copy until 'dst' is cache-line-aligned. */ + for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1); + n -= sizeof (op_t)) + *dst8++ = *src8++; + + for (; n >= CHIP_L2_LINE_SIZE ();) + { + op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Prefetch and advance to next line to prefetch, but + don't go past the end. */ + __insn_prefetch (prefetch); + prefetch += CHIP_L2_LINE_SIZE (); + prefetch = (prefetch < src1_end) ? prefetch : + (const char *) src8; + + /* Do all the loads before wh64. This is necessary if + [src8, src8+7] and [dst8, dst8+7] share the same + cache line and dst8 <= src8, as can be the case when + called from memmove, or with code tested on x86 whose + memcpy always works with forward copies. */ + tmp0 = *src8++; + tmp1 = *src8++; + tmp2 = *src8++; + tmp3 = *src8++; + tmp4 = *src8++; + tmp5 = *src8++; + tmp6 = *src8++; + tmp7 = *src8++; + + __insn_wh64 (dst8); + + *dst8++ = tmp0; + *dst8++ = tmp1; + *dst8++ = tmp2; + *dst8++ = tmp3; + *dst8++ = tmp4; + *dst8++ = tmp5; + *dst8++ = tmp6; + *dst8++ = tmp7; + + n -= 64; + } +#if CHIP_L2_LINE_SIZE() != 64 +# error "Fix code that assumes particular L2 cache line size." +#endif + } + + for (; n >= sizeof (op_t); n -= sizeof (op_t)) + *dst8++ = *src8++; + + if (__builtin_expect (n == 0, 1)) + return dstv; + + final = *src8; + } + + /* n != 0 if we get here. Write out any trailing bytes. */ + dst1 = (char *) dst8; +#ifndef __BIG_ENDIAN__ + if (n & 4) + { + *(uint32_t *) dst1 = final; + dst1 += 4; + final >>= 32; + n &= 3; + } + if (n & 2) + { + *(uint16_t *) dst1 = final; + dst1 += 2; + final >>= 16; + n &= 1; + } + if (n) + *(uint8_t *) dst1 = final; +#else + if (n & 4) + { + *(uint32_t *) dst1 = final >> 32; + dst1 += 4; + } + else + { + final >>= 32; + } + if (n & 2) + { + *(uint16_t *) dst1 = final >> 16; + dst1 += 2; + } + else + { + final >>= 16; + } + if (n & 1) + *(uint8_t *) dst1 = final >> 8; +#endif + + return dstv; +} +weak_alias (__memcpy, memcpy) +libc_hidden_builtin_def (memcpy) |