summaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/cell/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/cell/memcpy.S')
-rw-r--r--sysdeps/powerpc/powerpc64/cell/memcpy.S246
1 files changed, 0 insertions, 246 deletions
diff --git a/sysdeps/powerpc/powerpc64/cell/memcpy.S b/sysdeps/powerpc/powerpc64/cell/memcpy.S
deleted file mode 100644
index 1cc66456e3..0000000000
--- a/sysdeps/powerpc/powerpc64/cell/memcpy.S
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Optimized memcpy implementation for CELL BE PowerPC.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef MEMCPY
-# define MEMCPY memcpy
-#endif
-
-#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
-#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
-
-/* memcpy routine optimized for CELL-BE-PPC v2.0
- *
- * The CELL PPC core has 1 integer unit and 1 load/store unit
- * CELL:
- * 1st level data cache = 32K
- * 2nd level data cache = 512K
- * 3rd level data cache = 0K
- * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
- * latency to memory is >400 clocks
- * To improve copy performance we need to prefetch source data
- * far ahead to hide this latency
- * For best performance instruction forms ending in "." like "andi."
- * should be avoided as the are implemented in microcode on CELL.
- * The below code is loop unrolled for the CELL cache line of 128 bytes
- */
-
-.align 7
-
-EALIGN (MEMCPY, 5, 0)
- CALL_MCOUNT 3
-
- dcbt 0,r4 /* Prefetch ONE SRC cacheline */
- cmpldi cr1,r5,16 /* is size < 16 ? */
- mr r6,r3
- blt+ cr1,.Lshortcopy
-
-.Lbigcopy:
- neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
- clrldi r8,r8,64-4 /* align to 16byte boundary */
- sub r7,r4,r3
- cmpldi cr0,r8,0
- beq+ .Ldst_aligned
-
-.Ldst_unaligned:
- mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
- subf r5,r8,r5
-
- bf cr7*4+3,1f
- lbzx r0,r7,r6 /* copy 1 byte */
- stb r0,0(r6)
- addi r6,r6,1
-1: bf cr7*4+2,2f
- lhzx r0,r7,r6 /* copy 2 byte */
- sth r0,0(r6)
- addi r6,r6,2
-2: bf cr7*4+1,4f
- lwzx r0,r7,r6 /* copy 4 byte */
- stw r0,0(r6)
- addi r6,r6,4
-4: bf cr7*4+0,8f
- ldx r0,r7,r6 /* copy 8 byte */
- std r0,0(r6)
- addi r6,r6,8
-8:
- add r4,r7,r6
-
-.Ldst_aligned:
-
- cmpdi cr5,r5,128-1
-
- neg r7,r6
- addi r6,r6,-8 /* prepare for stdu */
- addi r4,r4,-8 /* prepare for ldu */
-
- clrldi r7,r7,64-7 /* align to cacheline boundary */
- ble+ cr5,.Llessthancacheline
-
- cmpldi cr6,r7,0
- subf r5,r7,r5
- srdi r7,r7,4 /* divide size by 16 */
- srdi r10,r5,7 /* number of cache lines to copy */
-
- cmpldi r10,0
- li r11,0 /* number cachelines to copy with prefetch */
- beq .Lnocacheprefetch
-
- cmpldi r10,PREFETCH_AHEAD
- li r12,128+8 /* prefetch distance */
- ble .Llessthanmaxprefetch
-
- subi r11,r10,PREFETCH_AHEAD
- li r10,PREFETCH_AHEAD
-
-.Llessthanmaxprefetch:
- mtctr r10
-
-.LprefetchSRC:
- dcbt r12,r4
- addi r12,r12,128
- bdnz .LprefetchSRC
-
-.Lnocacheprefetch:
- mtctr r7
- cmpldi cr1,r5,128
- clrldi r5,r5,64-7
- beq cr6,.Lcachelinealigned
-
-.Laligntocacheline:
- ld r9,0x08(r4)
- ldu r7,0x10(r4)
- std r9,0x08(r6)
- stdu r7,0x10(r6)
- bdnz .Laligntocacheline
-
-
-.Lcachelinealigned: /* copy while cache lines */
-
- blt- cr1,.Llessthancacheline /* size <128 */
-
-.Louterloop:
- cmpdi r11,0
- mtctr r11
- beq- .Lendloop
-
- li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
-
-.align 4
- /* Copy whole cachelines, optimized by prefetching SRC cacheline */
-.Lloop: /* Copy aligned body */
- dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
- ld r9, 0x08(r4)
- dcbz r11,r6
- ld r7, 0x10(r4) /* 4 register stride copy is optimal */
- ld r8, 0x18(r4) /* to hide 1st level cache latency. */
- ld r0, 0x20(r4)
- std r9, 0x08(r6)
- std r7, 0x10(r6)
- std r8, 0x18(r6)
- std r0, 0x20(r6)
- ld r9, 0x28(r4)
- ld r7, 0x30(r4)
- ld r8, 0x38(r4)
- ld r0, 0x40(r4)
- std r9, 0x28(r6)
- std r7, 0x30(r6)
- std r8, 0x38(r6)
- std r0, 0x40(r6)
- ld r9, 0x48(r4)
- ld r7, 0x50(r4)
- ld r8, 0x58(r4)
- ld r0, 0x60(r4)
- std r9, 0x48(r6)
- std r7, 0x50(r6)
- std r8, 0x58(r6)
- std r0, 0x60(r6)
- ld r9, 0x68(r4)
- ld r7, 0x70(r4)
- ld r8, 0x78(r4)
- ldu r0, 0x80(r4)
- std r9, 0x68(r6)
- std r7, 0x70(r6)
- std r8, 0x78(r6)
- stdu r0, 0x80(r6)
-
- bdnz .Lloop
-
-.Lendloop:
- cmpdi r10,0
- sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
- beq- .Lendloop2
- mtctr r10
-
-.Lloop2: /* Copy aligned body */
- ld r9, 0x08(r4)
- ld r7, 0x10(r4)
- ld r8, 0x18(r4)
- ldu r0, 0x20(r4)
- std r9, 0x08(r6)
- std r7, 0x10(r6)
- std r8, 0x18(r6)
- stdu r0, 0x20(r6)
-
- bdnz .Lloop2
-.Lendloop2:
-
-.Llessthancacheline: /* less than cache to do ? */
- cmpldi cr0,r5,16
- srdi r7,r5,4 /* divide size by 16 */
- blt- .Ldo_lt16
- mtctr r7
-
-.Lcopy_remaining:
- ld r8,0x08(r4)
- ldu r7,0x10(r4)
- std r8,0x08(r6)
- stdu r7,0x10(r6)
- bdnz .Lcopy_remaining
-
-.Ldo_lt16: /* less than 16 ? */
- cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
- beqlr+ /* no rest to copy */
- addi r4,r4,8
- addi r6,r6,8
-
-.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
- mtcrf 0x01,r5
- sub r7,r4,r6
- bf- cr7*4+0,8f
- ldx r0,r7,r6 /* copy 8 byte */
- std r0,0(r6)
- addi r6,r6,8
-8:
- bf cr7*4+1,4f
- lwzx r0,r7,r6 /* copy 4 byte */
- stw r0,0(r6)
- addi r6,r6,4
-4:
- bf cr7*4+2,2f
- lhzx r0,r7,r6 /* copy 2 byte */
- sth r0,0(r6)
- addi r6,r6,2
-2:
- bf cr7*4+3,1f
- lbzx r0,r7,r6 /* copy 1 byte */
- stb r0,0(r6)
-1: blr
-
-END_GEN_TB (MEMCPY,TB_TOCLESS)
-libc_hidden_builtin_def (memcpy)