diff options
author | Adhemerval Zanella <azanella@linux.vnet.ibm.com> | 2015-01-30 14:43:57 -0500 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2017-09-19 16:12:19 -0300 |
commit | 3988adaf704a30af6a9b1a889c1e5b6724301653 (patch) | |
tree | 187205d12e79a58f23309530ca4cc3a52d874998 | |
parent | 92892fdbfa5e4d9f3cc25601767da064d0a8818a (diff) | |
download | glibc-3988adaf704a30af6a9b1a889c1e5b6724301653.tar.gz |
powerpc: POWER8 memcpy optimization
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/memcpy-power8.S | 26 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/memcpy.S | 181 |
4 files changed, 211 insertions, 2 deletions
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index f6491a2dc6..39af19eaec 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -1,6 +1,6 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ - memcpy-power4 memcpy-ppc64 \ +sysdep_routines += memcpy-power8 memcpy-power7 memcpy-a2 memcpy-power6 \ + memcpy-cell memcpy-power4 memcpy-ppc64 \ memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \ memset-power7 memset-power6 memset-power4 \ memset-ppc64 memset-power8 \ diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index a1d3ca0088..630f52a9c8 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #ifdef SHARED /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */ IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __memcpy_power8) IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX, __memcpy_power7) IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06, diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8.S new file mode 100644 index 0000000000..74b33302c4 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8.S @@ -0,0 +1,26 @@ +/* Optimized memcpy implementation for PowerPC/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define MEMCPY __memcpy_power8 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power8/memcpy.S> diff --git a/sysdeps/powerpc/powerpc64/power8/memcpy.S b/sysdeps/powerpc/powerpc64/power8/memcpy.S new file mode 100644 index 0000000000..8a6ae1bad0 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/memcpy.S @@ -0,0 +1,181 @@ +/* Optimized memcpy implementation for PowerPC64/POWER8. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + +/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. */ + +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + + .machine power8 +ENTRY_TOCLESS (MEMCPY, 5) + CALL_MCOUNT 3 + + cmpldi 7,5,15 + bgt 7,L(copy_gt_32) + andi. 9,5,0x1 + mr 9,3 + beq 0,1f + lbz 10,0(4) + addi 9,3,1 + addi 4,4,1 + stb 10,0(3) +1: + andi. 10,5,0x2 + beq 0,2f + lhz 10,0(4) + addi 9,9,2 + addi 4,4,2 + sth 10,-2(9) +2: + andi. 10,5,0x4 + beq 0,3f + lwz 10,0(4) + addi 9,9,4 + addi 4,4,4 + stw 10,-4(9) +3: + andi. 10,5,0x8 + beqlr 0 + ld 10,0(4) + std 10,0(9) + blr + + .align 4 +L(copy_gt_32): + cmpldi 7,5,32 + ble 7,L(copy_gt_16_le_32) + cmpldi 7,5,64 + ble 7,L(copy_gt_32_le_64) + cmpldi 7,5,127 + mr 9,3 + ble 7,L(copy_gt_64_le_128) + addi 8,5,-128 + li 11,16 + rldicr 8,8,0,56 + li 6,32 + srdi 10,8,7 + addi 0,8,128 + addi 10,10,1 + li 7,48 + mtctr 10 + + .align 4 +L(copy_128): + lxvd2x 10,0,4 + lxvd2x 11,4,11 + addi 8,4,64 + addi 10,9,64 + lxvd2x 12,4,6 + lxvd2x 0,4,7 + addi 4,4,128 + stxvd2x 10,0,9 + stxvd2x 11,9,11 + stxvd2x 12,9,6 + stxvd2x 0,9,7 + addi 9,9,128 + lxvd2x 10,0,8 + lxvd2x 11,8,11 + lxvd2x 12,8,6 + lxvd2x 0,8,7 + stxvd2x 10,0,10 + stxvd2x 11,10,11 + stxvd2x 12,10,6 + stxvd2x 0,10,7 + bdnz L(copy_128) + add 9,3,0 + rldicl 5,5,0,57 +L(copy_gt_64_le_128): + cmpldi 7,5,63 + ble 7,L(copy_tail_le_64) + li 7,16 + li 8,32 + lxvd2x 10,0,4 + li 10,48 + addi 5,5,-64 + lxvd2x 11,4,7 + lxvd2x 12,4,8 + lxvd2x 0,4,10 + addi 4,4,64 + stxvd2x 10,0,9 + stxvd2x 11,9,7 + stxvd2x 12,9,8 + stxvd2x 0,9,10 + addi 9,9,64 +L(copy_tail_le_64): + cmpldi 7,5,32 + bgt 7,L(copy_tail_gt_32_le_64) + cmpdi 7,5,0 + beqlr 7 + addi 5,5,-32 + li 10,16 + add 7,4,5 + add 8,9,5 + lxvd2x 12,4,5 + lxvd2x 0,7,10 + stxvd2x 12,9,5 + stxvd2x 0,8,10 + blr + + .align 4 +L(copy_gt_16_le_32): + lxvd2x 0,0,4 + addi 5,5,-16 + lxvd2x 12,4,5 + stxvd2x 0,0,3 + stxvd2x 12,3,5 + blr + + .align 4 +L(copy_gt_32_le_64): + li 9,16 + lxvd2x 12,0,4 + addi 5,5,-32 + add 8,4,5 + add 10,3,5 + lxvd2x 0,4,9 + stxvd2x 12,0,3 + stxvd2x 0,3,9 + lxvd2x 12,4,5 + lxvd2x 0,8,9 + stxvd2x 12,3,5 + stxvd2x 0,10,9 + blr + + .align 4 +L(copy_tail_gt_32_le_64): + li 10,16 + lxvd2x 12,0,4 + addi 5,5,-32 + add 7,4,5 + add 8,9,5 + lxvd2x 0,4,10 + stxvd2x 12,0,9 + stxvd2x 0,9,10 + lxvd2x 12,4,5 + lxvd2x 0,7,10 + stxvd2x 12,9,5 + stxvd2x 0,8,10 + blr + +END_GEN_TB (MEMCPY,TB_TOCLESS) +libc_hidden_builtin_def (memcpy) |