diff options
author | Alan Modra <amodra@gmail.com> | 2013-08-17 18:47:22 +0930 |
---|---|---|
committer | Alan Modra <amodra@gmail.com> | 2013-10-04 10:41:24 +0930 |
commit | 759cfef3ac4c07dba1ece0bbc1207e099348816d (patch) | |
tree | a0e8cadce4426afb90d39b330dd50688b8975484 /sysdeps/powerpc/powerpc32/power7 | |
parent | fe6e95d7171eba5f3e07848f081676fae4e86322 (diff) | |
download | glibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.tar.gz |
PowerPC LE memcpy
http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
LIttle-endian support for memcpy. I spent some time cleaning up the
64-bit power7 memcpy, in order to avoid the extra alignment traps
power7 takes for little-endian. It probably would have been better
to copy the linux kernel version of memcpy.
* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better
use of regs. Use power7 mtocrf. Tidy function tails.
Diffstat (limited to 'sysdeps/powerpc/powerpc32/power7')
-rw-r--r-- | sysdeps/powerpc/powerpc32/power7/memcpy.S | 24 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power7/mempcpy.S | 28 |
2 files changed, 40 insertions, 12 deletions
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S index 7f00778236..acf3c10198 100644 --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S @@ -383,7 +383,7 @@ L(copy_GE_32_unaligned): beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ mtcrf 0x01,0 subf 31,0,5 @@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmplwi cr6,9,1 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -461,11 +469,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S index 5ad4edb580..4610ec5b56 100644 --- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S @@ -325,7 +325,7 @@ L(copy_GE_32_unaligned): beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ mtcrf 0x01,0 subf 31,0,5 @@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmplwi cr6,9,1 - lvsl 5,0,12 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else + lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 - vperm 6,3,4,5 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -403,11 +411,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 |