summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlan Modra <amodra@gmail.com>2013-08-17 18:47:22 +0930
committerAlan Modra <amodra@gmail.com>2013-10-04 10:41:24 +0930
commit759cfef3ac4c07dba1ece0bbc1207e099348816d (patch)
treea0e8cadce4426afb90d39b330dd50688b8975484
parentfe6e95d7171eba5f3e07848f081676fae4e86322 (diff)
downloadglibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.tar.gz
PowerPC LE memcpy
http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html LIttle-endian support for memcpy. I spent some time cleaning up the 64-bit power7 memcpy, in order to avoid the extra alignment traps power7 takes for little-endian. It probably would have been better to copy the linux kernel version of memcpy. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better use of regs. Use power7 mtocrf. Tidy function tails.
-rw-r--r--ChangeLog13
-rw-r--r--sysdeps/powerpc/powerpc32/power4/memcpy.S58
-rw-r--r--sysdeps/powerpc/powerpc32/power6/memcpy.S81
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memcpy.S24
-rw-r--r--sysdeps/powerpc/powerpc32/power7/mempcpy.S28
-rw-r--r--sysdeps/powerpc/powerpc64/memcpy.S27
-rw-r--r--sysdeps/powerpc/powerpc64/power4/memcpy.S61
-rw-r--r--sysdeps/powerpc/powerpc64/power6/memcpy.S329
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S704
-rw-r--r--sysdeps/powerpc/powerpc64/power7/mempcpy.S26
10 files changed, 941 insertions, 410 deletions
diff --git a/ChangeLog b/ChangeLog
index 51311857a6..959d3a30ab 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
2013-10-04 Alan Modra <amodra@gmail.com>
+ * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
+ * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better
+ use of regs. Use power7 mtocrf. Tidy function tails.
+
+2013-10-04 Alan Modra <amodra@gmail.com>
+
* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
Formatting. Consistently use rXXX register defines or rN defines.
Use early exit labels that avoid restoring unused non-volatile regs.
diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S
index d9146631e3..338d3cce30 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S
@@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
lwz 6,8(5) /* load the 3rd src word */
stw 0,0(4) /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10 /* now left align 2nd src word into R0 */
srw 8,6,9 /* shift 3rd src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
lwz 7,12(5)
stw 0,4(4) /* store the 2nd dst word */
@@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
addi 5,5,16
bf 31,4f
/* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 3rd src word to left align it in R0 */
srw 8,7,9 /* shift 4th src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
stw 0,0(4) /* store 3rd dst word */
mr 6,7
@@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
+#endif
addi 5,5,8
or 0,0,8 /* or them to get word to store */
bf 31,4f
@@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
.align 4
4:
/* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
lwz 6,0(5)
stw 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10
srw 8,6,9
+#endif
or 0,0,8
lwz 7,4(5)
stw 0,4(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
lwz 6,8(5)
stw 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10
srw 8,6,9
+#endif
or 0,0,8
lwz 7,12(5)
stw 0,12(4)
@@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
bdnz+ 4b
8:
/* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
stw 0,0(4)
3:
diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S
index a76f71e04f..f58114a0c5 100644
--- a/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S
@@ -219,15 +219,28 @@ L(word_unaligned_short):
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@@ -577,7 +590,11 @@ L(wdu1_32):
lwz 6,-1(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,8
+#else
slwi 6,6,8
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu1_32tail)
mtctr 8
@@ -585,8 +602,12 @@ L(wdu1_32):
lwz 8,3(4)
lwz 7,4(4)
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
b L(wdu1_loop32x)
.align 4
L(wdu1_loop32):
@@ -595,8 +616,12 @@ L(wdu1_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
L(wdu1_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -613,7 +638,11 @@ L(wdu1_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,8
+#else
slwi 6,8,8
+#endif
bdnz+ L(wdu1_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -624,8 +653,12 @@ L(wdu1_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,3(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
+/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
b L(wdu_32tailx)
L(wdu2_32):
@@ -633,7 +666,11 @@ L(wdu2_32):
lwz 6,-2(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,16
+#else
slwi 6,6,16
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu2_32tail)
mtctr 8
@@ -641,8 +678,11 @@ L(wdu2_32):
lwz 8,2(4)
lwz 7,4(4)
-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
b L(wdu2_loop32x)
.align 4
L(wdu2_loop32):
@@ -651,8 +691,11 @@ L(wdu2_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
L(wdu2_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -670,7 +713,11 @@ L(wdu2_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,16
+#else
slwi 6,8,16
+#endif
bdnz+ L(wdu2_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -681,8 +728,11 @@ L(wdu2_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,2(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
b L(wdu_32tailx)
L(wdu3_32):
@@ -690,7 +740,11 @@ L(wdu3_32):
lwz 6,-3(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,24
+#else
slwi 6,6,24
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu3_32tail)
mtctr 8
@@ -698,8 +752,11 @@ L(wdu3_32):
lwz 8,1(4)
lwz 7,4(4)
-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
b L(wdu3_loop32x)
.align 4
L(wdu3_loop32):
@@ -708,8 +765,11 @@ L(wdu3_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
L(wdu3_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -726,7 +786,11 @@ L(wdu3_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,24
+#else
slwi 6,8,24
+#endif
bdnz+ L(wdu3_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -737,8 +801,11 @@ L(wdu3_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,1(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
b L(wdu_32tailx)
.align 4
L(wdu_32tailx):
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 7f00778236..acf3c10198 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -461,11 +469,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
index 5ad4edb580..4610ec5b56 100644
--- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
- lvsl 5,0,12
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
+ lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
- vperm 6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -403,11 +411,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
diff --git a/sysdeps/powerpc/powerpc64/memcpy.S b/sysdeps/powerpc/powerpc64/memcpy.S
index b8c4cc8b10..5fc7401c99 100644
--- a/sysdeps/powerpc/powerpc64/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/memcpy.S
@@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0)
ld 7,8(5)
subfic 9,10,64
beq 2f
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+#else
sld 0,6,10
+#endif
cmpldi 11,1
mr 6,7
addi 4,4,-8
@@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0)
b 1f
2: addi 5,5,8
.align 4
+#ifdef __LITTLE_ENDIAN__
+0: srd 0,6,10
+ sld 8,7,9
+#else
0: sld 0,6,10
srd 8,7,9
+#endif
cmpldi 11,2
ld 6,8(5)
or 0,0,8
addi 11,11,-2
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+1: sld 8,6,9
+#else
sld 0,7,10
1: srd 8,6,9
+#endif
or 0,0,8
beq 8f
ld 7,16(5)
diff --git a/sysdeps/powerpc/powerpc64/power4/memcpy.S b/sysdeps/powerpc/powerpc64/power4/memcpy.S
index 4317c7e786..f9a7260dcb 100644
--- a/sysdeps/powerpc/powerpc64/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S
@@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+ sld 8,6,9
+#else
sld 0,7,10
srd 8,6,9
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0)
blt cr6,8f /* if total DWs = 3, then bypass loop */
bf 31,4f
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
addi 5,5,16
or 0,0,8
bf 31,4f
@@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0)
addi 4,4,8
.align 4
/* copy 32 bytes at a time */
-4: sld 0,6,10
+4:
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
+ sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+ sld 8,6,9
+#else
sld 0,7,10
srd 8,6,9
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+ sld 8,6,9
+#else
sld 0,7,10
srd 8,6,9
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0)
.align 4
8:
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
std 0,0(4)
3:
diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S
index d6d242d293..e3f3d8a303 100644
--- a/sysdeps/powerpc/powerpc64/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S
@@ -400,15 +400,28 @@ L(das_tail2):
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@@ -595,13 +608,24 @@ L(du1_do):
bf 30,L(du1_1dw)
/* there are at least two DWs to copy */
+ /* FIXME: can combine last shift and "or" into "rldimi" */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 8
+ sldi 8,7, 64-8
+#else
sldi 0,6, 8
srdi 8,7, 64-8
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 8
+ sldi 8,6, 64-8
+#else
sldi 0,7, 8
srdi 8,6, 64-8
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -610,8 +634,13 @@ L(du1_do):
blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du1_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 8
+ sldi 8,7, 64-8
+#else
sldi 0,6, 8
srdi 8,7, 64-8
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -622,8 +651,13 @@ L(du1_do):
b L(du1_loop)
.align 4
L(du1_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 8
+ sldi 8,7, 64-8
+#else
sldi 0,6, 8
srdi 8,7, 64-8
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du1_loop)
@@ -635,23 +669,43 @@ L(du1_1dw):
.align 4
/* copy 32 bytes at a time */
L(du1_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 8
+ sldi 8,7, 64-8
+#else
sldi 0,6, 8
srdi 8,7, 64-8
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 8
+ sldi 8,6, 64-8
+#else
sldi 0,7, 8
srdi 8,6, 64-8
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 8
+ sldi 8,7, 64-8
+#else
sldi 0,6, 8
srdi 8,7, 64-8
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 8
+ sldi 8,6, 64-8
+#else
sldi 0,7, 8
srdi 8,6, 64-8
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -661,8 +715,13 @@ L(du1_loop):
.align 4
L(du1_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 8
+ sldi 8,7, 64-8
+#else
sldi 0,6, 8
srdi 8,7, 64-8
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@@ -672,13 +731,23 @@ L(du2_do):
bf 30,L(du2_1dw)
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 16
+ sldi 8,7, 64-16
+#else
sldi 0,6, 16
srdi 8,7, 64-16
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 16
+ sldi 8,6, 64-16
+#else
sldi 0,7, 16
srdi 8,6, 64-16
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -687,8 +756,13 @@ L(du2_do):
blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du2_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 16
+ sldi 8,7, 64-16
+#else
sldi 0,6, 16
srdi 8,7, 64-16
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -699,8 +773,13 @@ L(du2_do):
b L(du2_loop)
.align 4
L(du2_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 16
+ sldi 8,7, 64-16
+#else
sldi 0,6, 16
srdi 8,7, 64-16
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du2_loop)
@@ -712,23 +791,43 @@ L(du2_1dw):
.align 4
/* copy 32 bytes at a time */
L(du2_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 16
+ sldi 8,7, 64-16
+#else
sldi 0,6, 16
srdi 8,7, 64-16
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 16
+ sldi 8,6, 64-16
+#else
sldi 0,7, 16
srdi 8,6, 64-16
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 16
+ sldi 8,7, 64-16
+#else
sldi 0,6, 16
srdi 8,7, 64-16
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 16
+ sldi 8,6, 64-16
+#else
sldi 0,7, 16
srdi 8,6, 64-16
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -738,8 +837,13 @@ L(du2_loop):
.align 4
L(du2_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 16
+ sldi 8,7, 64-16
+#else
sldi 0,6, 16
srdi 8,7, 64-16
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@@ -749,13 +853,23 @@ L(du3_do):
bf 30,L(du3_1dw)
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 24
+ sldi 8,7, 64-24
+#else
sldi 0,6, 24
srdi 8,7, 64-24
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 24
+ sldi 8,6, 64-24
+#else
sldi 0,7, 24
srdi 8,6, 64-24
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -764,8 +878,13 @@ L(du3_do):
blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du3_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 24
+ sldi 8,7, 64-24
+#else
sldi 0,6, 24
srdi 8,7, 64-24
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -776,8 +895,13 @@ L(du3_do):
b L(du3_loop)
.align 4
L(du3_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 24
+ sldi 8,7, 64-24
+#else
sldi 0,6, 24
srdi 8,7, 64-24
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du3_loop)
@@ -789,23 +913,43 @@ L(du3_1dw):
.align 4
/* copy 32 bytes at a time */
L(du3_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 24
+ sldi 8,7, 64-24
+#else
sldi 0,6, 24
srdi 8,7, 64-24
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 24
+ sldi 8,6, 64-24
+#else
sldi 0,7, 24
srdi 8,6, 64-24
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 24
+ sldi 8,7, 64-24
+#else
sldi 0,6, 24
srdi 8,7, 64-24
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 24
+ sldi 8,6, 64-24
+#else
sldi 0,7, 24
srdi 8,6, 64-24
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -815,8 +959,13 @@ L(du3_loop):
.align 4
L(du3_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 24
+ sldi 8,7, 64-24
+#else
sldi 0,6, 24
srdi 8,7, 64-24
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@@ -832,13 +981,23 @@ L(du4_dox):
bf 30,L(du4_1dw)
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 32
+ sldi 8,7, 64-32
+#else
sldi 0,6, 32
srdi 8,7, 64-32
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 32
+ sldi 8,6, 64-32
+#else
sldi 0,7, 32
srdi 8,6, 64-32
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -847,8 +1006,13 @@ L(du4_dox):
blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du4_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 32
+ sldi 8,7, 64-32
+#else
sldi 0,6, 32
srdi 8,7, 64-32
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -859,8 +1023,13 @@ L(du4_dox):
b L(du4_loop)
.align 4
L(du4_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 32
+ sldi 8,7, 64-32
+#else
sldi 0,6, 32
srdi 8,7, 64-32
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du4_loop)
@@ -872,23 +1041,43 @@ L(du4_1dw):
.align 4
/* copy 32 bytes at a time */
L(du4_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 32
+ sldi 8,7, 64-32
+#else
sldi 0,6, 32
srdi 8,7, 64-32
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 32
+ sldi 8,6, 64-32
+#else
sldi 0,7, 32
srdi 8,6, 64-32
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 32
+ sldi 8,7, 64-32
+#else
sldi 0,6, 32
srdi 8,7, 64-32
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 32
+ sldi 8,6, 64-32
+#else
sldi 0,7, 32
srdi 8,6, 64-32
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -898,8 +1087,13 @@ L(du4_loop):
.align 4
L(du4_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 32
+ sldi 8,7, 64-32
+#else
sldi 0,6, 32
srdi 8,7, 64-32
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@@ -909,13 +1103,23 @@ L(du5_do):
bf 30,L(du5_1dw)
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 40
+ sldi 8,7, 64-40
+#else
sldi 0,6, 40
srdi 8,7, 64-40
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 40
+ sldi 8,6, 64-40
+#else
sldi 0,7, 40
srdi 8,6, 64-40
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -924,8 +1128,13 @@ L(du5_do):
blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du5_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 40
+ sldi 8,7, 64-40
+#else
sldi 0,6, 40
srdi 8,7, 64-40
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -936,8 +1145,13 @@ L(du5_do):
b L(du5_loop)
.align 4
L(du5_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 40
+ sldi 8,7, 64-40
+#else
sldi 0,6, 40
srdi 8,7, 64-40
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du5_loop)
@@ -949,23 +1163,43 @@ L(du5_1dw):
.align 4
/* copy 32 bytes at a time */
L(du5_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 40
+ sldi 8,7, 64-40
+#else
sldi 0,6, 40
srdi 8,7, 64-40
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 40
+ sldi 8,6, 64-40
+#else
sldi 0,7, 40
srdi 8,6, 64-40
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 40
+ sldi 8,7, 64-40
+#else
sldi 0,6, 40
srdi 8,7, 64-40
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 40
+ sldi 8,6, 64-40
+#else
sldi 0,7, 40
srdi 8,6, 64-40
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -975,8 +1209,13 @@ L(du5_loop):
.align 4
L(du5_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 40
+ sldi 8,7, 64-40
+#else
sldi 0,6, 40
srdi 8,7, 64-40
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@@ -986,13 +1225,23 @@ L(du6_do):
bf 30,L(du6_1dw)
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 48
+ sldi 8,7, 64-48
+#else
sldi 0,6, 48
srdi 8,7, 64-48
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 48
+ sldi 8,6, 64-48
+#else
sldi 0,7, 48
srdi 8,6, 64-48
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -1001,8 +1250,13 @@ L(du6_do):
blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du6_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 48
+ sldi 8,7, 64-48
+#else
sldi 0,6, 48
srdi 8,7, 64-48
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -1013,8 +1267,13 @@ L(du6_do):
b L(du6_loop)
.align 4
L(du6_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 48
+ sldi 8,7, 64-48
+#else
sldi 0,6, 48
srdi 8,7, 64-48
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du6_loop)
@@ -1026,23 +1285,43 @@ L(du6_1dw):
.align 4
/* copy 32 bytes at a time */
L(du6_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 48
+ sldi 8,7, 64-48
+#else
sldi 0,6, 48
srdi 8,7, 64-48
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 48
+ sldi 8,6, 64-48
+#else
sldi 0,7, 48
srdi 8,6, 64-48
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 48
+ sldi 8,7, 64-48
+#else
sldi 0,6, 48
srdi 8,7, 64-48
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 48
+ sldi 8,6, 64-48
+#else
sldi 0,7, 48
srdi 8,6, 64-48
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -1052,8 +1331,13 @@ L(du6_loop):
.align 4
L(du6_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 48
+ sldi 8,7, 64-48
+#else
sldi 0,6, 48
srdi 8,7, 64-48
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@@ -1063,13 +1347,23 @@ L(du7_do):
bf 30,L(du7_1dw)
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 56
+ sldi 8,7, 64-56
+#else
sldi 0,6, 56
srdi 8,7, 64-56
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 56
+ sldi 8,6, 64-56
+#else
sldi 0,7, 56
srdi 8,6, 64-56
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -1078,8 +1372,13 @@ L(du7_do):
blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du7_loop)
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 56
+ sldi 8,7, 64-56
+#else
sldi 0,6, 56
srdi 8,7, 64-56
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -1090,8 +1389,13 @@ L(du7_do):
b L(du7_loop)
.align 4
L(du7_1dw):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 56
+ sldi 8,7, 64-56
+#else
sldi 0,6, 56
srdi 8,7, 64-56
+#endif
addi 5,5,16
or 0,0,8
bf 31,L(du7_loop)
@@ -1103,23 +1407,43 @@ L(du7_1dw):
.align 4
/* copy 32 bytes at a time */
L(du7_loop):
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 56
+ sldi 8,7, 64-56
+#else
sldi 0,6, 56
srdi 8,7, 64-56
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 56
+ sldi 8,6, 64-56
+#else
sldi 0,7, 56
srdi 8,6, 64-56
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 56
+ sldi 8,7, 64-56
+#else
sldi 0,6, 56
srdi 8,7, 64-56
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,7, 56
+ sldi 8,6, 64-56
+#else
sldi 0,7, 56
srdi 8,6, 64-56
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -1129,8 +1453,13 @@ L(du7_loop):
.align 4
L(du7_fini):
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srdi 0,6, 56
+ sldi 8,7, 64-56
+#else
sldi 0,6, 56
srdi 8,7, 64-56
+#endif
or 0,0,8
std 0,0(4)
b L(du_done)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 800a9f1bb1..e8df75f593 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -23,418 +23,361 @@
/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
Returns 'dst'. */
+#define dst 11 /* Use r11 so r3 kept unchanged. */
+#define src 4
+#define cnt 5
+
.machine power7
EALIGN (memcpy, 5, 0)
CALL_MCOUNT 3
- cmpldi cr1,5,31
+ cmpldi cr1,cnt,31
neg 0,3
- std 3,-16(1)
- std 31,-8(1)
- cfi_offset(31,-8)
ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
code. */
- andi. 11,3,7 /* Check alignment of DST. */
-
-
- clrldi 10,4,61 /* Check alignment of SRC. */
- cmpld cr6,10,11 /* SRC and DST alignments match? */
- mr 12,4
- mr 31,5
+#ifdef __LITTLE_ENDIAN__
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
+ or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
+ loop is only used for quadword aligned copies. */
+ andi. 10,3,15
+ clrldi 11,4,60
+#else
+ andi. 10,3,7 /* Check alignment of DST. */
+ clrldi 11,4,61 /* Check alignment of SRC. */
+#endif
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+ mr dst,3
bne cr6,L(copy_GE_32_unaligned)
+ beq L(aligned_copy)
- srdi 9,5,3 /* Number of full quadwords remaining. */
-
- beq L(copy_GE_32_aligned_cont)
-
- clrldi 0,0,61
- mtcrf 0x01,0
- subf 31,0,5
-
- /* Get the SRC aligned to 8 bytes. */
-
-1: bf 31,2f
- lbz 6,0(12)
- addi 12,12,1
- stb 6,0(3)
- addi 3,3,1
-2: bf 30,4f
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-4: bf 29,0f
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-0:
- clrldi 10,12,61 /* Check alignment of SRC again. */
- srdi 9,31,3 /* Number of full doublewords remaining. */
-
-L(copy_GE_32_aligned_cont):
-
- clrldi 11,31,61
- mtcrf 0x01,9
-
- srdi 8,31,5
- cmpldi cr1,9,4
- cmpldi cr6,11,0
- mr 11,12
-
- /* Copy 1~3 doublewords so the main loop starts
- at a multiple of 32 bytes. */
+ mtocrf 0x01,0
+#ifdef __LITTLE_ENDIAN__
+ clrldi 0,0,60
+#else
+ clrldi 0,0,61
+#endif
- bf 30,1f
- ld 6,0(12)
- ld 7,8(12)
- addi 11,12,16
- mtctr 8
- std 6,0(3)
- std 7,8(3)
- addi 10,3,16
- bf 31,4f
- ld 0,16(12)
- std 0,16(3)
- blt cr1,3f
- addi 11,12,24
- addi 10,3,24
- b 4f
-
- .align 4
-1: /* Copy 1 doubleword and set the counter. */
- mr 10,3
- mtctr 8
- bf 31,4f
- ld 6,0(12)
- addi 11,12,8
- std 6,0(3)
- addi 10,3,8
-
-L(aligned_copy):
- /* Main aligned copy loop. Copies up to 128-bytes at a time. */
- .align 4
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
+1:
+ bf 31,2f
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+2:
+ bf 30,4f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
4:
- /* check for any 32-byte or 64-byte lumps that are outside of a
- nice 128-byte range. R8 contains the number of 32-byte
- lumps, so drop this into the CR, and use the SO/EQ bits to help
- handle the 32- or 64- byte lumps. Then handle the rest with an
- unrolled 128-bytes-at-a-time copy loop. */
- mtocrf 1,8
- li 6,16 # 16() index
- li 7,32 # 32() index
- li 8,48 # 48() index
-
-L(aligned_32byte):
- /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
- bns cr7,L(aligned_64byte)
- lxvd2x 6,0,11
- lxvd2x 7,11,6
- addi 11,11,32
- stxvd2x 6,0,10
- stxvd2x 7,10,6
- addi 10,10,32
-
-L(aligned_64byte):
- /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
- bne cr7,L(aligned_128setup)
- lxvd2x 6,0,11
- lxvd2x 7,11,6
- lxvd2x 8,11,7
- lxvd2x 9,11,8
- addi 11,11,64
- stxvd2x 6,0,10
- stxvd2x 7,10,6
- stxvd2x 8,10,7
- stxvd2x 9,10,8
- addi 10,10,64
-
-L(aligned_128setup):
- /* Set up for the 128-byte at a time copy loop. */
- srdi 8,31,7
- cmpdi 8,0 # Any 4x lumps left?
- beq 3f # if not, move along.
- lxvd2x 6,0,11
- lxvd2x 7,11,6
- mtctr 8 # otherwise, load the ctr and begin.
- li 8,48 # 48() index
+ bf 29,8f
+ lwz 6,0(src)
+ addi src,src,4
+ stw 6,0(dst)
+ addi dst,dst,4
+8:
+#ifdef __LITTLE_ENDIAN__
+ bf 28,16f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+16:
+#endif
+ subf cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+ li 6,16
+ li 7,32
+ li 8,48
+ mtocrf 0x02,cnt
+ srdi 12,cnt,7
+ cmpdi 12,0
+ beq L(aligned_tail)
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ mtctr 12
b L(aligned_128loop)
+ .align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,11
- lxvd2x 7,11,6
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
L(aligned_128loop):
- lxvd2x 8,11,7
- lxvd2x 9,11,8
- stxvd2x 6,0,10
- addi 11,11,64
- stxvd2x 7,10,6
- stxvd2x 8,10,7
- stxvd2x 9,10,8
- lxvd2x 6,0,11
- lxvd2x 7,11,6
- addi 10,10,64
- lxvd2x 8,11,7
- lxvd2x 9,11,8
- addi 11,11,64
- stxvd2x 6,0,10
- stxvd2x 7,10,6
- stxvd2x 8,10,7
- stxvd2x 9,10,8
- addi 10,10,64
+ lxvd2x 8,src,7
+ lxvd2x 9,src,8
+ stxvd2x 6,0,dst
+ addi src,src,64
+ stxvd2x 7,dst,6
+ stxvd2x 8,dst,7
+ stxvd2x 9,dst,8
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ addi dst,dst,64
+ lxvd2x 8,src,7
+ lxvd2x 9,src,8
+ addi src,src,64
+ stxvd2x 6,0,dst
+ stxvd2x 7,dst,6
+ stxvd2x 8,dst,7
+ stxvd2x 9,dst,8
+ addi dst,dst,64
bdnz L(aligned_128head)
-3:
- /* Check for tail bytes. */
- rldicr 0,31,0,60
- mtcrf 0x01,31
- beq cr6,0f
-
-.L9:
- add 3,3,0
- add 12,12,0
-
- /* At this point we have a tail of 0-7 bytes and we know that the
- destination is doubleword-aligned. */
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-2: /* Copy 2 bytes. */
- bf 30,1f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-1: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,0(12)
- stb 6,0(3)
-0: /* Return original DST pointer. */
- ld 31,-8(1)
- ld 3,-16(1)
+L(aligned_tail):
+ mtocrf 0x01,cnt
+ bf 25,32f
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ lxvd2x 8,src,7
+ lxvd2x 9,src,8
+ addi src,src,64
+ stxvd2x 6,0,dst
+ stxvd2x 7,dst,6
+ stxvd2x 8,dst,7
+ stxvd2x 9,dst,8
+ addi dst,dst,64
+32:
+ bf 26,16f
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ addi src,src,32
+ stxvd2x 6,0,dst
+ stxvd2x 7,dst,6
+ addi dst,dst,32
+16:
+ bf 27,8f
+ lxvd2x 6,0,src
+ addi src,src,16
+ stxvd2x 6,0,dst
+ addi dst,dst,16
+8:
+ bf 28,4f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
blr
- /* Handle copies of 0~31 bytes. */
- .align 4
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
L(copy_LT_32):
- cmpldi cr6,5,8
- mr 12,4
- mtcrf 0x01,5
+ mr dst,3
+ cmpldi cr6,cnt,8
+ mtocrf 0x01,cnt
ble cr6,L(copy_LE_8)
/* At least 9 bytes to go. */
neg 8,4
- clrrdi 11,4,2
- andi. 0,8,3
- cmpldi cr1,5,16
- mr 10,5
+ andi. 0,8,3
+ cmpldi cr1,cnt,16
beq L(copy_LT_32_aligned)
- /* Force 4-bytes alignment for SRC. */
- mtocrf 0x01,0
- subf 10,0,5
-2: bf 30,1f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-1: bf 31,L(end_4bytes_alignment)
-
- lbz 6,0(12)
- addi 12,12,1
- stb 6,0(3)
- addi 3,3,1
-
- .align 4
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf cnt,0,cnt
+2:
+ bf 30,1f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+1:
+ bf 31,L(end_4bytes_alignment)
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+
+ .align 4
L(end_4bytes_alignment):
- cmpldi cr1,10,16
- mtcrf 0x01,10
+ cmpldi cr1,cnt,16
+ mtocrf 0x01,cnt
L(copy_LT_32_aligned):
/* At least 6 bytes to go, and SRC is word-aligned. */
blt cr1,8f
/* Copy 16 bytes. */
- lwz 6,0(12)
- lwz 7,4(12)
- stw 6,0(3)
- lwz 8,8(12)
- stw 7,4(3)
- lwz 6,12(12)
- addi 12,12,16
- stw 8,8(3)
- stw 6,12(3)
- addi 3,3,16
+ lwz 6,0(src)
+ lwz 7,4(src)
+ stw 6,0(dst)
+ lwz 8,8(src)
+ stw 7,4(dst)
+ lwz 6,12(src)
+ addi src,src,16
+ stw 8,8(dst)
+ stw 6,12(dst)
+ addi dst,dst,16
8: /* Copy 8 bytes. */
- bf 28,4f
+ bf 28,L(tail4)
+ lwz 6,0(src)
+ lwz 7,4(src)
+ addi src,src,8
+ stw 6,0(dst)
+ stw 7,4(dst)
+ addi dst,dst,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
- lwz 6,0(12)
- lwz 7,4(12)
- addi 12,12,8
- stw 6,0(3)
- stw 7,4(3)
- addi 3,3,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-2: /* Copy 2-3 bytes. */
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
bf 30,1f
-
- lhz 6,0(12)
- sth 6,0(3)
- bf 31,0f
- lbz 7,2(12)
- stb 7,2(3)
- ld 3,-16(1)
+ lhz 6,0(src)
+ sth 6,0(dst)
+ bflr 31
+ lbz 7,2(src)
+ stb 7,2(dst)
blr
- .align 4
-1: /* Copy 1 byte. */
- bf 31,0f
+ .align 4
+L(tail5):
+ bflr 31
+ lbz 6,4(src)
+ stb 6,4(dst)
+ blr
- lbz 6,0(12)
- stb 6,0(3)
-0: /* Return original DST pointer. */
- ld 3,-16(1)
+ .align 4
+1:
+ bflr 31
+ lbz 6,0(src)
+ stb 6,0(dst)
+ /* Return original DST pointer. */
blr
- /* Handles copies of 0~8 bytes. */
- .align 4
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
L(copy_LE_8):
- bne cr6,4f
+ bne cr6,L(tail4)
/* Though we could've used ld/std here, they are still
slow for unaligned cases. */
- lwz 6,0(4)
- lwz 7,4(4)
- stw 6,0(3)
- stw 7,4(3)
- ld 3,-16(1) /* Return original DST pointers. */
+ lwz 6,0(src)
+ lwz 7,4(src)
+ stw 6,0(dst)
+ stw 7,4(dst)
blr
- .align 4
-4: /* Copies 4~7 bytes. */
- bf 29,2b
-
- lwz 6,0(4)
- stw 6,0(3)
- bf 30,5f
- lhz 7,4(4)
- sth 7,4(3)
- bf 31,0f
- lbz 8,6(4)
- stb 8,6(3)
- ld 3,-16(1)
- blr
-
- .align 4
-5: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,4(4)
- stb 6,4(3)
-
-0: /* Return original DST pointer. */
- ld 3,-16(1)
- blr
- /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
- SRC is not. Use aligned quadword loads from SRC, shifted to realign
- the data, allowing for aligned DST stores. */
- .align 4
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
L(copy_GE_32_unaligned):
- clrldi 0,0,60 /* Number of bytes until the 1st
- quadword. */
- andi. 11,3,15 /* Check alignment of DST (against
- quadwords). */
- srdi 9,5,4 /* Number of full quadwords remaining. */
+ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
+#ifndef __LITTLE_ENDIAN__
+ andi. 10,3,15 /* Check alignment of DST (against quadwords). */
+#endif
+ srdi 9,cnt,4 /* Number of full quadwords remaining. */
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
- mtcrf 0x01,0
- subf 31,0,5
+ mtocrf 0x01,0
+ subf cnt,0,cnt
/* Vector instructions work best when proper alignment (16-bytes)
is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
-1: /* Copy 1 byte. */
+1:
bf 31,2f
-
- lbz 6,0(12)
- addi 12,12,1
- stb 6,0(3)
- addi 3,3,1
-2: /* Copy 2 bytes. */
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+2:
bf 30,4f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-4: /* Copy 4 bytes. */
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+4:
bf 29,8f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-8: /* Copy 8 bytes. */
+ lwz 6,0(src)
+ addi src,src,4
+ stw 6,0(dst)
+ addi dst,dst,4
+8:
bf 28,0f
-
- ld 6,0(12)
- addi 12,12,8
- std 6,0(3)
- addi 3,3,8
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
0:
- clrldi 10,12,60 /* Check alignment of SRC. */
- srdi 9,31,4 /* Number of full quadwords remaining. */
+ srdi 9,cnt,4 /* Number of full quadwords remaining. */
/* The proper alignment is present, it is OK to copy the bytes now. */
L(copy_GE_32_unaligned_cont):
/* Setup two indexes to speed up the indexed vector operations. */
- clrldi 11,31,60
- li 6,16 /* Index for 16-bytes offsets. */
+ clrldi 10,cnt,60
+ li 6,16 /* Index for 16-bytes offsets. */
li 7,32 /* Index for 32-bytes offsets. */
- cmpldi cr1,11,0
- srdi 8,31,5 /* Setup the loop counter. */
- mr 10,3
- mr 11,12
- mtcrf 0x01,9
- cmpldi cr6,9,1
- lvsl 5,0,12
- lvx 3,0,12
- bf 31,L(setup_unaligned_loop)
-
- /* Copy another 16 bytes to align to 32-bytes due to the loop . */
- lvx 4,12,6
- vperm 6,3,4,5
- addi 11,12,16
- addi 10,3,16
- stvx 6,0,3
+ cmpldi cr1,10,0
+ srdi 8,cnt,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,src
+#else
+ lvsl 5,0,src
+#endif
+ lvx 3,0,src
+ li 0,0
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx 4,src,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi src,src,16
+ stvx 6,0,dst
+ addi dst,dst,16
vor 3,4,4
+ clrrdi 0,src,60
L(setup_unaligned_loop):
- mtctr 8
- ble cr6,L(end_unaligned_loop)
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
/* Copy 32 bytes at a time using vector instructions. */
- .align 4
+ .align 4
L(unaligned_loop):
/* Note: vr6/vr10 may contain data that was already copied,
@@ -442,62 +385,55 @@ L(unaligned_loop):
some portions again. This is faster than having unaligned
vector instructions though. */
- lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
- lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
- addi 11,11,32
- stvx 6,0,10
- stvx 10,10,6
- addi 10,10,32
-
+ lvx 4,src,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,src,7
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi src,src,32
+ stvx 6,0,dst
+ stvx 10,dst,6
+ addi dst,dst,32
bdnz L(unaligned_loop)
- .align 4
+ clrrdi 0,src,60
+
+ .align 4
L(end_unaligned_loop):
/* Check for tail bytes. */
- rldicr 0,31,0,59
- mtcrf 0x01,31
- beq cr1,0f
+ mtocrf 0x01,cnt
+ beqlr cr1
- add 3,3,0
- add 12,12,0
+ add src,src,0
/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
-8: /* Copy 8 bytes. */
+ /* Copy 8 bytes. */
bf 28,4f
-
- lwz 6,0(12)
- lwz 7,4(12)
- addi 12,12,8
- stw 6,0(3)
- stw 7,4(3)
- addi 3,3,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-2: /* Copy 2~3 bytes. */
- bf 30,1f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-1: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,0(12)
- stb 6,0(3)
-0: /* Return original DST pointer. */
- ld 31,-8(1)
- ld 3,-16(1)
+ lwz 6,0(src)
+ lwz 7,4(src)
+ addi src,src,8
+ stw 6,0(dst)
+ stw 7,4(dst)
+ addi dst,dst,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
blr
END_GEN_TB (memcpy,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
index f20be938d2..b93ab7da52 100644
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmpldi cr6,9,1
- lvsl 5,0,12
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
+ lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
- vperm 6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -391,11 +399,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6