summaryrefslogtreecommitdiff
path: root/src/runtime/memclr_ppc64x.s
diff options
context:
space:
mode:
authorLynn Boger <laboger@linux.vnet.ibm.com>2018-10-05 14:21:39 -0400
committerLynn Boger <laboger@linux.vnet.ibm.com>2018-11-06 14:54:59 +0000
commitaa9bcea3907a74f45303b3bdb603b9952cc72b7b (patch)
tree55dbad3c975d51993c099d374c29f88230d301e3 /src/runtime/memclr_ppc64x.s
parente1978a2d7a6deac29aa778a17a1cbea25586abc6 (diff)
downloadgo-git-aa9bcea3907a74f45303b3bdb603b9952cc72b7b.tar.gz
runtime: improve performance of memclr, memmove on ppc64x
This improves the asm implementations for memmove and memclr on ppc64x through use of vsx loads and stores when size is >= 32 bytes. For memclr, dcbz is used when the size is >= 512 and aligned to 128. Memclr/64 13.3ns ± 0% 10.7ns ± 0% -19.55% (p=0.000 n=8+7) Memclr/96 14.9ns ± 0% 11.4ns ± 0% -23.49% (p=0.000 n=8+8) Memclr/128 16.3ns ± 0% 12.3ns ± 0% -24.54% (p=0.000 n=8+8) Memclr/160 17.3ns ± 0% 13.0ns ± 0% -24.86% (p=0.000 n=8+8) Memclr/256 20.0ns ± 0% 15.3ns ± 0% -23.62% (p=0.000 n=8+8) Memclr/512 34.2ns ± 0% 10.2ns ± 0% -70.20% (p=0.000 n=8+8) Memclr/4096 178ns ± 0% 23ns ± 0% -87.13% (p=0.000 n=8+8) Memclr/65536 2.67µs ± 0% 0.30µs ± 0% -88.89% (p=0.000 n=7+8) Memclr/1M 43.2µs ± 0% 10.0µs ± 0% -76.85% (p=0.000 n=8+8) Memclr/4M 173µs ± 0% 40µs ± 0% -76.88% (p=0.000 n=8+8) Memclr/8M 349µs ± 0% 82µs ± 0% -76.58% (p=0.000 n=8+8) Memclr/16M 701µs ± 7% 672µs ± 0% -4.05% (p=0.040 n=8+7) Memclr/64M 2.70ms ± 0% 2.67ms ± 0% -0.96% (p=0.000 n=8+7) Memmove/32 6.59ns ± 0% 5.84ns ± 0% -11.34% (p=0.029 n=4+4) Memmove/64 7.91ns ± 0% 6.97ns ± 0% -11.92% (p=0.029 n=4+4) Memmove/128 10.5ns ± 0% 8.8ns ± 0% -16.24% (p=0.029 n=4+4) Memmove/256 21.0ns ± 0% 12.9ns ± 0% -38.57% (p=0.029 n=4+4) Memmove/512 28.4ns ± 0% 26.2ns ± 0% -7.75% (p=0.029 n=4+4) Memmove/1024 48.2ns ± 1% 39.4ns ± 0% -18.26% (p=0.029 n=4+4) Memmove/2048 85.4ns ± 0% 69.0ns ± 0% -19.20% (p=0.029 n=4+4) Memmove/4096 159ns ± 0% 128ns ± 0% -19.50% (p=0.029 n=4+4) Change-Id: I8c1adf88790845bf31444a15249456006eb5bf8b Reviewed-on: https://go-review.googlesource.com/c/141217 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <mike.munday@ibm.com>
Diffstat (limited to 'src/runtime/memclr_ppc64x.s')
-rw-r--r--src/runtime/memclr_ppc64x.s129
1 files changed, 115 insertions, 14 deletions
diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s
index 3b23ce89d8..072963f756 100644
--- a/src/runtime/memclr_ppc64x.s
+++ b/src/runtime/memclr_ppc64x.s
@@ -14,34 +14,68 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT|NOFRAME, $0-16
// Determine if there are doublewords to clear
check:
ANDCC $7, R4, R5 // R5: leftover bytes to clear
- SRAD $3, R4, R6 // R6: double words to clear
+ SRD $3, R4, R6 // R6: double words to clear
CMP R6, $0, CR1 // CR1[EQ] set if no double words
- BC 12, 6, nozerolarge // only single bytes
- MOVD R6, CTR // R6 = number of double words
- SRADCC $2, R6, R7 // 32 byte chunks?
- BNE zero32setup
+ BC 12, 6, nozerolarge // only single bytes
+ CMP R4, $512
+ BLT under512 // special case for < 512
+ ANDCC $127, R3, R8 // check for 128 alignment of address
+ BEQ zero512setup
+
+ ANDCC $7, R3, R15
+ BEQ zero512xsetup // at least 8 byte aligned
+
+ // zero bytes up to 8 byte alignment
+
+ ANDCC $1, R3, R15 // check for byte alignment
+ BEQ byte2
+ MOVB R0, 0(R3) // zero 1 byte
+ ADD $1, R3 // bump ptr by 1
+ ADD $-1, R4
+
+byte2:
+ ANDCC $2, R3, R15 // check for 2 byte alignment
+ BEQ byte4
+ MOVH R0, 0(R3) // zero 2 bytes
+ ADD $2, R3 // bump ptr by 2
+ ADD $-2, R4
+
+byte4:
+ ANDCC $4, R3, R15 // check for 4 byte alignment
+ BEQ zero512xsetup
+ MOVW R0, 0(R3) // zero 4 bytes
+ ADD $4, R3 // bump ptr by 4
+ ADD $-4, R4
+ BR zero512xsetup // ptr should now be 8 byte aligned
+
+under512:
+ MOVD R6, CTR // R6 = number of double words
+ SRDCC $2, R6, R7 // 32 byte chunks?
+ BNE zero32setup
// Clear double words
zero8:
MOVD R0, 0(R3) // double word
ADD $8, R3
+ ADD $-8, R4
BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0
- BR nozerolarge // handle remainder
+ BR nozerolarge // handle leftovers
// Prepare to clear 32 bytes at a time.
zero32setup:
- DCBTST (R3) // prepare data cache
- MOVD R7, CTR // number of 32 byte chunks
+ DCBTST (R3) // prepare data cache
+ XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
+ MOVD R7, CTR // number of 32 byte chunks
+ MOVD $16, R8
zero32:
- MOVD R0, 0(R3) // clear 4 double words
- MOVD R0, 8(R3)
- MOVD R0, 16(R3)
- MOVD R0, 24(R3)
+ STXVD2X VS32, (R3+R0) // store 16 bytes
+ STXVD2X VS32, (R3+R8)
ADD $32, R3
+ ADD $-32, R4
BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0
RLDCLCC $61, R4, $3, R6 // remaining doublewords
BEQ nozerolarge
@@ -49,8 +83,8 @@ zero32:
BR zero8
nozerolarge:
- CMP R5, $0 // any remaining bytes
- BC 4, 1, LR // ble lr
+ ANDCC $7, R4, R5 // any remaining bytes
+ BC 4, 1, LR // ble lr
zerotail:
MOVD R5, CTR // set up to clear tail bytes
@@ -60,3 +94,70 @@ zerotailloop:
ADD $1, R3
BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
RET
+
+zero512xsetup: // 512 chunk with extra needed
+ ANDCC $8, R3, R11 // 8 byte alignment?
+ BEQ zero512setup16
+ MOVD R0, 0(R3) // clear 8 bytes
+ ADD $8, R3 // update ptr to next 8
+ ADD $-8, R4 // dec count by 8
+
+zero512setup16:
+ ANDCC $127, R3, R14 // < 128 byte alignment
+ BEQ zero512setup // handle 128 byte alignment
+ MOVD $128, R15
+ SUB R14, R15, R14 // find increment to 128 alignment
+ SRD $4, R14, R15 // number of 16 byte chunks
+
+zero512presetup:
+ MOVD R15, CTR // loop counter of 16 bytes
+ XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
+
+zero512preloop: // clear up to 128 alignment
+ STXVD2X VS32, (R3+R0) // clear 16 bytes
+ ADD $16, R3 // update ptr
+ ADD $-16, R4 // dec count
+ BC 16, 0, zero512preloop
+
+zero512setup: // setup for dcbz loop
+ CMP R4, $512 // check if at least 512
+ BLT remain
+ SRD $9, R4, R8 // loop count for 512 chunks
+ MOVD R8, CTR // set up counter
+ MOVD $128, R9 // index regs for 128 bytes
+ MOVD $256, R10
+ MOVD $384, R11
+
+zero512:
+ DCBZ (R3+R0) // clear first chunk
+ DCBZ (R3+R9) // clear second chunk
+ DCBZ (R3+R10) // clear third chunk
+ DCBZ (R3+R11) // clear fourth chunk
+ ADD $512, R3
+ ADD $-512, R4
+ BC 16, 0, zero512
+
+remain:
+ CMP R4, $128 // check if 128 byte chunks left
+ BLT smaller
+ DCBZ (R3+R0) // clear 128
+ ADD $128, R3
+ ADD $-128, R4
+ BR remain
+
+smaller:
+ ANDCC $127, R4, R7 // find leftovers
+ BEQ done
+ CMP R7, $64 // more than 64, do 32 at a time
+ BLT zero8setup // less than 64, do 8 at a time
+ SRD $5, R7, R7 // set up counter for 32
+ BR zero32setup
+
+zero8setup:
+ SRDCC $3, R7, R7 // less than 8 bytes
+ BEQ nozerolarge
+ MOVD R7, CTR
+ BR zero8
+
+done:
+ RET