[AArch64] Adjust writeback in non-zero memset

This fixes an ineffiency in the non-zero memset. Delaying the writeback until the end of the loop is slightly faster on some cores - this shows ~5% performance gain on Cortex-A53 when doing large non-zero memsets. * sysdeps/aarch64/memset.S (MEMSET): Improve non-zero memset loop.
author: Wilco Dijkstra <wdijkstr@arm.com> 2018-11-20 12:37:00 +0000
committer: Wilco Dijkstra <wdijkstr@arm.com> 2018-11-20 12:37:00 +0000
commit: 5770c0ad1e0c784e817464ca2cf9436a58c9beb7 (patch)
tree: 6616d15f2d44823b4c70b0fe607b4c7927fe45ac /sysdeps/aarch64
parent: 9a62a9397d0a25643922d8d053f04ee895100d9a (diff)
download: glibc-5770c0ad1e0c784e817464ca2cf9436a58c9beb7.tar.gz
1 files changed, 4 insertions, 3 deletions
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 4a45459361..9738cf5fd5 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -89,10 +89,10 @@ L(set_long):
 	b.eq	L(try_zva)
 L(no_zva):
 	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
 	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
+1:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
 L(tail64):
 	subs	count, count, 64
 	b.hi	1b
@@ -183,6 +183,7 @@ L(zva_other):
 	subs	count, count, zva_len
 	b.hs	3b
 4:	add	count, count, zva_len
+	sub	dst, dst, 32		/* Bias dst for tail loop.  */
 	b	L(tail64)
 #endif
author	Wilco Dijkstra <wdijkstr@arm.com>	2018-11-20 12:37:00 +0000
committer	Wilco Dijkstra <wdijkstr@arm.com>	2018-11-20 12:37:00 +0000
commit	5770c0ad1e0c784e817464ca2cf9436a58c9beb7 (patch)
tree	6616d15f2d44823b4c70b0fe607b4c7927fe45ac /sysdeps/aarch64
parent	9a62a9397d0a25643922d8d053f04ee895100d9a (diff)
download	glibc-5770c0ad1e0c784e817464ca2cf9436a58c9beb7.tar.gz