math: use SIMD to accelerate some scalar math functions on s390x

Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
author: Bill O'Farrell <billo@ca.ibm.com> 2016-10-30 00:11:37 -0400
committer: Michael Munday <munday@ca.ibm.com> 2016-11-11 20:20:23 +0000
commit: b6a15683f0c4d177b3711b55724506aebb03f764 (patch)
tree: 9b8a4802f885983a80eb9d7d647b4ca42cbec3db /src/math/stubs_s390x.s
parent: 9f9d83404f938a0dfb98d3f4a4d420261606069a (diff)
download: go-git-b6a15683f0c4d177b3711b55724506aebb03f764.tar.gz
1 files changed, 149 insertions, 9 deletions
diff --git a/src/math/stubs_s390x.s b/src/math/stubs_s390x.s
index c3aed13e87..8da55c54ab 100644
--- a/src/math/stubs_s390x.s
+++ b/src/math/stubs_s390x.s
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "../runtime/textflag.h"
+#include "textflag.h"
 
 TEXT ·Asin(SB),NOSPLIT,$0
 	BR ·asin(SB)
@@ -34,9 +34,6 @@ TEXT ·Hypot(SB),NOSPLIT,$0
 TEXT ·Ldexp(SB),NOSPLIT,$0
 	BR ·ldexp(SB)
 
-TEXT ·Log10(SB),NOSPLIT,$0
-	BR ·log10(SB)
-
 TEXT ·Log2(SB),NOSPLIT,$0
 	BR ·log2(SB)
 
@@ -58,11 +55,154 @@ TEXT ·Remainder(SB),NOSPLIT,$0
 TEXT ·Sincos(SB),NOSPLIT,$0
 	BR ·sincos(SB)
 
-TEXT ·Sin(SB),NOSPLIT,$0
-	BR ·sin(SB)
+TEXT ·Tan(SB),NOSPLIT,$0
+	BR ·tan(SB)
+
+//if go assembly use vector instruction
+TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
+	MOVD    $x-24(SP), R1
+	XC      $24, 0(R1), 0(R1) // clear the storage
+	MOVD    $2, R0            // R0 is the number of double words stored -1
+	WORD    $0xB2B01000       // STFLE 0(R1)
+	XOR     R0, R0            // reset the value of R0
+	MOVBZ   z-8(SP), R1
+	AND     $0x40, R1
+	BEQ     novector
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB   $0, $0xF, V16
+	VLGVB   $0, V16, R1
+	CMPBNE  R1, $0xF, novector
+	MOVB    $1, ret+0(FP) // have vx
+	RET
+novector:
+	MOVB    $0, ret+0(FP) // no vx
+	RET
+
+TEXT ·Log10(SB),NOSPLIT,$0
+	MOVD    log10vectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·log10TrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $log10vectorfacility+0x00(SB), R1
+	MOVD    $·log10(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·log10(SB)
+vectorimpl:
+	MOVD    $log10vectorfacility+0x00(SB), R1
+	MOVD    $·log10Asm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·log10Asm(SB)
+
+GLOBL log10vectorfacility+0x00(SB), NOPTR, $8
+DATA log10vectorfacility+0x00(SB)/8, $·log10TrampolineSetup(SB)
+
 
 TEXT ·Cos(SB),NOSPLIT,$0
-	BR ·cos(SB)
+	MOVD    cosvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·cosTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $cosvectorfacility+0x00(SB), R1
+	MOVD    $·cos(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·cos(SB)
+vectorimpl:
+	MOVD    $cosvectorfacility+0x00(SB), R1
+	MOVD    $·cosAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·cosAsm(SB)
+
+GLOBL cosvectorfacility+0x00(SB), NOPTR, $8
+DATA cosvectorfacility+0x00(SB)/8, $·cosTrampolineSetup(SB)
+
+
+TEXT ·Cosh(SB),NOSPLIT,$0
+	MOVD    coshvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·coshTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $coshvectorfacility+0x00(SB), R1
+	MOVD    $·cosh(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·cosh(SB)
+vectorimpl:
+	MOVD    $coshvectorfacility+0x00(SB), R1
+	MOVD    $·coshAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·coshAsm(SB)
+
+GLOBL coshvectorfacility+0x00(SB), NOPTR, $8
+DATA coshvectorfacility+0x00(SB)/8, $·coshTrampolineSetup(SB)
+
+
+TEXT ·Sin(SB),NOSPLIT,$0
+	MOVD    sinvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·sinTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $sinvectorfacility+0x00(SB), R1
+	MOVD    $·sin(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sin(SB)
+vectorimpl:
+	MOVD    $sinvectorfacility+0x00(SB), R1
+	MOVD    $·sinAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sinAsm(SB)
+
+GLOBL sinvectorfacility+0x00(SB), NOPTR, $8
+DATA sinvectorfacility+0x00(SB)/8, $·sinTrampolineSetup(SB)
+
+
+TEXT ·Sinh(SB),NOSPLIT,$0
+	MOVD    sinhvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·sinhTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $sinhvectorfacility+0x00(SB), R1
+	MOVD    $·sinh(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sinh(SB)
+vectorimpl:
+	MOVD    $sinhvectorfacility+0x00(SB), R1
+	MOVD    $·sinhAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sinhAsm(SB)
+
+GLOBL sinhvectorfacility+0x00(SB), NOPTR, $8
+DATA sinhvectorfacility+0x00(SB)/8, $·sinhTrampolineSetup(SB)
+
+
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+	MOVD    tanhvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·tanhTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $tanhvectorfacility+0x00(SB), R1
+	MOVD    $·tanh(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·tanh(SB)
+vectorimpl:
+	MOVD    $tanhvectorfacility+0x00(SB), R1
+	MOVD    $·tanhAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·tanhAsm(SB)
+
+GLOBL tanhvectorfacility+0x00(SB), NOPTR, $8
+DATA tanhvectorfacility+0x00(SB)/8, $·tanhTrampolineSetup(SB)
+
 
-TEXT ·Tan(SB),NOSPLIT,$0
-	BR ·tan(SB)
author	Bill O'Farrell <billo@ca.ibm.com>	2016-10-30 00:11:37 -0400
committer	Michael Munday <munday@ca.ibm.com>	2016-11-11 20:20:23 +0000
commit	b6a15683f0c4d177b3711b55724506aebb03f764 (patch)
tree	9b8a4802f885983a80eb9d7d647b4ca42cbec3db /src/math/stubs_s390x.s
parent	9f9d83404f938a0dfb98d3f4a4d420261606069a (diff)
download	go-git-b6a15683f0c4d177b3711b55724506aebb03f764.tar.gz