summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2020-09-06 20:43:11 +0200
committerSayed Adel <seiko@imavr.com>2020-09-07 00:04:45 +0200
commit5a642d221e6771307bfca408a18a8d43a470da21 (patch)
treeb85aef73b2a18aea17a0ec12286e703033ad1306
parentc970c04cbeb38c80901a02bc573a9333458d4c4a (diff)
downloadnumpy-5a642d221e6771307bfca408a18a8d43a470da21.tar.gz
NPYV: add fused multiply subtract/add intrinics for all supported platforms
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h44
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h16
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h43
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h57
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h16
5 files changed, 175 insertions, 1 deletions
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 9d8b4ab5e..4af9e4d17 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -72,4 +72,48 @@
#define npyv_div_f32 _mm256_div_ps
#define npyv_div_f64 _mm256_div_pd
+/***************************
+ * FUSED
+ ***************************/
+#ifdef NPY_HAVE_FMA3
+ // multiply and add, a*b + c
+ #define npyv_muladd_f32 _mm256_fmadd_ps
+ #define npyv_muladd_f64 _mm256_fmadd_pd
+ // multiply and subtract, a*b - c
+ #define npyv_mulsub_f32 _mm256_fmsub_ps
+ #define npyv_mulsub_f64 _mm256_fmsub_pd
+ // negate multiply and add, -(a*b) + c
+ #define npyv_nmuladd_f32 _mm256_fnmadd_ps
+ #define npyv_nmuladd_f64 _mm256_fnmadd_pd
+ // negate multiply and subtract, -(a*b) - c
+ #define npyv_nmulsub_f32 _mm256_fnmsub_ps
+ #define npyv_nmulsub_f64 _mm256_fnmsub_pd
+#else
+ // multiply and add, a*b + c
+ NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return npyv_add_f32(npyv_mul_f32(a, b), c); }
+ NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return npyv_add_f64(npyv_mul_f64(a, b), c); }
+ // multiply and subtract, a*b - c
+ NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return npyv_sub_f32(npyv_mul_f32(a, b), c); }
+ NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return npyv_sub_f64(npyv_mul_f64(a, b), c); }
+ // negate multiply and add, -(a*b) + c
+ NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
+ NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+ // negate multiply and subtract, -(a*b) - c
+ NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ {
+ npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
+ return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
+ }
+ NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
+ return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
+ }
+#endif // !NPY_HAVE_FMA3
#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index fcaef0efd..824ae818e 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -113,4 +113,20 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
#define npyv_div_f32 _mm512_div_ps
#define npyv_div_f64 _mm512_div_pd
+/***************************
+ * FUSED
+ ***************************/
+// multiply and add, a*b + c
+#define npyv_muladd_f32 _mm512_fmadd_ps
+#define npyv_muladd_f64 _mm512_fmadd_pd
+// multiply and subtract, a*b - c
+#define npyv_mulsub_f32 _mm512_fmsub_ps
+#define npyv_mulsub_f64 _mm512_fmsub_pd
+// negate multiply and add, -(a*b) + c
+#define npyv_nmuladd_f32 _mm512_fnmadd_ps
+#define npyv_nmuladd_f64 _mm512_fnmadd_pd
+// negate multiply and subtract, -(a*b) - c
+#define npyv_nmulsub_f32 _mm512_fnmsub_ps
+#define npyv_nmulsub_f64 _mm512_fnmsub_pd
+
#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index ec8b8ecd0..5eeee1bb6 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -75,4 +75,47 @@
#endif
#define npyv_div_f64 vdivq_f64
+/***************************
+ * FUSED F32
+ ***************************/
+#ifdef NPY_HAVE_NEON_VFPV4 // FMA
+ // multiply and add, a*b + c
+ NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vfmaq_f32(c, a, b); }
+ // multiply and subtract, a*b - c
+ NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vfmaq_f32(vnegq_f32(c), a, b); }
+ // negate multiply and add, -(a*b) + c
+ NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vfmsq_f32(c, a, b); }
+ // negate multiply and subtract, -(a*b) - c
+ NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vfmsq_f32(vnegq_f32(c), a, b); }
+#else
+ // multiply and add, a*b + c
+ NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vmlaq_f32(c, a, b); }
+ // multiply and subtract, a*b - c
+ NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vmlaq_f32(vnegq_f32(c), a, b); }
+ // negate multiply and add, -(a*b) + c
+ NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vmlsq_f32(c, a, b); }
+ // negate multiply and subtract, -(a*b) - c
+ NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return vmlsq_f32(vnegq_f32(c), a, b); }
+#endif
+/***************************
+ * FUSED F64
+ ***************************/
+#if NPY_SIMD_F64
+ NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return vfmaq_f64(c, a, b); }
+ NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return vfmaq_f64(vnegq_f64(c), a, b); }
+ NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return vfmsq_f64(c, a, b); }
+ NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return vfmsq_f64(vnegq_f64(c), a, b); }
+#endif // NPY_SIMD_F64
#endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 12d0af05c..717dacd39 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -91,5 +91,60 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
// TODO: emulate integer division
#define npyv_div_f32 _mm_div_ps
#define npyv_div_f64 _mm_div_pd
-
+/***************************
+ * FUSED
+ ***************************/
+#ifdef NPY_HAVE_FMA3
+ // multiply and add, a*b + c
+ #define npyv_muladd_f32 _mm_fmadd_ps
+ #define npyv_muladd_f64 _mm_fmadd_pd
+ // multiply and subtract, a*b - c
+ #define npyv_mulsub_f32 _mm_fmsub_ps
+ #define npyv_mulsub_f64 _mm_fmsub_pd
+ // negate multiply and add, -(a*b) + c
+ #define npyv_nmuladd_f32 _mm_fnmadd_ps
+ #define npyv_nmuladd_f64 _mm_fnmadd_pd
+ // negate multiply and subtract, -(a*b) - c
+ #define npyv_nmulsub_f32 _mm_fnmsub_ps
+ #define npyv_nmulsub_f64 _mm_fnmsub_pd
+#elif defined(NPY_HAVE_FMA4)
+ // multiply and add, a*b + c
+ #define npyv_muladd_f32 _mm_macc_ps
+ #define npyv_muladd_f64 _mm_macc_pd
+ // multiply and subtract, a*b - c
+ #define npyv_mulsub_f32 _mm_msub_ps
+ #define npyv_mulsub_f64 _mm_msub_pd
+ // negate multiply and add, -(a*b) + c
+ #define npyv_nmuladd_f32 _mm_nmacc_ps
+ #define npyv_nmuladd_f64 _mm_nmacc_pd
+#else
+ // multiply and add, a*b + c
+ NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return npyv_add_f32(npyv_mul_f32(a, b), c); }
+ NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return npyv_add_f64(npyv_mul_f64(a, b), c); }
+ // multiply and subtract, a*b - c
+ NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return npyv_sub_f32(npyv_mul_f32(a, b), c); }
+ NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return npyv_sub_f64(npyv_mul_f64(a, b), c); }
+ // negate multiply and add, -(a*b) + c
+ NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
+ NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+#endif // NPY_HAVE_FMA3
+#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
+ // negate multiply and subtract, -(a*b) - c
+ NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ {
+ npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
+ return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
+ }
+ NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
+ return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
+ }
+#endif // !NPY_HAVE_FMA3
#endif // _NPY_SIMD_SSE_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index dd23b5b11..6ef007676 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -100,4 +100,20 @@
#define npyv_div_f32 vec_div
#define npyv_div_f64 vec_div
+/***************************
+ * FUSED
+ ***************************/
+// multiply and add, a*b + c
+#define npyv_muladd_f32 vec_madd
+#define npyv_muladd_f64 vec_madd
+// multiply and subtract, a*b - c
+#define npyv_mulsub_f32 vec_msub
+#define npyv_mulsub_f64 vec_msub
+// negate multiply and add, -(a*b) + c
+#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c)
+#define npyv_nmuladd_f64 vec_nmsub
+// negate multiply and subtract, -(a*b) - c
+#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
+#define npyv_nmulsub_f64 vec_nmadd
+
#endif // _NPY_SIMD_VSX_ARITHMETIC_H