diff options
author | Sayed Adel <seiko@imavr.com> | 2020-09-06 20:43:11 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2020-09-07 00:04:45 +0200 |
commit | 5a642d221e6771307bfca408a18a8d43a470da21 (patch) | |
tree | b85aef73b2a18aea17a0ec12286e703033ad1306 | |
parent | c970c04cbeb38c80901a02bc573a9333458d4c4a (diff) | |
download | numpy-5a642d221e6771307bfca408a18a8d43a470da21.tar.gz |
NPYV: add fused multiply subtract/add intrinics for all supported platforms
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 44 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 16 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 43 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 57 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 16 |
5 files changed, 175 insertions, 1 deletions
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 9d8b4ab5e..4af9e4d17 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -72,4 +72,48 @@ #define npyv_div_f32 _mm256_div_ps #define npyv_div_f64 _mm256_div_pd +/*************************** + * FUSED + ***************************/ +#ifdef NPY_HAVE_FMA3 + // multiply and add, a*b + c + #define npyv_muladd_f32 _mm256_fmadd_ps + #define npyv_muladd_f64 _mm256_fmadd_pd + // multiply and subtract, a*b - c + #define npyv_mulsub_f32 _mm256_fmsub_ps + #define npyv_mulsub_f64 _mm256_fmsub_pd + // negate multiply and add, -(a*b) + c + #define npyv_nmuladd_f32 _mm256_fnmadd_ps + #define npyv_nmuladd_f64 _mm256_fnmadd_pd + // negate multiply and subtract, -(a*b) - c + #define npyv_nmulsub_f32 _mm256_fnmsub_ps + #define npyv_nmulsub_f64 _mm256_fnmsub_pd +#else + // multiply and add, a*b + c + NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return npyv_add_f32(npyv_mul_f32(a, b), c); } + NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return npyv_add_f64(npyv_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return npyv_sub_f32(npyv_mul_f32(a, b), c); } + NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return npyv_sub_f64(npyv_mul_f64(a, b), c); } + // negate multiply and add, -(a*b) + c + NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return npyv_sub_f32(c, npyv_mul_f32(a, b)); } + NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return npyv_sub_f64(c, npyv_mul_f64(a, b)); } + // negate multiply and subtract, -(a*b) - c + NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f)); + return npyv_sub_f32(npyv_mul_f32(neg_a, b), c); + } + NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0)); + return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); + } +#endif // !NPY_HAVE_FMA3 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index fcaef0efd..824ae818e 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -113,4 +113,20 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_div_f32 _mm512_div_ps #define npyv_div_f64 _mm512_div_pd +/*************************** + * FUSED + ***************************/ +// multiply and add, a*b + c +#define npyv_muladd_f32 _mm512_fmadd_ps +#define npyv_muladd_f64 _mm512_fmadd_pd +// multiply and subtract, a*b - c +#define npyv_mulsub_f32 _mm512_fmsub_ps +#define npyv_mulsub_f64 _mm512_fmsub_pd +// negate multiply and add, -(a*b) + c +#define npyv_nmuladd_f32 _mm512_fnmadd_ps +#define npyv_nmuladd_f64 _mm512_fnmadd_pd +// negate multiply and subtract, -(a*b) - c +#define npyv_nmulsub_f32 _mm512_fnmsub_ps +#define npyv_nmulsub_f64 _mm512_fnmsub_pd + #endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index ec8b8ecd0..5eeee1bb6 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -75,4 +75,47 @@ #endif #define npyv_div_f64 vdivq_f64 +/*************************** + * FUSED F32 + ***************************/ +#ifdef NPY_HAVE_NEON_VFPV4 // FMA + // multiply and add, a*b + c + NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vfmaq_f32(c, a, b); } + // multiply and subtract, a*b - c + NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vfmaq_f32(vnegq_f32(c), a, b); } + // negate multiply and add, -(a*b) + c + NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vfmsq_f32(c, a, b); } + // negate multiply and subtract, -(a*b) - c + NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vfmsq_f32(vnegq_f32(c), a, b); } +#else + // multiply and add, a*b + c + NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vmlaq_f32(c, a, b); } + // multiply and subtract, a*b - c + NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vmlaq_f32(vnegq_f32(c), a, b); } + // negate multiply and add, -(a*b) + c + NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vmlsq_f32(c, a, b); } + // negate multiply and subtract, -(a*b) - c + NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return vmlsq_f32(vnegq_f32(c), a, b); } +#endif +/*************************** + * FUSED F64 + ***************************/ +#if NPY_SIMD_F64 + NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return vfmaq_f64(c, a, b); } + NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return vfmaq_f64(vnegq_f64(c), a, b); } + NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return vfmsq_f64(c, a, b); } + NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return vfmsq_f64(vnegq_f64(c), a, b); } +#endif // NPY_SIMD_F64 #endif // _NPY_SIMD_NEON_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 12d0af05c..717dacd39 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -91,5 +91,60 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) // TODO: emulate integer division #define npyv_div_f32 _mm_div_ps #define npyv_div_f64 _mm_div_pd - +/*************************** + * FUSED + ***************************/ +#ifdef NPY_HAVE_FMA3 + // multiply and add, a*b + c + #define npyv_muladd_f32 _mm_fmadd_ps + #define npyv_muladd_f64 _mm_fmadd_pd + // multiply and subtract, a*b - c + #define npyv_mulsub_f32 _mm_fmsub_ps + #define npyv_mulsub_f64 _mm_fmsub_pd + // negate multiply and add, -(a*b) + c + #define npyv_nmuladd_f32 _mm_fnmadd_ps + #define npyv_nmuladd_f64 _mm_fnmadd_pd + // negate multiply and subtract, -(a*b) - c + #define npyv_nmulsub_f32 _mm_fnmsub_ps + #define npyv_nmulsub_f64 _mm_fnmsub_pd +#elif defined(NPY_HAVE_FMA4) + // multiply and add, a*b + c + #define npyv_muladd_f32 _mm_macc_ps + #define npyv_muladd_f64 _mm_macc_pd + // multiply and subtract, a*b - c + #define npyv_mulsub_f32 _mm_msub_ps + #define npyv_mulsub_f64 _mm_msub_pd + // negate multiply and add, -(a*b) + c + #define npyv_nmuladd_f32 _mm_nmacc_ps + #define npyv_nmuladd_f64 _mm_nmacc_pd +#else + // multiply and add, a*b + c + NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return npyv_add_f32(npyv_mul_f32(a, b), c); } + NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return npyv_add_f64(npyv_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return npyv_sub_f32(npyv_mul_f32(a, b), c); } + NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return npyv_sub_f64(npyv_mul_f64(a, b), c); } + // negate multiply and add, -(a*b) + c + NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return npyv_sub_f32(c, npyv_mul_f32(a, b)); } + NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return npyv_sub_f64(c, npyv_mul_f64(a, b)); } +#endif // NPY_HAVE_FMA3 +#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3 + // negate multiply and subtract, -(a*b) - c + NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f)); + return npyv_sub_f32(npyv_mul_f32(neg_a, b), c); + } + NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0)); + return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); + } +#endif // !NPY_HAVE_FMA3 #endif // _NPY_SIMD_SSE_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index dd23b5b11..6ef007676 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -100,4 +100,20 @@ #define npyv_div_f32 vec_div #define npyv_div_f64 vec_div +/*************************** + * FUSED + ***************************/ +// multiply and add, a*b + c +#define npyv_muladd_f32 vec_madd +#define npyv_muladd_f64 vec_madd +// multiply and subtract, a*b - c +#define npyv_mulsub_f32 vec_msub +#define npyv_mulsub_f64 vec_msub +// negate multiply and add, -(a*b) + c +#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c) +#define npyv_nmuladd_f64 vec_nmsub +// negate multiply and subtract, -(a*b) - c +#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c) +#define npyv_nmulsub_f64 vec_nmadd + #endif // _NPY_SIMD_VSX_ARITHMETIC_H |