summaryrefslogtreecommitdiff
path: root/lib/Headers/avx512fintrin.h
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-06-21 16:41:28 +0000
committerCraig Topper <craig.topper@intel.com>2018-06-21 16:41:28 +0000
commit0be86b8c934425669a4da497cbc08425370fda0a (patch)
tree12a2d324bee1b0856206bea2bf0a9a996f055c11 /lib/Headers/avx512fintrin.h
parent41e20a3f297fbf794689979fad39a2cc77e5aafb (diff)
downloadclang-0be86b8c934425669a4da497cbc08425370fda0a.tar.gz
[X86] Rewrite the add/mul/or/and reduction intrinsics to make better use of other intrinsics and remove undef shuffle indices.
Similar to what was done to max/min recently. These already reduced the vector width to 256 and 128 bit as we go unlike the original max/min code. Differential Revision: https://reviews.llvm.org/D48346 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@335253 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Headers/avx512fintrin.h')
-rw-r--r--lib/Headers/avx512fintrin.h266
1 files changed, 100 insertions, 166 deletions
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index b2f6e28031..534007b69f 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -9281,251 +9281,185 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
* Used bisection method. At each step, we partition the vector with previous
* step in half, and the operation is performed on its two halves.
* This takes log2(n) steps where n is the number of elements in the vector.
-
- * Vec512 - Vector with size 512.
- * Operator - Can be one of following: +,*,&,|
- * T2 - Can get 'i' for int and 'f' for float.
- * T1 - Can get 'i' for int and 'd' for double.
*/
-#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \
- __extension__({ \
- __m256##T1 Vec256 = __builtin_shufflevector( \
- (__v8d##T2)Vec512, \
- (__v8d##T2)Vec512, \
- 0, 1, 2, 3) \
- Operator \
- __builtin_shufflevector( \
- (__v8d##T2)Vec512, \
- (__v8d##T2)Vec512, \
- 4, 5, 6, 7); \
- __m128##T1 Vec128 = __builtin_shufflevector( \
- (__v4d##T2)Vec256, \
- (__v4d##T2)Vec256, \
- 0, 1) \
- Operator \
- __builtin_shufflevector( \
- (__v4d##T2)Vec256, \
- (__v4d##T2)Vec256, \
- 2, 3); \
- Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \
- (__v2d##T2)Vec128, 0, -1) \
- Operator \
- __builtin_shufflevector((__v2d##T2)Vec128, \
- (__v2d##T2)Vec128, 1, -1); \
- return Vec128[0]; \
- })
+#define _mm512_mask_reduce_operator(op) \
+ __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \
+ __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \
+ __m256i __t3 = (__m256i)(__t1 op __t2); \
+ __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \
+ __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \
+ __v2du __t6 = __t4 op __t5; \
+ __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
+ __v2du __t8 = __t6 op __t7; \
+ return __t8[0];
static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
- _mm512_reduce_operator_64bit(__W, +, i, i);
+ _mm512_mask_reduce_operator(+);
}
static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
- _mm512_reduce_operator_64bit(__W, *, i, i);
+ _mm512_mask_reduce_operator(*);
}
static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
- _mm512_reduce_operator_64bit(__W, &, i, i);
+ _mm512_mask_reduce_operator(&);
}
static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
- _mm512_reduce_operator_64bit(__W, |, i, i);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
- _mm512_reduce_operator_64bit(__W, +, f, d);
+ _mm512_mask_reduce_operator(|);
}
-static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
- _mm512_reduce_operator_64bit(__W, *, f, d);
-}
-
-/* Vec512 - Vector with size 512.
- * Vec512Neutral - All vector elements set to the identity element.
- * Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
- * Operator - Can be one of following: +,*,&,|
- * Mask - Intrinsic Mask
- * T2 - Can get 'i' for int and 'f' for float.
- * T1 - Can get 'i' for int and 'd' for packed double-precision.
- * T3 - Can be Pd for packed double or q for q-word.
- */
-
-#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \
- Mask, T2, T1, T3) \
- __extension__({ \
- Vec512 = __builtin_ia32_select##T3##_512( \
- (__mmask8)Mask, \
- (__v8d##T2)Vec512, \
- (__v8d##T2)Vec512Neutral); \
- _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \
- })
-
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
+ __W = _mm512_maskz_mov_epi64(__M, __W);
+ _mm512_mask_reduce_operator(+);
}
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
+ __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
+ _mm512_mask_reduce_operator(*);
}
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
- &, __M, i, i, q);
+ __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
+ _mm512_mask_reduce_operator(&);
}
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
- i, i, q);
+ __W = _mm512_maskz_mov_epi64(__M, __W);
+ _mm512_mask_reduce_operator(|);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+ __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
+ __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
+ __m256d __t3 = __t1 op __t2; \
+ __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
+ __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
+ __m128d __t6 = __t4 op __t5; \
+ __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
+ __m128d __t8 = __t6 op __t7; \
+ return __t8[0];
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
+ _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
+ _mm512_mask_reduce_operator(*);
}
static __inline__ double __DEFAULT_FN_ATTRS
_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
- f, d, pd);
+ __W = _mm512_maskz_mov_pd(__M, __W);
+ _mm512_mask_reduce_operator(+);
}
static __inline__ double __DEFAULT_FN_ATTRS
_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
- f, d, pd);
+ __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
+ _mm512_mask_reduce_operator(*);
}
-#undef _mm512_reduce_operator_64bit
-#undef _mm512_mask_reduce_operator_64bit
-
-/* Vec512 - Vector with size 512.
- * Operator - Can be one of following: +,*,&,|
- * T2 - Can get 'i' for int and ' ' for packed single.
- * T1 - Can get 'i' for int and 'f' for float.
- */
+#undef _mm512_mask_reduce_operator
-#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
- __m256##T1 Vec256 = \
- (__m256##T1)(__builtin_shufflevector( \
- (__v16s##T2)Vec512, \
- (__v16s##T2)Vec512, \
- 0, 1, 2, 3, 4, 5, 6, 7) \
- Operator \
- __builtin_shufflevector( \
- (__v16s##T2)Vec512, \
- (__v16s##T2)Vec512, \
- 8, 9, 10, 11, 12, 13, 14, 15)); \
- __m128##T1 Vec128 = \
- (__m128##T1)(__builtin_shufflevector( \
- (__v8s##T2)Vec256, \
- (__v8s##T2)Vec256, \
- 0, 1, 2, 3) \
- Operator \
- __builtin_shufflevector( \
- (__v8s##T2)Vec256, \
- (__v8s##T2)Vec256, \
- 4, 5, 6, 7)); \
- Vec128 = (__m128##T1)(__builtin_shufflevector( \
- (__v4s##T2)Vec128, \
- (__v4s##T2)Vec128, \
- 0, 1, -1, -1) \
- Operator \
- __builtin_shufflevector( \
- (__v4s##T2)Vec128, \
- (__v4s##T2)Vec128, \
- 2, 3, -1, -1)); \
- Vec128 = (__m128##T1)(__builtin_shufflevector( \
- (__v4s##T2)Vec128, \
- (__v4s##T2)Vec128, \
- 0, -1, -1, -1) \
- Operator \
- __builtin_shufflevector( \
- (__v4s##T2)Vec128, \
- (__v4s##T2)Vec128, \
- 1, -1, -1, -1)); \
- return Vec128[0]; \
- })
+#define _mm512_mask_reduce_operator(op) \
+ __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \
+ __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \
+ __m256i __t3 = (__m256i)(__t1 op __t2); \
+ __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \
+ __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \
+ __v4su __t6 = __t4 op __t5; \
+ __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
+ __v4su __t8 = __t6 op __t7; \
+ __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
+ __v4su __t10 = __t8 op __t9; \
+ return __t10[0];
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_add_epi32(__m512i __W) {
- _mm512_reduce_operator_32bit(__W, +, i, i);
+ _mm512_mask_reduce_operator(+);
}
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_mul_epi32(__m512i __W) {
- _mm512_reduce_operator_32bit(__W, *, i, i);
+ _mm512_mask_reduce_operator(*);
}
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_and_epi32(__m512i __W) {
- _mm512_reduce_operator_32bit(__W, &, i, i);
+ _mm512_mask_reduce_operator(&);
}
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_or_epi32(__m512i __W) {
- _mm512_reduce_operator_32bit(__W, |, i, i);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm512_reduce_add_ps(__m512 __W) {
- _mm512_reduce_operator_32bit(__W, +, f, );
+ _mm512_mask_reduce_operator(|);
}
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm512_reduce_mul_ps(__m512 __W) {
- _mm512_reduce_operator_32bit(__W, *, f, );
-}
-
-/* Vec512 - Vector with size 512.
- * Vec512Neutral - All vector elements set to the identity element.
- * Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
- * Operator - Can be one of following: +,*,&,|
- * Mask - Intrinsic Mask
- * T2 - Can get 'i' for int and 'f' for float.
- * T1 - Can get 'i' for int and 'd' for double.
- * T3 - Can be Ps for packed single or d for d-word.
- */
-
-#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \
- Mask, T2, T1, T3) \
- __extension__({ \
- Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
- (__mmask16)Mask, \
- (__v16s##T2)Vec512, \
- (__v16s##T2)Vec512Neutral); \
- _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \
- })
-
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
+ __W = _mm512_maskz_mov_epi32(__M, __W);
+ _mm512_mask_reduce_operator(+);
}
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
+ __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
+ _mm512_mask_reduce_operator(*);
}
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
- i, i, d);
+ __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
+ _mm512_mask_reduce_operator(&);
}
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d);
+ __W = _mm512_maskz_mov_epi32(__M, __W);
+ _mm512_mask_reduce_operator(|);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+ __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
+ __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
+ __m256 __t3 = __t1 op __t2; \
+ __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
+ __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
+ __m128 __t6 = __t4 op __t5; \
+ __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
+ __m128 __t8 = __t6 op __t7; \
+ __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
+ __m128 __t10 = __t8 op __t9; \
+ return __t10[0];
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_add_ps(__m512 __W) {
+ _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_ps(__m512 __W) {
+ _mm512_mask_reduce_operator(*);
}
static __inline__ float __DEFAULT_FN_ATTRS
_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
+ __W = _mm512_maskz_mov_ps(__M, __W);
+ _mm512_mask_reduce_operator(+);
}
static __inline__ float __DEFAULT_FN_ATTRS
_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
+ __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
+ _mm512_mask_reduce_operator(*);
}
-#undef _mm512_reduce_operator_32bit
-#undef _mm512_mask_reduce_operator_32bit
+#undef _mm512_mask_reduce_operator
#define _mm512_mask_reduce_operator(op) \
__m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \