diff options
author | Craig Topper <craig.topper@intel.com> | 2018-06-08 21:50:07 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-06-08 21:50:07 +0000 |
commit | 3cef3a6363ad17ab52f3e41d8de9511bbd07a26f (patch) | |
tree | 4bf190ff251ffc45e7376531f10bb7616fe3559b /lib/Headers/avx512fintrin.h | |
parent | 8c873daccce7ee5339b9fd82c81fe02b73543b65 (diff) | |
download | clang-3cef3a6363ad17ab52f3e41d8de9511bbd07a26f.tar.gz |
[X86] Fold masking into subvector extract builtins.
I'm looking into making the select builtins require avx512f, avx512bw, or avx512vl since masking operations generally require those features.
The extract builtins are funny because the 512-bit versions return a 128 or 256 bit vector with masking even when avx512vl is not supported.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@334330 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Headers/avx512fintrin.h')
-rw-r--r-- | lib/Headers/avx512fintrin.h | 64 |
1 files changed, 36 insertions, 28 deletions
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h index 381011ed39..dbac414fff 100644 --- a/lib/Headers/avx512fintrin.h +++ b/lib/Headers/avx512fintrin.h @@ -3494,30 +3494,34 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, /* Vector Extract */ #define _mm512_extractf64x4_pd(A, I) \ - (__m256d)__builtin_ia32_extractf64x4((__v8df)(__m512d)(A), (int)(I)) + (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ + (__v4df)_mm256_undefined_si256(), \ + (__mmask8)-1) #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ - (__v4df)(__m256d)(W)) + (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U)) #define _mm512_maskz_extractf64x4_pd(U, A, imm) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ - (__v4df)_mm256_setzero_pd()) + (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U)) #define _mm512_extractf32x4_ps(A, I) \ - (__m128)__builtin_ia32_extractf32x4((__v16sf)(__m512)(A), (int)(I)) + (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1) #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ - (__v4sf)(__m128)(W)) + (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U)) #define _mm512_maskz_extractf32x4_ps(U, A, imm) \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ - (__v4sf)_mm_setzero_ps()) + (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U)) /* Vector Blend */ @@ -7534,30 +7538,34 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) } #define _mm512_extracti32x4_epi32(A, imm) \ - (__m128i)__builtin_ia32_extracti32x4((__v16si)(__m512i)(A), (int)(imm)) + (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1) #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ - (__v4si)(__m128i)(W)) + (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U)) #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ - (__v4si)_mm_setzero_si128()) + (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm512_extracti64x4_epi64(A, imm) \ - (__m256i)__builtin_ia32_extracti64x4((__v8di)(__m512i)(A), (int)(imm)) + (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_undefined_si256(), \ + (__mmask8)-1) #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ - (__v4di)(__m256i)(W)) + (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U)) #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ - (__v4di)_mm256_setzero_si256()) + (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U)) #define _mm512_insertf64x4(A, B, imm) \ (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ |