diff options
author | Sayed Adel <seiko@imavr.com> | 2022-12-15 20:07:18 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2023-01-29 13:02:36 +0200 |
commit | 640e85017aa8eac3e9be68b475acf27d623b16b7 (patch) | |
tree | 7da0ef7833c58729080ce09275387b09a5d1ccdb | |
parent | 5749d39a52048761555e384cd5c3575ebe8988b7 (diff) | |
download | numpy-640e85017aa8eac3e9be68b475acf27d623b16b7.tar.gz |
ENH: remove raw SIMD of complex operations
-rw-r--r-- | numpy/core/meson.build | 1 | ||||
-rw-r--r-- | numpy/core/setup.py | 2 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 28 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 13 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 653 |
5 files changed, 10 insertions, 687 deletions
diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 27d7ab851..2e349ff5a 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -761,7 +761,6 @@ src_umath = [ src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'), src_file.process('src/umath/matmul.c.src'), src_file.process('src/umath/matmul.h.src'), - src_file.process('src/umath/simd.inc.src'), 'src/umath/ufunc_type_resolution.c', 'src/umath/clip.cpp', 'src/umath/clip.h', diff --git a/numpy/core/setup.py b/numpy/core/setup.py index e509b9d11..128aed779 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1001,7 +1001,6 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'umathmodule.c'), join('src', 'umath', 'reduction.c'), join('src', 'umath', 'funcs.inc.src'), - join('src', 'umath', 'simd.inc.src'), join('src', 'umath', 'loops.h.src'), join('src', 'umath', 'loops_utils.h.src'), join('src', 'umath', 'loops.c.src'), @@ -1042,7 +1041,6 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'common.h'), join('src', 'multiarray', 'number.h'), join('src', 'common', 'templ_common.h.src'), - join('src', 'umath', 'simd.inc.src'), join('src', 'umath', 'override.h'), join(codegen_dir, 'generate_ufunc_api.py'), join(codegen_dir, 'ufunc_docstrings.py'), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 2b70a4b9a..b3b70717a 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -13,6 +13,7 @@ #include "numpy/npy_math.h" #include "numpy/halffloat.h" #include "lowlevel_strided_loops.h" +#include "loops_utils.h" #include "npy_pycompat.h" @@ -44,14 +45,6 @@ /** Provides the various *_LOOP macros */ #include "fast_loop_macros.h" -/* - * include vectorized functions and dispatchers - * this file is safe to include also for generic builds - * platform specific instructions are either masked via the proprocessor or - * runtime detected - */ -#include "simd.inc" - /****************************************************************************** ** GENERIC FLOAT LOOPS ** *****************************************************************************/ @@ -2116,6 +2109,8 @@ NPY_NO_EXPORT void } /**end repeat1**/ +#if !@SIMD@ +// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src' NPY_NO_EXPORT void @TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -2126,6 +2121,7 @@ NPY_NO_EXPORT void ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r; } } +#endif NPY_NO_EXPORT void @TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) @@ -2156,6 +2152,8 @@ NPY_NO_EXPORT void } } +#if !@SIMD@ +// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src' NPY_NO_EXPORT void @TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { UNARY_LOOP { @@ -2175,20 +2173,6 @@ NPY_NO_EXPORT void *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i); } } - -#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F) -/**begin repeat1 - * arithmetic - * #kind = conjugate, square, absolute# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) -{ - if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - @TYPE@_@kind@(args, dimensions, steps, func); - } -} -/**end repeat1**/ #endif NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 26742946f..06945d091 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -540,7 +540,7 @@ NPY_NO_EXPORT void * #TYPE = CFLOAT, CDOUBLE# */ /**begin repeat1 - * #kind = add, subtract, multiply# + * #kind = add, subtract, multiply, conjugate, absolute, square# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) @@ -609,19 +609,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step NPY_NO_EXPORT void C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); -/**begin repeat1 - * #isa = , _avx512f# - */ - NPY_NO_EXPORT void -C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); +C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); +C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); -/**end repeat1**/ +C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); NPY_NO_EXPORT void C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src deleted file mode 100644 index 803c45948..000000000 --- a/numpy/core/src/umath/simd.inc.src +++ /dev/null @@ -1,653 +0,0 @@ - - -/* - * This file is for the definitions of simd vectorized operations. - * - * Currently contains sse2 functions that are built on amd64, x32 or - * non-generic builds (CFLAGS=-march=...) - * In future it may contain other instruction sets like AVX or NEON detected - * at runtime in which case it needs to be included indirectly via a file - * compiled with special options (or use gcc target attributes) so the binary - * stays portable. - */ - - -#ifndef __NPY_SIMD_INC -#define __NPY_SIMD_INC - -#include "lowlevel_strided_loops.h" -#include "numpy/npy_common.h" -#include "numpy/npy_math.h" -#include "npy_simd_data.h" -#ifdef NPY_HAVE_SSE2_INTRINSICS -#include <emmintrin.h> -#if !defined(_MSC_VER) || _MSC_VER >= 1600 -#include <immintrin.h> -#else -#undef __AVX2__ -#undef __AVX512F__ -#endif -#endif -#include "loops_utils.h" // nomemoverlap -#include <assert.h> -#include <stdlib.h> -#include <float.h> -#include <string.h> /* for memcpy */ - -#define VECTOR_SIZE_BYTES 16 - -/* - * Dispatcher functions - * decide whether the operation can be vectorized and run it - * if it was run returns true and false if nothing was done - */ - -/* - ***************************************************************************** - ** CMPLX DISPATCHERS - ***************************************************************************** - */ - -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - * #type= npy_float, npy_double# - * #esize = 8, 16# - */ - -/**begin repeat1 - * #func = square, absolute, conjugate# - * #outsize = 1, 2, 1# - * #max_stride = 2, 8, 8# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static inline NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride); -#endif - -static inline int -run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS - if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) { - AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat1**/ -/**end repeat**/ - -#ifdef NPY_HAVE_SSE2_INTRINSICS - -/* - * Vectorized operations - */ -/* - ***************************************************************************** - ** FLOAT LOOPS - ***************************************************************************** - */ - -/**begin repeat -* horizontal reductions on a vector -* # VOP = min, max# -*/ - -NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v) -{ - npy_float r; - __m128 tmp = _mm_movehl_ps(v, v); /* c d ... */ - __m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */ - tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */ - _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */ - return r; -} - -NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) -{ - npy_double r; - __m128d tmp = _mm_unpackhi_pd(v, v); /* b b */ - _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */ - return r; -} -/**end repeat**/ - -/* bunch of helper functions used in ISA_exp/log_FLOAT*/ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_full_load_mask_ps(void) -{ - return _mm256_set1_ps(-1.0); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i -fma_get_full_load_mask_pd(void) -{ - return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) -{ - float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, - 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float* addr = maskint + num_lanes - num_elem; - return _mm256_loadu_ps(addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i -fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) -{ - npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1}; - npy_int* addr = maskint + 2*num_lanes - 2*num_elem; - return _mm256_loadu_si256((__m256i*) addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_gather_ps(__m256 src, - npy_float* addr, - __m256i vindex, - __m256 mask) -{ - return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d -fma_masked_gather_pd(__m256d src, - npy_double* addr, - __m128i vindex, - __m256d mask) -{ - return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_load_ps(__m256 mask, npy_float* addr) -{ - return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d -fma_masked_load_pd(__m256i mask, npy_double* addr) -{ - return _mm256_maskload_pd(addr, mask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) -{ - return _mm256_blendv_ps(x, val, mask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d -fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) -{ - return _mm256_blendv_pd(x, val, mask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_blend(__m256 x, __m256 y, __m256 ymask) -{ - return _mm256_blendv_ps(x, y, ymask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_invert_mask_ps(__m256 ymask) -{ - return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i -fma_invert_mask_pd(__m256i ymask) -{ - return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); -} - -/**begin repeat - * #vsub = ps, pd# - * #vtype = __m256, __m256d# - */ -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_abs_@vsub@(@vtype@ x) -{ - return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_reciprocal_@vsub@(@vtype@ x) -{ - return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_rint_@vsub@(@vtype@ x) -{ - return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_floor_@vsub@(@vtype@ x) -{ - return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_trunc_@vsub@(@vtype@ x) -{ - return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO); -} -/**end repeat**/ -#endif - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_full_load_mask_ps(void) -{ - return 0xFFFF; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_get_full_load_mask_pd(void) -{ - return 0xFF; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) -{ - return (0x0001 << num_elem) - 0x0001; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) -{ - return (0x01 << num_elem) - 0x01; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_gather_ps(__m512 src, - npy_float* addr, - __m512i vindex, - __mmask16 kmask) -{ - return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d -avx512_masked_gather_pd(__m512d src, - npy_double* addr, - __m256i vindex, - __mmask8 kmask) -{ - return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_load_ps(__mmask16 mask, npy_float* addr) -{ - return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d -avx512_masked_load_pd(__mmask8 mask, npy_double* addr) -{ - return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask) -{ - return _mm512_mask_blend_ps(mask, x, val); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d -avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask) -{ - return _mm512_mask_blend_pd(mask, x, val); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_blend(__m512 x, __m512 y, __mmask16 ymask) -{ - return _mm512_mask_mov_ps(x, ymask, y); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_invert_mask_ps(__mmask16 ymask) -{ - return _mm512_knot(ymask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_invert_mask_pd(__mmask8 ymask) -{ - return _mm512_knot(ymask); -} - -/**begin repeat - * #vsub = ps, pd# - * #type= npy_float, npy_double# - * #epi_vsub = epi32, epi64# - * #vtype = __m512, __m512d# - * #mask = __mmask16, __mmask8# - * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# - * #neg_mask = 0x80000000, 0x8000000000000000# - * #perm_ = 0xb1, 0x55# - * #cmpx_img_mask = 0xAAAA, 0xAA# - * #cmpx_re_mask = 0x5555, 0x55# - * #INF = NPY_INFINITYF, NPY_INFINITY# - * #NAN = NPY_NANF, NPY_NAN# - */ -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_abs_@vsub@(@vtype@ x) -{ - return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x, - _mm512_set1_@epi_vsub@ (@and_const@)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_reciprocal_@vsub@(@vtype@ x) -{ - return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_rint_@vsub@(@vtype@ x) -{ - return _mm512_roundscale_@vsub@(x, 0x08); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_floor_@vsub@(@vtype@ x) -{ - return _mm512_roundscale_@vsub@(x, 0x09); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_trunc_@vsub@(@vtype@ x) -{ - return _mm512_roundscale_@vsub@(x, 0x0B); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_hadd_@vsub@(const @vtype@ x) -{ - return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_hsub_@vsub@(const @vtype@ x) -{ - return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_cabsolute_@vsub@(const @vtype@ x1, - const @vtype@ x2, - const __m512i re_indices, - const __m512i im_indices) -{ - @vtype@ inf = _mm512_set1_@vsub@(@INF@); - @vtype@ nan = _mm512_set1_@vsub@(@NAN@); - @vtype@ x1_abs = avx512_abs_@vsub@(x1); - @vtype@ x2_abs = avx512_abs_@vsub@(x2); - @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs); - @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs); - /* - * If real or imag = INF, then convert it to inf + j*inf - * Handles: inf + j*nan, nan + j*inf - */ - @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ); - @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ); - im = _mm512_mask_mov_@vsub@(im, re_infmask, inf); - re = _mm512_mask_mov_@vsub@(re, im_infmask, inf); - - /* - * If real or imag = NAN, then convert it to nan + j*nan - * Handles: x + j*nan, nan + j*x - */ - @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ); - @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ); - im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan); - re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan); - - @vtype@ larger = _mm512_max_@vsub@(re, im); - @vtype@ smaller = _mm512_min_@vsub@(im, re); - - /* - * Calculate div_mask to prevent 0./0. and inf/inf operations in div - */ - @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ); - @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ); - @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask)); - @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger); - @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@( - ratio, ratio, _mm512_set1_@vsub@(1.0f))); - return _mm512_mul_@vsub@(hypot, larger); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_conjugate_@vsub@(const @vtype@ x) -{ - /* - * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and - * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction - */ - __m512i cast_x = _mm512_cast@vsub@_si512(x); - __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@, - cast_x, _mm512_set1_@epi_vsub@(@neg_mask@)); - return _mm512_castsi512_@vsub@(res); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) -{ - // x1 = r1, i1 - // x2 = r2, i2 - @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 - @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 - @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 - @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 - @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 - return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_csquare_@vsub@(@vtype@ x) -{ - return avx512_cmul_@vsub@(x, x); -} - -/**end repeat**/ -#endif - -/**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512# - * #vtype = __m256, __m512# - * #vsize = 256, 512# - * #or = or_ps, kor# - * #vsub = , _mask# - * #mask = __m256, __mmask16# - * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - **/ - -#if defined @CHK@ - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ -@isa@_sqrt_ps(@vtype@ x) -{ - return _mm@vsize@_sqrt_ps(x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d -@isa@_sqrt_pd(@vtype@d x) -{ - return _mm@vsize@_sqrt_pd(x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ -@isa@_square_ps(@vtype@ x) -{ - return _mm@vsize@_mul_ps(x,x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d -@isa@_square_pd(@vtype@d x) -{ - return _mm@vsize@_mul_pd(x,x); -} - -#endif -/**end repeat**/ - -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - * #type = npy_float, npy_double# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #epi_vsub = epi32, epi64# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #storemask = 0xFF, 0xF# - * #IS_FLOAT = 1, 0# - */ - -/**begin repeat1 - * #func = square, conjugate# - * #vectorf = avx512_csquare, avx512_conjugate# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(@type@ * op, - @type@ * ip, - const npy_intp array_size, - const npy_intp steps) -{ - npy_intp num_remaining_elements = 2*array_size; - const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via max_stride - */ - npy_int32 index_ip1[16]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) { - index_ip1[ii] = ii*stride_ip1; - index_ip1[ii+1] = ii*stride_ip1 + 1; - } - @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1); - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - @vtype@ zeros = _mm512_setzero_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1; - if (stride_ip1 == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask, ip); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask); - } - - @vtype@ out = @vectorf@_@vsuffix@(x1); - - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - op += @num_lanes@; - ip += @num_lanes@*stride_ip1; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void -AVX512F_absolute_@TYPE@(@type@ * op, - @type@ * ip, - const npy_intp array_size, - const npy_intp steps) -{ - npy_intp num_remaining_elements = 2*array_size; - const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via max_stride - */ - npy_int32 index_ip[32]; - for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) { - index_ip[ii] = ii*stride_ip1; - index_ip[ii+1] = ii*stride_ip1 + 1; - } - @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip); - @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@)); - - @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@(); - @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@(); - @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@(); - @vtype@ zeros = _mm512_setzero_@vsuffix@(); - -#if @IS_FLOAT@ - __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0); - __m512i im_index = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1); -#else - __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0); - __m512i im_index = _mm512_set_epi64(15,13,11,9,7,5,3,1); -#endif - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask1 = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - load_mask2 = 0x0000; - store_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements/2, @num_lanes@); - } else if (num_remaining_elements < 2*@num_lanes@) { - load_mask1 = avx512_get_full_load_mask_@vsuffix@(); - load_mask2 = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements - @num_lanes@, @num_lanes@); - store_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements/2, @num_lanes@); - } - @vtype@ x1, x2; - if (stride_ip1 == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask1, ip); - x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1); - x2 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2); - } - - @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index); - - _mm512_mask_storeu_@vsuffix@(op, store_mask, out); - op += @num_lanes@; - ip += 2*@num_lanes@*stride_ip1; - num_remaining_elements -= 2*@num_lanes@; - } - npy_clear_floatstatus_barrier((char*)&num_remaining_elements); -} - -#endif -/**end repeat**/ - -#undef VECTOR_SIZE_BYTES -#endif /* NPY_HAVE_SSE2_INTRINSICS */ -#endif - |