summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-12-15 20:07:18 +0200
committerSayed Adel <seiko@imavr.com>2023-01-29 13:02:36 +0200
commit640e85017aa8eac3e9be68b475acf27d623b16b7 (patch)
tree7da0ef7833c58729080ce09275387b09a5d1ccdb
parent5749d39a52048761555e384cd5c3575ebe8988b7 (diff)
downloadnumpy-640e85017aa8eac3e9be68b475acf27d623b16b7.tar.gz
ENH: remove raw SIMD of complex operations
-rw-r--r--numpy/core/meson.build1
-rw-r--r--numpy/core/setup.py2
-rw-r--r--numpy/core/src/umath/loops.c.src28
-rw-r--r--numpy/core/src/umath/loops.h.src13
-rw-r--r--numpy/core/src/umath/simd.inc.src653
5 files changed, 10 insertions, 687 deletions
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 27d7ab851..2e349ff5a 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -761,7 +761,6 @@ src_umath = [
src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
src_file.process('src/umath/matmul.c.src'),
src_file.process('src/umath/matmul.h.src'),
- src_file.process('src/umath/simd.inc.src'),
'src/umath/ufunc_type_resolution.c',
'src/umath/clip.cpp',
'src/umath/clip.h',
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index e509b9d11..128aed779 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1001,7 +1001,6 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'umathmodule.c'),
join('src', 'umath', 'reduction.c'),
join('src', 'umath', 'funcs.inc.src'),
- join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops_utils.h.src'),
join('src', 'umath', 'loops.c.src'),
@@ -1042,7 +1041,6 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'common.h'),
join('src', 'multiarray', 'number.h'),
join('src', 'common', 'templ_common.h.src'),
- join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'override.h'),
join(codegen_dir, 'generate_ufunc_api.py'),
join(codegen_dir, 'ufunc_docstrings.py'),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 2b70a4b9a..b3b70717a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,6 +13,7 @@
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
#include "lowlevel_strided_loops.h"
+#include "loops_utils.h"
#include "npy_pycompat.h"
@@ -44,14 +45,6 @@
/** Provides the various *_LOOP macros */
#include "fast_loop_macros.h"
-/*
- * include vectorized functions and dispatchers
- * this file is safe to include also for generic builds
- * platform specific instructions are either masked via the proprocessor or
- * runtime detected
- */
-#include "simd.inc"
-
/******************************************************************************
** GENERIC FLOAT LOOPS **
*****************************************************************************/
@@ -2116,6 +2109,8 @@ NPY_NO_EXPORT void
}
/**end repeat1**/
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
NPY_NO_EXPORT void
@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -2126,6 +2121,7 @@ NPY_NO_EXPORT void
((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
}
}
+#endif
NPY_NO_EXPORT void
@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -2156,6 +2152,8 @@ NPY_NO_EXPORT void
}
}
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
NPY_NO_EXPORT void
@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
UNARY_LOOP {
@@ -2175,20 +2173,6 @@ NPY_NO_EXPORT void
*((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
}
}
-
-#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F)
-/**begin repeat1
- * arithmetic
- * #kind = conjugate, square, absolute#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
- if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
- @TYPE@_@kind@(args, dimensions, steps, func);
- }
-}
-/**end repeat1**/
#endif
NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 26742946f..06945d091 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -540,7 +540,7 @@ NPY_NO_EXPORT void
* #TYPE = CFLOAT, CDOUBLE#
*/
/**begin repeat1
- * #kind = add, subtract, multiply#
+ * #kind = add, subtract, multiply, conjugate, absolute, square#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -609,19 +609,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step
NPY_NO_EXPORT void
C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-/**begin repeat1
- * #isa = , _avx512f#
- */
-
NPY_NO_EXPORT void
-C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
-C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
-C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
-/**end repeat1**/
+C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
NPY_NO_EXPORT void
C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
deleted file mode 100644
index 803c45948..000000000
--- a/numpy/core/src/umath/simd.inc.src
+++ /dev/null
@@ -1,653 +0,0 @@
-
-
-/*
- * This file is for the definitions of simd vectorized operations.
- *
- * Currently contains sse2 functions that are built on amd64, x32 or
- * non-generic builds (CFLAGS=-march=...)
- * In future it may contain other instruction sets like AVX or NEON detected
- * at runtime in which case it needs to be included indirectly via a file
- * compiled with special options (or use gcc target attributes) so the binary
- * stays portable.
- */
-
-
-#ifndef __NPY_SIMD_INC
-#define __NPY_SIMD_INC
-
-#include "lowlevel_strided_loops.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_simd_data.h"
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-#include <immintrin.h>
-#else
-#undef __AVX2__
-#undef __AVX512F__
-#endif
-#endif
-#include "loops_utils.h" // nomemoverlap
-#include <assert.h>
-#include <stdlib.h>
-#include <float.h>
-#include <string.h> /* for memcpy */
-
-#define VECTOR_SIZE_BYTES 16
-
-/*
- * Dispatcher functions
- * decide whether the operation can be vectorized and run it
- * if it was run returns true and false if nothing was done
- */
-
-/*
- *****************************************************************************
- ** CMPLX DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-
-/**begin repeat1
- * #func = square, absolute, conjugate#
- * #outsize = 1, 2, 1#
- * #max_stride = 2, 8, 8#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
- if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
- AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-
-/*
- * Vectorized operations
- */
-/*
- *****************************************************************************
- ** FLOAT LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
-* horizontal reductions on a vector
-* # VOP = min, max#
-*/
-
-NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
-{
- npy_float r;
- __m128 tmp = _mm_movehl_ps(v, v); /* c d ... */
- __m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */
- tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
- _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */
- return r;
-}
-
-NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
-{
- npy_double r;
- __m128d tmp = _mm_unpackhi_pd(v, v); /* b b */
- _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
- return r;
-}
-/**end repeat**/
-
-/* bunch of helper functions used in ISA_exp/log_FLOAT*/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask_ps(void)
-{
- return _mm256_set1_ps(-1.0);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_full_load_mask_pd(void)
-{
- return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
-{
- float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
- 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
- float* addr = maskint + num_lanes - num_elem;
- return _mm256_loadu_ps(addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
-{
- npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
- npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
- return _mm256_loadu_si256((__m256i*) addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather_ps(__m256 src,
- npy_float* addr,
- __m256i vindex,
- __m256 mask)
-{
- return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_gather_pd(__m256d src,
- npy_double* addr,
- __m128i vindex,
- __m256d mask)
-{
- return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load_ps(__m256 mask, npy_float* addr)
-{
- return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_load_pd(__m256i mask, npy_double* addr)
-{
- return _mm256_maskload_pd(addr, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
-{
- return _mm256_blendv_ps(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
-{
- return _mm256_blendv_pd(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_blend(__m256 x, __m256 y, __m256 ymask)
-{
- return _mm256_blendv_ps(x, y, ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_invert_mask_ps(__m256 ymask)
-{
- return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
- return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-/**begin repeat
- * #vsub = ps, pd#
- * #vtype = __m256, __m256d#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_abs_@vsub@(@vtype@ x)
-{
- return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_reciprocal_@vsub@(@vtype@ x)
-{
- return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_rint_@vsub@(@vtype@ x)
-{
- return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_floor_@vsub@(@vtype@ x)
-{
- return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_trunc_@vsub@(@vtype@ x)
-{
- return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
-}
-/**end repeat**/
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask_ps(void)
-{
- return 0xFFFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_full_load_mask_pd(void)
-{
- return 0xFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
-{
- return (0x0001 << num_elem) - 0x0001;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
- return (0x01 << num_elem) - 0x01;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather_ps(__m512 src,
- npy_float* addr,
- __m512i vindex,
- __mmask16 kmask)
-{
- return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_gather_pd(__m512d src,
- npy_double* addr,
- __m256i vindex,
- __mmask8 kmask)
-{
- return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
- return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
-{
- return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
-{
- return _mm512_mask_blend_ps(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
-{
- return _mm512_mask_blend_pd(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
-{
- return _mm512_mask_mov_ps(x, ymask, y);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-/**begin repeat
- * #vsub = ps, pd#
- * #type= npy_float, npy_double#
- * #epi_vsub = epi32, epi64#
- * #vtype = __m512, __m512d#
- * #mask = __mmask16, __mmask8#
- * #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- * #neg_mask = 0x80000000, 0x8000000000000000#
- * #perm_ = 0xb1, 0x55#
- * #cmpx_img_mask = 0xAAAA, 0xAA#
- * #cmpx_re_mask = 0x5555, 0x55#
- * #INF = NPY_INFINITYF, NPY_INFINITY#
- * #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_abs_@vsub@(@vtype@ x)
-{
- return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
- _mm512_set1_@epi_vsub@ (@and_const@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_reciprocal_@vsub@(@vtype@ x)
-{
- return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_rint_@vsub@(@vtype@ x)
-{
- return _mm512_roundscale_@vsub@(x, 0x08);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_floor_@vsub@(@vtype@ x)
-{
- return _mm512_roundscale_@vsub@(x, 0x09);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_trunc_@vsub@(@vtype@ x)
-{
- return _mm512_roundscale_@vsub@(x, 0x0B);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
-{
- return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
- return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cabsolute_@vsub@(const @vtype@ x1,
- const @vtype@ x2,
- const __m512i re_indices,
- const __m512i im_indices)
-{
- @vtype@ inf = _mm512_set1_@vsub@(@INF@);
- @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
- @vtype@ x1_abs = avx512_abs_@vsub@(x1);
- @vtype@ x2_abs = avx512_abs_@vsub@(x2);
- @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
- @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
- /*
- * If real or imag = INF, then convert it to inf + j*inf
- * Handles: inf + j*nan, nan + j*inf
- */
- @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
- @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
- im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
- re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
-
- /*
- * If real or imag = NAN, then convert it to nan + j*nan
- * Handles: x + j*nan, nan + j*x
- */
- @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
- @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
- im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
- re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
-
- @vtype@ larger = _mm512_max_@vsub@(re, im);
- @vtype@ smaller = _mm512_min_@vsub@(im, re);
-
- /*
- * Calculate div_mask to prevent 0./0. and inf/inf operations in div
- */
- @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
- @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
- @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
- @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
- @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
- ratio, ratio, _mm512_set1_@vsub@(1.0f)));
- return _mm512_mul_@vsub@(hypot, larger);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_conjugate_@vsub@(const @vtype@ x)
-{
- /*
- * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
- * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
- */
- __m512i cast_x = _mm512_cast@vsub@_si512(x);
- __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
- cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
- return _mm512_castsi512_@vsub@(res);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
- // x1 = r1, i1
- // x2 = r2, i2
- @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2
- @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2
- @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1
- @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2
- @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2
- return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_csquare_@vsub@(@vtype@ x)
-{
- return avx512_cmul_@vsub@(x, x);
-}
-
-/**end repeat**/
-#endif
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vtype = __m256, __m512#
- * #vsize = 256, 512#
- * #or = or_ps, kor#
- * #vsub = , _mask#
- * #mask = __m256, __mmask16#
- * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- **/
-
-#if defined @CHK@
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sqrt_ps(@vtype@ x)
-{
- return _mm@vsize@_sqrt_ps(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_sqrt_pd(@vtype@d x)
-{
- return _mm@vsize@_sqrt_pd(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_square_ps(@vtype@ x)
-{
- return _mm@vsize@_mul_ps(x,x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_square_pd(@vtype@d x)
-{
- return _mm@vsize@_mul_pd(x,x);
-}
-
-#endif
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-
-/**begin repeat1
- * #func = square, conjugate#
- * #vectorf = avx512_csquare, avx512_conjugate#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@ * op,
- @type@ * ip,
- const npy_intp array_size,
- const npy_intp steps)
-{
- npy_intp num_remaining_elements = 2*array_size;
- const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via max_stride
- */
- npy_int32 index_ip1[16];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
- index_ip1[ii] = ii*stride_ip1;
- index_ip1[ii+1] = ii*stride_ip1 + 1;
- }
- @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
- @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1;
- if (stride_ip1 == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
- }
-
- @vtype@ out = @vectorf@_@vsuffix@(x1);
-
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
- op += @num_lanes@;
- ip += @num_lanes@*stride_ip1;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void
-AVX512F_absolute_@TYPE@(@type@ * op,
- @type@ * ip,
- const npy_intp array_size,
- const npy_intp steps)
-{
- npy_intp num_remaining_elements = 2*array_size;
- const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via max_stride
- */
- npy_int32 index_ip[32];
- for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
- index_ip[ii] = ii*stride_ip1;
- index_ip[ii+1] = ii*stride_ip1 + 1;
- }
- @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
- @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
-
- @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
- @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
- @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
- @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-#if @IS_FLOAT@
- __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
- __m512i im_index = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
-#else
- __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
- __m512i im_index = _mm512_set_epi64(15,13,11,9,7,5,3,1);
-#endif
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- load_mask2 = 0x0000;
- store_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements/2, @num_lanes@);
- } else if (num_remaining_elements < 2*@num_lanes@) {
- load_mask1 = avx512_get_full_load_mask_@vsuffix@();
- load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements - @num_lanes@, @num_lanes@);
- store_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements/2, @num_lanes@);
- }
- @vtype@ x1, x2;
- if (stride_ip1 == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
- x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
- x2 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
- }
-
- @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
-
- _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
- op += @num_lanes@;
- ip += 2*@num_lanes@*stride_ip1;
- num_remaining_elements -= 2*@num_lanes@;
- }
- npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
-}
-
-#endif
-/**end repeat**/
-
-#undef VECTOR_SIZE_BYTES
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
-#endif
-