diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2019-11-18 09:04:23 -0800 |
---|---|---|
committer | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2020-02-01 20:14:38 -0800 |
commit | 5562a8c93fe18f0a51d9051f0c25c7cf525312fe (patch) | |
tree | 450c5befca6dc7f36430101757b61960d63643b0 | |
parent | a0fc6a1b784bc8b2dbbbc582db1abd06596ca600 (diff) | |
download | numpy-5562a8c93fe18f0a51d9051f0c25c7cf525312fe.tar.gz |
ENH: Use AVX-512F for complex number arithmetic, absolute, square and conjugate
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 13 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 43 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 29 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 292 |
4 files changed, 362 insertions, 15 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 1fd08241d..9a262e01d 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -233,6 +233,7 @@ flts = 'efdg' fltsO = flts + O fltsP = flts + P cmplx = 'FDG' +cmplxvec = 'FD' cmplxO = cmplx + O cmplxP = cmplx + P inexact = flts + cmplx @@ -268,7 +269,7 @@ defdict = { Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.add'), 'PyUFunc_AdditionTypeResolver', - TD(notimes_or_obj, simd=[('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'mM', 'M'), @@ -279,7 +280,7 @@ defdict = { Ufunc(2, 1, None, # Zero is only a unit to the right, not the left docstrings.get('numpy.core.umath.subtract'), 'PyUFunc_SubtractionTypeResolver', - TD(ints + inexact, simd=[('avx2', ints)]), + TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'MM', 'm'), @@ -290,7 +291,7 @@ defdict = { Ufunc(2, 1, One, docstrings.get('numpy.core.umath.multiply'), 'PyUFunc_MultiplicationTypeResolver', - TD(notimes_or_obj, simd=[('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'qm', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), @@ -325,7 +326,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.conjugate'), None, - TD(ints+flts+cmplx, simd=[('avx2', ints)]), + TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]), TD(P, f='conjugate'), ), 'fmod': @@ -340,7 +341,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]), + TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]), TD(O, f='Py_square'), ), 'reciprocal': @@ -378,7 +379,7 @@ defdict = { docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]), - TD(cmplx, out=('f', 'd', 'g')), + TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), '_arg': diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index b310d73ff..9b43824cb 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -2509,6 +2509,7 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, * #ftype = npy_float, npy_double, npy_longdouble# * #c = f, , l# * #C = F, , L# + * #SIMD = 1, 1, 0# */ /* similar to pairwise sum of real floats */ @@ -2584,6 +2585,7 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, } } + /**begin repeat1 * arithmetic * #kind = add, subtract# @@ -2662,6 +2664,32 @@ NPY_NO_EXPORT void } } +#if @SIMD@ +NPY_NO_EXPORT void +@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (IS_BINARY_REDUCE) { + @TYPE@_add(args, dimensions, steps, func); + } + else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) { + @TYPE@_add(args, dimensions, steps, func); + } +} + +/**begin repeat1 + * arithmetic + * #kind = subtract, multiply# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + @TYPE@_@kind@(args, dimensions, steps, func); + } +} +/**end repeat1**/ +#endif + NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -2819,6 +2847,21 @@ NPY_NO_EXPORT void } } +#if @SIMD@ +/**begin repeat1 + * arithmetic + * #kind = conjugate, square, absolute# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + @TYPE@_@kind@(args, dimensions, steps, func); + } +} +/**end repeat1**/ +#endif + NPY_NO_EXPORT void @TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 6c89627ca..e9d0b4c62 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -356,20 +356,27 @@ NPY_NO_EXPORT void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# + * #IFSIMD = 1, 1, 0# */ /**begin repeat1 + * #isa = , _avx512f# + */ + +/**begin repeat2 * arithmetic * #kind = add, subtract# * #OP = +, -# */ + NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ +/**end repeat2**/ NPY_NO_EXPORT void -C@TYPE@_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ NPY_NO_EXPORT void C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -409,19 +416,24 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v /**end repeat1**/ NPY_NO_EXPORT void -C@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); NPY_NO_EXPORT void C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**begin repeat1 + * #isa = , _avx512f# + */ + NPY_NO_EXPORT void -C@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -C@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ NPY_NO_EXPORT void C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -444,7 +456,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v NPY_NO_EXPORT void C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat1**/ - #define C@TYPE@_true_divide C@TYPE@_divide /**end repeat**/ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index cd485034e..e3c0ee3cc 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -55,6 +55,13 @@ abs_ptrdiff(char *a, char *b) return (a > b) ? (a - b) : (b - a); } +#define IS_BINARY_STRIDE_ONE(esize, vsize) \ + ((steps[0] == esize) && \ + (steps[1] == esize) && \ + (steps[2] == esize) && \ + (abs_ptrdiff(args[2], args[0]) >= vsize) && \ + (abs_ptrdiff(args[2], args[1]) >= vsize)) + /* * stride is equal to element size and input and destination are equal or * don't overlap within one register. The check of the steps against @@ -158,6 +165,71 @@ abs_ptrdiff(char *a, char *b) /* ***************************************************************************** + ** CMPLX DISPATCHERS + ***************************************************************************** + */ + +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + * #type= npy_float, npy_double# + * #esize = 8, 16# + */ + +/**begin repeat1 + * #func = add, subtract, multiply# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps); +#endif + +static NPY_INLINE int +run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS + if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { + AVX512F_@func@_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; +#endif + return 0; +} + +/**end repeat1**/ + +/**begin repeat1 + * #func = square, absolute, conjugate# + * #outsize = 1, 2, 1# + * #max_stride = 2, 8, 8# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride); +#endif + +static NPY_INLINE int +run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS + if ((IS_OUTPUT_BLOCKABLE_UNARY((npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) { + AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); + return 1; + } + else + return 0; +#endif + return 0; +} + +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** ** FLOAT DISPATCHERS ***************************************************************************** */ @@ -1591,9 +1663,17 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) } /**begin repeat * #vsub = ps, pd# + * #type= npy_float, npy_double# * #epi_vsub = epi32, epi64# * #vtype = __m512, __m512d# + * #mask = __mmask16, __mmask8# * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# + * #neg_mask = 0x80000000, 0x8000000000000000# + * #perm_ = 0xb1, 0x55# + * #cmpx_img_mask = 0xAAAA, 0xAA# + * #cmpx_re_mask = 0x5555, 0x55# + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# */ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_abs_@vsub@(@vtype@ x) @@ -1631,6 +1711,92 @@ avx512_trunc_@vsub@(@vtype@ x) { return _mm512_roundscale_@vsub@(x, 0x0B); } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_hadd_@vsub@(const @vtype@ x) +{ + return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_hsub_@vsub@(const @vtype@ x) +{ + return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_cabsolute_@vsub@(const @vtype@ x) +{ + @vtype@ inf = _mm512_set1_@vsub@(@INF@); + @vtype@ nan = _mm512_set1_@vsub@(@NAN@); + @vtype@ x_abs = avx512_abs_@vsub@(x); + @vtype@ re = _mm512_maskz_compress_@vsub@(@cmpx_re_mask@, x_abs); + @vtype@ im = _mm512_maskz_compress_@vsub@(@cmpx_img_mask@, x_abs); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ); + @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ); + im = _mm512_mask_mov_@vsub@(im, re_infmask, inf); + re = _mm512_mask_mov_@vsub@(re, im_infmask, inf); + + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ); + @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ); + im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan); + re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan); + + @vtype@ larger = _mm512_max_@vsub@(re, im); + @vtype@ smaller = _mm512_min_@vsub@(im, re); + + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ); + @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ); + @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask)); + @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger); + @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@( + ratio, ratio, _mm512_set1_@vsub@(1.0f))); + return _mm512_mul_@vsub@(hypot, larger); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_conjugate_@vsub@(const @vtype@ x) +{ + /* + * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and + * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction + */ + __m512i cast_x = _mm512_cast@vsub@_si512(x); + __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@, + cast_x, _mm512_set1_@epi_vsub@(@neg_mask@)); + return _mm512_castsi512_@vsub@(res); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) +{ + // x1 = r1, i1 + // x2 = r2, i2 + @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 + @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 + @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 + @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 + @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 + return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_csquare_@vsub@(@vtype@ x) +{ + return avx512_cmul_@vsub@(x, x); +} + /**end repeat**/ #endif @@ -2450,6 +2616,132 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void #endif /**end repeat**/ +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + * #type = npy_float, npy_double# + * #num_lanes = 16, 8# + * #vsuffix = ps, pd# + * #mask = __mmask16, __mmask8# + * #vtype = __m512, __m512d# + * #scale = 4, 8# + * #vindextype = __m512i, __m256i# + * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# + * #storemask = 0xFF, 0xF# + */ + +/**begin repeat1 + * #func = add, subtract, multiply# + * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = 2*array_size; + @type@* ip1 = (@type@*) args[0]; + @type@* ip2 = (@type@*) args[1]; + @type@* op = (@type@*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype@ x1, x2; + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); + + @vtype@ out = @vectorf@_@vsuffix@(x1, x2); + + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + + ip1 += @num_lanes@; + ip2 += @num_lanes@; + op += @num_lanes@; + num_remaining_elements -= @num_lanes@; + } +} +#endif +/**end repeat1**/ + +/**begin repeat1 + * #func = absolute, square, conjugate# + * #vectorf = avx512_cabsolute, avx512_csquare, avx512_conjugate# + * #is_out_real = 1, 0, 0# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(@type@ * op, + @type@ * ip, + const npy_intp array_size, + const npy_intp steps) +{ + npy_intp num_remaining_elements = 2*array_size; + const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via max_stride + */ + npy_int32 index_ip1[16]; + for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) { + index_ip1[ii] = ii*stride_ip1; + index_ip1[ii+1] = ii*stride_ip1 + 1; + } + @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1); + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); +#if @is_out_real@ + @mask@ store_mask = _mm512_kand(avx512_get_full_load_mask_@vsuffix@(), @storemask@); +#endif + @vtype@ zeros = _mm512_setzero_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); +#if @is_out_real@ + store_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements/2, @num_lanes@); +#endif + } + @vtype@ x1; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask, ip); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask); + } + + @vtype@ out = @vectorf@_@vsuffix@(x1); + +#if @is_out_real@ + _mm512_mask_storeu_@vsuffix@(op, store_mask, out); + op += @num_lanes@/2; +#else + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + op += @num_lanes@; +#endif + + ip += @num_lanes@*stride_ip1; + num_remaining_elements -= @num_lanes@; + } +#if @is_out_real@ + /* + * Ignore invalid exception for cabsolute generated by vmaxps/vmaxpd + * and vminps/vminpd instructions + */ + npy_clear_floatstatus_barrier((char*)op); +#endif +} +#endif +/**end repeat1**/ +/**end repeat**/ + /* ***************************************************************************** ** BOOL LOOPS |