/*@targets ** $maxopt baseline ** neon asimd ** sse2 avx2 avx512_skx ** vsx2 ** vx **/ #define _UMATHMODULE #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" // Provides the various *_LOOP macros #include "fast_loop_macros.h" /******************************************************************************* ** Defining the SIMD kernels ******************************************************************************/ #if NPY_SIMD /* * convert any bit set to boolean true so vectorized and normal operations are * consistent, should not be required if bool is used correctly everywhere but * you never know */ NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v) { const npyv_u8 zero = npyv_zero_u8(); const npyv_u8 truemask = npyv_setall_u8(1 == 1); // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00 npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero)); // tmp is filled with 0xff/0x00, negate and mask to boolean true return npyv_andc_u8(truemask, tmp); } /* * convert mask vector (0xff/0x00) to boolean true. similar to byte_to_true(), * but we've already got a mask and can skip negation. */ NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v) { const npyv_u8 truemask = npyv_setall_u8(1 == 1); return npyv_and_u8(truemask, npyv_cvt_u8_b8(v)); } /* * For logical_and, we have to be careful to handle non-bool inputs where * bits of each operand might not overlap. Example: a = 0x01, b = 0x80 * Both evaluate to boolean true, however, a & b is false. Return value * should be consistent with byte_to_true(). */ NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b) { const npyv_u8 zero = npyv_zero_u8(); const npyv_u8 truemask = npyv_setall_u8(1 == 1); npyv_b8 ma = npyv_cmpeq_u8(a, zero); npyv_b8 mb = npyv_cmpeq_u8(b, zero); npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb)); return npyv_andc_u8(truemask, r); } /* * We don't really need the following, but it simplifies the templating code * below since it is paired with simd_logical_and_u8() above. */ NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b) { npyv_u8 r = npyv_or_u8(a, b); return byte_to_true(r); } /**begin repeat * #kind = logical_and, logical_or# * #and = 1, 0# * #scalar_op = &&, ||# * #intrin = and, or# * #reduce = min, max# * #scalar_cmp = ==, !=# * #anyall = all, any# */ static void simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len) { #define UNROLL 16 const int vstep = npyv_nlanes_u8; const int wstep = vstep * UNROLL; // Unrolled vectors loop for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) { /**begin repeat1 * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# */ #if UNROLL > @unroll@ npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@); npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@); npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@); npyv_store_u8(op + vstep * @unroll@, r@unroll@); #endif /**end repeat1**/ } #undef UNROLL // Single vectors loop for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) { npyv_u8 a = npyv_load_u8(ip1); npyv_u8 b = npyv_load_u8(ip2); npyv_u8 r = simd_logical_@intrin@_u8(a, b); npyv_store_u8(op, r); } // Scalar loop to finish off for (; len > 0; len--, ip1++, ip2++, op++) { *op = *ip1 @scalar_op@ *ip2; } } static void simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) { #define UNROLL 8 const int vstep = npyv_nlanes_u8; const int wstep = vstep * UNROLL; // Unrolled vectors loop for (; len >= wstep; len -= wstep, ip += wstep) { #if defined(NPY_HAVE_SSE2) NPY_PREFETCH(ip + wstep, 0, 3); #endif npyv_u8 v0 = npyv_load_u8(ip + vstep * 0); npyv_u8 v1 = npyv_load_u8(ip + vstep * 1); npyv_u8 v2 = npyv_load_u8(ip + vstep * 2); npyv_u8 v3 = npyv_load_u8(ip + vstep * 3); npyv_u8 v4 = npyv_load_u8(ip + vstep * 4); npyv_u8 v5 = npyv_load_u8(ip + vstep * 5); npyv_u8 v6 = npyv_load_u8(ip + vstep * 6); npyv_u8 v7 = npyv_load_u8(ip + vstep * 7); npyv_u8 m01 = npyv_@reduce@_u8(v0, v1); npyv_u8 m23 = npyv_@reduce@_u8(v2, v3); npyv_u8 m45 = npyv_@reduce@_u8(v4, v5); npyv_u8 m67 = npyv_@reduce@_u8(v6, v7); npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23); npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67); npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567); if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){ *op = !@and@; return; } } // Single vectors loop for (; len >= vstep; len -= vstep, ip += vstep) { npyv_u8 v0 = npyv_load_u8(ip); if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){ *op = !@and@; return; } } // Scalar loop to finish off for (; len > 0; --len, ++ip) { *op = *op @scalar_op@ *ip; if (*op @scalar_cmp@ 0) { return; } } #undef UNROLL } /**end repeat**/ /**begin repeat * #kind = logical_not, absolute# * #op = ==, !=# * #not = 1, 0# */ static void simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) { #define UNROLL 16 const int vstep = npyv_nlanes_u8; const int wstep = vstep * UNROLL; #if @not@ const npyv_u8 zero = npyv_zero_u8(); #endif // Unrolled vectors loop for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { /**begin repeat1 * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# */ #if UNROLL > @unroll@ npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@); #if @not@ npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero)); #else npyv_u8 r@unroll@ = byte_to_true(v@unroll@); #endif npyv_store_u8(op + vstep * @unroll@, r@unroll@); #endif /**end repeat1**/ } #undef UNROLL // Single vectors loop for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) { npyv_u8 v = npyv_load_u8(ip); #if @not@ npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero)); #else npyv_u8 r = byte_to_true(v); #endif npyv_store_u8(op, r); } // Scalar loop to finish off for (; len > 0; --len, ++ip, ++op) { *op = (*ip @op@ 0); } } /**end repeat**/ #endif // NPY_SIMD /******************************************************************************* ** Defining ufunc inner functions ******************************************************************************/ /**begin repeat * # kind = logical_or, logical_and# */ static NPY_INLINE int run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if NPY_SIMD if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], (npy_bool*)args[1], dimensions[0]); return 1; } #endif return 0; } static NPY_INLINE int run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if NPY_SIMD if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) { simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], dimensions[0]); return 1; } #endif return 0; } /**end repeat**/ /**begin repeat * #kind = logical_not, absolute# */ static NPY_INLINE int run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if NPY_SIMD if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); return 1; } #endif return 0; } /**end repeat**/ /**begin repeat * #kind = logical_and, logical_or# * #OP = &&, ||# * #SC = ==, !=# * #and = 1, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if(IS_BINARY_REDUCE) { #if NPY_SIMD /* * stick with our variant for more reliable performance, only known * platform which outperforms it by ~20% is an i7 with glibc 2.17 */ if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { return; } #else /* for now only use libc on 32-bit/non-x86 */ if (steps[1] == 1) { npy_bool * op = (npy_bool *)args[0]; #if @and@ /* np.all(), search for a zero (false) */ if (*op) { *op = memchr(args[1], 0, dimensions[0]) == NULL; } #else /* * np.any(), search for a non-zero (true) via comparing against * zero blocks, memcmp is faster than memchr on SSE4 machines * with glibc >= 2.12 and memchr can only check for equal 1 */ static const npy_bool zero[4096]; /* zero by C standard */ npy_uintp i, n = dimensions[0]; for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; } if (!*op && n - i > 0) { *op = memcmp(&args[1][i], zero, n - i) != 0; } #endif return; } #endif else { BINARY_REDUCE_LOOP(npy_bool) { const npy_bool in2 = *(npy_bool *)ip2; io1 = io1 @OP@ in2; if (io1 @SC@ 0) { break; } } *((npy_bool *)iop1) = io1; } } else { if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { return; } else { BINARY_LOOP { const npy_bool in1 = *(npy_bool *)ip1; const npy_bool in2 = *(npy_bool *)ip2; *((npy_bool *)op1) = in1 @OP@ in2; } } } } /**end repeat**/ /**begin repeat * #kind = logical_not, absolute# * #OP = ==, !=# **/ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { return; } else { UNARY_LOOP { npy_bool in1 = *(npy_bool *)ip1; *((npy_bool *)op1) = in1 @OP@ 0; } } } /**end repeat**/