/*@targets
 ** $maxopt baseline
 ** neon asimd
 ** sse2 avx2 avx512_skx
 ** vsx2
 ** vx
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/*******************************************************************************
 ** Defining the SIMD kernels
 ******************************************************************************/

#if NPY_SIMD
/*
 * convert any bit set to boolean true so vectorized and normal operations are
 * consistent, should not be required if bool is used correctly everywhere but
 * you never know
 */
NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
{
    const npyv_u8 zero = npyv_zero_u8();
    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
    // tmp is filled with 0xff/0x00, negate and mask to boolean true
    return npyv_andc_u8(truemask, tmp);
}
/*
 * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
 * but we've already got a mask and can skip negation.
 */
NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
{
    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
}
/*
 * For logical_and, we have to be careful to handle non-bool inputs where
 * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
 * Both evaluate to boolean true, however, a & b is false.  Return value
 * should be consistent with byte_to_true().
 */
NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
{
    const npyv_u8 zero = npyv_zero_u8();
    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
    npyv_b8 ma = npyv_cmpeq_u8(a, zero);
    npyv_b8 mb = npyv_cmpeq_u8(b, zero);
    npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
    return npyv_andc_u8(truemask, r);
}
/*
 * We don't really need the following, but it simplifies the templating code
 * below since it is paired with simd_logical_and_u8() above.
 */
NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
{
    npyv_u8 r = npyv_or_u8(a, b);
    return byte_to_true(r);
}


/**begin repeat
 * #kind = logical_and, logical_or#
 * #and  = 1, 0#
 * #scalar_op = &&, ||#
 * #intrin = and, or#
 * #reduce = min, max#
 * #scalar_cmp = ==, !=#
 * #anyall = all, any#
 */
static void
simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
{
    #define UNROLL 16

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
        /**begin repeat1
         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
         */
        #if UNROLL > @unroll@
        npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@);
        npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@);
        npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@);
        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
        #endif
        /**end repeat1**/
    }
    #undef UNROLL

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
        npyv_u8 a = npyv_load_u8(ip1);
        npyv_u8 b = npyv_load_u8(ip2);
        npyv_u8 r = simd_logical_@intrin@_u8(a, b);
        npyv_store_u8(op, r);
    }

    // Scalar loop to finish off
    for (; len > 0; len--, ip1++, ip2++, op++) {
        *op = *ip1 @scalar_op@ *ip2;
    }
}

static void
simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
{
    #define UNROLL 8

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #if defined(NPY_HAVE_SSE2)
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 m01 = npyv_@reduce@_u8(v0, v1);
        npyv_u8 m23 = npyv_@reduce@_u8(v2, v3);
        npyv_u8 m45 = npyv_@reduce@_u8(v4, v5);
        npyv_u8 m67 = npyv_@reduce@_u8(v6, v7);

        npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23);
        npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67);

        npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567);

        if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){
            *op = !@and@;
            return;
        }
    }

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip += vstep) {
        npyv_u8 v0 = npyv_load_u8(ip);
        if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){
            *op = !@and@;
            return;
        }
    }

    // Scalar loop to finish off
    for (; len > 0; --len, ++ip) {
        *op = *op @scalar_op@ *ip;
        if (*op @scalar_cmp@ 0) {
            return;
        }
    }
#undef UNROLL
}
/**end repeat**/ 

/**begin repeat
 * #kind = logical_not, absolute#
 * #op = ==, !=#
 * #not = 1, 0#
 */
static void
simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
{
    #define UNROLL 16

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    #if @not@
    const npyv_u8 zero = npyv_zero_u8();
    #endif

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
        /**begin repeat1
         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
         */
        #if UNROLL > @unroll@
        npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@);
#if @not@
        npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero));
#else
        npyv_u8 r@unroll@ = byte_to_true(v@unroll@);
#endif
        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
        #endif
        /**end repeat1**/
    }
    #undef UNROLL

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
        npyv_u8 v = npyv_load_u8(ip);
#if @not@
        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
#else
        npyv_u8 r = byte_to_true(v);
#endif
        npyv_store_u8(op, r);
    }

    // Scalar loop to finish off
    for (; len > 0; --len, ++ip, ++op) {
        *op = (*ip @op@ 0);
    }
}
/**end repeat**/

#endif // NPY_SIMD

/*******************************************************************************
 ** Defining ufunc inner functions
 ******************************************************************************/

/**begin repeat
 * # kind = logical_or, logical_and#
 */
static NPY_INLINE int
run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
                               (npy_bool*)args[1], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}


static NPY_INLINE int
run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
                                dimensions[0]);
        return 1;
    }
#endif
    return 0;
}
/**end repeat**/

/**begin repeat
 * #kind = logical_not, absolute#
 */
static NPY_INLINE int
run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}
/**end repeat**/


/**begin repeat
 * #kind = logical_and, logical_or#
 * #OP =  &&, ||#
 * #SC =  ==, !=#
 * #and = 1, 0#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if(IS_BINARY_REDUCE) {
#if NPY_SIMD
        /*
         * stick with our variant for more reliable performance, only known
         * platform which outperforms it by ~20% is an i7 with glibc 2.17
         */
        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
            return;
        }
#else
        /* for now only use libc on 32-bit/non-x86 */
        if (steps[1] == 1) {
            npy_bool * op = (npy_bool *)args[0];
#if @and@
            /* np.all(), search for a zero (false) */
            if (*op) {
                *op = memchr(args[1], 0, dimensions[0]) == NULL;
            }
#else
            /*
             * np.any(), search for a non-zero (true) via comparing against
             * zero blocks, memcmp is faster than memchr on SSE4 machines
             * with glibc >= 2.12 and memchr can only check for equal 1
             */
            static const npy_bool zero[4096]; /* zero by C standard */
            npy_uintp i, n = dimensions[0];

            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
            }
            if (!*op && n - i > 0) {
                *op = memcmp(&args[1][i], zero, n - i) != 0;
            }
#endif
            return;
        }
#endif
        else {
            BINARY_REDUCE_LOOP(npy_bool) {
                const npy_bool in2 = *(npy_bool *)ip2;
                io1 = io1 @OP@ in2;
                if (io1 @SC@ 0) {
                    break;
                }
            }
            *((npy_bool *)iop1) = io1;
        }
    }
    else {
        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
            return;
        }
        else {
            BINARY_LOOP {
                const npy_bool in1 = *(npy_bool *)ip1;
                const npy_bool in2 = *(npy_bool *)ip2;
                *((npy_bool *)op1) = in1 @OP@ in2;
            }
        }
    }
}
/**end repeat**/

/**begin repeat
 * #kind = logical_not, absolute#
 * #OP = ==, !=#
 **/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
        return;
    }
    else {
        UNARY_LOOP {
            npy_bool in1 = *(npy_bool *)ip1;
            *((npy_bool *)op1) = in1 @OP@ 0;
        }
    }
}
/**end repeat**/