diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2020-12-19 13:58:37 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-19 13:58:37 -0700 |
commit | f7bc512184a1de3992b0d154e6244a7a0d9a574f (patch) | |
tree | 33bafb4dec61bd525493be4b9c908c75638275db | |
parent | 811f9eea3b105919846760a6eca4edd22cc11c1d (diff) | |
parent | 81bb563172c84eb1eba92b05fa99d902d28af13b (diff) | |
download | numpy-f7bc512184a1de3992b0d154e6244a7a0d9a574f.tar.gz |
Merge pull request #17985 from seiko2plus/ditch_simd_arithmetic
ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic)
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 84 | ||||
-rw-r--r-- | numpy/core/setup.py | 2 | ||||
-rw-r--r-- | numpy/core/src/umath/fast_loop_macros.h | 112 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 258 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 53 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_arithm_fp.dispatch.c.src | 777 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_utils.h | 42 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_utils.h.src | 213 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 669 |
10 files changed, 1226 insertions, 986 deletions
diff --git a/.gitignore b/.gitignore index b28f0d3c8..b0fa037a6 100644 --- a/.gitignore +++ b/.gitignore @@ -170,6 +170,7 @@ numpy/core/src/umath/simd.inc numpy/core/src/umath/struct_ufunc_test.c numpy/core/src/umath/test_rational.c numpy/core/src/umath/umath_tests.c +numpy/core/src/umath/loops_utils.h numpy/distutils/__config__.py numpy/linalg/umath_linalg.c doc/source/**/generated/ @@ -216,3 +217,4 @@ numpy/core/src/_simd/_simd_data.inc numpy/core/src/_simd/_simd_inc.h # umath module numpy/core/src/umath/loops_unary_fp.dispatch.c +numpy/core/src/umath/loops_arithm_fp.dispatch.c diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index cb1147b93..4e9a2cfec 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -45,14 +45,20 @@ class TypeDescription: astype : dict or None, optional If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y instead of PyUFunc_x_x/PyUFunc_xx_x. + cfunc_alias : str or none, optional + Appended to inner loop C function name, e.g., FLOAT_{cfunc_alias}. See make_arrays. + NOTE: it doesn't support 'astype' simd: list Available SIMD ufunc loops, dispatched at runtime in specified order Currently only supported for simples types (see make_arrays) - dispatch: list - Available SIMD ufunc loops, dispatched at runtime in specified order - Currently only supported for simples types (see make_arrays) + dispatch: str or None, optional + Dispatch-able source name without its extension '.dispatch.c' that + contains the definition of ufunc, dispatched at runtime depending on the + specified targets of the dispatch-able source. + NOTE: it doesn't support 'astype' """ - def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None): + def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None, + simd=None, dispatch=None): self.type = type self.func_data = f if astype is None: @@ -64,6 +70,7 @@ class TypeDescription: if out is not None: out = out.replace('P', type) self.out = out + self.cfunc_alias = cfunc_alias self.simd = simd self.dispatch = dispatch @@ -90,7 +97,8 @@ def build_func_data(types, f): func_data = [_fdata_map.get(t, '%s') % (f,) for t in types] return func_data -def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None): +def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None, + simd=None, dispatch=None): if f is not None: if isinstance(f, str): func_data = build_func_data(types, f) @@ -119,13 +127,15 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None) simdt = [k for k, v in simd if t in v] else: simdt = [] + # [(dispatch file name without extension '.dispatch.c*', list of types)] if dispatch: - dispt = [k for k, v in dispatch if t in v] + dispt = ([k for k, v in dispatch if t in v]+[None])[0] else: - dispt = [] + dispt = None tds.append(TypeDescription( - t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt + t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias, + simd=simdt, dispatch=dispt )) return tds @@ -280,7 +290,7 @@ defdict = { Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.add'), 'PyUFunc_AdditionTypeResolver', - TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'mM', 'M'), @@ -291,7 +301,7 @@ defdict = { Ufunc(2, 1, None, # Zero is only a unit to the right, not the left docstrings.get('numpy.core.umath.subtract'), 'PyUFunc_SubtractionTypeResolver', - TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]), + TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'MM', 'm'), @@ -302,7 +312,7 @@ defdict = { Ufunc(2, 1, One, docstrings.get('numpy.core.umath.multiply'), 'PyUFunc_MultiplicationTypeResolver', - TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'qm', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), @@ -326,10 +336,10 @@ defdict = { Ufunc(2, 1, None, # One is only a unit to the right, not the left docstrings.get('numpy.core.umath.true_divide'), 'PyUFunc_TrueDivisionTypeResolver', - TD(flts+cmplx), - [TypeDescription('m', FullTypeDescr, 'mq', 'm'), - TypeDescription('m', FullTypeDescr, 'md', 'm'), - TypeDescription('m', FullTypeDescr, 'mm', 'd'), + TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]), + [TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'), + TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'), + TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'), ], TD(O, f='PyNumber_TrueDivide'), ), @@ -1000,6 +1010,7 @@ def make_arrays(funcdict): # later code1list = [] code2list = [] + dispdict = {} names = sorted(funcdict.keys()) for name in names: uf = funcdict[name] @@ -1010,44 +1021,33 @@ def make_arrays(funcdict): sub = 0 for t in uf.type_descriptions: + cfunc_alias = t.cfunc_alias if t.cfunc_alias else name + cfunc_fname = None if t.func_data is FullTypeDescr: tname = english_upper(chartoname[t.type]) datalist.append('(void *)NULL') - funclist.append( - '%s_%s_%s_%s' % (tname, t.in_, t.out, name)) + cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}" elif isinstance(t.func_data, FuncNameSuffix): datalist.append('(void *)NULL') tname = english_upper(chartoname[t.type]) - funclist.append( - '%s_%s_%s' % (tname, name, t.func_data.suffix)) + cfunc_fname = f"{tname}_{cfunc_alias}_{t.func_data.suffix}" elif t.func_data is None: datalist.append('(void *)NULL') tname = english_upper(chartoname[t.type]) - funclist.append('%s_%s' % (tname, name)) + cfunc_fname = f"{tname}_{cfunc_alias}" if t.simd is not None: for vt in t.simd: code2list.append(textwrap.dedent("""\ #ifdef HAVE_ATTRIBUTE_TARGET_{ISA} if (NPY_CPU_HAVE({ISA})) {{ - {fname}_functions[{idx}] = {type}_{fname}_{isa}; + {fname}_functions[{idx}] = {cname}_{isa}; }} #endif """).format( ISA=vt.upper(), isa=vt, - fname=name, type=tname, idx=k - )) - if t.dispatch is not None: - for dname in t.dispatch: - code2list.append(textwrap.dedent("""\ - #ifndef NPY_DISABLE_OPTIMIZATION - #include "{dname}.dispatch.h" - #endif - NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name}); - """).format( - dname=dname, name=name, tname=tname, k=k + fname=name, cname=cfunc_fname, idx=k )) else: - funclist.append('NULL') try: thedict = arity_lookup[uf.nin, uf.nout] except KeyError as e: @@ -1077,6 +1077,13 @@ def make_arrays(funcdict): #datalist.append('(void *)%s' % t.func_data) sub += 1 + if cfunc_fname: + funclist.append(cfunc_fname) + if t.dispatch: + dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname)) + else: + funclist.append('NULL') + for x in t.in_ + t.out: siglist.append('NPY_%s' % (english_upper(chartoname[x]),)) @@ -1091,6 +1098,17 @@ def make_arrays(funcdict): % (name, datanames)) code1list.append("static char %s_signatures[] = {%s};" % (name, signames)) + + for dname, funcs in dispdict.items(): + code2list.append(textwrap.dedent(f""" + #ifndef NPY_DISABLE_OPTIMIZATION + #include "{dname}.dispatch.h" + #endif + """)) + for (ufunc_name, func_idx, cfunc_name) in funcs: + code2list.append(textwrap.dedent(f"""\ + NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name}); + """)) return "\n".join(code1list), "\n".join(code2list) def make_ufuncs(funcdict): diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 2ec5e1a64..2e020a595 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -916,8 +916,10 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'funcs.inc.src'), join('src', 'umath', 'simd.inc.src'), join('src', 'umath', 'loops.h.src'), + join('src', 'umath', 'loops_utils.h.src'), join('src', 'umath', 'loops.c.src'), join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), join('src', 'umath', 'clip.h.src'), diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index 5c22c6f1c..dbcff8793 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b) TYPE io1 = *(TYPE *)iop1; \ BINARY_REDUCE_LOOP_INNER +#define IS_BINARY_STRIDE_ONE(esize, vsize) \ + ((steps[0] == esize) && \ + (steps[1] == esize) && \ + (steps[2] == esize) && \ + (abs_ptrdiff(args[2], args[0]) >= vsize) && \ + (abs_ptrdiff(args[2], args[1]) >= vsize)) + +/* + * stride is equal to element size and input and destination are equal or + * don't overlap within one register. The check of the steps against + * esize also quarantees that steps are >= 0. + */ +#define IS_BLOCKABLE_UNARY(esize, vsize) \ + (steps[0] == (esize) && steps[0] == steps[1] && \ + (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ + ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ + ((abs_ptrdiff(args[1], args[0]) == 0)))) + +/* + * Avoid using SIMD for very large step sizes for several reasons: + * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions, + * in which case we need two i64gather instructions and an additional vinsertf32x8 + * instruction to load a single zmm register (since one i64gather instruction + * loads into a ymm register). This is not ideal for performance. + * 2) Gather and scatter instructions can be slow when the loads/stores + * cross page boundaries. + * + * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index + * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE + * ensures this. The condition also requires that the input and output arrays + * should have no overlap in memory. + */ +#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \ + ((labs(steps[0]) < MAX_STEP_SIZE) && \ + (labs(steps[1]) < MAX_STEP_SIZE) && \ + (labs(steps[2]) < MAX_STEP_SIZE) && \ + (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ + (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0]))) + +#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \ + ((labs(steps[0]) < MAX_STEP_SIZE) && \ + (labs(steps[1]) < MAX_STEP_SIZE) && \ + (labs(steps[2]) < MAX_STEP_SIZE) && \ + (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ + (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0]))) + +/* + * 1) Output should be contiguous, can handle strided input data + * 2) Input step should be smaller than MAX_STEP_SIZE for performance + * 3) Input and output arrays should have no overlap in memory + */ +#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \ + ((steps[0] & (esizein-1)) == 0 && \ + steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \ + (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0]))) + +#define IS_BLOCKABLE_REDUCE(esize, vsize) \ + (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \ + npy_is_aligned(args[1], (esize)) && \ + npy_is_aligned(args[0], (esize))) + +#define IS_BLOCKABLE_BINARY(esize, vsize) \ + (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \ + npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \ + npy_is_aligned(args[0], (esize)) && \ + (abs_ptrdiff(args[2], args[0]) >= (vsize) || \ + abs_ptrdiff(args[2], args[0]) == 0) && \ + (abs_ptrdiff(args[2], args[1]) >= (vsize) || \ + abs_ptrdiff(args[2], args[1]) >= 0)) + +#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \ + (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \ + npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \ + ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \ + (abs_ptrdiff(args[2], args[1]) == 0)) && \ + abs_ptrdiff(args[2], args[0]) >= (esize)) + +#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \ + (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \ + npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \ + ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \ + (abs_ptrdiff(args[2], args[0]) == 0)) && \ + abs_ptrdiff(args[2], args[1]) >= (esize)) + +#undef abs_ptrdiff + +#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \ + (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \ + npy_is_aligned(args[1], (esize)) && \ + npy_is_aligned(args[0], (esize))) + +#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \ + (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \ + npy_is_aligned(args[1], (esize))) + +#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \ + (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \ + npy_is_aligned(args[0], (esize))) + +/* align var to alignment */ +#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ + npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ + alignment, n);\ + for(i = 0; i < peel; i++) + +#define LOOP_BLOCKED(type, vsize)\ + for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\ + i += (vsize / sizeof(type))) + +#define LOOP_BLOCKED_END\ + for (; i < n; i++) + #endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 6403efaee..839d2b3ae 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1797,124 +1797,11 @@ DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step /**begin repeat * Float types - * #type = npy_float, npy_double, npy_longdouble, npy_float# - * #dtype = npy_float, npy_double, npy_longdouble, npy_half# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF# - * #c = f, , l, # - * #C = F, , L, # - * #trf = , , , npy_half_to_float# - */ - -/* - * Pairwise summation, rounding error O(lg n) instead of O(n). - * The recursion depth is O(lg n) as well. - * when updating also update similar complex floats summation - */ -static @type@ -pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride) -{ - if (n < 8) { - npy_intp i; - @type@ res = 0.; - - for (i = 0; i < n; i++) { - res += @trf@(*((@dtype@*)(a + i * stride))); - } - return res; - } - else if (n <= PW_BLOCKSIZE) { - npy_intp i; - @type@ r[8], res; - - /* - * sum a block with 8 accumulators - * 8 times unroll reduces blocksize to 16 and allows vectorization with - * avx without changing summation ordering - */ - r[0] = @trf@(*((@dtype@ *)(a + 0 * stride))); - r[1] = @trf@(*((@dtype@ *)(a + 1 * stride))); - r[2] = @trf@(*((@dtype@ *)(a + 2 * stride))); - r[3] = @trf@(*((@dtype@ *)(a + 3 * stride))); - r[4] = @trf@(*((@dtype@ *)(a + 4 * stride))); - r[5] = @trf@(*((@dtype@ *)(a + 5 * stride))); - r[6] = @trf@(*((@dtype@ *)(a + 6 * stride))); - r[7] = @trf@(*((@dtype@ *)(a + 7 * stride))); - - for (i = 8; i < n - (n % 8); i += 8) { - /* small blocksizes seems to mess with hardware prefetch */ - NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3); - r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride))); - r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride))); - r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride))); - r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride))); - r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride))); - r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride))); - r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride))); - r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride))); - } - - /* accumulate now to avoid stack spills for single peel loop */ - res = ((r[0] + r[1]) + (r[2] + r[3])) + - ((r[4] + r[5]) + (r[6] + r[7])); - - /* do non multiple of 8 rest */ - for (; i < n; i++) { - res += @trf@(*((@dtype@ *)(a + i * stride))); - } - return res; - } - else { - /* divide by two but avoid non-multiples of unroll factor */ - npy_intp n2 = n / 2; - - n2 -= n2 % 8; - return pairwise_sum_@TYPE@(a, n2, stride) + - pairwise_sum_@TYPE@(a + n2 * stride, n - n2, stride); - } -} - -/**end repeat**/ - -/**begin repeat - * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# */ - -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - * # PW = 1, 0, 0, 0# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { -#if @PW@ - @type@ * iop1 = (@type@ *)args[0]; - npy_intp n = dimensions[0]; - - *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]); -#else - BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; - } - *((@type@ *)iop1) = io1; -#endif - } - else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = in1 @OP@ in2; - } - } -} -/**end repeat1**/ - /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, * logical_and, logical_or# @@ -2244,8 +2131,6 @@ NPY_NO_EXPORT void } } -#define @TYPE@_true_divide @TYPE@_divide - /**end repeat**/ /* @@ -2254,6 +2139,38 @@ NPY_NO_EXPORT void ***************************************************************************** */ +/**begin repeat + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + * # PW = 1, 0, 0, 0# + */ +NPY_NO_EXPORT void +LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { +#if @PW@ + npy_longdouble * iop1 = (npy_longdouble *)args[0]; + npy_intp n = dimensions[0]; + + *iop1 @OP@= LONGDOUBLE_pairwise_sum(args[1], n, steps[1]); +#else + BINARY_REDUCE_LOOP(npy_longdouble) { + io1 @OP@= *(npy_longdouble *)ip2; + } + *((npy_longdouble *)iop1) = io1; +#endif + } + else { + BINARY_LOOP { + const npy_longdouble in1 = *(npy_longdouble *)ip1; + const npy_longdouble in2 = *(npy_longdouble *)ip2; + *((npy_longdouble *)op1) = in1 @OP@ in2; + } + } +} +/**end repeat**/ + NPY_NO_EXPORT void LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -2306,7 +2223,7 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void #if @PW@ npy_intp n = dimensions[0]; - io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]); + io1 @OP@= HALF_pairwise_sum(args[1], n, steps[1]); #else BINARY_REDUCE_LOOP_INNER { io1 @OP@= npy_half_to_float(*(npy_half *)ip2); @@ -2614,9 +2531,6 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, } } -#define HALF_true_divide HALF_divide - - /* ***************************************************************************** ** COMPLEX LOOPS ** @@ -2643,80 +2557,8 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, * #SIMD = 1, 1, 0# */ -/* similar to pairwise sum of real floats */ -static void -pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, - npy_intp stride) -{ - assert(n % 2 == 0); - if (n < 8) { - npy_intp i; - - *rr = 0.; - *ri = 0.; - for (i = 0; i < n; i += 2) { - *rr += *((@ftype@ *)(a + i * stride + 0)); - *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@))); - } - return; - } - else if (n <= PW_BLOCKSIZE) { - npy_intp i; - @ftype@ r[8]; - - /* - * sum a block with 8 accumulators - * 8 times unroll reduces blocksize to 16 and allows vectorization with - * avx without changing summation ordering - */ - r[0] = *((@ftype@ *)(a + 0 * stride)); - r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@))); - r[2] = *((@ftype@ *)(a + 2 * stride)); - r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@))); - r[4] = *((@ftype@ *)(a + 4 * stride)); - r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@))); - r[6] = *((@ftype@ *)(a + 6 * stride)); - r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@))); - - for (i = 8; i < n - (n % 8); i += 8) { - /* small blocksizes seems to mess with hardware prefetch */ - NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3); - r[0] += *((@ftype@ *)(a + (i + 0) * stride)); - r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@))); - r[2] += *((@ftype@ *)(a + (i + 2) * stride)); - r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@))); - r[4] += *((@ftype@ *)(a + (i + 4) * stride)); - r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@))); - r[6] += *((@ftype@ *)(a + (i + 6) * stride)); - r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@))); - } - - /* accumulate now to avoid stack spills for single peel loop */ - *rr = ((r[0] + r[2]) + (r[4] + r[6])); - *ri = ((r[1] + r[3]) + (r[5] + r[7])); - - /* do non multiple of 8 rest */ - for (; i < n; i+=2) { - *rr += *((@ftype@ *)(a + i * stride + 0)); - *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@))); - } - return; - } - else { - /* divide by two but avoid non-multiples of unroll factor */ - @ftype@ rr1, ri1, rr2, ri2; - npy_intp n2 = n / 2; - - n2 -= n2 % 8; - pairwise_sum_@TYPE@(&rr1, &ri1, a, n2, stride); - pairwise_sum_@TYPE@(&rr2, &ri2, a + n2 * stride, n - n2, stride); - *rr = rr1 + rr2; - *ri = ri1 + ri2; - return; - } -} - - +#if !@SIMD@ +// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src' /**begin repeat1 * arithmetic * #kind = add, subtract# @@ -2733,7 +2575,7 @@ NPY_NO_EXPORT void @ftype@ * oi = ((@ftype@ *)args[0]) + 1; @ftype@ rr, ri; - pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2); + @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2); *or @OP@= rr; *oi @OP@= ri; return; @@ -2763,6 +2605,7 @@ NPY_NO_EXPORT void ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; } } +#endif // !SIMD NPY_NO_EXPORT void @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -2796,31 +2639,6 @@ NPY_NO_EXPORT void } } -#if @SIMD@ -NPY_NO_EXPORT void -@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) -{ - if (IS_BINARY_REDUCE) { - @TYPE@_add(args, dimensions, steps, func); - } - else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) { - @TYPE@_add(args, dimensions, steps, func); - } -} - -/**begin repeat1 - * arithmetic - * #kind = subtract, multiply# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) -{ - if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - @TYPE@_@kind@(args, dimensions, steps, func); - } -} -/**end repeat1**/ -#endif NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -3066,8 +2884,6 @@ NPY_NO_EXPORT void } /**end repeat1**/ -#define @TYPE@_true_divide @TYPE@_divide - /**end repeat**/ #undef CGE diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index a0b68d168..c15ff8e3b 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -185,6 +185,22 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * #TYPE = FLOAT, DOUBLE# */ @@ -356,9 +372,6 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -#define @TYPE@_true_divide @TYPE@_divide - /**end repeat**/ @@ -367,6 +380,19 @@ NPY_NO_EXPORT void ** COMPLEX LOOPS ** ***************************************************************************** */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = add, subtract, multiply# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ #define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); #define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); @@ -380,26 +406,14 @@ NPY_NO_EXPORT void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# - * #IFSIMD = 1, 1, 0# */ /**begin repeat1 - * #isa = , _avx512f# - */ - -/**begin repeat2 * arithmetic - * #kind = add, subtract# - * #OP = +, -# + * #kind = add, subtract, multiply# */ - NPY_NO_EXPORT void -C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat2**/ - -NPY_NO_EXPORT void -C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat1**/ NPY_NO_EXPORT void @@ -480,7 +494,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v NPY_NO_EXPORT void C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat1**/ -#define C@TYPE@_true_divide C@TYPE@_divide /**end repeat**/ @@ -593,10 +606,6 @@ NPY_NO_EXPORT void TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /* Special case equivalents to above functions */ - -#define TIMEDELTA_mq_m_true_divide TIMEDELTA_mq_m_divide -#define TIMEDELTA_md_m_true_divide TIMEDELTA_md_m_divide -#define TIMEDELTA_mm_d_true_divide TIMEDELTA_mm_d_divide #define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src new file mode 100644 index 000000000..d8c8fdc9e --- /dev/null +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -0,0 +1,777 @@ +/*@targets + ** $maxopt baseline + ** sse2 avx2 avx512f + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +// TODO: replace raw SIMD with NPYV +//############################################################################### +//## Real Single/Double precision +//############################################################################### +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +#ifdef NPY_HAVE_SSE2 +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #scalarf = npy_sqrtf, npy_sqrt# + * #c = f, # + * #vtype = __m128, __m128d# + * #vtype256 = __m256, __m256d# + * #vtype512 = __m512, __m512d# + * #vpre = _mm, _mm# + * #vpre256 = _mm256, _mm256# + * #vpre512 = _mm512, _mm512# + * #vsuf = ps, pd# + * #vsufs = ss, sd# + * #nan = NPY_NANF, NPY_NAN# + * #double = 0, 1# + * #cast = _mm_castps_si128, _mm_castpd_si128# + */ +/**begin repeat1 +* Arithmetic +* # kind = add, subtract, multiply, divide# +* # OP = +, -, *, /# +* # VOP = add, sub, mul, div# +*/ +static void +sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef NPY_HAVE_AVX512F + const npy_intp vector_size_bytes = 64; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[i]; + /* lots of specializations, to squeeze out max performance */ + if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); + @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + } + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); + @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); + @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + else { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); + @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + } +#elif defined NPY_HAVE_AVX2 + const npy_intp vector_size_bytes = 32; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[i]; + /* lots of specializations, to squeeze out max performance */ + if (npy_is_aligned(&ip1[i], vector_size_bytes) && + npy_is_aligned(&ip2[i], vector_size_bytes)) { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); + @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + } + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); + @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); + @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + else { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); + @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + } +#else + const npy_intp vector_size_bytes = 16; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[i]; + /* lots of specializations, to squeeze out max performance */ + if (npy_is_aligned(&ip1[i], vector_size_bytes) && + npy_is_aligned(&ip2[i], vector_size_bytes)) { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + } + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + } +#endif + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[i]; + } +} + +static void +sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef NPY_HAVE_AVX512F + const npy_intp vector_size_bytes = 64; + const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[0] @OP@ ip2[i]; + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + + +#elif defined NPY_HAVE_AVX2 + const npy_intp vector_size_bytes = 32; + const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[0] @OP@ ip2[i]; + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } +#else + const npy_intp vector_size_bytes = 16; + const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[0] @OP@ ip2[i]; + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } +#endif + LOOP_BLOCKED_END { + op[i] = ip1[0] @OP@ ip2[i]; + } +} + +static void +sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef NPY_HAVE_AVX512F + const npy_intp vector_size_bytes = 64; + const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[0]; + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); + @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); + @vpre512@_store_@vsuf@(&op[i], c); + } + } + +#elif defined NPY_HAVE_AVX2 + const npy_intp vector_size_bytes = 32; + const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[0]; + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); + @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); + @vpre256@_store_@vsuf@(&op[i], c); + } + } +#else + const npy_intp vector_size_bytes = 16; + const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[0]; + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } +#endif + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[0]; + } +} + +/**end repeat1**/ +/**end repeat**/ + +#else // NPY_HAVE_SSE2 + +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #CHK = , _F64# + */ +#if NPY_SIMD@CHK@ +/**begin repeat1 +* Arithmetic +* # kind = add, subtract, multiply, divide# +* # OP = +, -, *, /# +* # VOP = add, sub, mul, div# +*/ + +static void +simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[i] @OP@ ip2[i]; + } + /* lots of specializations, to squeeze out max performance */ + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); + npyv_store_@sfx@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); + npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); + npyv_store_@sfx@(&op[i], c); + } + } + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[i]; + } +} + +static void +simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[0] @OP@ ip2[i]; + } + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); + npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); + npyv_store_@sfx@(&op[i], v3); + } + LOOP_BLOCKED_END { + op[i] = ip1[0] @OP@ ip2[i]; + } +} + +static void +simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[i] @OP@ ip2[0]; + } + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); + npyv_store_@sfx@(&op[i], v3); + } + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[0]; + } +} +/**end repeat1**/ +#endif /* NPY_SIMD@CHK@ */ +/**end repeat**/ +#endif // NPY_HAVE_SSE2 + +/**begin repeat + * Float types + * #type = npy_float, npy_double, npy_longdouble# + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #vector = 1, 1, 0# + * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + */ +static NPY_INLINE int +run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if @vector@ && defined NPY_HAVE_SSE2 + @type@ * ip1 = (@type@ *)args[0]; + @type@ * ip2 = (@type@ *)args[1]; + @type@ * op = (@type@ *)args[2]; + npy_intp n = dimensions[0]; +#if defined NPY_HAVE_AVX512F + const npy_uintp vector_size_bytes = 64; +#elif defined NPY_HAVE_AVX2 + const npy_uintp vector_size_bytes = 32; +#else + const npy_uintp vector_size_bytes = 32; +#endif + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { + sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { + sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { + sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } +#elif @VECTOR@ + @type@ * ip1 = (@type@ *)args[0]; + @type@ * ip2 = (@type@ *)args[1]; + @type@ * op = (@type@ *)args[2]; + npy_intp n = dimensions[0]; + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } +#endif + return 0; +} +/**end repeat1**/ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * Float types + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #c = f, # + * #C = F, # + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + * # PW = 1, 0, 0, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { +#if @PW@ + @type@ * iop1 = (@type@ *)args[0]; + npy_intp n = dimensions[0]; + + *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]); +#else + BINARY_REDUCE_LOOP(@type@) { + io1 @OP@= *(@type@ *)ip2; + } + *((@type@ *)iop1) = io1; +#endif + } + else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1) = in1 @OP@ in2; + } + } +} +/**end repeat1**/ +/**end repeat**/ + +//############################################################################### +//## Complex Single/Double precision +//############################################################################### +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F) + /** + * For somehow MSVC commit aggressive optimization lead + * to raises 'RuntimeWarning: invalid value encountered in multiply' + * + * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to + * investigate about it while moving to NPYV. + */ + #define AVX512F_NOMSVC +#endif + +#ifdef AVX512F_NOMSVC +static NPY_INLINE __mmask16 +avx512_get_full_load_mask_ps(void) +{ + return 0xFFFF; +} + +static NPY_INLINE __mmask8 +avx512_get_full_load_mask_pd(void) +{ + return 0xFF; +} +static NPY_INLINE __m512 +avx512_masked_load_ps(__mmask16 mask, npy_float* addr) +{ + return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); +} + +static NPY_INLINE __m512d +avx512_masked_load_pd(__mmask8 mask, npy_double* addr) +{ + return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) +{ + return (0x0001 << num_elem) - 0x0001; +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) +{ + return (0x01 << num_elem) - 0x01; +} +/**begin repeat + * #vsub = ps, pd# + * #type= npy_float, npy_double# + * #epi_vsub = epi32, epi64# + * #vtype = __m512, __m512d# + * #mask = __mmask16, __mmask8# + * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# + * #neg_mask = 0x80000000, 0x8000000000000000# + * #perm_ = 0xb1, 0x55# + * #cmpx_img_mask = 0xAAAA, 0xAA# + * #cmpx_re_mask = 0x5555, 0x55# + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# + */ +static @vtype@ +avx512_hadd_@vsub@(const @vtype@ x) +{ + return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); +} + +static @vtype@ +avx512_hsub_@vsub@(const @vtype@ x) +{ + return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); +} +static NPY_INLINE @vtype@ +avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) +{ + // x1 = r1, i1 + // x2 = r2, i2 + @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 + @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 + @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 + @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 + @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 + return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); +} +/**end repeat**/ +#endif + +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + * #type = npy_float, npy_double# + * #num_lanes = 16, 8# + * #vsuffix = ps, pd# + * #epi_vsub = epi32, epi64# + * #mask = __mmask16, __mmask8# + * #vtype = __m512, __m512d# + * #scale = 4, 8# + * #vindextype = __m512i, __m256i# + * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# + * #storemask = 0xFF, 0xF# + * #IS_FLOAT = 1, 0# + */ +/**begin repeat1 + * #func = add, subtract, multiply# + * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# + */ +#if defined AVX512F_NOMSVC +static NPY_INLINE void +AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = 2*array_size; + @type@* ip1 = (@type@*) args[0]; + @type@* ip2 = (@type@*) args[1]; + @type@* op = (@type@*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype@ x1, x2; + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); + + @vtype@ out = @vectorf@_@vsuffix@(x1, x2); + + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + + ip1 += @num_lanes@; + ip2 += @num_lanes@; + op += @num_lanes@; + num_remaining_elements -= @num_lanes@; + } +} +#endif // AVX512F_NOMSVC +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + * #type= npy_float, npy_double# + * #esize = 8, 16# + */ +/**begin repeat1 + * #func = add, subtract, multiply# + */ +static NPY_INLINE int +run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ +#if defined AVX512F_NOMSVC + if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { + AVX512F_@func@_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; +#endif + return 0; +} +/**end repeat1**/ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * complex types + * #TYPE = CFLOAT, CDOUBLE# + * #ftype = npy_float, npy_double# + * #c = f, # + * #C = F, # + */ +/**begin repeat1 + * arithmetic + * #kind = add, subtract# + * #OP = +, -# + * #PW = 1, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + // Parenthesis around @PW@ tells clang dead code is intentional + if (IS_BINARY_REDUCE && (@PW@)) { + npy_intp n = dimensions[0]; + @ftype@ * or = ((@ftype@ *)args[0]); + @ftype@ * oi = ((@ftype@ *)args[0]) + 1; + @ftype@ rr, ri; + + @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2); + *or @OP@= rr; + *oi @OP@= ri; + return; + } + if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + const @ftype@ in1r = ((@ftype@ *)ip1)[0]; + const @ftype@ in1i = ((@ftype@ *)ip1)[1]; + const @ftype@ in2r = ((@ftype@ *)ip2)[0]; + const @ftype@ in2i = ((@ftype@ *)ip2)[1]; + ((@ftype@ *)op1)[0] = in1r @OP@ in2r; + ((@ftype@ *)op1)[1] = in1i @OP@ in2i; + } + } +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + const @ftype@ in1r = ((@ftype@ *)ip1)[0]; + const @ftype@ in1i = ((@ftype@ *)ip1)[1]; + const @ftype@ in2r = ((@ftype@ *)ip2)[0]; + const @ftype@ in2i = ((@ftype@ *)ip2)[1]; + ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i; + ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; + } + } +} +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h deleted file mode 100644 index f5540bdae..000000000 --- a/numpy/core/src/umath/loops_utils.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _NPY_UMATH_LOOPS_UTILS_H_ -#define _NPY_UMATH_LOOPS_UTILS_H_ - -#include "numpy/npy_common.h" // NPY_FINLINE -/* - * nomemoverlap - returns false if two strided arrays have an overlapping - * region in memory. ip_size/op_size = size of the arrays which can be negative - * indicating negative steps. - */ -NPY_FINLINE npy_bool -nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size) -{ - char *ip_start, *ip_end, *op_start, *op_end; - if (ip_size < 0) { - ip_start = ip + ip_size; - ip_end = ip; - } - else { - ip_start = ip; - ip_end = ip + ip_size; - } - if (op_size < 0) { - op_start = op + op_size; - op_end = op; - } - else { - op_start = op; - op_end = op + op_size; - } - return (ip_start == op_start && op_end == ip_end) || - (ip_start > op_end) || (op_start > ip_end); -} - -// returns true if two strided arrays have an overlapping region in memory -// same as `nomemoverlap()` but requires array length and step sizes -NPY_FINLINE npy_bool -is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len) -{ - return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len)); -} - -#endif // _NPY_UMATH_LOOPS_UTILS_H_ diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src new file mode 100644 index 000000000..dfa790ed9 --- /dev/null +++ b/numpy/core/src/umath/loops_utils.h.src @@ -0,0 +1,213 @@ +#ifndef _NPY_UMATH_LOOPS_UTILS_H_ +#define _NPY_UMATH_LOOPS_UTILS_H_ + +#include "numpy/npy_common.h" // NPY_FINLINE +#include "numpy/halffloat.h" // npy_half_to_float +/* + * nomemoverlap - returns false if two strided arrays have an overlapping + * region in memory. ip_size/op_size = size of the arrays which can be negative + * indicating negative steps. + */ +NPY_FINLINE npy_bool +nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size) +{ + char *ip_start, *ip_end, *op_start, *op_end; + if (ip_size < 0) { + ip_start = ip + ip_size; + ip_end = ip; + } + else { + ip_start = ip; + ip_end = ip + ip_size; + } + if (op_size < 0) { + op_start = op + op_size; + op_end = op; + } + else { + op_start = op; + op_end = op + op_size; + } + return (ip_start == op_start && op_end == ip_end) || + (ip_start > op_end) || (op_start > ip_end); +} + +// returns true if two strided arrays have an overlapping region in memory +// same as `nomemoverlap()` but requires array length and step sizes +NPY_FINLINE npy_bool +is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len) +{ + return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len)); +} + +/* + * cutoff blocksize for pairwise summation + * decreasing it decreases errors slightly as more pairs are summed but + * also lowers performance, as the inner loop is unrolled eight times it is + * effectively 16 + */ +#define PW_BLOCKSIZE 128 + +/**begin repeat + * Float types + * #type = npy_float, npy_double, npy_longdouble, npy_float# + * #dtype = npy_float, npy_double, npy_longdouble, npy_half# + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF# + * #c = f, , l, # + * #C = F, , L, # + * #trf = , , , npy_half_to_float# + */ + +/* + * Pairwise summation, rounding error O(lg n) instead of O(n). + * The recursion depth is O(lg n) as well. + * when updating also update similar complex floats summation + */ +static NPY_INLINE @type@ +@TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride) +{ + if (n < 8) { + npy_intp i; + @type@ res = 0.; + + for (i = 0; i < n; i++) { + res += @trf@(*((@dtype@*)(a + i * stride))); + } + return res; + } + else if (n <= PW_BLOCKSIZE) { + npy_intp i; + @type@ r[8], res; + + /* + * sum a block with 8 accumulators + * 8 times unroll reduces blocksize to 16 and allows vectorization with + * avx without changing summation ordering + */ + r[0] = @trf@(*((@dtype@ *)(a + 0 * stride))); + r[1] = @trf@(*((@dtype@ *)(a + 1 * stride))); + r[2] = @trf@(*((@dtype@ *)(a + 2 * stride))); + r[3] = @trf@(*((@dtype@ *)(a + 3 * stride))); + r[4] = @trf@(*((@dtype@ *)(a + 4 * stride))); + r[5] = @trf@(*((@dtype@ *)(a + 5 * stride))); + r[6] = @trf@(*((@dtype@ *)(a + 6 * stride))); + r[7] = @trf@(*((@dtype@ *)(a + 7 * stride))); + + for (i = 8; i < n - (n % 8); i += 8) { + /* small blocksizes seems to mess with hardware prefetch */ + NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3); + r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride))); + r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride))); + r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride))); + r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride))); + r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride))); + r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride))); + r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride))); + r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride))); + } + + /* accumulate now to avoid stack spills for single peel loop */ + res = ((r[0] + r[1]) + (r[2] + r[3])) + + ((r[4] + r[5]) + (r[6] + r[7])); + + /* do non multiple of 8 rest */ + for (; i < n; i++) { + res += @trf@(*((@dtype@ *)(a + i * stride))); + } + return res; + } + else { + /* divide by two but avoid non-multiples of unroll factor */ + npy_intp n2 = n / 2; + + n2 -= n2 % 8; + return @TYPE@_pairwise_sum(a, n2, stride) + + @TYPE@_pairwise_sum(a + n2 * stride, n - n2, stride); + } +} + +/**end repeat**/ + +/**begin repeat + * complex types + * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE# + * #ftype = npy_float, npy_double, npy_longdouble# + * #c = f, , l# + * #C = F, , L# + * #SIMD = 1, 1, 0# + */ +/* similar to pairwise sum of real floats */ +static NPY_INLINE void +@TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, + npy_intp stride) +{ + assert(n % 2 == 0); + if (n < 8) { + npy_intp i; + + *rr = 0.; + *ri = 0.; + for (i = 0; i < n; i += 2) { + *rr += *((@ftype@ *)(a + i * stride + 0)); + *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@))); + } + return; + } + else if (n <= PW_BLOCKSIZE) { + npy_intp i; + @ftype@ r[8]; + + /* + * sum a block with 8 accumulators + * 8 times unroll reduces blocksize to 16 and allows vectorization with + * avx without changing summation ordering + */ + r[0] = *((@ftype@ *)(a + 0 * stride)); + r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@))); + r[2] = *((@ftype@ *)(a + 2 * stride)); + r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@))); + r[4] = *((@ftype@ *)(a + 4 * stride)); + r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@))); + r[6] = *((@ftype@ *)(a + 6 * stride)); + r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@))); + + for (i = 8; i < n - (n % 8); i += 8) { + /* small blocksizes seems to mess with hardware prefetch */ + NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3); + r[0] += *((@ftype@ *)(a + (i + 0) * stride)); + r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@))); + r[2] += *((@ftype@ *)(a + (i + 2) * stride)); + r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@))); + r[4] += *((@ftype@ *)(a + (i + 4) * stride)); + r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@))); + r[6] += *((@ftype@ *)(a + (i + 6) * stride)); + r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@))); + } + + /* accumulate now to avoid stack spills for single peel loop */ + *rr = ((r[0] + r[2]) + (r[4] + r[6])); + *ri = ((r[1] + r[3]) + (r[5] + r[7])); + + /* do non multiple of 8 rest */ + for (; i < n; i+=2) { + *rr += *((@ftype@ *)(a + i * stride + 0)); + *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@))); + } + return; + } + else { + /* divide by two but avoid non-multiples of unroll factor */ + @ftype@ rr1, ri1, rr2, ri2; + npy_intp n2 = n / 2; + + n2 -= n2 % 8; + @TYPE@_pairwise_sum(&rr1, &ri1, a, n2, stride); + @TYPE@_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride); + *rr = rr1 + rr2; + *ri = ri1 + ri2; + return; + } +} +/**end repeat**/ + +#endif // _NPY_UMATH_LOOPS_UTILS_H_ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index c2104810f..3d4e6de87 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -28,7 +28,6 @@ #undef __AVX512F__ #endif #endif -#include "simd/simd.h" #include "loops_utils.h" // nomemoverlap #include <assert.h> #include <stdlib.h> @@ -52,119 +51,6 @@ */ #define MAX_STEP_SIZE 2097152 -#define IS_BINARY_STRIDE_ONE(esize, vsize) \ - ((steps[0] == esize) && \ - (steps[1] == esize) && \ - (steps[2] == esize) && \ - (abs_ptrdiff(args[2], args[0]) >= vsize) && \ - (abs_ptrdiff(args[2], args[1]) >= vsize)) - -/* - * stride is equal to element size and input and destination are equal or - * don't overlap within one register. The check of the steps against - * esize also quarantees that steps are >= 0. - */ -#define IS_BLOCKABLE_UNARY(esize, vsize) \ - (steps[0] == (esize) && steps[0] == steps[1] && \ - (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ - ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ - ((abs_ptrdiff(args[1], args[0]) == 0)))) - -/* - * Avoid using SIMD for very large step sizes for several reasons: - * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions, - * in which case we need two i64gather instructions and an additional vinsertf32x8 - * instruction to load a single zmm register (since one i64gather instruction - * loads into a ymm register). This is not ideal for performance. - * 2) Gather and scatter instructions can be slow when the loads/stores - * cross page boundaries. - * - * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index - * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE - * ensures this. The condition also requires that the input and output arrays - * should have no overlap in memory. - */ -#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \ - ((labs(steps[0]) < MAX_STEP_SIZE) && \ - (labs(steps[1]) < MAX_STEP_SIZE) && \ - (labs(steps[2]) < MAX_STEP_SIZE) && \ - (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ - (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0]))) - -#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \ - ((labs(steps[0]) < MAX_STEP_SIZE) && \ - (labs(steps[1]) < MAX_STEP_SIZE) && \ - (labs(steps[2]) < MAX_STEP_SIZE) && \ - (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ - (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0]))) - -/* - * 1) Output should be contiguous, can handle strided input data - * 2) Input step should be smaller than MAX_STEP_SIZE for performance - * 3) Input and output arrays should have no overlap in memory - */ -#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \ - ((steps[0] & (esizein-1)) == 0 && \ - steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \ - (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0]))) - -#define IS_BLOCKABLE_REDUCE(esize, vsize) \ - (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \ - npy_is_aligned(args[1], (esize)) && \ - npy_is_aligned(args[0], (esize))) - -#define IS_BLOCKABLE_BINARY(esize, vsize) \ - (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \ - npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \ - npy_is_aligned(args[0], (esize)) && \ - (abs_ptrdiff(args[2], args[0]) >= (vsize) || \ - abs_ptrdiff(args[2], args[0]) == 0) && \ - (abs_ptrdiff(args[2], args[1]) >= (vsize) || \ - abs_ptrdiff(args[2], args[1]) >= 0)) - -#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \ - (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \ - npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \ - ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \ - (abs_ptrdiff(args[2], args[1]) == 0)) && \ - abs_ptrdiff(args[2], args[0]) >= (esize)) - -#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \ - (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \ - npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \ - ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \ - (abs_ptrdiff(args[2], args[0]) == 0)) && \ - abs_ptrdiff(args[2], args[1]) >= (esize)) - -#undef abs_ptrdiff - -#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \ - (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \ - npy_is_aligned(args[1], (esize)) && \ - npy_is_aligned(args[0], (esize))) - -#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \ - (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \ - npy_is_aligned(args[1], (esize))) - -#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \ - (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \ - npy_is_aligned(args[0], (esize))) - -/* align var to alignment */ -#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ - npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ - alignment, n);\ - for(i = 0; i < peel; i++) - -#define LOOP_BLOCKED(type, vsize)\ - for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\ - i += (vsize / sizeof(type))) - -#define LOOP_BLOCKED_END\ - for (; i < n; i++) - - /* * Dispatcher functions * decide whether the operation can be vectorized and run it @@ -184,31 +70,6 @@ */ /**begin repeat1 - * #func = add, subtract, multiply# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps); -#endif - -static NPY_INLINE int -run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS - if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { - AVX512F_@func@_@TYPE@(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat1**/ - -/**begin repeat1 * #func = square, absolute, conjugate# * #outsize = 1, 2, 1# * #max_stride = 2, 8, 8# @@ -508,92 +369,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp /**end repeat1**/ /**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - */ - -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - -/* prototypes */ -static void -sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, - npy_intp n); -static void -sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, - npy_intp n); -static void -sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, - npy_intp n); - -#elif @VECTOR@ - -static void -simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, - npy_intp n); -static void -simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, - npy_intp n); -static void -simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, - npy_intp n); - -#endif - -static NPY_INLINE int -run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - @type@ * op = (@type@ *)args[2]; - npy_intp n = dimensions[0]; -#if defined __AVX512F__ - const npy_uintp vector_size_bytes = 64; -#elif defined __AVX2__ - const npy_uintp vector_size_bytes = 32; -#else - const npy_uintp vector_size_bytes = 32; -#endif - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { - sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { - sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { - sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } -#elif @VECTOR@ - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - @type@ * op = (@type@ *)args[2]; - npy_intp n = dimensions[0]; - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } -#endif - return 0; -} - -/**end repeat1**/ - -/**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, * logical_and, logical_or# * #simd = 1, 1, 1, 1, 1, 1, 0, 0# @@ -777,7 +552,6 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */ return r; } - /**end repeat**/ /**begin repeat @@ -797,327 +571,6 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) * #double = 0, 1# * #cast = _mm_castps_si128, _mm_castpd_si128# */ - - -/**begin repeat1 -* Arithmetic -* # kind = add, subtract, multiply, divide# -* # OP = +, -, *, /# -* # VOP = add, sub, mul, div# -*/ - -static void -sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef __AVX512F__ - const npy_intp vector_size_bytes = 64; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - } -#elif defined __AVX2__ - const npy_intp vector_size_bytes = 32; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && - npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - } -#else - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && - npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - } - else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[i]; - } -} - - -static void -sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef __AVX512F__ - const npy_intp vector_size_bytes = 64; - const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - - -#elif defined __AVX2__ - const npy_intp vector_size_bytes = 32; - const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } -#else - const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; - } -} - - -static void -sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef __AVX512F__ - const npy_intp vector_size_bytes = 64; - const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - -#elif defined __AVX2__ - const npy_intp vector_size_bytes = 32; - const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } -#else - const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; - } -} - -/**end repeat1**/ - /* * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1 * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the @@ -3356,45 +2809,6 @@ AVX512F_log_DOUBLE(npy_double * op, */ /**begin repeat1 - * #func = add, subtract, multiply# - * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) -{ - const npy_intp array_size = dimensions[0]; - npy_intp num_remaining_elements = 2*array_size; - @type@* ip1 = (@type@*) args[0]; - @type@* ip2 = (@type@*) args[1]; - @type@* op = (@type@*) args[2]; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1, x2; - x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); - x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); - - @vtype@ out = @vectorf@_@vsuffix@(x1, x2); - - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - - ip1 += @num_lanes@; - ip2 += @num_lanes@; - op += @num_lanes@; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ - -/**begin repeat1 * #func = square, conjugate# * #vectorf = avx512_csquare, avx512_conjugate# */ @@ -3656,86 +3070,5 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) /**end repeat**/ #undef VECTOR_SIZE_BYTES -#else /* NPY_HAVE_SSE2_INTRINSICS */ - -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# - * #CHK = , _F64# - */ - -#if NPY_SIMD@CHK@ - -/**begin repeat1 -* Arithmetic -* # kind = add, subtract, multiply, divide# -* # OP = +, -, *, /# -* # VOP = add, sub, mul, div# -*/ - -static void -simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[i] @OP@ ip2[i]; - } - /* lots of specializations, to squeeze out max performance */ - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); - npyv_store_@sfx@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); - npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); - npyv_store_@sfx@(&op[i], c); - } - } - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[i]; - } -} - -static void -simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[0] @OP@ ip2[i]; - } - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); - npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); - npyv_store_@sfx@(&op[i], v3); - } - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; - } -} - -static void -simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[i] @OP@ ip2[0]; - } - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); - npyv_store_@sfx@(&op[i], v3); - } - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; - } -} -/**end repeat1**/ -#endif /* NPY_SIMD@CHK@ */ -/**end repeat**/ -#endif +#endif /* NPY_HAVE_SSE2_INTRINSICS */ #endif |