summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2020-12-19 13:58:37 -0700
committerGitHub <noreply@github.com>2020-12-19 13:58:37 -0700
commitf7bc512184a1de3992b0d154e6244a7a0d9a574f (patch)
tree33bafb4dec61bd525493be4b9c908c75638275db
parent811f9eea3b105919846760a6eca4edd22cc11c1d (diff)
parent81bb563172c84eb1eba92b05fa99d902d28af13b (diff)
downloadnumpy-f7bc512184a1de3992b0d154e6244a7a0d9a574f.tar.gz
Merge pull request #17985 from seiko2plus/ditch_simd_arithmetic
ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic)
-rw-r--r--.gitignore2
-rw-r--r--numpy/core/code_generators/generate_umath.py84
-rw-r--r--numpy/core/setup.py2
-rw-r--r--numpy/core/src/umath/fast_loop_macros.h112
-rw-r--r--numpy/core/src/umath/loops.c.src258
-rw-r--r--numpy/core/src/umath/loops.h.src53
-rw-r--r--numpy/core/src/umath/loops_arithm_fp.dispatch.c.src777
-rw-r--r--numpy/core/src/umath/loops_utils.h42
-rw-r--r--numpy/core/src/umath/loops_utils.h.src213
-rw-r--r--numpy/core/src/umath/simd.inc.src669
10 files changed, 1226 insertions, 986 deletions
diff --git a/.gitignore b/.gitignore
index b28f0d3c8..b0fa037a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -170,6 +170,7 @@ numpy/core/src/umath/simd.inc
numpy/core/src/umath/struct_ufunc_test.c
numpy/core/src/umath/test_rational.c
numpy/core/src/umath/umath_tests.c
+numpy/core/src/umath/loops_utils.h
numpy/distutils/__config__.py
numpy/linalg/umath_linalg.c
doc/source/**/generated/
@@ -216,3 +217,4 @@ numpy/core/src/_simd/_simd_data.inc
numpy/core/src/_simd/_simd_inc.h
# umath module
numpy/core/src/umath/loops_unary_fp.dispatch.c
+numpy/core/src/umath/loops_arithm_fp.dispatch.c
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index cb1147b93..4e9a2cfec 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -45,14 +45,20 @@ class TypeDescription:
astype : dict or None, optional
If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
instead of PyUFunc_x_x/PyUFunc_xx_x.
+ cfunc_alias : str or none, optional
+ Appended to inner loop C function name, e.g., FLOAT_{cfunc_alias}. See make_arrays.
+ NOTE: it doesn't support 'astype'
simd: list
Available SIMD ufunc loops, dispatched at runtime in specified order
Currently only supported for simples types (see make_arrays)
- dispatch: list
- Available SIMD ufunc loops, dispatched at runtime in specified order
- Currently only supported for simples types (see make_arrays)
+ dispatch: str or None, optional
+ Dispatch-able source name without its extension '.dispatch.c' that
+ contains the definition of ufunc, dispatched at runtime depending on the
+ specified targets of the dispatch-able source.
+ NOTE: it doesn't support 'astype'
"""
- def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
+ def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
+ simd=None, dispatch=None):
self.type = type
self.func_data = f
if astype is None:
@@ -64,6 +70,7 @@ class TypeDescription:
if out is not None:
out = out.replace('P', type)
self.out = out
+ self.cfunc_alias = cfunc_alias
self.simd = simd
self.dispatch = dispatch
@@ -90,7 +97,8 @@ def build_func_data(types, f):
func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
return func_data
-def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
+def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
+ simd=None, dispatch=None):
if f is not None:
if isinstance(f, str):
func_data = build_func_data(types, f)
@@ -119,13 +127,15 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None)
simdt = [k for k, v in simd if t in v]
else:
simdt = []
+
# [(dispatch file name without extension '.dispatch.c*', list of types)]
if dispatch:
- dispt = [k for k, v in dispatch if t in v]
+ dispt = ([k for k, v in dispatch if t in v]+[None])[0]
else:
- dispt = []
+ dispt = None
tds.append(TypeDescription(
- t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
+ t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
+ simd=simdt, dispatch=dispt
))
return tds
@@ -280,7 +290,7 @@ defdict = {
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.add'),
'PyUFunc_AdditionTypeResolver',
- TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+ TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -291,7 +301,7 @@ defdict = {
Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
docstrings.get('numpy.core.umath.subtract'),
'PyUFunc_SubtractionTypeResolver',
- TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+ TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -302,7 +312,7 @@ defdict = {
Ufunc(2, 1, One,
docstrings.get('numpy.core.umath.multiply'),
'PyUFunc_MultiplicationTypeResolver',
- TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+ TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
TypeDescription('m', FullTypeDescr, 'qm', 'm'),
TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -326,10 +336,10 @@ defdict = {
Ufunc(2, 1, None, # One is only a unit to the right, not the left
docstrings.get('numpy.core.umath.true_divide'),
'PyUFunc_TrueDivisionTypeResolver',
- TD(flts+cmplx),
- [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
- TypeDescription('m', FullTypeDescr, 'md', 'm'),
- TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+ TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]),
+ [TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'),
+ TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'),
+ TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
],
TD(O, f='PyNumber_TrueDivide'),
),
@@ -1000,6 +1010,7 @@ def make_arrays(funcdict):
# later
code1list = []
code2list = []
+ dispdict = {}
names = sorted(funcdict.keys())
for name in names:
uf = funcdict[name]
@@ -1010,44 +1021,33 @@ def make_arrays(funcdict):
sub = 0
for t in uf.type_descriptions:
+ cfunc_alias = t.cfunc_alias if t.cfunc_alias else name
+ cfunc_fname = None
if t.func_data is FullTypeDescr:
tname = english_upper(chartoname[t.type])
datalist.append('(void *)NULL')
- funclist.append(
- '%s_%s_%s_%s' % (tname, t.in_, t.out, name))
+ cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
elif isinstance(t.func_data, FuncNameSuffix):
datalist.append('(void *)NULL')
tname = english_upper(chartoname[t.type])
- funclist.append(
- '%s_%s_%s' % (tname, name, t.func_data.suffix))
+ cfunc_fname = f"{tname}_{cfunc_alias}_{t.func_data.suffix}"
elif t.func_data is None:
datalist.append('(void *)NULL')
tname = english_upper(chartoname[t.type])
- funclist.append('%s_%s' % (tname, name))
+ cfunc_fname = f"{tname}_{cfunc_alias}"
if t.simd is not None:
for vt in t.simd:
code2list.append(textwrap.dedent("""\
#ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
if (NPY_CPU_HAVE({ISA})) {{
- {fname}_functions[{idx}] = {type}_{fname}_{isa};
+ {fname}_functions[{idx}] = {cname}_{isa};
}}
#endif
""").format(
ISA=vt.upper(), isa=vt,
- fname=name, type=tname, idx=k
- ))
- if t.dispatch is not None:
- for dname in t.dispatch:
- code2list.append(textwrap.dedent("""\
- #ifndef NPY_DISABLE_OPTIMIZATION
- #include "{dname}.dispatch.h"
- #endif
- NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
- """).format(
- dname=dname, name=name, tname=tname, k=k
+ fname=name, cname=cfunc_fname, idx=k
))
else:
- funclist.append('NULL')
try:
thedict = arity_lookup[uf.nin, uf.nout]
except KeyError as e:
@@ -1077,6 +1077,13 @@ def make_arrays(funcdict):
#datalist.append('(void *)%s' % t.func_data)
sub += 1
+ if cfunc_fname:
+ funclist.append(cfunc_fname)
+ if t.dispatch:
+ dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname))
+ else:
+ funclist.append('NULL')
+
for x in t.in_ + t.out:
siglist.append('NPY_%s' % (english_upper(chartoname[x]),))
@@ -1091,6 +1098,17 @@ def make_arrays(funcdict):
% (name, datanames))
code1list.append("static char %s_signatures[] = {%s};"
% (name, signames))
+
+ for dname, funcs in dispdict.items():
+ code2list.append(textwrap.dedent(f"""
+ #ifndef NPY_DISABLE_OPTIMIZATION
+ #include "{dname}.dispatch.h"
+ #endif
+ """))
+ for (ufunc_name, func_idx, cfunc_name) in funcs:
+ code2list.append(textwrap.dedent(f"""\
+ NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name});
+ """))
return "\n".join(code1list), "\n".join(code2list)
def make_ufuncs(funcdict):
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2ec5e1a64..2e020a595 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -916,8 +916,10 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'funcs.inc.src'),
join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
+ join('src', 'umath', 'loops_utils.h.src'),
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+ join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h.src'),
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 5c22c6f1c..dbcff8793 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b)
TYPE io1 = *(TYPE *)iop1; \
BINARY_REDUCE_LOOP_INNER
+#define IS_BINARY_STRIDE_ONE(esize, vsize) \
+ ((steps[0] == esize) && \
+ (steps[1] == esize) && \
+ (steps[2] == esize) && \
+ (abs_ptrdiff(args[2], args[0]) >= vsize) && \
+ (abs_ptrdiff(args[2], args[1]) >= vsize))
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register. The check of the steps against
+ * esize also quarantees that steps are >= 0.
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+ (steps[0] == (esize) && steps[0] == steps[1] && \
+ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+ ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+ ((abs_ptrdiff(args[1], args[0]) == 0))))
+
+/*
+ * Avoid using SIMD for very large step sizes for several reasons:
+ * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
+ * in which case we need two i64gather instructions and an additional vinsertf32x8
+ * instruction to load a single zmm register (since one i64gather instruction
+ * loads into a ymm register). This is not ideal for performance.
+ * 2) Gather and scatter instructions can be slow when the loads/stores
+ * cross page boundaries.
+ *
+ * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
+ * ensures this. The condition also requires that the input and output arrays
+ * should have no overlap in memory.
+ */
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
+ ((labs(steps[0]) < MAX_STEP_SIZE) && \
+ (labs(steps[1]) < MAX_STEP_SIZE) && \
+ (labs(steps[2]) < MAX_STEP_SIZE) && \
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+ (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
+
+#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
+ ((labs(steps[0]) < MAX_STEP_SIZE) && \
+ (labs(steps[1]) < MAX_STEP_SIZE) && \
+ (labs(steps[2]) < MAX_STEP_SIZE) && \
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
+
+/*
+ * 1) Output should be contiguous, can handle strided input data
+ * 2) Input step should be smaller than MAX_STEP_SIZE for performance
+ * 3) Input and output arrays should have no overlap in memory
+ */
+#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
+ ((steps[0] & (esizein-1)) == 0 && \
+ steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
+ (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
+
+#define IS_BLOCKABLE_REDUCE(esize, vsize) \
+ (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
+ npy_is_aligned(args[1], (esize)) && \
+ npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY(esize, vsize) \
+ (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
+ npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+ npy_is_aligned(args[0], (esize)) && \
+ (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
+ abs_ptrdiff(args[2], args[0]) == 0) && \
+ (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
+ abs_ptrdiff(args[2], args[1]) >= 0))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
+ (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
+ npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+ ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
+ (abs_ptrdiff(args[2], args[1]) == 0)) && \
+ abs_ptrdiff(args[2], args[0]) >= (esize))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
+ (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
+ npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
+ ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
+ (abs_ptrdiff(args[2], args[0]) == 0)) && \
+ abs_ptrdiff(args[2], args[1]) >= (esize))
+
+#undef abs_ptrdiff
+
+#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
+ (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
+ npy_is_aligned(args[1], (esize)) && \
+ npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
+ (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
+ npy_is_aligned(args[1], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
+ (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
+ npy_is_aligned(args[0], (esize)))
+
+/* align var to alignment */
+#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+ npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+ alignment, n);\
+ for(i = 0; i < peel; i++)
+
+#define LOOP_BLOCKED(type, vsize)\
+ for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+ i += (vsize / sizeof(type)))
+
+#define LOOP_BLOCKED_END\
+ for (; i < n; i++)
+
#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 6403efaee..839d2b3ae 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1797,124 +1797,11 @@ DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step
/**begin repeat
* Float types
- * #type = npy_float, npy_double, npy_longdouble, npy_float#
- * #dtype = npy_float, npy_double, npy_longdouble, npy_half#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
- * #c = f, , l, #
- * #C = F, , L, #
- * #trf = , , , npy_half_to_float#
- */
-
-/*
- * Pairwise summation, rounding error O(lg n) instead of O(n).
- * The recursion depth is O(lg n) as well.
- * when updating also update similar complex floats summation
- */
-static @type@
-pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
-{
- if (n < 8) {
- npy_intp i;
- @type@ res = 0.;
-
- for (i = 0; i < n; i++) {
- res += @trf@(*((@dtype@*)(a + i * stride)));
- }
- return res;
- }
- else if (n <= PW_BLOCKSIZE) {
- npy_intp i;
- @type@ r[8], res;
-
- /*
- * sum a block with 8 accumulators
- * 8 times unroll reduces blocksize to 16 and allows vectorization with
- * avx without changing summation ordering
- */
- r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
- r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
- r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
- r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
- r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
- r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
- r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
- r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
-
- for (i = 8; i < n - (n % 8); i += 8) {
- /* small blocksizes seems to mess with hardware prefetch */
- NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
- r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
- r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
- r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
- r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
- r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
- r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
- r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
- r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
- }
-
- /* accumulate now to avoid stack spills for single peel loop */
- res = ((r[0] + r[1]) + (r[2] + r[3])) +
- ((r[4] + r[5]) + (r[6] + r[7]));
-
- /* do non multiple of 8 rest */
- for (; i < n; i++) {
- res += @trf@(*((@dtype@ *)(a + i * stride)));
- }
- return res;
- }
- else {
- /* divide by two but avoid non-multiples of unroll factor */
- npy_intp n2 = n / 2;
-
- n2 -= n2 % 8;
- return pairwise_sum_@TYPE@(a, n2, stride) +
- pairwise_sum_@TYPE@(a + n2 * stride, n - n2, stride);
- }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, , l#
* #C = F, , L#
*/
-
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (IS_BINARY_REDUCE) {
-#if @PW@
- @type@ * iop1 = (@type@ *)args[0];
- npy_intp n = dimensions[0];
-
- *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
-#else
- BINARY_REDUCE_LOOP(@type@) {
- io1 @OP@= *(@type@ *)ip2;
- }
- *((@type@ *)iop1) = io1;
-#endif
- }
- else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ in2 = *(@type@ *)ip2;
- *((@type@ *)op1) = in1 @OP@ in2;
- }
- }
-}
-/**end repeat1**/
-
/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal,
* logical_and, logical_or#
@@ -2244,8 +2131,6 @@ NPY_NO_EXPORT void
}
}
-#define @TYPE@_true_divide @TYPE@_divide
-
/**end repeat**/
/*
@@ -2254,6 +2139,38 @@ NPY_NO_EXPORT void
*****************************************************************************
*/
+/**begin repeat
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ */
+NPY_NO_EXPORT void
+LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+#if @PW@
+ npy_longdouble * iop1 = (npy_longdouble *)args[0];
+ npy_intp n = dimensions[0];
+
+ *iop1 @OP@= LONGDOUBLE_pairwise_sum(args[1], n, steps[1]);
+#else
+ BINARY_REDUCE_LOOP(npy_longdouble) {
+ io1 @OP@= *(npy_longdouble *)ip2;
+ }
+ *((npy_longdouble *)iop1) = io1;
+#endif
+ }
+ else {
+ BINARY_LOOP {
+ const npy_longdouble in1 = *(npy_longdouble *)ip1;
+ const npy_longdouble in2 = *(npy_longdouble *)ip2;
+ *((npy_longdouble *)op1) = in1 @OP@ in2;
+ }
+ }
+}
+/**end repeat**/
+
NPY_NO_EXPORT void
LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -2306,7 +2223,7 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
#if @PW@
npy_intp n = dimensions[0];
- io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
+ io1 @OP@= HALF_pairwise_sum(args[1], n, steps[1]);
#else
BINARY_REDUCE_LOOP_INNER {
io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
@@ -2614,9 +2531,6 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps,
}
}
-#define HALF_true_divide HALF_divide
-
-
/*
*****************************************************************************
** COMPLEX LOOPS **
@@ -2643,80 +2557,8 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps,
* #SIMD = 1, 1, 0#
*/
-/* similar to pairwise sum of real floats */
-static void
-pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
- npy_intp stride)
-{
- assert(n % 2 == 0);
- if (n < 8) {
- npy_intp i;
-
- *rr = 0.;
- *ri = 0.;
- for (i = 0; i < n; i += 2) {
- *rr += *((@ftype@ *)(a + i * stride + 0));
- *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
- }
- return;
- }
- else if (n <= PW_BLOCKSIZE) {
- npy_intp i;
- @ftype@ r[8];
-
- /*
- * sum a block with 8 accumulators
- * 8 times unroll reduces blocksize to 16 and allows vectorization with
- * avx without changing summation ordering
- */
- r[0] = *((@ftype@ *)(a + 0 * stride));
- r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
- r[2] = *((@ftype@ *)(a + 2 * stride));
- r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
- r[4] = *((@ftype@ *)(a + 4 * stride));
- r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
- r[6] = *((@ftype@ *)(a + 6 * stride));
- r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
-
- for (i = 8; i < n - (n % 8); i += 8) {
- /* small blocksizes seems to mess with hardware prefetch */
- NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
- r[0] += *((@ftype@ *)(a + (i + 0) * stride));
- r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
- r[2] += *((@ftype@ *)(a + (i + 2) * stride));
- r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
- r[4] += *((@ftype@ *)(a + (i + 4) * stride));
- r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
- r[6] += *((@ftype@ *)(a + (i + 6) * stride));
- r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
- }
-
- /* accumulate now to avoid stack spills for single peel loop */
- *rr = ((r[0] + r[2]) + (r[4] + r[6]));
- *ri = ((r[1] + r[3]) + (r[5] + r[7]));
-
- /* do non multiple of 8 rest */
- for (; i < n; i+=2) {
- *rr += *((@ftype@ *)(a + i * stride + 0));
- *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
- }
- return;
- }
- else {
- /* divide by two but avoid non-multiples of unroll factor */
- @ftype@ rr1, ri1, rr2, ri2;
- npy_intp n2 = n / 2;
-
- n2 -= n2 % 8;
- pairwise_sum_@TYPE@(&rr1, &ri1, a, n2, stride);
- pairwise_sum_@TYPE@(&rr2, &ri2, a + n2 * stride, n - n2, stride);
- *rr = rr1 + rr2;
- *ri = ri1 + ri2;
- return;
- }
-}
-
-
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
/**begin repeat1
* arithmetic
* #kind = add, subtract#
@@ -2733,7 +2575,7 @@ NPY_NO_EXPORT void
@ftype@ * oi = ((@ftype@ *)args[0]) + 1;
@ftype@ rr, ri;
- pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
+ @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
*or @OP@= rr;
*oi @OP@= ri;
return;
@@ -2763,6 +2605,7 @@ NPY_NO_EXPORT void
((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
}
}
+#endif // !SIMD
NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -2796,31 +2639,6 @@ NPY_NO_EXPORT void
}
}
-#if @SIMD@
-NPY_NO_EXPORT void
-@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
- if (IS_BINARY_REDUCE) {
- @TYPE@_add(args, dimensions, steps, func);
- }
- else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) {
- @TYPE@_add(args, dimensions, steps, func);
- }
-}
-
-/**begin repeat1
- * arithmetic
- * #kind = subtract, multiply#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
- if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
- @TYPE@_@kind@(args, dimensions, steps, func);
- }
-}
-/**end repeat1**/
-#endif
NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -3066,8 +2884,6 @@ NPY_NO_EXPORT void
}
/**end repeat1**/
-#define @TYPE@_true_divide @TYPE@_divide
-
/**end repeat**/
#undef CGE
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index a0b68d168..c15ff8e3b 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -185,6 +185,22 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat1**/
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
@@ -356,9 +372,6 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-#define @TYPE@_true_divide @TYPE@_divide
-
/**end repeat**/
@@ -367,6 +380,19 @@ NPY_NO_EXPORT void
** COMPLEX LOOPS **
*****************************************************************************
*/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = add, subtract, multiply#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
@@ -380,26 +406,14 @@ NPY_NO_EXPORT void
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, , l#
* #C = F, , L#
- * #IFSIMD = 1, 1, 0#
*/
/**begin repeat1
- * #isa = , _avx512f#
- */
-
-/**begin repeat2
* arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
+ * #kind = add, subtract, multiply#
*/
-
NPY_NO_EXPORT void
-C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat2**/
-
-NPY_NO_EXPORT void
-C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat1**/
NPY_NO_EXPORT void
@@ -480,7 +494,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v
NPY_NO_EXPORT void
C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat1**/
-#define C@TYPE@_true_divide C@TYPE@_divide
/**end repeat**/
@@ -593,10 +606,6 @@ NPY_NO_EXPORT void
TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/* Special case equivalents to above functions */
-
-#define TIMEDELTA_mq_m_true_divide TIMEDELTA_mq_m_divide
-#define TIMEDELTA_md_m_true_divide TIMEDELTA_md_m_divide
-#define TIMEDELTA_mm_d_true_divide TIMEDELTA_mm_d_divide
#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
new file mode 100644
index 000000000..d8c8fdc9e
--- /dev/null
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -0,0 +1,777 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 avx2 avx512f
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+// TODO: replace raw SIMD with NPYV
+//###############################################################################
+//## Real Single/Double precision
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#ifdef NPY_HAVE_SSE2
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #scalarf = npy_sqrtf, npy_sqrt#
+ * #c = f, #
+ * #vtype = __m128, __m128d#
+ * #vtype256 = __m256, __m256d#
+ * #vtype512 = __m512, __m512d#
+ * #vpre = _mm, _mm#
+ * #vpre256 = _mm256, _mm256#
+ * #vpre512 = _mm512, _mm512#
+ * #vsuf = ps, pd#
+ * #vsufs = ss, sd#
+ * #nan = NPY_NANF, NPY_NAN#
+ * #double = 0, 1#
+ * #cast = _mm_castps_si128, _mm_castpd_si128#
+ */
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+static void
+sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef NPY_HAVE_AVX512F
+ const npy_intp vector_size_bytes = 64;
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[i] @OP@ ip2[i];
+ /* lots of specializations, to squeeze out max performance */
+ if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+ @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ }
+ else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+ @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+ @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+ @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ }
+#elif defined NPY_HAVE_AVX2
+ const npy_intp vector_size_bytes = 32;
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[i] @OP@ ip2[i];
+ /* lots of specializations, to squeeze out max performance */
+ if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
+ npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+ @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ }
+ else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+ @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+ @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+ @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ }
+#else
+ const npy_intp vector_size_bytes = 16;
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[i] @OP@ ip2[i];
+ /* lots of specializations, to squeeze out max performance */
+ if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
+ npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ }
+ else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+ @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ }
+#endif
+ LOOP_BLOCKED_END {
+ op[i] = ip1[i] @OP@ ip2[i];
+ }
+}
+
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef NPY_HAVE_AVX512F
+ const npy_intp vector_size_bytes = 64;
+ const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[0] @OP@ ip2[i];
+ if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+
+
+#elif defined NPY_HAVE_AVX2
+ const npy_intp vector_size_bytes = 32;
+ const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[0] @OP@ ip2[i];
+ if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+#else
+ const npy_intp vector_size_bytes = 16;
+ const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[0] @OP@ ip2[i];
+ if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+#endif
+ LOOP_BLOCKED_END {
+ op[i] = ip1[0] @OP@ ip2[i];
+ }
+}
+
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef NPY_HAVE_AVX512F
+ const npy_intp vector_size_bytes = 64;
+ const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[i] @OP@ ip2[0];
+ if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+ @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+ @vpre512@_store_@vsuf@(&op[i], c);
+ }
+ }
+
+#elif defined NPY_HAVE_AVX2
+ const npy_intp vector_size_bytes = 32;
+ const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[i] @OP@ ip2[0];
+ if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+ @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+ @vpre256@_store_@vsuf@(&op[i], c);
+ }
+ }
+#else
+ const npy_intp vector_size_bytes = 16;
+ const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+ op[i] = ip1[i] @OP@ ip2[0];
+ if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, vector_size_bytes) {
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @vpre@_store_@vsuf@(&op[i], c);
+ }
+ }
+#endif
+ LOOP_BLOCKED_END {
+ op[i] = ip1[i] @OP@ ip2[0];
+ }
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+#else // NPY_HAVE_SSE2
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #CHK = , _F64#
+ */
+#if NPY_SIMD@CHK@
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+ op[i] = ip1[i] @OP@ ip2[i];
+ }
+ /* lots of specializations, to squeeze out max performance */
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+ npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+ npyv_store_@sfx@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+ npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+ npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+ npyv_store_@sfx@(&op[i], c);
+ }
+ }
+ LOOP_BLOCKED_END {
+ op[i] = ip1[i] @OP@ ip2[i];
+ }
+}
+
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+ op[i] = ip1[0] @OP@ ip2[i];
+ }
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+ npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+ npyv_store_@sfx@(&op[i], v3);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = ip1[0] @OP@ ip2[i];
+ }
+}
+
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+ op[i] = ip1[i] @OP@ ip2[0];
+ }
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+ npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+ npyv_store_@sfx@(&op[i], v3);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = ip1[i] @OP@ ip2[0];
+ }
+}
+/**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
+/**end repeat**/
+#endif // NPY_HAVE_SSE2
+
+/**begin repeat
+ * Float types
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #vector = 1, 1, 0#
+ * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if @vector@ && defined NPY_HAVE_SSE2
+ @type@ * ip1 = (@type@ *)args[0];
+ @type@ * ip2 = (@type@ *)args[1];
+ @type@ * op = (@type@ *)args[2];
+ npy_intp n = dimensions[0];
+#if defined NPY_HAVE_AVX512F
+ const npy_uintp vector_size_bytes = 64;
+#elif defined NPY_HAVE_AVX2
+ const npy_uintp vector_size_bytes = 32;
+#else
+ const npy_uintp vector_size_bytes = 32;
+#endif
+ /* argument one scalar */
+ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
+ sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ /* argument two scalar */
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
+ sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
+ sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+#elif @VECTOR@
+ @type@ * ip1 = (@type@ *)args[0];
+ @type@ * ip2 = (@type@ *)args[1];
+ @type@ * op = (@type@ *)args[2];
+ npy_intp n = dimensions[0];
+ /* argument one scalar */
+ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ /* argument two scalar */
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+#endif
+ return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * Float types
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #c = f, #
+ * #C = F, #
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+#if @PW@
+ @type@ * iop1 = (@type@ *)args[0];
+ npy_intp n = dimensions[0];
+
+ *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
+#else
+ BINARY_REDUCE_LOOP(@type@) {
+ io1 @OP@= *(@type@ *)ip2;
+ }
+ *((@type@ *)iop1) = io1;
+#endif
+ }
+ else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+ BINARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ const @type@ in2 = *(@type@ *)ip2;
+ *((@type@ *)op1) = in1 @OP@ in2;
+ }
+ }
+}
+/**end repeat1**/
+/**end repeat**/
+
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
+ /**
+ * For somehow MSVC commit aggressive optimization lead
+ * to raises 'RuntimeWarning: invalid value encountered in multiply'
+ *
+ * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
+ * investigate about it while moving to NPYV.
+ */
+ #define AVX512F_NOMSVC
+#endif
+
+#ifdef AVX512F_NOMSVC
+static NPY_INLINE __mmask16
+avx512_get_full_load_mask_ps(void)
+{
+ return 0xFFFF;
+}
+
+static NPY_INLINE __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+ return 0xFF;
+}
+static NPY_INLINE __m512
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
+{
+ return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+}
+
+static NPY_INLINE __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+ return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+{
+ return (0x0001 << num_elem) - 0x0001;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+ return (0x01 << num_elem) - 0x01;
+}
+/**begin repeat
+ * #vsub = ps, pd#
+ * #type= npy_float, npy_double#
+ * #epi_vsub = epi32, epi64#
+ * #vtype = __m512, __m512d#
+ * #mask = __mmask16, __mmask8#
+ * #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
+ * #neg_mask = 0x80000000, 0x8000000000000000#
+ * #perm_ = 0xb1, 0x55#
+ * #cmpx_img_mask = 0xAAAA, 0xAA#
+ * #cmpx_re_mask = 0x5555, 0x55#
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
+ */
+static @vtype@
+avx512_hadd_@vsub@(const @vtype@ x)
+{
+ return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+
+static @vtype@
+avx512_hsub_@vsub@(const @vtype@ x)
+{
+ return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+static NPY_INLINE @vtype@
+avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
+{
+ // x1 = r1, i1
+ // x2 = r2, i2
+ @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2
+ @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2
+ @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1
+ @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2
+ @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2
+ return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type = npy_float, npy_double#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #epi_vsub = epi32, epi64#
+ * #mask = __mmask16, __mmask8#
+ * #vtype = __m512, __m512d#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ * #storemask = 0xFF, 0xF#
+ * #IS_FLOAT = 1, 0#
+ */
+/**begin repeat1
+ * #func = add, subtract, multiply#
+ * #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
+ */
+#if defined AVX512F_NOMSVC
+static NPY_INLINE void
+AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+ const npy_intp array_size = dimensions[0];
+ npy_intp num_remaining_elements = 2*array_size;
+ @type@* ip1 = (@type@*) args[0];
+ @type@* ip2 = (@type@*) args[1];
+ @type@* op = (@type@*) args[2];
+
+ @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < @num_lanes@) {
+ load_mask = avx512_get_partial_load_mask_@vsuffix@(
+ num_remaining_elements, @num_lanes@);
+ }
+ @vtype@ x1, x2;
+ x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+ x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
+
+ @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
+
+ _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+
+ ip1 += @num_lanes@;
+ ip2 += @num_lanes@;
+ op += @num_lanes@;
+ num_remaining_elements -= @num_lanes@;
+ }
+}
+#endif // AVX512F_NOMSVC
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type= npy_float, npy_double#
+ * #esize = 8, 16#
+ */
+/**begin repeat1
+ * #func = add, subtract, multiply#
+ */
+static NPY_INLINE int
+run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+#if defined AVX512F_NOMSVC
+ if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
+ AVX512F_@func@_@TYPE@(args, dimensions, steps);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+ return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #c = f, #
+ * #C = F, #
+ */
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract#
+ * #OP = +, -#
+ * #PW = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ // Parenthesis around @PW@ tells clang dead code is intentional
+ if (IS_BINARY_REDUCE && (@PW@)) {
+ npy_intp n = dimensions[0];
+ @ftype@ * or = ((@ftype@ *)args[0]);
+ @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+ @ftype@ rr, ri;
+
+ @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+ *or @OP@= rr;
+ *oi @OP@= ri;
+ return;
+ }
+ if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+ BINARY_LOOP {
+ const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+ const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+ const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+ const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+ ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
+ ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+ }
+ }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
+ BINARY_LOOP {
+ const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+ const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+ const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+ const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+ ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
+ ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+ }
+ }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h
deleted file mode 100644
index f5540bdae..000000000
--- a/numpy/core/src/umath/loops_utils.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _NPY_UMATH_LOOPS_UTILS_H_
-#define _NPY_UMATH_LOOPS_UTILS_H_
-
-#include "numpy/npy_common.h" // NPY_FINLINE
-/*
- * nomemoverlap - returns false if two strided arrays have an overlapping
- * region in memory. ip_size/op_size = size of the arrays which can be negative
- * indicating negative steps.
- */
-NPY_FINLINE npy_bool
-nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
-{
- char *ip_start, *ip_end, *op_start, *op_end;
- if (ip_size < 0) {
- ip_start = ip + ip_size;
- ip_end = ip;
- }
- else {
- ip_start = ip;
- ip_end = ip + ip_size;
- }
- if (op_size < 0) {
- op_start = op + op_size;
- op_end = op;
- }
- else {
- op_start = op;
- op_end = op + op_size;
- }
- return (ip_start == op_start && op_end == ip_end) ||
- (ip_start > op_end) || (op_start > ip_end);
-}
-
-// returns true if two strided arrays have an overlapping region in memory
-// same as `nomemoverlap()` but requires array length and step sizes
-NPY_FINLINE npy_bool
-is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
-{
- return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
-}
-
-#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
new file mode 100644
index 000000000..dfa790ed9
--- /dev/null
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -0,0 +1,213 @@
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+#include "numpy/halffloat.h" // npy_half_to_float
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+ char *ip_start, *ip_end, *op_start, *op_end;
+ if (ip_size < 0) {
+ ip_start = ip + ip_size;
+ ip_end = ip;
+ }
+ else {
+ ip_start = ip;
+ ip_end = ip + ip_size;
+ }
+ if (op_size < 0) {
+ op_start = op + op_size;
+ op_end = op;
+ }
+ else {
+ op_start = op;
+ op_end = op + op_size;
+ }
+ return (ip_start == op_start && op_end == ip_end) ||
+ (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+ return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+/*
+ * cutoff blocksize for pairwise summation
+ * decreasing it decreases errors slightly as more pairs are summed but
+ * also lowers performance, as the inner loop is unrolled eight times it is
+ * effectively 16
+ */
+#define PW_BLOCKSIZE 128
+
+/**begin repeat
+ * Float types
+ * #type = npy_float, npy_double, npy_longdouble, npy_float#
+ * #dtype = npy_float, npy_double, npy_longdouble, npy_half#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
+ * #c = f, , l, #
+ * #C = F, , L, #
+ * #trf = , , , npy_half_to_float#
+ */
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static NPY_INLINE @type@
+@TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+ if (n < 8) {
+ npy_intp i;
+ @type@ res = 0.;
+
+ for (i = 0; i < n; i++) {
+ res += @trf@(*((@dtype@*)(a + i * stride)));
+ }
+ return res;
+ }
+ else if (n <= PW_BLOCKSIZE) {
+ npy_intp i;
+ @type@ r[8], res;
+
+ /*
+ * sum a block with 8 accumulators
+ * 8 times unroll reduces blocksize to 16 and allows vectorization with
+ * avx without changing summation ordering
+ */
+ r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
+ r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
+ r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
+ r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
+ r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
+ r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
+ r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
+ r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
+
+ for (i = 8; i < n - (n % 8); i += 8) {
+ /* small blocksizes seems to mess with hardware prefetch */
+ NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
+ r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
+ r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
+ r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
+ r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
+ r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
+ r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
+ r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
+ r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
+ }
+
+ /* accumulate now to avoid stack spills for single peel loop */
+ res = ((r[0] + r[1]) + (r[2] + r[3])) +
+ ((r[4] + r[5]) + (r[6] + r[7]));
+
+ /* do non multiple of 8 rest */
+ for (; i < n; i++) {
+ res += @trf@(*((@dtype@ *)(a + i * stride)));
+ }
+ return res;
+ }
+ else {
+ /* divide by two but avoid non-multiples of unroll factor */
+ npy_intp n2 = n / 2;
+
+ n2 -= n2 % 8;
+ return @TYPE@_pairwise_sum(a, n2, stride) +
+ @TYPE@_pairwise_sum(a + n2 * stride, n - n2, stride);
+ }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #ftype = npy_float, npy_double, npy_longdouble#
+ * #c = f, , l#
+ * #C = F, , L#
+ * #SIMD = 1, 1, 0#
+ */
+/* similar to pairwise sum of real floats */
+static NPY_INLINE void
+@TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
+ npy_intp stride)
+{
+ assert(n % 2 == 0);
+ if (n < 8) {
+ npy_intp i;
+
+ *rr = 0.;
+ *ri = 0.;
+ for (i = 0; i < n; i += 2) {
+ *rr += *((@ftype@ *)(a + i * stride + 0));
+ *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
+ }
+ return;
+ }
+ else if (n <= PW_BLOCKSIZE) {
+ npy_intp i;
+ @ftype@ r[8];
+
+ /*
+ * sum a block with 8 accumulators
+ * 8 times unroll reduces blocksize to 16 and allows vectorization with
+ * avx without changing summation ordering
+ */
+ r[0] = *((@ftype@ *)(a + 0 * stride));
+ r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
+ r[2] = *((@ftype@ *)(a + 2 * stride));
+ r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
+ r[4] = *((@ftype@ *)(a + 4 * stride));
+ r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
+ r[6] = *((@ftype@ *)(a + 6 * stride));
+ r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
+
+ for (i = 8; i < n - (n % 8); i += 8) {
+ /* small blocksizes seems to mess with hardware prefetch */
+ NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
+ r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+ r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
+ r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+ r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
+ r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+ r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
+ r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+ r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
+ }
+
+ /* accumulate now to avoid stack spills for single peel loop */
+ *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+ *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+ /* do non multiple of 8 rest */
+ for (; i < n; i+=2) {
+ *rr += *((@ftype@ *)(a + i * stride + 0));
+ *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
+ }
+ return;
+ }
+ else {
+ /* divide by two but avoid non-multiples of unroll factor */
+ @ftype@ rr1, ri1, rr2, ri2;
+ npy_intp n2 = n / 2;
+
+ n2 -= n2 % 8;
+ @TYPE@_pairwise_sum(&rr1, &ri1, a, n2, stride);
+ @TYPE@_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+ *rr = rr1 + rr2;
+ *ri = ri1 + ri2;
+ return;
+ }
+}
+/**end repeat**/
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index c2104810f..3d4e6de87 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,7 +28,6 @@
#undef __AVX512F__
#endif
#endif
-#include "simd/simd.h"
#include "loops_utils.h" // nomemoverlap
#include <assert.h>
#include <stdlib.h>
@@ -52,119 +51,6 @@
*/
#define MAX_STEP_SIZE 2097152
-#define IS_BINARY_STRIDE_ONE(esize, vsize) \
- ((steps[0] == esize) && \
- (steps[1] == esize) && \
- (steps[2] == esize) && \
- (abs_ptrdiff(args[2], args[0]) >= vsize) && \
- (abs_ptrdiff(args[2], args[1]) >= vsize))
-
-/*
- * stride is equal to element size and input and destination are equal or
- * don't overlap within one register. The check of the steps against
- * esize also quarantees that steps are >= 0.
- */
-#define IS_BLOCKABLE_UNARY(esize, vsize) \
- (steps[0] == (esize) && steps[0] == steps[1] && \
- (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
- ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
- ((abs_ptrdiff(args[1], args[0]) == 0))))
-
-/*
- * Avoid using SIMD for very large step sizes for several reasons:
- * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
- * in which case we need two i64gather instructions and an additional vinsertf32x8
- * instruction to load a single zmm register (since one i64gather instruction
- * loads into a ymm register). This is not ideal for performance.
- * 2) Gather and scatter instructions can be slow when the loads/stores
- * cross page boundaries.
- *
- * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
- * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
- * ensures this. The condition also requires that the input and output arrays
- * should have no overlap in memory.
- */
-#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
- ((labs(steps[0]) < MAX_STEP_SIZE) && \
- (labs(steps[1]) < MAX_STEP_SIZE) && \
- (labs(steps[2]) < MAX_STEP_SIZE) && \
- (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
- (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
-
-#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
- ((labs(steps[0]) < MAX_STEP_SIZE) && \
- (labs(steps[1]) < MAX_STEP_SIZE) && \
- (labs(steps[2]) < MAX_STEP_SIZE) && \
- (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
- (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
-
-/*
- * 1) Output should be contiguous, can handle strided input data
- * 2) Input step should be smaller than MAX_STEP_SIZE for performance
- * 3) Input and output arrays should have no overlap in memory
- */
-#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
- ((steps[0] & (esizein-1)) == 0 && \
- steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
- (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
-
-#define IS_BLOCKABLE_REDUCE(esize, vsize) \
- (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
- npy_is_aligned(args[1], (esize)) && \
- npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY(esize, vsize) \
- (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
- npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
- npy_is_aligned(args[0], (esize)) && \
- (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
- abs_ptrdiff(args[2], args[0]) == 0) && \
- (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
- abs_ptrdiff(args[2], args[1]) >= 0))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
- (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
- npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
- ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
- (abs_ptrdiff(args[2], args[1]) == 0)) && \
- abs_ptrdiff(args[2], args[0]) >= (esize))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
- (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
- npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
- ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
- (abs_ptrdiff(args[2], args[0]) == 0)) && \
- abs_ptrdiff(args[2], args[1]) >= (esize))
-
-#undef abs_ptrdiff
-
-#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
- (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
- npy_is_aligned(args[1], (esize)) && \
- npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
- (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
- npy_is_aligned(args[1], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
- (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
- npy_is_aligned(args[0], (esize)))
-
-/* align var to alignment */
-#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
- npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
- alignment, n);\
- for(i = 0; i < peel; i++)
-
-#define LOOP_BLOCKED(type, vsize)\
- for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
- i += (vsize / sizeof(type)))
-
-#define LOOP_BLOCKED_END\
- for (; i < n; i++)
-
-
/*
* Dispatcher functions
* decide whether the operation can be vectorized and run it
@@ -184,31 +70,6 @@
*/
/**begin repeat1
- * #func = add, subtract, multiply#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps);
-#endif
-
-static NPY_INLINE int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
- if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
- AVX512F_@func@_@TYPE@(args, dimensions, steps);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
-/**end repeat1**/
-
-/**begin repeat1
* #func = square, absolute, conjugate#
* #outsize = 1, 2, 1#
* #max_stride = 2, 8, 8#
@@ -508,92 +369,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
/**end repeat1**/
/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-
-#elif @VECTOR@
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- @type@ * op = (@type@ *)args[2];
- npy_intp n = dimensions[0];
-#if defined __AVX512F__
- const npy_uintp vector_size_bytes = 64;
-#elif defined __AVX2__
- const npy_uintp vector_size_bytes = 32;
-#else
- const npy_uintp vector_size_bytes = 32;
-#endif
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
-#elif @VECTOR@
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- @type@ * op = (@type@ *)args[2];
- npy_intp n = dimensions[0];
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat1**/
-
-/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal,
* logical_and, logical_or#
* #simd = 1, 1, 1, 1, 1, 1, 0, 0#
@@ -777,7 +552,6 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
_mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
return r;
}
-
/**end repeat**/
/**begin repeat
@@ -797,327 +571,6 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
* #double = 0, 1#
* #cast = _mm_castps_si128, _mm_castpd_si128#
*/
-
-
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __AVX512F__
- const npy_intp vector_size_bytes = 64;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- }
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- }
-#elif defined __AVX2__
- const npy_intp vector_size_bytes = 32;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
- npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- }
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- }
-#else
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
- npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- }
- else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[i];
- }
-}
-
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __AVX512F__
- const npy_intp vector_size_bytes = 64;
- const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
-
-
-#elif defined __AVX2__
- const npy_intp vector_size_bytes = 32;
- const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
-#else
- const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[0] @OP@ ip2[i];
- }
-}
-
-
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __AVX512F__
- const npy_intp vector_size_bytes = 64;
- const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
-
-#elif defined __AVX2__
- const npy_intp vector_size_bytes = 32;
- const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
-#else
- const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[0];
- }
-}
-
-/**end repeat1**/
-
/*
* compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
* the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
@@ -3356,45 +2809,6 @@ AVX512F_log_DOUBLE(npy_double * op,
*/
/**begin repeat1
- * #func = add, subtract, multiply#
- * #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
- const npy_intp array_size = dimensions[0];
- npy_intp num_remaining_elements = 2*array_size;
- @type@* ip1 = (@type@*) args[0];
- @type@* ip2 = (@type@*) args[1];
- @type@* op = (@type@*) args[2];
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1, x2;
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
- x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
- @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
- ip1 += @num_lanes@;
- ip2 += @num_lanes@;
- op += @num_lanes@;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-
-/**begin repeat1
* #func = square, conjugate#
* #vectorf = avx512_csquare, avx512_conjugate#
*/
@@ -3656,86 +3070,5 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
/**end repeat**/
#undef VECTOR_SIZE_BYTES
-#else /* NPY_HAVE_SSE2_INTRINSICS */
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #sfx = f32, f64#
- * #CHK = , _F64#
- */
-
-#if NPY_SIMD@CHK@
-
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[i] @OP@ ip2[i];
- }
- /* lots of specializations, to squeeze out max performance */
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
- npyv_store_@sfx@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
- npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
- npyv_store_@sfx@(&op[i], c);
- }
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[i];
- }
-}
-
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[0] @OP@ ip2[i];
- }
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
- npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
- npyv_store_@sfx@(&op[i], v3);
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[0] @OP@ ip2[i];
- }
-}
-
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[i] @OP@ ip2[0];
- }
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
- npyv_store_@sfx@(&op[i], v3);
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[0];
- }
-}
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif
+#endif /* NPY_HAVE_SSE2_INTRINSICS */
#endif