summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2020-10-29 20:14:40 +0200
committerGitHub <noreply@github.com>2020-10-29 20:14:40 +0200
commit8829b807a841911ce18e79b308fee9fb92fb91b6 (patch)
tree88b1c765b18a524dc89e98e94e412d5acea121d6
parent43683b3256a86659f230dcadbcde1f8020398bfa (diff)
parentac4ffe1d39d9cc845948079a24facf7057effb24 (diff)
downloadnumpy-8829b807a841911ce18e79b308fee9fb92fb91b6.tar.gz
Merge pull request #16782 from seiko2plus/implement_npyv_pymod
ENH, TST: Bring the NumPy C SIMD vectorization interface "NPYV" to Python
-rw-r--r--.gitattributes1
-rw-r--r--numpy/core/setup.py23
-rw-r--r--numpy/core/src/_simd/_simd.c73
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src549
-rw-r--r--numpy/core/src/_simd/_simd.h30
-rw-r--r--numpy/core/src/_simd/_simd_arg.inc85
-rw-r--r--numpy/core/src/_simd/_simd_convert.inc209
-rw-r--r--numpy/core/src/_simd/_simd_data.inc.src93
-rw-r--r--numpy/core/src/_simd/_simd_easyintrin.inc214
-rw-r--r--numpy/core/src/_simd/_simd_inc.h.src421
-rw-r--r--numpy/core/src/_simd/_simd_vector.inc178
-rw-r--r--numpy/core/tests/test_simd.py550
-rw-r--r--numpy/core/tests/test_simd_module.py97
-rw-r--r--numpy/distutils/ccompiler_opt.py7
-rw-r--r--numpy/distutils/command/build.py12
-rw-r--r--numpy/distutils/command/build_ext.py8
-rwxr-xr-xruntests.py5
17 files changed, 2549 insertions, 6 deletions
diff --git a/.gitattributes b/.gitattributes
index ad7d3b227..bce3dbe6d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,6 +7,7 @@ doc/release/*.rst merge=union
# Highlight our custom templating language as C, since it's hopefully better
# than nothing. This also affects repo statistics.
*.c.src linguist-language=C
+*.inc.src linguist-language=C
*.h.src linguist-language=C
# Mark some files as vendored
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index b3e17baed..68aa0a851 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -626,6 +626,7 @@ def configuration(parent_package='',top_path=None):
config.add_include_dirs(join('src', 'multiarray'))
config.add_include_dirs(join('src', 'umath'))
config.add_include_dirs(join('src', 'npysort'))
+ config.add_include_dirs(join('src', '_simd'))
config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
@@ -974,6 +975,28 @@ def configuration(parent_package='',top_path=None):
config.add_extension('_operand_flag_tests',
sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
+ #######################################################################
+ # SIMD module #
+ #######################################################################
+
+ config.add_extension('_simd', sources=[
+ join('src', 'common', 'npy_cpu_features.c.src'),
+ join('src', '_simd', '_simd.c'),
+ join('src', '_simd', '_simd_inc.h.src'),
+ join('src', '_simd', '_simd_data.inc.src'),
+ join('src', '_simd', '_simd.dispatch.c.src'),
+ ], depends=[
+ join('src', 'common', 'npy_cpu_dispatch.h'),
+ join('src', 'common', 'simd', 'simd.h'),
+ join('src', '_simd', '_simd.h'),
+ join('src', '_simd', '_simd_inc.h.src'),
+ join('src', '_simd', '_simd_data.inc.src'),
+ join('src', '_simd', '_simd_arg.inc'),
+ join('src', '_simd', '_simd_convert.inc'),
+ join('src', '_simd', '_simd_easyintrin.inc'),
+ join('src', '_simd', '_simd_vector.inc'),
+ ])
+
config.add_subpackage('tests')
config.add_data_dir('tests/data')
config.add_data_dir('tests/examples')
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
new file mode 100644
index 000000000..b1fdd4478
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.c
@@ -0,0 +1,73 @@
+#include "_simd.h"
+
+PyMODINIT_FUNC PyInit__simd(void)
+{
+ static struct PyModuleDef defs = {
+ .m_base = PyModuleDef_HEAD_INIT,
+ .m_name = "numpy.core._simd",
+ .m_size = -1
+ };
+ if (npy_cpu_init() < 0) {
+ return NULL;
+ }
+ PyObject *m = PyModule_Create(&defs);
+ if (m == NULL) {
+ return NULL;
+ }
+ PyObject *targets = PyDict_New();
+ if (targets == NULL) {
+ goto err;
+ }
+ if (PyModule_AddObject(m, "targets", targets) < 0) {
+ Py_DECREF(targets);
+ goto err;
+ }
+ // add keys for non-supported optimizations with None value
+ #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY) \
+ { \
+ PyObject *simd_mod; \
+ if (!TESTED_FEATURES) { \
+ Py_INCREF(Py_None); \
+ simd_mod = Py_None; \
+ } else { \
+ simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)(); \
+ if (simd_mod == NULL) { \
+ goto err; \
+ } \
+ } \
+ const char *target_name = NPY_TOSTRING(TARGET_NAME); \
+ if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) { \
+ Py_DECREF(simd_mod); \
+ goto err; \
+ } \
+ Py_INCREF(simd_mod); \
+ if (PyModule_AddObject(m, target_name, simd_mod) < 0) { \
+ Py_DECREF(simd_mod); \
+ goto err; \
+ } \
+ }
+
+ #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY) \
+ { \
+ PyObject *simd_mod = simd_create_module(); \
+ if (simd_mod == NULL) { \
+ goto err; \
+ } \
+ if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) { \
+ Py_DECREF(simd_mod); \
+ goto err; \
+ } \
+ Py_INCREF(simd_mod); \
+ if (PyModule_AddObject(m, "baseline", simd_mod) < 0) { \
+ Py_DECREF(simd_mod); \
+ goto err; \
+ } \
+ }
+
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+ NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+ return m;
+err:
+ Py_DECREF(m);
+ return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
new file mode 100644
index 000000000..2d89b9df0
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -0,0 +1,549 @@
+/*@targets $werror #simd_test*/
+#include "_simd.h"
+#include "_simd_inc.h"
+
+#if NPY_SIMD
+#include "_simd_data.inc"
+#include "_simd_convert.inc"
+#include "_simd_vector.inc"
+#include "_simd_arg.inc"
+#include "_simd_easyintrin.inc"
+
+/*************************************************************************
+ * Defining NPYV intrinsics as module functions
+ *************************************************************************/
+/**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
+ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
+ * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
+ * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
+ * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
+ */
+#if @simd_sup@
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
+/**end repeat1**/
+/**begin repeat1
+ * # intrin = store, storea, stores, storel, storeh#
+ */
+// special definition due to the nature of @intrin@
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+ if (!PyArg_ParseTuple(
+ args, "O&O&:@intrin@_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &vec_arg
+ )) {
+ return NULL;
+ }
+ npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
+ // write-back
+ if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+ simd_arg_free(&seq_arg);
+ return NULL;
+ }
+ simd_arg_free(&seq_arg);
+ Py_RETURN_NONE;
+}
+/**end repeat1**/
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
+SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+
+// Partial Store
+static PyObject *
+simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg nlane_arg = {.dtype = simd_data_u32};
+ simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+ if (!PyArg_ParseTuple(
+ args, "O&O&O&:store_till_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &nlane_arg,
+ simd_arg_converter, &vec_arg
+ )) {
+ return NULL;
+ }
+ npyv_store_till_@sfx@(
+ seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
+ );
+ // write-back
+ if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+ simd_arg_free(&seq_arg);
+ return NULL;
+ }
+ simd_arg_free(&seq_arg);
+ Py_RETURN_NONE;
+}
+
+// Non-contiguous Load
+/**begin repeat1
+ * #intrin = loadn, loadn_till, loadn_tillz#
+ * #till = 0, 1, 1#
+ * #fill = 0, 1, 0#
+ * #format = , O&O&, O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg stride_arg = {.dtype = simd_data_s64};
+#if @till@
+ simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if @fill@
+ simd_arg fill_arg = {.dtype = simd_data_@sfx@};
+#endif
+ if (!PyArg_ParseTuple(
+ args, "@format@O&O&:@intrin@_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &stride_arg
+#if @till@
+ ,simd_arg_converter, &nlane_arg
+#endif
+#if @fill@
+ ,simd_arg_converter, &fill_arg
+#endif
+ )) {
+ return NULL;
+ }
+ npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+ npy_intp stride = (npy_intp)stride_arg.data.s64;
+ Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+ Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+ if (stride < 0) {
+ seq_ptr += cur_seq_len -1;
+ min_seq_len = -min_seq_len;
+ }
+ if (cur_seq_len < min_seq_len) {
+ PyErr_Format(PyExc_ValueError,
+ "@intrin@_@sfx@(), according to provided stride %d, the "
+ "minimum acceptable size of the required sequence is %d, given(%d)",
+ stride, min_seq_len, cur_seq_len
+ );
+ goto err;
+ }
+ npyv_@sfx@ rvec = npyv_@intrin@_@sfx@(
+ seq_ptr, stride
+ #if @till@
+ , nlane_arg.data.u32
+ #endif
+ #if @fill@
+ , fill_arg.data.@sfx@
+ #endif
+ );
+ simd_arg ret = {
+ .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
+ };
+ simd_arg_free(&seq_arg);
+ return simd_arg_to_obj(&ret);
+err:
+ simd_arg_free(&seq_arg);
+ return NULL;
+}
+/**end repeat1**/
+
+// Non-contiguous Store
+/**begin repeat1
+ * #intrin = storen, storen_till#
+ * #till = 0, 1#
+ * #format = , O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg stride_arg = {.dtype = simd_data_s64};
+ simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+#if @till@
+ simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+ if (!PyArg_ParseTuple(
+ args, "@format@O&O&O&:storen_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &stride_arg
+#if @till@
+ ,simd_arg_converter, &nlane_arg
+#endif
+ ,simd_arg_converter, &vec_arg
+ )) {
+ return NULL;
+ }
+ npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+ npy_intp stride = (npy_intp)stride_arg.data.s64;
+ Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+ Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+ if (stride < 0) {
+ seq_ptr += cur_seq_len -1;
+ min_seq_len = -min_seq_len;
+ }
+ // overflow guard
+ if (cur_seq_len < min_seq_len) {
+ PyErr_Format(PyExc_ValueError,
+ "@intrin@_@sfx@(), according to provided stride %d, the"
+ "minimum acceptable size of the required sequence is %d, given(%d)",
+ stride, min_seq_len, cur_seq_len
+ );
+ goto err;
+ }
+ npyv_@intrin@_@sfx@(
+ seq_ptr, stride
+ #if @till@
+ ,nlane_arg.data.u32
+ #endif
+ ,vec_arg.data.v@sfx@
+ );
+ // write-back
+ if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+ goto err;
+ }
+ simd_arg_free(&seq_arg);
+ Py_RETURN_NONE;
+err:
+ simd_arg_free(&seq_arg);
+ return NULL;
+}
+/**end repeat1**/
+#endif // @ncont_sup@
+
+
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@)
+SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_@sfx@ and npy_set_@sfx@.
+*/
+/**begin repeat1
+ * #intrin = setf, set#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ npyv_lanetype_@sfx@ *data = simd_sequence_from_iterable(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
+ if (data == NULL) {
+ return NULL;
+ }
+ simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@(
+ data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
+ data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
+ data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+ data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+ data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+ data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+ data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+ data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+ data[64] // for setf
+ )};
+ simd_sequence_free(data);
+ return (PyObject*)PySIMDVector_FromData(r, simd_data_v@sfx@);
+}
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = combine, zip#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8)
+SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@)
+SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@)
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin = and, or, xor#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #intrin = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin = add, sub#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin = adds, subs#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // div_sup
+
+#if @fused_sup@
+/**begin repeat1
+ * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ */
+SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // fused_sup
+
+#endif // simd_sup
+/**end repeat**/
+/***************************
+ * Variant
+ ***************************/
+SIMD_IMPL_INTRIN_0N(cleanup)
+
+/*************************************************************************
+ * Attach module functions
+ *************************************************************************/
+static PyMethodDef simd__intrinsics_methods[] = {
+/**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
+ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
+ * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
+ * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
+ * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
+ */
+#if @simd_sup@
+
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+/**begin repeat1
+ * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz,
+ * store_till, storen, storen_till#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // ncont_sup
+
+
+/***************************
+ * Misc
+ ***************************/
+/**begin repeat1
+ * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = set, setf, setall, zero, select#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh, combine, zip#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+/**begin repeat1
+ * # intrin = shl, shr, shli, shri#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin = add, sub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin = adds, subs#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_INTRIN_DEF(mul_@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_INTRIN_DEF(div_@sfx@)
+#endif // div_sup
+
+#if @fused_sup@
+/**begin repeat1
+ * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // fused_sup
+
+#endif // simd_sup
+/**end repeat**/
+
+/***************************
+ * Variant
+ ***************************/
+SIMD_INTRIN_DEF(cleanup)
+/***************************/
+{NULL, NULL, 0, NULL}
+}; // PyMethodDef
+
+#endif // NPY_SIMD
+
+/*************************************************************************
+ * Defining a separate module for each target
+ *************************************************************************/
+NPY_VISIBILITY_HIDDEN PyObject *
+NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
+{
+ static struct PyModuleDef defs = {
+ .m_base = PyModuleDef_HEAD_INIT,
+ .m_size = -1,
+ #ifdef NPY__CPU_TARGET_CURRENT
+ .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
+ #else
+ .m_name = "numpy.core._simd.baseline",
+ #endif
+ #if NPY_SIMD
+ .m_methods = simd__intrinsics_methods
+ #else
+ .m_methods = NULL
+ #endif
+ };
+ PyObject *m = PyModule_Create(&defs);
+ if (m == NULL) {
+ return NULL;
+ }
+ if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) {
+ goto err;
+ }
+ if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
+ goto err;
+ }
+ if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
+ goto err;
+ }
+#if NPY_SIMD
+ if (PySIMDVectorType_Init(m)) {
+ goto err;
+ }
+ /**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ */
+ if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) {
+ goto err;
+ }
+ /**end repeat**/
+#endif // NPY_SIMD
+ return m;
+err:
+ Py_DECREF(m);
+ return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h
new file mode 100644
index 000000000..d9905c801
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.h
@@ -0,0 +1,30 @@
+/**
+ * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes.
+ *
+ * Please keep this module independent from other c-extension modules,
+ * since NPYV intrinsics may be involved in their functionality,
+ * which increases the degree of complexity in tracking and detecting errors.
+ *
+ * TODO: Add an independent sphinx doc.
+ *
+ * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'.
+ */
+#ifndef _SIMD_SIMD_H_
+#define _SIMD_SIMD_H_
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+// autogenerated, required for CPU dispatch macros
+#include "_simd.dispatch.h"
+#endif
+/**
+ * Create a new module for each required optimization which contains all NPYV intrinsics,
+ *
+ * If required optimization is not supported by NPYV, the module will still provides
+ * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without
+ * any intrinsics.
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void))
+#endif // _SIMD_SIMD_H_
diff --git a/numpy/core/src/_simd/_simd_arg.inc b/numpy/core/src/_simd/_simd_arg.inc
new file mode 100644
index 000000000..f5bcf5487
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_arg.inc
@@ -0,0 +1,85 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Protected Definitions
+ ************************************/
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg)
+{
+ assert(arg->dtype != 0);
+ const simd_data_info *info = simd_data_getinfo(arg->dtype);
+ if (info->is_scalar) {
+ arg->data = simd_scalar_from_number(obj, arg->dtype);
+ }
+ else if (info->is_sequence) {
+ unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes;
+ arg->data.qu8 = simd_sequence_from_iterable(obj, arg->dtype, min_seq_size);
+ }
+ else if (info->is_vectorx) {
+ arg->data = simd_vectorx_from_tuple(obj, arg->dtype);
+ }
+ else if (info->is_vector) {
+ arg->data = PySIMDVector_AsData((PySIMDVectorObject*)obj, arg->dtype);
+ } else {
+ arg->data.u64 = 0;
+ PyErr_Format(PyExc_RuntimeError,
+ "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname
+ );
+ return -1;
+ }
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+ return 0;
+}
+
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg)
+{
+ assert(arg->dtype != 0);
+ const simd_data_info *info = simd_data_getinfo(arg->dtype);
+ if (info->is_scalar) {
+ return simd_scalar_to_number(arg->data, arg->dtype);
+ }
+ if (info->is_sequence) {
+ return simd_sequence_to_list(arg->data.qu8, arg->dtype);
+ }
+ if (info->is_vectorx) {
+ return simd_vectorx_to_tuple(arg->data, arg->dtype);
+ }
+ if (info->is_vector) {
+ return (PyObject*)PySIMDVector_FromData(arg->data, arg->dtype);
+ }
+ PyErr_Format(PyExc_RuntimeError,
+ "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname
+ );
+ return NULL;
+}
+
+static void
+simd_arg_free(simd_arg *arg)
+{
+ const simd_data_info *info = simd_data_getinfo(arg->dtype);
+ if (info->is_sequence) {
+ simd_sequence_free(arg->data.qu8);
+ }
+}
+
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg)
+{
+ if (obj != NULL) {
+ if (simd_arg_from_obj(obj, arg) < 0) {
+ return 0;
+ }
+ arg->obj = obj;
+ return Py_CLEANUP_SUPPORTED;
+ } else {
+ simd_arg_free(arg);
+ }
+ return 1;
+}
diff --git a/numpy/core/src/_simd/_simd_convert.inc b/numpy/core/src/_simd/_simd_convert.inc
new file mode 100644
index 000000000..f5bfc3f50
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_convert.inc
@@ -0,0 +1,209 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_scalar && info->lane_size > 0);
+ simd_data data;
+ if (info->is_float) {
+ data.f64 = PyFloat_AsDouble(obj);
+ if (dtype == simd_data_f32){
+ data.f32 = (float)data.f64;
+ }
+ } else {
+ data.u64 = PyLong_AsUnsignedLongLongMask(obj);
+ }
+ return data;
+}
+
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_scalar && info->lane_size > 0);
+ if (info->is_float) {
+ if (dtype == simd_data_f32) {
+ return PyFloat_FromDouble(data.f32);
+ }
+ return PyFloat_FromDouble(data.f64);
+ }
+ int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+ data.u64 <<= leftb;
+ if (info->is_signed) {
+ return PyLong_FromLongLong(data.s64 >> leftb);
+ }
+ return PyLong_FromUnsignedLongLong(data.u64 >> leftb);
+}
+
+typedef struct {
+ Py_ssize_t len;
+ void *ptr;
+} simd__alloc_data;
+
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(len > 0 && info->is_sequence && info->lane_size > 0);
+ size_t size = sizeof(simd__alloc_data) + len * info->lane_size + NPY_SIMD_WIDTH;
+ void *ptr = malloc(size);
+ if (ptr == NULL) {
+ return PyErr_NoMemory();
+ }
+ // align the pointer
+ simd__alloc_data *a_ptr = (simd__alloc_data *)(
+ ((uintptr_t)ptr + sizeof(simd__alloc_data) + NPY_SIMD_WIDTH) & ~(uintptr_t)(NPY_SIMD_WIDTH-1)
+ );
+ a_ptr[-1].len = len;
+ a_ptr[-1].ptr = ptr;
+ return a_ptr;
+}
+
+static Py_ssize_t
+simd_sequence_len(void const *ptr)
+{
+ return ((simd__alloc_data const*)ptr)[-1].len;
+}
+
+static void
+simd_sequence_free(void *ptr)
+{
+ free(((simd__alloc_data *)ptr)[-1].ptr);
+}
+
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_sequence && info->lane_size > 0);
+ PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence");
+ if (seq_obj == NULL) {
+ return NULL;
+ }
+ Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj);
+ if (seq_size < min_size) {
+ PyErr_Format(PyExc_ValueError,
+ "minimum acceptable size of the required sequence is %d, given(%d)",
+ min_size, seq_size
+ );
+ return NULL;
+ }
+ npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype);
+ if (dst == NULL) {
+ return NULL;
+ }
+ PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj);
+ for (Py_ssize_t i = 0; i < seq_size; ++i) {
+ simd_data data = simd_scalar_from_number(seq_items[i], info->to_scalar);
+ npyv_lanetype_u8 *sdst = dst + i * info->lane_size;
+ memcpy(sdst, &data.u64, info->lane_size);
+ }
+ Py_DECREF(seq_obj);
+
+ if (PyErr_Occurred()) {
+ simd_sequence_free(dst);
+ return NULL;
+ }
+ return dst;
+}
+
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ if (!PySequence_Check(obj)) {
+ PyErr_Format(PyExc_TypeError,
+ "a sequence object is required to fill %s", info->pyname
+ );
+ return -1;
+ }
+ const npyv_lanetype_u8 *src = ptr;
+ Py_ssize_t seq_len = simd_sequence_len(ptr);
+ for (Py_ssize_t i = 0; i < seq_len; ++i) {
+ const npyv_lanetype_u8 *ssrc = src + i * info->lane_size;
+ simd_data data;
+ memcpy(&data.u64, ssrc, info->lane_size);
+ PyObject *item = simd_scalar_to_number(data, info->to_scalar);
+ if (item == NULL) {
+ return -1;
+ }
+ if (PySequence_SetItem(obj, i, item) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype)
+{
+ PyObject *list = PyList_New(simd_sequence_len(ptr));
+ if (list == NULL) {
+ return NULL;
+ }
+ if (simd_sequence_fill_iterable(list, ptr, dtype) < 0) {
+ Py_DECREF(list);
+ return NULL;
+ }
+ return list;
+}
+
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ // NPYV currently only supports x2 and x3
+ assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+ simd_data data = {.u64 = 0};
+ if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) {
+ PyErr_Format(PyExc_TypeError,
+ "a tuple of %d vector type %s is required",
+ info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname
+ );
+ return data;
+ }
+ for (int i = 0; i < info->is_vectorx; ++i) {
+ PyObject *item = PyTuple_GET_ITEM(obj, i);
+ // get the max multi-vec and let the compiler do the rest
+ data.vu64x3.val[i] = PySIMDVector_AsData((PySIMDVectorObject*)item, info->to_vector).vu64;
+ if (PyErr_Occurred()) {
+ return data;
+ }
+ }
+ return data;
+}
+
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ // NPYV currently only supports x2 and x3
+ assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+ PyObject *tuple = PyTuple_New(info->is_vectorx);
+ if (tuple == NULL) {
+ return NULL;
+ }
+ for (int i = 0; i < info->is_vectorx; ++i) {
+ // get the max multi-vector and let the compiler handle the rest
+ simd_data vdata = {.vu64 = data.vu64x3.val[i]};
+ PyObject *item = (PyObject*)PySIMDVector_FromData(vdata, info->to_vector);
+ if (item == NULL) {
+ // TODO: improve log add item number
+ Py_DECREF(tuple);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(tuple, i, item);
+ }
+ return tuple;
+}
diff --git a/numpy/core/src/_simd/_simd_data.inc.src b/numpy/core/src/_simd/_simd_data.inc.src
new file mode 100644
index 000000000..5c796487c
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_data.inc.src
@@ -0,0 +1,93 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static simd_data_info simd__data_registry[simd_data_end] =
+{
+ [simd_data_none] = {.pyname="none"},
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ * #name = int*8, float, float#
+ */
+ [simd_data_@sfx@] = {
+ .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // sequences
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ * #name = int*8, float, float#
+ */
+ [simd_data_q@sfx@] = {
+ .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // vectors
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ */
+ [simd_data_v@sfx@] = {
+ .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // boolean vectors, treated as unsigned and converted internally
+ // to add compatibility among all SIMD extensions
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64#
+ * #bsfx = b8, b16, b32, b64#
+ */
+ [simd_data_v@bsfx@] = {
+ .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1,
+ .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // multi-vectors x2
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ */
+ [simd_data_v@sfx@x2] = {
+ .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // multi-vectors x3
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ */
+ [simd_data_v@sfx@x3] = {
+ .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype)
+{ return &simd__data_registry[dtype]; }
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
new file mode 100644
index 000000000..54e7ccf01
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -0,0 +1,214 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+#define SIMD_INTRIN_DEF(NAME) \
+ { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma
+
+#define SIMD_IMPL_INTRIN_0(NAME, RET) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ if (!PyArg_ParseTuple( \
+ args, ":" NPY_TOSTRING(NAME)) \
+ ) return NULL; \
+ simd_arg a = { \
+ .dtype = simd_data_##RET, \
+ .data = {.RET = npyv_##NAME()}, \
+ }; \
+ return simd_arg_to_obj(&a); \
+ }
+
+#define SIMD_IMPL_INTRIN_0N(NAME) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ if (!PyArg_ParseTuple( \
+ args, ":" NPY_TOSTRING(NAME)) \
+ ) return NULL; \
+ npyv_##NAME(); \
+ Py_RETURN_NONE; \
+ }
+
+#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg = {.dtype = simd_data_##IN0}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg \
+ )) return NULL; \
+ simd_data data = {.RET = npyv_##NAME( \
+ arg.data.IN0 \
+ )}; \
+ simd_arg_free(&arg); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+
+#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg1 = {.dtype = simd_data_##IN0}; \
+ simd_arg arg2 = {.dtype = simd_data_##IN1}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg1, \
+ simd_arg_converter, &arg2 \
+ )) return NULL; \
+ simd_data data = {.RET = npyv_##NAME( \
+ arg1.data.IN0, arg2.data.IN1 \
+ )}; \
+ simd_arg_free(&arg1); \
+ simd_arg_free(&arg2); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+
+#define SIMD__REPEAT_2IMM(C, NAME, IN0) \
+ C == arg2.data.u8 ? NPY_CAT(npyv_, NAME)(arg1.data.IN0, C) :
+
+#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg1 = {.dtype = simd_data_##IN0}; \
+ simd_arg arg2 = {.dtype = simd_data_u8}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg1, \
+ simd_arg_converter, &arg2 \
+ )) return NULL; \
+ simd_data data; \
+ data.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)( \
+ SIMD__REPEAT_2IMM, NAME, IN0 \
+ ) npyv_##NAME(arg1.data.IN0, 0); \
+ simd_arg_free(&arg1); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+
+#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg1 = {.dtype = simd_data_##IN0}; \
+ simd_arg arg2 = {.dtype = simd_data_##IN1}; \
+ simd_arg arg3 = {.dtype = simd_data_##IN2}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&O&O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg1, \
+ simd_arg_converter, &arg2, \
+ simd_arg_converter, &arg3 \
+ )) return NULL; \
+ simd_data data = {.RET = npyv_##NAME( \
+ arg1.data.IN0, arg2.data.IN1, \
+ arg3.data.IN2 \
+ )}; \
+ simd_arg_free(&arg1); \
+ simd_arg_free(&arg2); \
+ simd_arg_free(&arg3); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+/**
+ * Helper macros for repeating and expand a certain macro.
+ * Mainly used for converting a scalar to an immediate constant.
+ */
+#define SIMD__IMPL_COUNT_7(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_8(FN, ...) \
+ SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(8, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_16(FN, ...) \
+ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(16, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_32(FN, ...) \
+ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(32, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_48(FN, ...) \
+ SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(48, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_64(FN, ...) \
+ SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(64, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_7_(FN, ...) \
+ NPY_EXPAND(FN(1, __VA_ARGS__)) \
+ NPY_EXPAND(FN(2, __VA_ARGS__)) NPY_EXPAND(FN(3, __VA_ARGS__)) \
+ NPY_EXPAND(FN(4, __VA_ARGS__)) NPY_EXPAND(FN(5, __VA_ARGS__)) \
+ NPY_EXPAND(FN(6, __VA_ARGS__)) NPY_EXPAND(FN(7, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15_(FN, ...) \
+ SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(8, __VA_ARGS__)) NPY_EXPAND(FN(9, __VA_ARGS__)) \
+ NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \
+ NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \
+ NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31_(FN, ...) \
+ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \
+ NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \
+ NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \
+ NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \
+ NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \
+ NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \
+ NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \
+ NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47_(FN, ...) \
+ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \
+ NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \
+ NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \
+ NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \
+ NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \
+ NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \
+ NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \
+ NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63_(FN, ...) \
+ SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \
+ NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \
+ NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \
+ NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \
+ NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \
+ NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \
+ NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \
+ NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__))
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
new file mode 100644
index 000000000..9858fc0dc
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -0,0 +1,421 @@
+#ifndef _SIMD_SIMD_INC_H_
+#define _SIMD_SIMD_INC_H_
+
+#include <Python.h>
+#include "simd/simd.h"
+
+#if NPY_SIMD
+/************************************
+ ** Types
+ ************************************/
+/**
+ * Gather all data types supported by the module.
+*/
+typedef union
+{
+ // scalars
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ npyv_lanetype_@sfx@ @sfx@;
+ /**end repeat**/
+ // sequence
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ npyv_lanetype_@sfx@ *q@sfx@;
+ /**end repeat**/
+ // vectors
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64#
+ */
+ npyv_@sfx@ v@sfx@;
+ /**end repeat**/
+ // multi-vectors x2
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+ */
+ npyv_@sfx@x2 v@sfx@x2;
+ /**end repeat**/
+ // multi-vectors x3
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+ */
+ npyv_@sfx@x3 v@sfx@x3;
+ /**end repeat**/
+#if NPY_SIMD_F64
+ npyv_f64 vf64;
+ npyv_f64x2 vf64x2;
+ npyv_f64x3 vf64x3;
+#endif
+} simd_data;
+
+/**
+ * Data types IDs and suffixes. Must be same data types as the ones
+ * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'.
+*/
+typedef enum
+{
+ simd_data_none = 0,
+ // scalars
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_@sfx@,
+ /**end repeat**/
+ // sequences
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_q@sfx@,
+ /**end repeat**/
+ // vectors
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64#
+ */
+ simd_data_v@sfx@,
+ /**end repeat**/
+ // multi-vectors x2
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_v@sfx@x2,
+ /**end repeat**/
+ // multi-vectors x3
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_v@sfx@x3,
+ /**end repeat**/
+ simd_data_end,
+} simd_data_type;
+/************************************
+ ** Declarations (inc_data)
+ ************************************/
+/**
+ * simd_data_type information
+ */
+typedef struct
+{
+ // type name compatible with python style
+ const char *pyname;
+ // returns '1' if the type represent a unsigned integer
+ int is_unsigned:1;
+ // returns '1' if the type represent a signed integer
+ int is_signed:1;
+ // returns '1' if the type represent a single or double precision
+ int is_float:1;
+ // returns '1' if the type represent a boolean
+ int is_bool:1;
+ // returns '1' if the type represent a sequence
+ int is_sequence:1;
+ // returns '1' if the type represent a scalar
+ int is_scalar:1;
+ // returns '1' if the type represent a vector
+ int is_vector:1;
+ // returns the len of multi-vector if the type reprsent x2 or x3 vector
+ // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
+ int is_vectorx;
+ // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
+ simd_data_type to_scalar;
+ // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8
+ // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector
+ // e.g. simd_data_vb8 -> simd_data_vu8
+ simd_data_type to_vector;
+ // number of vector lanes
+ int nlanes;
+ // sizeof lane type
+ int lane_size;
+} simd_data_info;
+
+/**
+ * Returns data info of certain dtype.
+ *
+ * Example:
+ ** const simd_data_info *info = simd_data_getinfo(simd_data_vu8);
+ ** if (info->is_vector && info->is_unsigned) {
+ ** ...
+ ** }
+ */
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_vector)
+ ************************************/
+typedef struct
+{
+ PyObject_HEAD
+ // vector type id
+ simd_data_type dtype;
+ // vector data, aligned for safe casting
+ npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH];
+} PySIMDVectorObject;
+/**
+ * Create a Python obj(PySIMDVectorObject) from a NPYV vector based on the contents
+ * of `data`(simd_data) and according to the vector data type `dtype`
+ * on range(simd_data_[vu8:vf64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.vu8 = npyv_setall_u8(0xff)};
+ ** PySIMDVectorObject *obj = PySIMDVector_FromData(data, simd_data_vu8);
+ ** if (obj != NULL) {
+ ** printf("I have a valid vector obj and first element is \n", obj->data[0]);
+ ** Py_DECREF(obj);
+ ** }
+ */
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype);
+/**
+ * Return a NPYV vector(simd_data) representation of `obj`(PySIMDVectorObject) and
+ * according to the vector data type `dtype` on range (simd_data_[vu8:vf64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = PySIMDVector_AsData(vec_obj, simd_data_vf32);
+ ** if (!PyErr_Occurred()) {
+ ** npyv_f32 add_1 = npyv_add_f32(data.vf32, npyv_setall_f32(1));
+ ** ...
+ ** }
+ */
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *obj, simd_data_type dtype);
+/**
+ * initialize and register PySIMDVectorType to certain PyModule,
+ * PySIMDVectorType can be reached through attribute 'vector_type'.
+ * return -1 on error, 0 on success.
+ */
+static int
+PySIMDVectorType_Init(PyObject *module);
+
+/************************************
+ ** Declarations (inc_convert)
+ ************************************/
+/**
+ * Return a C scalar(simd_data) representation of `obj` and
+ * according to the scalar data type `dtype` on range (simd_data_[u8:f64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_scalar_from_number(obj, simd_data_f32);
+ ** if (!PyErr_Occurred()) {
+ ** printf("I have a valid float %d\n", data.f32);
+ ** }
+ */
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python scalar from a C scalar based on the contents
+ * of `data`(simd_data) and according to the scalar data type `dtype`
+ * on range(simd_data_[u8:f64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.u32 = 0x7fffffff};
+ ** PyObject *obj = simd_scalar_to_number(data, simd_data_s32);
+ ** if (obj != NULL) {
+ ** printf("I have a valid Python integer %d\n", PyLong_AsLong(obj));
+ ** Py_DECREF(obj);
+ ** }
+ */
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype);
+/**
+ * Allocate a C array in memory according to number of elements `len`
+ * and sequence data type `dtype` on range(simd_data_[qu8:qf64]).
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64);
+ ** if (aligned_ptr != NULL) {
+ ** // aligned store
+ ** npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0));
+ ** printf("The first element of my array %f\n", aligned_ptr[0]);
+ ** simd_sequence_free(aligned_ptr);
+ ** }
+ */
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype);
+/**
+ * Return the number of elements of the allocated C array `ptr`
+ * by `simd_sequence_new()` or `simd_sequence_from_iterable()`.
+ */
+static Py_ssize_t
+simd_sequence_len(const void *ptr);
+/**
+ * Free the allocated C array by `simd_sequence_new()` or
+ * `simd_sequence_from_iterable()`.
+ */
+static void
+simd_sequence_free(void *ptr);
+/**
+ * Return a C array representation of a PyObject sequence `obj` and
+ * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Note: parameter `min_size` takes the number of minimum acceptable elements.
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_u32 *ptr = simd_sequence_from_iterable(seq_obj, simd_data_qu32, npyv_nlanes_u32);
+ ** if (ptr != NULL) {
+ ** npyv_u32 a = npyv_load_u32(ptr);
+ ** ...
+ ** simd_sequence_free(ptr);
+ ** }
+ **
+ */
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size);
+/**
+ * Fill a Python sequence object `obj` with a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return 0 on success or -1 with a Python exception on failure.
+ */
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype);
+/**
+ * Create a Python list from a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype);
+/**
+ * Return a SIMD multi-vector(simd_data) representation of Python tuple of
+ * (simd_vector*,) `obj` according to the scalar data type `dtype`
+ * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_vectorx_from_tuple(tuple_obj, simd_data_vf32x2);
+ ** if (!PyErr_Occurred()) {
+ ** npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]);
+ ** ...
+ ** }
+ **
+ */
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python tuple of 'simd_vector' from a SIMD multi-vector
+ * based on the contents of `data`(simd_data) and according to
+ * the multi-vector data type `dtype` on range
+ * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_arg)
+ ************************************/
+typedef struct
+{
+ simd_data_type dtype;
+ simd_data data;
+ // set by simd_arg_converter()
+ PyObject *obj;
+} simd_arg;
+/**
+ * The following functions gather all conversions between all data types
+ * and they can used instead of all above functions.
+ */
+/**
+ * Convert a Python object `obj` into simd_data `arg->data` according to the
+ * required data type `arg->dtype`.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ *
+ * Notes:
+ * - requires `simd_arg_free()` or `simd_sequence_free()`
+ * to free allocated C array, in case of sequence data types.
+ * - the number of minimum acceptable elements for sequence data
+ * types is the number of lanes of the equivalent vector data type.
+ *
+ * Example #1:
+ ** simd_arg arg = {.dtype = simd_data_qu8};
+ ** if (simd_arg_from_obj(seq_obj, &arg) < 0) {
+ ** // fails to convert a python sequence object to C array of uint8
+ ** return;
+ ** }
+ ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8);
+ ** ...
+ ** simd_arg_free(&arg);
+ *
+ * Example #2:
+ ** simd_arg arg = {.dtype = simd_data_vf32};
+ ** if (simd_arg_from_obj(vector_obj, &arg) < 0) {
+ ** // fails to convert a python simd_vector to NPYV vector
+ ** return;
+ ** }
+ ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1));
+ ** ...
+ */
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg);
+/**
+ * Convert a simd_data `arg->data` to into a Python object according to the
+ * required data type `arg->dtype`.
+ *
+ * Return NULL and raise Python exception on failure, otherwise return
+ * new reference.
+ *
+ * Example:
+ ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}};
+ ** PyObject *obj = simd_arg_to_obj(&arg);
+ ** if (obj == NULL) {
+ ** // fails convert C uint32 to Python integer.
+ ** return;
+ ** }
+ **
+ */
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg);
+/**
+ * Converter function used similar to simd_arg_from_obj() but
+ * used with PyArg_Parse*().
+ *
+ * Notes:
+ * - requires `simd_arg_free()` or `simd_sequence_free()`
+ * to free allocated C array, in case of sequence data types.
+ * - the number of minimum acceptable elements for sequence data
+ * types is the number of lanes of the equivalent vector data type.
+ * - use 'arg->obj' to retrieve the parameter obj.
+ *
+ * Example:
+ ** simd_arg seq_f32 = {.dtype = simd_data_qf32};
+ ** simd_arg vec_f32 = {.dtype = simd_data_vf32};
+ ** if (!PyArg_ParseTuple(
+ ** args, "O&O&:add_sum_f32",
+ ** simd_arg_converter, &seq_f32,
+ ** simd_arg_converter, &vec_f32
+ ** )) {
+ ** // fail
+ ** return;
+ ** }
+ ** npyv_f32 load_a = npyv_load_f32(seq_f32.data.qf32);
+ ** npyv_f32 sum = npyv_add_f32(load_a, vec_f32.data.vf32);
+ ** ...
+ ** simd_arg_free(&seq_f32);
+ */
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg);
+/**
+ * Free the allocated C array, if the arg hold sequence data type.
+ */
+static void
+simd_arg_free(simd_arg *arg);
+
+#endif // NPY_SIMD
+#endif // _SIMD_SIMD_INC_H_
diff --git a/numpy/core/src/_simd/_simd_vector.inc b/numpy/core/src/_simd/_simd_vector.inc
new file mode 100644
index 000000000..2a1378f22
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_vector.inc
@@ -0,0 +1,178 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static Py_ssize_t
+simd__vector_length(PySIMDVectorObject *self)
+{
+ return simd_data_getinfo(self->dtype)->nlanes;
+}
+static PyObject *
+simd__vector_item(PySIMDVectorObject *self, Py_ssize_t i)
+{
+ const simd_data_info *info = simd_data_getinfo(self->dtype);
+ int nlanes = info->nlanes;
+ if (i >= nlanes) {
+ PyErr_SetString(PyExc_IndexError, "vector index out of range");
+ return NULL;
+ }
+ npyv_lanetype_u8 *src = self->data + i * info->lane_size;
+ simd_data data;
+ memcpy(&data.u64, src, info->lane_size);
+ return simd_scalar_to_number(data, info->to_scalar);
+}
+
+static PySequenceMethods simd__vector_as_sequence = {
+ .sq_length = (lenfunc) simd__vector_length,
+ .sq_item = (ssizeargfunc) simd__vector_item
+};
+
+static PyObject *
+simd__vector_name(PySIMDVectorObject *self)
+{
+ return PyUnicode_FromString(simd_data_getinfo(self->dtype)->pyname);
+}
+static PyGetSetDef simd__vector_getset[] = {
+ { "__name__", (getter)simd__vector_name, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL }
+};
+
+static PyObject *
+simd__vector_repr(PySIMDVectorObject *self)
+{
+ PyObject *obj = PySequence_List((PyObject*)self);
+ if (obj != NULL) {
+ const char *type_name = simd_data_getinfo(self->dtype)->pyname;
+ PyObject *repr = PyUnicode_FromFormat("<%s of %R>", type_name, obj);
+ Py_DECREF(obj);
+ return repr;
+ }
+ return obj;
+}
+static PyObject *
+simd__vector_compare(PyObject *self, PyObject *other, int cmp_op)
+{
+ PyObject *obj;
+ if (PyTuple_Check(other)) {
+ obj = PySequence_Tuple(self);
+ } else if (PyList_Check(other)) {
+ obj = PySequence_List(self);
+ } else {
+ obj = PySequence_Fast(self, "invalid argument, expected a vector");
+ }
+ if (obj != NULL) {
+ PyObject *rich = PyObject_RichCompare(obj, other, cmp_op);
+ Py_DECREF(obj);
+ return rich;
+ }
+ return obj;
+}
+static PyTypeObject PySIMDVectorType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)),
+ .tp_basicsize = sizeof(PySIMDVectorObject),
+ .tp_repr = (reprfunc)simd__vector_repr,
+ .tp_as_sequence = &simd__vector_as_sequence,
+ .tp_flags = Py_TPFLAGS_DEFAULT,
+ .tp_richcompare = simd__vector_compare,
+ .tp_getset = simd__vector_getset
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_vector && info->nlanes > 0);
+
+ PySIMDVectorObject *vec = PyObject_New(PySIMDVectorObject, &PySIMDVectorType);
+ if (vec == NULL) {
+ return (PySIMDVectorObject*)PyErr_NoMemory();
+ }
+ vec->dtype = dtype;
+ if (info->is_bool) {
+ // boolean vectors are internally treated as unsigned
+ // vectors to add compatibility among all SIMD extensions
+ switch(dtype) {
+ case simd_data_vb8:
+ data.vu8 = npyv_cvt_u8_b8(data.vb8);
+ break;
+ case simd_data_vb16:
+ data.vu16 = npyv_cvt_u16_b16(data.vb16);
+ break;
+ case simd_data_vb32:
+ data.vu32 = npyv_cvt_u32_b32(data.vb32);
+ break;
+ default:
+ data.vu64 = npyv_cvt_u64_b64(data.vb64);
+ }
+ }
+ npyv_store_u8(vec->data, data.vu8);
+ return vec;
+}
+
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *vec, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_vector && info->nlanes > 0);
+
+ simd_data data = {.u64 = 0};
+ if (!PyObject_IsInstance(
+ (PyObject *)vec, (PyObject *)&PySIMDVectorType
+ )) {
+ PyErr_Format(PyExc_TypeError,
+ "a vector type %s is required", info->pyname
+ );
+ return data;
+ }
+ if (vec->dtype != dtype) {
+ PyErr_Format(PyExc_TypeError,
+ "a vector type %s is required, got(%s)",
+ info->pyname, simd_data_getinfo(vec->dtype)->pyname
+ );
+ return data;
+ }
+
+ data.vu8 = npyv_load_u8(vec->data);
+ if (info->is_bool) {
+ // boolean vectors are internally treated as unsigned
+ // vectors to add compatibility among all SIMD extensions
+ switch(dtype) {
+ case simd_data_vb8:
+ data.vb8 = npyv_cvt_b8_u8(data.vu8);
+ break;
+ case simd_data_vb16:
+ data.vb16 = npyv_cvt_b16_u16(data.vu16);
+ break;
+ case simd_data_vb32:
+ data.vb32 = npyv_cvt_b32_u32(data.vu32);
+ break;
+ default:
+ data.vb64 = npyv_cvt_b64_u64(data.vu64);
+ }
+ }
+ return data;
+}
+
+static int
+PySIMDVectorType_Init(PyObject *module)
+{
+ Py_INCREF(&PySIMDVectorType);
+ if (PyType_Ready(&PySIMDVectorType)) {
+ return -1;
+ }
+ if (PyModule_AddObject(
+ module, "vector_type",(PyObject *)&PySIMDVectorType
+ )) {
+ return -1;
+ }
+ return 0;
+}
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
new file mode 100644
index 000000000..50e77a4b8
--- /dev/null
+++ b/numpy/core/tests/test_simd.py
@@ -0,0 +1,550 @@
+# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
+# may be involved in their functionality.
+import pytest
+from numpy.core._simd import targets
+
+class _Test_Utility:
+ # submodule of the desired SIMD extention, e.g. targets["AVX512F"]
+ npyv = None
+ # the current data type suffix e.g. 's8'
+ sfx = None
+
+ def __getattr__(self, attr):
+ """
+ To call NPV intrinsics without the attribute 'npyv' and
+ auto suffixing intrinsics according to class attribute 'sfx'
+ """
+ return getattr(self.npyv, attr + "_" + self.sfx)
+
+ def _data(self, start=None, count=None, reverse=False):
+ """
+ Create list of consecutive numbers according to number of vector's lanes.
+ """
+ if start is None:
+ start = 1
+ if count is None:
+ count = self.nlanes
+ rng = range(start, start + count)
+ if reverse:
+ rng = reversed(rng)
+ if self._is_fp():
+ return [x / 1.0 for x in rng]
+ return list(rng)
+
+ def _is_unsigned(self):
+ return self.sfx[0] == 'u'
+
+ def _is_signed(self):
+ return self.sfx[0] == 's'
+
+ def _is_fp(self):
+ return self.sfx[0] == 'f'
+
+ def _scalar_size(self):
+ return int(self.sfx[1:])
+
+ def _int_clip(self, seq):
+ if self._is_fp():
+ return seq
+ max_int = self._int_max()
+ min_int = self._int_min()
+ return [min(max(v, min_int), max_int) for v in seq]
+
+ def _int_max(self):
+ if self._is_fp():
+ return None
+ max_u = self._to_unsigned(self.setall(-1))[0]
+ if self._is_signed():
+ return max_u // 2
+ return max_u
+
+ def _int_min(self):
+ if self._is_fp():
+ return None
+ if self._is_unsigned():
+ return 0
+ return -(self._int_max() + 1)
+
+ def _true_mask(self):
+ max_unsig = getattr(self.npyv, "setall_u" + self.sfx[1:])(-1)
+ return max_unsig[0]
+
+ def _to_unsigned(self, vector):
+ if isinstance(vector, (list, tuple)):
+ return getattr(self.npyv, "load_u" + self.sfx[1:])(vector)
+ else:
+ sfx = vector.__name__.replace("npyv_", "")
+ if sfx[0] == "b":
+ cvt_intrin = "cvt_u{0}_b{0}"
+ else:
+ cvt_intrin = "reinterpret_u{0}_{1}"
+ return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector)
+
+class _SIMD_INT(_Test_Utility):
+ """
+ To test all integer vector types at once
+ """
+ def test_operators_shift(self):
+ if self.sfx in ("u8", "s8"):
+ return
+
+ data_a = self._data(self._int_max() - self.nlanes)
+ data_b = self._data(self._int_min(), reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ for count in range(self._scalar_size()):
+ # load to cast
+ data_shl_a = self.load([a << count for a in data_a])
+ # left shift
+ shl = self.shl(vdata_a, count)
+ assert shl == data_shl_a
+ # left shift by an immediate constant
+ shli = self.shli(vdata_a, count)
+ assert shli == data_shl_a
+ # load to cast
+ data_shr_a = self.load([a >> count for a in data_a])
+ # right shift
+ shr = self.shr(vdata_a, count)
+ assert shr == data_shr_a
+ # right shift by an immediate constant
+ shri = self.shri(vdata_a, count)
+ assert shri == data_shr_a
+
+ def test_arithmetic_subadd_saturated(self):
+ if self.sfx in ("u32", "s32", "u64", "s64"):
+ return
+
+ data_a = self._data(self._int_max() - self.nlanes)
+ data_b = self._data(self._int_min(), reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ data_adds = self._int_clip([a + b for a, b in zip(data_a, data_b)])
+ adds = self.adds(vdata_a, vdata_b)
+ assert adds == data_adds
+
+ data_subs = self._int_clip([a - b for a, b in zip(data_a, data_b)])
+ subs = self.subs(vdata_a, vdata_b)
+ assert subs == data_subs
+
+class _SIMD_FP(_Test_Utility):
+ """
+ To test all float vector types at once
+ """
+ def test_arithmetic_fused(self):
+ vdata_a, vdata_b, vdata_c = [self.load(self._data())]*3
+ vdata_cx2 = self.add(vdata_c, vdata_c)
+ # multiply and add, a*b + c
+ data_fma = self.load([a * b + c for a, b, c in zip(vdata_a, vdata_b, vdata_c)])
+ fma = self.muladd(vdata_a, vdata_b, vdata_c)
+ assert fma == data_fma
+ # multiply and subtract, a*b - c
+ fms = self.mulsub(vdata_a, vdata_b, vdata_c)
+ data_fms = self.sub(data_fma, vdata_cx2)
+ assert fms == data_fms
+ # negate multiply and add, -(a*b) + c
+ nfma = self.nmuladd(vdata_a, vdata_b, vdata_c)
+ data_nfma = self.sub(vdata_cx2, data_fma)
+ assert nfma == data_nfma
+ # negate multiply and subtract, -(a*b) - c
+ nfms = self.nmulsub(vdata_a, vdata_b, vdata_c)
+ data_nfms = self.mul(data_fma, self.setall(-1))
+ assert nfms == data_nfms
+
+class _SIMD_ALL(_Test_Utility):
+ """
+ To test all vector types at once
+ """
+ def test_memory_load(self):
+ data = self._data()
+ # unaligned load
+ load_data = self.load(data)
+ assert load_data == data
+ # aligned load
+ loada_data = self.loada(data)
+ assert loada_data == data
+ # stream load
+ loads_data = self.loads(data)
+ assert loads_data == data
+ # load lower part
+ loadl = self.loadl(data)
+ loadl_half = list(loadl)[:self.nlanes//2]
+ data_half = data[:self.nlanes//2]
+ assert loadl_half == data_half
+ assert loadl != data # detect overflow
+
+ def test_memory_store(self):
+ data = self._data()
+ vdata = self.load(data)
+ # unaligned store
+ store = [0] * self.nlanes
+ self.store(store, vdata)
+ assert store == data
+ # aligned store
+ store_a = [0] * self.nlanes
+ self.storea(store_a, vdata)
+ assert store_a == data
+ # stream store
+ store_s = [0] * self.nlanes
+ self.stores(store_s, vdata)
+ assert store_s == data
+ # store lower part
+ store_l = [0] * self.nlanes
+ self.storel(store_l, vdata)
+ assert store_l[:self.nlanes//2] == data[:self.nlanes//2]
+ assert store_l != vdata # detect overflow
+ # store higher part
+ store_h = [0] * self.nlanes
+ self.storeh(store_h, vdata)
+ assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
+ assert store_h != vdata # detect overflow
+
+ def test_memory_partial_load(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ data = self._data()
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4] # test out of range
+ for n in lanes:
+ load_till = self.load_till(data, n, 15)
+ data_till = data[:n] + [15] * (self.nlanes-n)
+ assert load_till == data_till
+ load_tillz = self.load_tillz(data, n)
+ data_tillz = data[:n] + [0] * (self.nlanes-n)
+ assert load_tillz == data_tillz
+
+ def test_memory_partial_store(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ data = self._data()
+ data_rev = self._data(reverse=True)
+ vdata = self.load(data)
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4]
+ for n in lanes:
+ data_till = data_rev.copy()
+ data_till[:n] = data[:n]
+ store_till = self._data(reverse=True)
+ self.store_till(store_till, n, vdata)
+ assert store_till == data_till
+
+ def test_memory_noncont_load(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ for stride in range(1, 64):
+ data = self._data(count=stride*self.nlanes)
+ data_stride = data[::stride]
+ loadn = self.loadn(data, stride)
+ assert loadn == data_stride
+
+ for stride in range(-64, 0):
+ data = self._data(stride, -stride*self.nlanes)
+ data_stride = self.load(data[::stride]) # cast unsigned
+ loadn = self.loadn(data, stride)
+ assert loadn == data_stride
+
+ def test_memory_noncont_partial_load(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4]
+ for stride in range(1, 64):
+ data = self._data(count=stride*self.nlanes)
+ data_stride = data[::stride]
+ for n in lanes:
+ data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+ loadn_till = self.loadn_till(data, stride, n, 15)
+ assert loadn_till == data_stride_till
+ data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+ loadn_tillz = self.loadn_tillz(data, stride, n)
+ assert loadn_tillz == data_stride_tillz
+
+ for stride in range(-64, 0):
+ data = self._data(stride, -stride*self.nlanes)
+ data_stride = list(self.load(data[::stride])) # cast unsigned
+ for n in lanes:
+ data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+ loadn_till = self.loadn_till(data, stride, n, 15)
+ assert loadn_till == data_stride_till
+ data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+ loadn_tillz = self.loadn_tillz(data, stride, n)
+ assert loadn_tillz == data_stride_tillz
+
+ def test_memory_noncont_store(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ vdata = self.load(self._data())
+ for stride in range(1, 64):
+ data = [15] * stride * self.nlanes
+ data[::stride] = vdata
+ storen = [15] * stride * self.nlanes
+ storen += [127]*64
+ self.storen(storen, stride, vdata)
+ assert storen[:-64] == data
+ assert storen[-64:] == [127]*64 # detect overflow
+
+ for stride in range(-64, 0):
+ data = [15] * -stride * self.nlanes
+ data[::stride] = vdata
+ storen = [127]*64
+ storen += [15] * -stride * self.nlanes
+ self.storen(storen, stride, vdata)
+ assert storen[64:] == data
+ assert storen[:64] == [127]*64 # detect overflow
+
+ def test_memory_noncont_partial_store(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ data = self._data()
+ vdata = self.load(data)
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4]
+ for stride in range(1, 64):
+ for n in lanes:
+ data_till = [15] * stride * self.nlanes
+ data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+ storen_till = [15] * stride * self.nlanes
+ storen_till += [127]*64
+ self.storen_till(storen_till, stride, n, vdata)
+ assert storen_till[:-64] == data_till
+ assert storen_till[-64:] == [127]*64 # detect overflow
+
+ for stride in range(-64, 0):
+ for n in lanes:
+ data_till = [15] * -stride * self.nlanes
+ data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+ storen_till = [127]*64
+ storen_till += [15] * -stride * self.nlanes
+ self.storen_till(storen_till, stride, n, vdata)
+ assert storen_till[64:] == data_till
+ assert storen_till[:64] == [127]*64 # detect overflow
+
+ def test_misc(self):
+ broadcast_zero = self.zero()
+ assert broadcast_zero == [0] * self.nlanes
+ for i in range(1, 10):
+ broadcasti = self.setall(i)
+ assert broadcasti == [i] * self.nlanes
+
+ data_a, data_b = self._data(), self._data(reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ # py level of npyv_set_* don't support ignoring the extra specified lanes or
+ # fill non-specified lanes with zero.
+ vset = self.set(*data_a)
+ assert vset == data_a
+ # py level of npyv_setf_* don't support ignoring the extra specified lanes or
+ # fill non-specified lanes with the specified scalar.
+ vsetf = self.setf(10, *data_a)
+ assert vsetf == data_a
+
+ # We're testing the sainty of _simd's type-vector,
+ # reinterpret* intrinsics itself are tested via compiler
+ # during the build of _simd module
+ sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"]
+ if self.npyv.simd_f64:
+ sfxes.append("f64")
+ for sfx in sfxes:
+ vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__
+ assert vec_name == "npyv_" + sfx
+
+ # select & mask operations
+ select_a = self.select(self.cmpeq(self.zero(), self.zero()), vdata_a, vdata_b)
+ assert select_a == data_a
+ select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b)
+ assert select_b == data_b
+
+ # cleanup intrinsic is only used with AVX for
+ # zeroing registers to avoid the AVX-SSE transition penalty,
+ # so nothing to test here
+ self.npyv.cleanup()
+
+ def test_reorder(self):
+ data_a, data_b = self._data(), self._data(reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+ # lower half part
+ data_a_lo = data_a[:self.nlanes//2]
+ data_b_lo = data_b[:self.nlanes//2]
+ # higher half part
+ data_a_hi = data_a[self.nlanes//2:]
+ data_b_hi = data_b[self.nlanes//2:]
+ # combine two lower parts
+ combinel = self.combinel(vdata_a, vdata_b)
+ assert combinel == data_a_lo + data_b_lo
+ # combine two higher parts
+ combineh = self.combineh(vdata_a, vdata_b)
+ assert combineh == data_a_hi + data_b_hi
+ # combine x2
+ combine = self.combine(vdata_a, vdata_b)
+ assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi)
+ # zip(interleave)
+ data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p]
+ data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p]
+ vzip = self.zip(vdata_a, vdata_b)
+ assert vzip == (data_zipl, data_ziph)
+
+ def test_operators_comparison(self):
+ if self._is_fp():
+ data_a = self._data()
+ else:
+ data_a = self._data(self._int_max() - self.nlanes)
+ data_b = self._data(self._int_min(), reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ mask_true = self._true_mask()
+ def to_bool(vector):
+ return [lane == mask_true for lane in vector]
+ # equal
+ data_eq = [a == b for a, b in zip(data_a, data_b)]
+ cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b))
+ assert cmpeq == data_eq
+ # not equal
+ data_neq = [a != b for a, b in zip(data_a, data_b)]
+ cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b))
+ assert cmpneq == data_neq
+ # greater than
+ data_gt = [a > b for a, b in zip(data_a, data_b)]
+ cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b))
+ assert cmpgt == data_gt
+ # greater than and equal
+ data_ge = [a >= b for a, b in zip(data_a, data_b)]
+ cmpge = to_bool(self.cmpge(vdata_a, vdata_b))
+ assert cmpge == data_ge
+ # less than
+ data_lt = [a < b for a, b in zip(data_a, data_b)]
+ cmplt = to_bool(self.cmplt(vdata_a, vdata_b))
+ assert cmplt == data_lt
+ # less than and equal
+ data_le = [a <= b for a, b in zip(data_a, data_b)]
+ cmple = to_bool(self.cmple(vdata_a, vdata_b))
+ assert cmple == data_le
+
+ def test_operators_logical(self):
+ if self._is_fp():
+ data_a = self._data()
+ else:
+ data_a = self._data(self._int_max() - self.nlanes)
+ data_b = self._data(self._int_min(), reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ if self._is_fp():
+ data_cast_a = self._to_unsigned(vdata_a)
+ data_cast_b = self._to_unsigned(vdata_b)
+ cast, cast_data = self._to_unsigned, self._to_unsigned
+ else:
+ data_cast_a, data_cast_b = data_a, data_b
+ cast, cast_data = lambda a: a, self.load
+
+ data_xor = cast_data([a ^ b for a, b in zip(data_cast_a, data_cast_b)])
+ vxor = cast(self.xor(vdata_a, vdata_b))
+ assert vxor == data_xor
+
+ data_or = cast_data([a | b for a, b in zip(data_cast_a, data_cast_b)])
+ vor = cast(getattr(self, "or")(vdata_a, vdata_b))
+ assert vor == data_or
+
+ data_and = cast_data([a & b for a, b in zip(data_cast_a, data_cast_b)])
+ vand = cast(getattr(self, "and")(vdata_a, vdata_b))
+ assert vand == data_and
+
+ data_not = cast_data([~a for a in data_cast_a])
+ vnot = cast(getattr(self, "not")(vdata_a))
+ assert vnot == data_not
+
+ def test_conversion_boolean(self):
+ bsfx = "b" + self.sfx[1:]
+ to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx))
+ from_boolean = getattr(self.npyv, "cvt_%s_%s" % (self.sfx, bsfx))
+
+ false_vb = to_boolean(self.setall(0))
+ true_vb = self.cmpeq(self.setall(0), self.setall(0))
+ assert false_vb != true_vb
+
+ false_vsfx = from_boolean(false_vb)
+ true_vsfx = from_boolean(true_vb)
+ assert false_vsfx != true_vsfx
+
+ def test_arithmetic_subadd(self):
+ if self._is_fp():
+ data_a = self._data()
+ else:
+ data_a = self._data(self._int_max() - self.nlanes)
+ data_b = self._data(self._int_min(), reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ # non-saturated
+ data_add = self.load([a + b for a, b in zip(data_a, data_b)]) # load to cast
+ add = self.add(vdata_a, vdata_b)
+ assert add == data_add
+ data_sub = self.load([a - b for a, b in zip(data_a, data_b)])
+ sub = self.sub(vdata_a, vdata_b)
+ assert sub == data_sub
+
+ def test_arithmetic_mul(self):
+ if self.sfx in ("u64", "s64"):
+ return
+
+ if self._is_fp():
+ data_a = self._data()
+ else:
+ data_a = self._data(self._int_max() - self.nlanes)
+ data_b = self._data(self._int_min(), reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ data_mul = self.load([a * b for a, b in zip(data_a, data_b)])
+ mul = self.mul(vdata_a, vdata_b)
+ assert mul == data_mul
+
+ def test_arithmetic_div(self):
+ if not self._is_fp():
+ return
+
+ data_a, data_b = self._data(), self._data(reverse=True)
+ vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+ # load to truncate f64 to precision of f32
+ data_div = self.load([a / b for a, b in zip(data_a, data_b)])
+ div = self.div(vdata_a, vdata_b)
+ assert div == data_div
+
+
+int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
+fp_sfx = ("f32", "f64")
+all_sfx = int_sfx + fp_sfx
+tests_registry = {
+ int_sfx : _SIMD_INT,
+ fp_sfx : _SIMD_FP,
+ all_sfx : _SIMD_ALL
+}
+for target_name, npyv in targets.items():
+ simd_width = npyv.simd if npyv else ''
+ pretty_name = target_name.split('__') # multi-target separator
+ if len(pretty_name) > 1:
+ # multi-target
+ pretty_name = f"({' '.join(pretty_name)})"
+ else:
+ pretty_name = pretty_name[0]
+
+ skip = ""
+ skip_sfx = dict()
+ if not npyv:
+ skip = f"target '{pretty_name}' isn't supported by current machine"
+ elif not npyv.simd:
+ skip = f"target '{pretty_name}' isn't supported by NPYV"
+ elif not npyv.simd_f64:
+ skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision"
+
+ for sfxes, cls in tests_registry.items():
+ for sfx in sfxes:
+ skip_m = skip_sfx.get(sfx, skip)
+ inhr = (cls,)
+ attr = dict(npyv=targets[target_name], sfx=sfx)
+ tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr)
+ if skip_m:
+ pytest.mark.skip(reason=skip_m)(tcls)
+ globals()[tcls.__name__] = tcls
diff --git a/numpy/core/tests/test_simd_module.py b/numpy/core/tests/test_simd_module.py
new file mode 100644
index 000000000..3d710884a
--- /dev/null
+++ b/numpy/core/tests/test_simd_module.py
@@ -0,0 +1,97 @@
+import pytest
+from numpy.core._simd import targets
+"""
+This testing unit only for checking the sanity of common functionality,
+therefore all we need is just to take one submodule that represents any
+of enabled SIMD extensions to run the test on it and the second submodule
+required to run only one check related to the possibility of mixing
+the data types among each submodule.
+"""
+npyvs = [npyv_mod for npyv_mod in targets.values() if npyv_mod and npyv_mod.simd]
+npyv, npyv2 = (npyvs + [None, None])[:2]
+
+unsigned_sfx = ["u8", "u16", "u32", "u64"]
+signed_sfx = ["s8", "s16", "s32", "s64"]
+fp_sfx = ["f32"]
+if npyv and npyv.simd_f64:
+ fp_sfx.append("f64")
+
+int_sfx = unsigned_sfx + signed_sfx
+all_sfx = unsigned_sfx + int_sfx
+
+@pytest.mark.skipif(not npyv, reason="could not find any SIMD extension with NPYV support")
+class Test_SIMD_MODULE:
+
+ @pytest.mark.parametrize('sfx', all_sfx)
+ def test_num_lanes(self, sfx):
+ nlanes = getattr(npyv, "nlanes_" + sfx)
+ vector = getattr(npyv, "setall_" + sfx)(1)
+ assert len(vector) == nlanes
+
+ @pytest.mark.parametrize('sfx', all_sfx)
+ def test_type_name(self, sfx):
+ vector = getattr(npyv, "setall_" + sfx)(1)
+ assert vector.__name__ == "npyv_" + sfx
+
+ def test_raises(self):
+ a, b = [npyv.setall_u32(1)]*2
+ for sfx in all_sfx:
+ vcb = lambda intrin: getattr(npyv, f"{intrin}_{sfx}")
+ pytest.raises(TypeError, vcb("add"), a)
+ pytest.raises(TypeError, vcb("add"), a, b, a)
+ pytest.raises(TypeError, vcb("setall"))
+ pytest.raises(TypeError, vcb("setall"), [1])
+ pytest.raises(TypeError, vcb("load"), 1)
+ pytest.raises(ValueError, vcb("load"), [1])
+ pytest.raises(ValueError, vcb("store"), [1], getattr(npyv, f"reinterpret_{sfx}_u32")(a))
+
+ @pytest.mark.skipif(not npyv2, reason=(
+ "could not find a second SIMD extension with NPYV support"
+ ))
+ def test_nomix(self):
+ # mix among submodules isn't allowed
+ a = npyv.setall_u32(1)
+ a2 = npyv2.setall_u32(1)
+ pytest.raises(TypeError, npyv.add_u32, a2, a2)
+ pytest.raises(TypeError, npyv2.add_u32, a, a)
+
+ @pytest.mark.parametrize('sfx', unsigned_sfx)
+ def test_unsigned_overflow(self, sfx):
+ nlanes = getattr(npyv, "nlanes_" + sfx)
+ maxu = (1 << int(sfx[1:])) - 1
+ maxu_72 = (1 << 72) - 1
+ lane = getattr(npyv, "setall_" + sfx)(maxu_72)[0]
+ assert lane == maxu
+ lanes = getattr(npyv, "load_" + sfx)([maxu_72] * nlanes)
+ assert lanes == [maxu] * nlanes
+ lane = getattr(npyv, "setall_" + sfx)(-1)[0]
+ assert lane == maxu
+ lanes = getattr(npyv, "load_" + sfx)([-1] * nlanes)
+ assert lanes == [maxu] * nlanes
+
+ @pytest.mark.parametrize('sfx', signed_sfx)
+ def test_signed_overflow(self, sfx):
+ nlanes = getattr(npyv, "nlanes_" + sfx)
+ maxs_72 = (1 << 71) - 1
+ lane = getattr(npyv, "setall_" + sfx)(maxs_72)[0]
+ assert lane == -1
+ lanes = getattr(npyv, "load_" + sfx)([maxs_72] * nlanes)
+ assert lanes == [-1] * nlanes
+ mins_72 = -1 << 71
+ lane = getattr(npyv, "setall_" + sfx)(mins_72)[0]
+ assert lane == 0
+ lanes = getattr(npyv, "load_" + sfx)([mins_72] * nlanes)
+ assert lanes == [0] * nlanes
+
+ def test_truncate_f32(self):
+ f32 = npyv.setall_f32(0.1)[0]
+ assert f32 != 0.1
+ assert round(f32, 1) == 0.1
+
+ def test_compare(self):
+ data_range = range(0, npyv.nlanes_u32)
+ vdata = npyv.load_u32(data_range)
+ assert vdata == list(data_range)
+ assert vdata == tuple(data_range)
+ for i in data_range:
+ assert vdata[i] == data_range[i]
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index 72ea0c388..3eba6e32a 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -2372,19 +2372,18 @@ class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
else:
dispatch_rows.append(("Generated", ''))
for tar in self.feature_sorted(target_sources):
- tar_as_seq = [tar] if isinstance(tar, str) else tar
sources = target_sources[tar]
- name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+ pretty_name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
flags = ' '.join(self.feature_flags(tar))
implies = ' '.join(self.feature_sorted(self.feature_implies(tar)))
detect = ' '.join(self.feature_detect(tar))
extra_checks = []
- for name in tar_as_seq:
+ for name in ((tar,) if isinstance(tar, str) else tar):
extra_checks += self.feature_extra_checks(name)
extra_checks = (' '.join(extra_checks) if extra_checks else "none")
dispatch_rows.append(('', ''))
- dispatch_rows.append((name, implies))
+ dispatch_rows.append((pretty_name, implies))
dispatch_rows.append(("Flags", flags))
dispatch_rows.append(("Extra checks", extra_checks))
dispatch_rows.append(("Detect", detect))
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index 60ba4c917..a4fda537d 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -22,6 +22,8 @@ class build(old_build):
"specify a list of dispatched CPU optimizations"),
('disable-optimization', None,
"disable CPU optimized code(dispatch,simd,fast...)"),
+ ('simd-test=', None,
+ "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
]
help_options = old_build.help_options + [
@@ -36,6 +38,16 @@ class build(old_build):
self.cpu_baseline = "min"
self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default
self.disable_optimization = False
+ """
+ the '_simd' module is a very large. Adding more dispatched features
+ will increase binary size and compile time. By default we minimize
+ the targeted features to those most commonly used by the NumPy SIMD interface(NPYV),
+ NOTE: any specified features will be ignored if they're:
+ - part of the baseline(--cpu-baseline)
+ - not part of dispatch-able features(--cpu-dispatch)
+ - not supported by compiler or platform
+ """
+ self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD"
def finalize_options(self):
build_scripts = self.build_scripts
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index 1a881c56a..ca6f8bcd2 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -19,8 +19,7 @@ from numpy.distutils.misc_util import (
has_cxx_sources, has_f_sources, is_sequence
)
from numpy.distutils.command.config_compiler import show_fortran_compilers
-from numpy.distutils.ccompiler_opt import new_ccompiler_opt
-
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt, CCompilerOpt
class build_ext (old_build_ext):
@@ -39,6 +38,8 @@ class build_ext (old_build_ext):
"specify a list of dispatched CPU optimizations"),
('disable-optimization', None,
"disable CPU optimized code(dispatch,simd,fast...)"),
+ ('simd-test=', None,
+ "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
]
help_options = old_build_ext.help_options + [
@@ -56,6 +57,7 @@ class build_ext (old_build_ext):
self.cpu_baseline = None
self.cpu_dispatch = None
self.disable_optimization = None
+ self.simd_test = None
def finalize_options(self):
if self.parallel:
@@ -87,7 +89,9 @@ class build_ext (old_build_ext):
('cpu_baseline', 'cpu_baseline'),
('cpu_dispatch', 'cpu_dispatch'),
('disable_optimization', 'disable_optimization'),
+ ('simd_test', 'simd_test')
)
+ CCompilerOpt.conf_target_groups["simd_test"] = self.simd_test
def run(self):
if not self.extensions:
diff --git a/runtests.py b/runtests.py
index f8b70d936..87e26768b 100755
--- a/runtests.py
+++ b/runtests.py
@@ -122,6 +122,9 @@ def main(argv):
help="Specify a list of dispatched CPU optimizations"),
parser.add_argument("--disable-optimization", action="store_true",
help="Disable CPU optimized code(dispatch,simd,fast...)"),
+ parser.add_argument("--simd-test", default=None,
+ help="Specify a list of CPU optimizations to be "
+ "tested against NumPy SIMD interface"),
parser.add_argument("--show-build-log", action="store_true",
help="Show build output rather than using a log file")
parser.add_argument("--bench", action="store_true",
@@ -439,6 +442,8 @@ def build_project(args):
cmd += ["--cpu-dispatch", args.cpu_dispatch]
if args.disable_optimization:
cmd += ["--disable-optimization"]
+ if args.simd_test is not None:
+ cmd += ["--simd-test", args.simd_test]
# Install; avoid producing eggs so numpy can be imported from dst_dir.
cmd += ['install', '--prefix=' + dst_dir,
'--single-version-externally-managed',