diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-10-29 20:14:40 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-29 20:14:40 +0200 |
commit | 8829b807a841911ce18e79b308fee9fb92fb91b6 (patch) | |
tree | 88b1c765b18a524dc89e98e94e412d5acea121d6 | |
parent | 43683b3256a86659f230dcadbcde1f8020398bfa (diff) | |
parent | ac4ffe1d39d9cc845948079a24facf7057effb24 (diff) | |
download | numpy-8829b807a841911ce18e79b308fee9fb92fb91b6.tar.gz |
Merge pull request #16782 from seiko2plus/implement_npyv_pymod
ENH, TST: Bring the NumPy C SIMD vectorization interface "NPYV" to Python
-rw-r--r-- | .gitattributes | 1 | ||||
-rw-r--r-- | numpy/core/setup.py | 23 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd.c | 73 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 549 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd.h | 30 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd_arg.inc | 85 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd_convert.inc | 209 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd_data.inc.src | 93 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd_easyintrin.inc | 214 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd_inc.h.src | 421 | ||||
-rw-r--r-- | numpy/core/src/_simd/_simd_vector.inc | 178 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 550 | ||||
-rw-r--r-- | numpy/core/tests/test_simd_module.py | 97 | ||||
-rw-r--r-- | numpy/distutils/ccompiler_opt.py | 7 | ||||
-rw-r--r-- | numpy/distutils/command/build.py | 12 | ||||
-rw-r--r-- | numpy/distutils/command/build_ext.py | 8 | ||||
-rwxr-xr-x | runtests.py | 5 |
17 files changed, 2549 insertions, 6 deletions
diff --git a/.gitattributes b/.gitattributes index ad7d3b227..bce3dbe6d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,6 +7,7 @@ doc/release/*.rst merge=union # Highlight our custom templating language as C, since it's hopefully better # than nothing. This also affects repo statistics. *.c.src linguist-language=C +*.inc.src linguist-language=C *.h.src linguist-language=C # Mark some files as vendored diff --git a/numpy/core/setup.py b/numpy/core/setup.py index b3e17baed..68aa0a851 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -626,6 +626,7 @@ def configuration(parent_package='',top_path=None): config.add_include_dirs(join('src', 'multiarray')) config.add_include_dirs(join('src', 'umath')) config.add_include_dirs(join('src', 'npysort')) + config.add_include_dirs(join('src', '_simd')) config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")]) @@ -974,6 +975,28 @@ def configuration(parent_package='',top_path=None): config.add_extension('_operand_flag_tests', sources=[join('src', 'umath', '_operand_flag_tests.c.src')]) + ####################################################################### + # SIMD module # + ####################################################################### + + config.add_extension('_simd', sources=[ + join('src', 'common', 'npy_cpu_features.c.src'), + join('src', '_simd', '_simd.c'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd.dispatch.c.src'), + ], depends=[ + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + join('src', '_simd', '_simd.h'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd_arg.inc'), + join('src', '_simd', '_simd_convert.inc'), + join('src', '_simd', '_simd_easyintrin.inc'), + join('src', '_simd', '_simd_vector.inc'), + ]) + config.add_subpackage('tests') config.add_data_dir('tests/data') config.add_data_dir('tests/examples') diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c new file mode 100644 index 000000000..b1fdd4478 --- /dev/null +++ b/numpy/core/src/_simd/_simd.c @@ -0,0 +1,73 @@ +#include "_simd.h" + +PyMODINIT_FUNC PyInit__simd(void) +{ + static struct PyModuleDef defs = { + .m_base = PyModuleDef_HEAD_INIT, + .m_name = "numpy.core._simd", + .m_size = -1 + }; + if (npy_cpu_init() < 0) { + return NULL; + } + PyObject *m = PyModule_Create(&defs); + if (m == NULL) { + return NULL; + } + PyObject *targets = PyDict_New(); + if (targets == NULL) { + goto err; + } + if (PyModule_AddObject(m, "targets", targets) < 0) { + Py_DECREF(targets); + goto err; + } + // add keys for non-supported optimizations with None value + #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY) \ + { \ + PyObject *simd_mod; \ + if (!TESTED_FEATURES) { \ + Py_INCREF(Py_None); \ + simd_mod = Py_None; \ + } else { \ + simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)(); \ + if (simd_mod == NULL) { \ + goto err; \ + } \ + } \ + const char *target_name = NPY_TOSTRING(TARGET_NAME); \ + if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) { \ + Py_DECREF(simd_mod); \ + goto err; \ + } \ + Py_INCREF(simd_mod); \ + if (PyModule_AddObject(m, target_name, simd_mod) < 0) { \ + Py_DECREF(simd_mod); \ + goto err; \ + } \ + } + + #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY) \ + { \ + PyObject *simd_mod = simd_create_module(); \ + if (simd_mod == NULL) { \ + goto err; \ + } \ + if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) { \ + Py_DECREF(simd_mod); \ + goto err; \ + } \ + Py_INCREF(simd_mod); \ + if (PyModule_AddObject(m, "baseline", simd_mod) < 0) { \ + Py_DECREF(simd_mod); \ + goto err; \ + } \ + } + + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY) + NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY) + return m; +err: + Py_DECREF(m); + return NULL; +} diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src new file mode 100644 index 000000000..2d89b9df0 --- /dev/null +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -0,0 +1,549 @@ +/*@targets $werror #simd_test*/ +#include "_simd.h" +#include "_simd_inc.h" + +#if NPY_SIMD +#include "_simd_data.inc" +#include "_simd_convert.inc" +#include "_simd_vector.inc" +#include "_simd_arg.inc" +#include "_simd_easyintrin.inc" + +/************************************************************************* + * Defining NPYV intrinsics as module functions + *************************************************************************/ +/**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# + * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# + * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# + * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# + * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# + * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + */ +#if @simd_sup@ +/*************************** + * Memory + ***************************/ +/**begin repeat1 + * # intrin = load, loada, loads, loadl# + */ +SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@) +/**end repeat1**/ +/**begin repeat1 + * # intrin = store, storea, stores, storel, storeh# + */ +// special definition due to the nature of @intrin@ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; + if (!PyArg_ParseTuple( + args, "O&O&:@intrin@_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &vec_arg + )) { + return NULL; + } + npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@); + // write-back + if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { + simd_arg_free(&seq_arg); + return NULL; + } + simd_arg_free(&seq_arg); + Py_RETURN_NONE; +} +/**end repeat1**/ + +/**************************************** + * Non-contiguous/Partial Memory access + ****************************************/ +#if @ncont_sup@ +// Partial Load +SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@) +SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32) + +// Partial Store +static PyObject * +simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg nlane_arg = {.dtype = simd_data_u32}; + simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; + if (!PyArg_ParseTuple( + args, "O&O&O&:store_till_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &nlane_arg, + simd_arg_converter, &vec_arg + )) { + return NULL; + } + npyv_store_till_@sfx@( + seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@ + ); + // write-back + if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { + simd_arg_free(&seq_arg); + return NULL; + } + simd_arg_free(&seq_arg); + Py_RETURN_NONE; +} + +// Non-contiguous Load +/**begin repeat1 + * #intrin = loadn, loadn_till, loadn_tillz# + * #till = 0, 1, 1# + * #fill = 0, 1, 0# + * #format = , O&O&, O&# + */ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg stride_arg = {.dtype = simd_data_s64}; +#if @till@ + simd_arg nlane_arg = {.dtype = simd_data_u32}; +#endif // till +#if @fill@ + simd_arg fill_arg = {.dtype = simd_data_@sfx@}; +#endif + if (!PyArg_ParseTuple( + args, "@format@O&O&:@intrin@_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &stride_arg +#if @till@ + ,simd_arg_converter, &nlane_arg +#endif +#if @fill@ + ,simd_arg_converter, &fill_arg +#endif + )) { + return NULL; + } + npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@; + npy_intp stride = (npy_intp)stride_arg.data.s64; + Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr); + Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@; + if (stride < 0) { + seq_ptr += cur_seq_len -1; + min_seq_len = -min_seq_len; + } + if (cur_seq_len < min_seq_len) { + PyErr_Format(PyExc_ValueError, + "@intrin@_@sfx@(), according to provided stride %d, the " + "minimum acceptable size of the required sequence is %d, given(%d)", + stride, min_seq_len, cur_seq_len + ); + goto err; + } + npyv_@sfx@ rvec = npyv_@intrin@_@sfx@( + seq_ptr, stride + #if @till@ + , nlane_arg.data.u32 + #endif + #if @fill@ + , fill_arg.data.@sfx@ + #endif + ); + simd_arg ret = { + .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec} + }; + simd_arg_free(&seq_arg); + return simd_arg_to_obj(&ret); +err: + simd_arg_free(&seq_arg); + return NULL; +} +/**end repeat1**/ + +// Non-contiguous Store +/**begin repeat1 + * #intrin = storen, storen_till# + * #till = 0, 1# + * #format = , O&# + */ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg stride_arg = {.dtype = simd_data_s64}; + simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; +#if @till@ + simd_arg nlane_arg = {.dtype = simd_data_u32}; +#endif + if (!PyArg_ParseTuple( + args, "@format@O&O&O&:storen_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &stride_arg +#if @till@ + ,simd_arg_converter, &nlane_arg +#endif + ,simd_arg_converter, &vec_arg + )) { + return NULL; + } + npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@; + npy_intp stride = (npy_intp)stride_arg.data.s64; + Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr); + Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@; + if (stride < 0) { + seq_ptr += cur_seq_len -1; + min_seq_len = -min_seq_len; + } + // overflow guard + if (cur_seq_len < min_seq_len) { + PyErr_Format(PyExc_ValueError, + "@intrin@_@sfx@(), according to provided stride %d, the" + "minimum acceptable size of the required sequence is %d, given(%d)", + stride, min_seq_len, cur_seq_len + ); + goto err; + } + npyv_@intrin@_@sfx@( + seq_ptr, stride + #if @till@ + ,nlane_arg.data.u32 + #endif + ,vec_arg.data.v@sfx@ + ); + // write-back + if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { + goto err; + } + simd_arg_free(&seq_arg); + Py_RETURN_NONE; +err: + simd_arg_free(&seq_arg); + return NULL; +} +/**end repeat1**/ +#endif // @ncont_sup@ + + +/*************************** + * Misc + ***************************/ +SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@) +SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@) + +/**begin repeat1 + * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + */ +#if @simd_sup2@ +SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@) +#endif // simd_sup2 +/**end repeat1**/ + +/** + * special definition due to the nature of intrinsics + * npyv_setf_@sfx@ and npy_set_@sfx@. +*/ +/**begin repeat1 + * #intrin = setf, set# + */ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + npyv_lanetype_@sfx@ *data = simd_sequence_from_iterable(args, simd_data_q@sfx@, npyv_nlanes_@sfx@); + if (data == NULL) { + return NULL; + } + simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@( + data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], + data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15], + data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23], + data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31], + data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39], + data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47], + data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55], + data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63], + data[64] // for setf + )}; + simd_sequence_free(data); + return (PyObject*)PySIMDVector_FromData(r, simd_data_v@sfx@); +} +/**end repeat1**/ + +/*************************** + * Reorder + ***************************/ +/**begin repeat1 + * # intrin = combinel, combineh# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +/**begin repeat1 + * # intrin = combine, zip# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@) +/**end repeat1**/ + +/*************************** + * Operators + ***************************/ +#if @shl_imm@ > 0 +SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8) +SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8) +// immediate constant +SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@) +SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@) +#endif // shl_imm + +/**begin repeat1 + * #intrin = and, or, xor# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@) + +/**begin repeat1 + * #intrin = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +/*************************** + * Conversion + ***************************/ +SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@) +SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@) + +/*************************** + * Arithmetic + ***************************/ +/**begin repeat1 + * #intrin = add, sub# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +#if @sat_sup@ +/**begin repeat1 + * #intrin = adds, subs# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ +#endif // sat_sup + +#if @mul_sup@ +SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@) +#endif // mul_sup + +#if @div_sup@ +SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@) +#endif // div_sup + +#if @fused_sup@ +/**begin repeat1 + * #intrin = muladd, mulsub, nmuladd, nmulsub# + */ +SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ +#endif // fused_sup + +#endif // simd_sup +/**end repeat**/ +/*************************** + * Variant + ***************************/ +SIMD_IMPL_INTRIN_0N(cleanup) + +/************************************************************************* + * Attach module functions + *************************************************************************/ +static PyMethodDef simd__intrinsics_methods[] = { +/**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# + * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# + * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# + * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# + * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# + * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + */ +#if @simd_sup@ + +/*************************** + * Memory + ***************************/ +/**begin repeat1 + * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +/**************************************** + * Non-contiguous/Partial Memory access + ****************************************/ +#if @ncont_sup@ +/**begin repeat1 + * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz, + * store_till, storen, storen_till# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // ncont_sup + + +/*************************** + * Misc + ***************************/ +/**begin repeat1 + * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + */ +#if @simd_sup2@ +SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@) +#endif // simd_sup2 +/**end repeat1**/ + +/**begin repeat1 + * # intrin = set, setf, setall, zero, select# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +/*************************** + * Reorder + ***************************/ +/**begin repeat1 + * # intrin = combinel, combineh, combine, zip# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) +SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) + +/*************************** + * Operators + ***************************/ +#if @shl_imm@ > 0 +/**begin repeat1 + * # intrin = shl, shr, shli, shri# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // shl_imm + +/**begin repeat1 + * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +/*************************** + * Conversion + ***************************/ +SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) +SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) + +/*************************** + * Arithmetic + ***************************/ +/**begin repeat1 + * #intrin = add, sub# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +#if @sat_sup@ +/**begin repeat1 + * #intrin = adds, subs# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // sat_sup + +#if @mul_sup@ +SIMD_INTRIN_DEF(mul_@sfx@) +#endif // mul_sup + +#if @div_sup@ +SIMD_INTRIN_DEF(div_@sfx@) +#endif // div_sup + +#if @fused_sup@ +/**begin repeat1 + * #intrin = muladd, mulsub, nmuladd, nmulsub# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // fused_sup + +#endif // simd_sup +/**end repeat**/ + +/*************************** + * Variant + ***************************/ +SIMD_INTRIN_DEF(cleanup) +/***************************/ +{NULL, NULL, 0, NULL} +}; // PyMethodDef + +#endif // NPY_SIMD + +/************************************************************************* + * Defining a separate module for each target + *************************************************************************/ +NPY_VISIBILITY_HIDDEN PyObject * +NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) +{ + static struct PyModuleDef defs = { + .m_base = PyModuleDef_HEAD_INIT, + .m_size = -1, + #ifdef NPY__CPU_TARGET_CURRENT + .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT), + #else + .m_name = "numpy.core._simd.baseline", + #endif + #if NPY_SIMD + .m_methods = simd__intrinsics_methods + #else + .m_methods = NULL + #endif + }; + PyObject *m = PyModule_Create(&defs); + if (m == NULL) { + return NULL; + } + if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) { + goto err; + } + if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) { + goto err; + } + if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) { + goto err; + } +#if NPY_SIMD + if (PySIMDVectorType_Init(m)) { + goto err; + } + /**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + */ + if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) { + goto err; + } + /**end repeat**/ +#endif // NPY_SIMD + return m; +err: + Py_DECREF(m); + return NULL; +} diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h new file mode 100644 index 000000000..d9905c801 --- /dev/null +++ b/numpy/core/src/_simd/_simd.h @@ -0,0 +1,30 @@ +/** + * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes. + * + * Please keep this module independent from other c-extension modules, + * since NPYV intrinsics may be involved in their functionality, + * which increases the degree of complexity in tracking and detecting errors. + * + * TODO: Add an independent sphinx doc. + * + * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'. + */ +#ifndef _SIMD_SIMD_H_ +#define _SIMD_SIMD_H_ + +#include <Python.h> +#include "numpy/npy_common.h" + +#ifndef NPY_DISABLE_OPTIMIZATION +// autogenerated, required for CPU dispatch macros +#include "_simd.dispatch.h" +#endif +/** + * Create a new module for each required optimization which contains all NPYV intrinsics, + * + * If required optimization is not supported by NPYV, the module will still provides + * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without + * any intrinsics. + */ +NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void)) +#endif // _SIMD_SIMD_H_ diff --git a/numpy/core/src/_simd/_simd_arg.inc b/numpy/core/src/_simd/_simd_arg.inc new file mode 100644 index 000000000..f5bcf5487 --- /dev/null +++ b/numpy/core/src/_simd/_simd_arg.inc @@ -0,0 +1,85 @@ +/** + * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and + * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN` + * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was + * deemed too harmful to readability. + */ +/************************************ + ** Protected Definitions + ************************************/ +static int +simd_arg_from_obj(PyObject *obj, simd_arg *arg) +{ + assert(arg->dtype != 0); + const simd_data_info *info = simd_data_getinfo(arg->dtype); + if (info->is_scalar) { + arg->data = simd_scalar_from_number(obj, arg->dtype); + } + else if (info->is_sequence) { + unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes; + arg->data.qu8 = simd_sequence_from_iterable(obj, arg->dtype, min_seq_size); + } + else if (info->is_vectorx) { + arg->data = simd_vectorx_from_tuple(obj, arg->dtype); + } + else if (info->is_vector) { + arg->data = PySIMDVector_AsData((PySIMDVectorObject*)obj, arg->dtype); + } else { + arg->data.u64 = 0; + PyErr_Format(PyExc_RuntimeError, + "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname + ); + return -1; + } + if (PyErr_Occurred()) { + return -1; + } + return 0; +} + +static PyObject * +simd_arg_to_obj(const simd_arg *arg) +{ + assert(arg->dtype != 0); + const simd_data_info *info = simd_data_getinfo(arg->dtype); + if (info->is_scalar) { + return simd_scalar_to_number(arg->data, arg->dtype); + } + if (info->is_sequence) { + return simd_sequence_to_list(arg->data.qu8, arg->dtype); + } + if (info->is_vectorx) { + return simd_vectorx_to_tuple(arg->data, arg->dtype); + } + if (info->is_vector) { + return (PyObject*)PySIMDVector_FromData(arg->data, arg->dtype); + } + PyErr_Format(PyExc_RuntimeError, + "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname + ); + return NULL; +} + +static void +simd_arg_free(simd_arg *arg) +{ + const simd_data_info *info = simd_data_getinfo(arg->dtype); + if (info->is_sequence) { + simd_sequence_free(arg->data.qu8); + } +} + +static int +simd_arg_converter(PyObject *obj, simd_arg *arg) +{ + if (obj != NULL) { + if (simd_arg_from_obj(obj, arg) < 0) { + return 0; + } + arg->obj = obj; + return Py_CLEANUP_SUPPORTED; + } else { + simd_arg_free(arg); + } + return 1; +} diff --git a/numpy/core/src/_simd/_simd_convert.inc b/numpy/core/src/_simd/_simd_convert.inc new file mode 100644 index 000000000..f5bfc3f50 --- /dev/null +++ b/numpy/core/src/_simd/_simd_convert.inc @@ -0,0 +1,209 @@ +/** + * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and + * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN` + * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was + * deemed too harmful to readability. + */ +/************************************ + ** Protected Definitions + ************************************/ +static simd_data +simd_scalar_from_number(PyObject *obj, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_scalar && info->lane_size > 0); + simd_data data; + if (info->is_float) { + data.f64 = PyFloat_AsDouble(obj); + if (dtype == simd_data_f32){ + data.f32 = (float)data.f64; + } + } else { + data.u64 = PyLong_AsUnsignedLongLongMask(obj); + } + return data; +} + +static PyObject * +simd_scalar_to_number(simd_data data, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_scalar && info->lane_size > 0); + if (info->is_float) { + if (dtype == simd_data_f32) { + return PyFloat_FromDouble(data.f32); + } + return PyFloat_FromDouble(data.f64); + } + int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8; + data.u64 <<= leftb; + if (info->is_signed) { + return PyLong_FromLongLong(data.s64 >> leftb); + } + return PyLong_FromUnsignedLongLong(data.u64 >> leftb); +} + +typedef struct { + Py_ssize_t len; + void *ptr; +} simd__alloc_data; + +static void * +simd_sequence_new(Py_ssize_t len, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(len > 0 && info->is_sequence && info->lane_size > 0); + size_t size = sizeof(simd__alloc_data) + len * info->lane_size + NPY_SIMD_WIDTH; + void *ptr = malloc(size); + if (ptr == NULL) { + return PyErr_NoMemory(); + } + // align the pointer + simd__alloc_data *a_ptr = (simd__alloc_data *)( + ((uintptr_t)ptr + sizeof(simd__alloc_data) + NPY_SIMD_WIDTH) & ~(uintptr_t)(NPY_SIMD_WIDTH-1) + ); + a_ptr[-1].len = len; + a_ptr[-1].ptr = ptr; + return a_ptr; +} + +static Py_ssize_t +simd_sequence_len(void const *ptr) +{ + return ((simd__alloc_data const*)ptr)[-1].len; +} + +static void +simd_sequence_free(void *ptr) +{ + free(((simd__alloc_data *)ptr)[-1].ptr); +} + +static void * +simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_sequence && info->lane_size > 0); + PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence"); + if (seq_obj == NULL) { + return NULL; + } + Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj); + if (seq_size < min_size) { + PyErr_Format(PyExc_ValueError, + "minimum acceptable size of the required sequence is %d, given(%d)", + min_size, seq_size + ); + return NULL; + } + npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype); + if (dst == NULL) { + return NULL; + } + PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj); + for (Py_ssize_t i = 0; i < seq_size; ++i) { + simd_data data = simd_scalar_from_number(seq_items[i], info->to_scalar); + npyv_lanetype_u8 *sdst = dst + i * info->lane_size; + memcpy(sdst, &data.u64, info->lane_size); + } + Py_DECREF(seq_obj); + + if (PyErr_Occurred()) { + simd_sequence_free(dst); + return NULL; + } + return dst; +} + +static int +simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + if (!PySequence_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "a sequence object is required to fill %s", info->pyname + ); + return -1; + } + const npyv_lanetype_u8 *src = ptr; + Py_ssize_t seq_len = simd_sequence_len(ptr); + for (Py_ssize_t i = 0; i < seq_len; ++i) { + const npyv_lanetype_u8 *ssrc = src + i * info->lane_size; + simd_data data; + memcpy(&data.u64, ssrc, info->lane_size); + PyObject *item = simd_scalar_to_number(data, info->to_scalar); + if (item == NULL) { + return -1; + } + if (PySequence_SetItem(obj, i, item) < 0) { + Py_DECREF(item); + return -1; + } + } + return 0; +} + +static PyObject * +simd_sequence_to_list(const void *ptr, simd_data_type dtype) +{ + PyObject *list = PyList_New(simd_sequence_len(ptr)); + if (list == NULL) { + return NULL; + } + if (simd_sequence_fill_iterable(list, ptr, dtype) < 0) { + Py_DECREF(list); + return NULL; + } + return list; +} + +static simd_data +simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + // NPYV currently only supports x2 and x3 + assert(info->is_vectorx > 1 && info->is_vectorx < 4); + + simd_data data = {.u64 = 0}; + if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) { + PyErr_Format(PyExc_TypeError, + "a tuple of %d vector type %s is required", + info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname + ); + return data; + } + for (int i = 0; i < info->is_vectorx; ++i) { + PyObject *item = PyTuple_GET_ITEM(obj, i); + // get the max multi-vec and let the compiler do the rest + data.vu64x3.val[i] = PySIMDVector_AsData((PySIMDVectorObject*)item, info->to_vector).vu64; + if (PyErr_Occurred()) { + return data; + } + } + return data; +} + +static PyObject * +simd_vectorx_to_tuple(simd_data data, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + // NPYV currently only supports x2 and x3 + assert(info->is_vectorx > 1 && info->is_vectorx < 4); + + PyObject *tuple = PyTuple_New(info->is_vectorx); + if (tuple == NULL) { + return NULL; + } + for (int i = 0; i < info->is_vectorx; ++i) { + // get the max multi-vector and let the compiler handle the rest + simd_data vdata = {.vu64 = data.vu64x3.val[i]}; + PyObject *item = (PyObject*)PySIMDVector_FromData(vdata, info->to_vector); + if (item == NULL) { + // TODO: improve log add item number + Py_DECREF(tuple); + return NULL; + } + PyTuple_SET_ITEM(tuple, i, item); + } + return tuple; +} diff --git a/numpy/core/src/_simd/_simd_data.inc.src b/numpy/core/src/_simd/_simd_data.inc.src new file mode 100644 index 000000000..5c796487c --- /dev/null +++ b/numpy/core/src/_simd/_simd_data.inc.src @@ -0,0 +1,93 @@ +/** + * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and + * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN` + * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was + * deemed too harmful to readability. + */ +/************************************ + ** Private Definitions + ************************************/ +static simd_data_info simd__data_registry[simd_data_end] = +{ + [simd_data_none] = {.pyname="none"}, + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + * #name = int*8, float, float# + */ + [simd_data_@sfx@] = { + .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // sequences + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + * #name = int*8, float, float# + */ + [simd_data_q@sfx@] = { + .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // vectors + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + */ + [simd_data_v@sfx@] = { + .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // boolean vectors, treated as unsigned and converted internally + // to add compatibility among all SIMD extensions + /**begin repeat + * #sfx = u8, u16, u32, u64# + * #bsfx = b8, b16, b32, b64# + */ + [simd_data_v@bsfx@] = { + .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1, + .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // multi-vectors x2 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + */ + [simd_data_v@sfx@x2] = { + .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // multi-vectors x3 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + */ + [simd_data_v@sfx@x3] = { + .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ +}; + +/************************************ + ** Protected Definitions + ************************************/ +static const simd_data_info * +simd_data_getinfo(simd_data_type dtype) +{ return &simd__data_registry[dtype]; } diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc new file mode 100644 index 000000000..54e7ccf01 --- /dev/null +++ b/numpy/core/src/_simd/_simd_easyintrin.inc @@ -0,0 +1,214 @@ +/** + * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and + * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN` + * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was + * deemed too harmful to readability. + */ +#define SIMD_INTRIN_DEF(NAME) \ + { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma + +#define SIMD_IMPL_INTRIN_0(NAME, RET) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + if (!PyArg_ParseTuple( \ + args, ":" NPY_TOSTRING(NAME)) \ + ) return NULL; \ + simd_arg a = { \ + .dtype = simd_data_##RET, \ + .data = {.RET = npyv_##NAME()}, \ + }; \ + return simd_arg_to_obj(&a); \ + } + +#define SIMD_IMPL_INTRIN_0N(NAME) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + if (!PyArg_ParseTuple( \ + args, ":" NPY_TOSTRING(NAME)) \ + ) return NULL; \ + npyv_##NAME(); \ + Py_RETURN_NONE; \ + } + +#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg arg = {.dtype = simd_data_##IN0}; \ + if (!PyArg_ParseTuple( \ + args, "O&:"NPY_TOSTRING(NAME), \ + simd_arg_converter, &arg \ + )) return NULL; \ + simd_data data = {.RET = npyv_##NAME( \ + arg.data.IN0 \ + )}; \ + simd_arg_free(&arg); \ + simd_arg ret = { \ + .data = data, .dtype = simd_data_##RET \ + }; \ + return simd_arg_to_obj(&ret); \ + } + +#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg arg1 = {.dtype = simd_data_##IN0}; \ + simd_arg arg2 = {.dtype = simd_data_##IN1}; \ + if (!PyArg_ParseTuple( \ + args, "O&O&:"NPY_TOSTRING(NAME), \ + simd_arg_converter, &arg1, \ + simd_arg_converter, &arg2 \ + )) return NULL; \ + simd_data data = {.RET = npyv_##NAME( \ + arg1.data.IN0, arg2.data.IN1 \ + )}; \ + simd_arg_free(&arg1); \ + simd_arg_free(&arg2); \ + simd_arg ret = { \ + .data = data, .dtype = simd_data_##RET \ + }; \ + return simd_arg_to_obj(&ret); \ + } + +#define SIMD__REPEAT_2IMM(C, NAME, IN0) \ + C == arg2.data.u8 ? NPY_CAT(npyv_, NAME)(arg1.data.IN0, C) : + +#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg arg1 = {.dtype = simd_data_##IN0}; \ + simd_arg arg2 = {.dtype = simd_data_u8}; \ + if (!PyArg_ParseTuple( \ + args, "O&O&:"NPY_TOSTRING(NAME), \ + simd_arg_converter, &arg1, \ + simd_arg_converter, &arg2 \ + )) return NULL; \ + simd_data data; \ + data.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)( \ + SIMD__REPEAT_2IMM, NAME, IN0 \ + ) npyv_##NAME(arg1.data.IN0, 0); \ + simd_arg_free(&arg1); \ + simd_arg ret = { \ + .data = data, .dtype = simd_data_##RET \ + }; \ + return simd_arg_to_obj(&ret); \ + } + +#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg arg1 = {.dtype = simd_data_##IN0}; \ + simd_arg arg2 = {.dtype = simd_data_##IN1}; \ + simd_arg arg3 = {.dtype = simd_data_##IN2}; \ + if (!PyArg_ParseTuple( \ + args, "O&O&O&:"NPY_TOSTRING(NAME), \ + simd_arg_converter, &arg1, \ + simd_arg_converter, &arg2, \ + simd_arg_converter, &arg3 \ + )) return NULL; \ + simd_data data = {.RET = npyv_##NAME( \ + arg1.data.IN0, arg2.data.IN1, \ + arg3.data.IN2 \ + )}; \ + simd_arg_free(&arg1); \ + simd_arg_free(&arg2); \ + simd_arg_free(&arg3); \ + simd_arg ret = { \ + .data = data, .dtype = simd_data_##RET \ + }; \ + return simd_arg_to_obj(&ret); \ + } +/** + * Helper macros for repeating and expand a certain macro. + * Mainly used for converting a scalar to an immediate constant. + */ +#define SIMD__IMPL_COUNT_7(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_8(FN, ...) \ + SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(8, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_15(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_16(FN, ...) \ + SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(16, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_31(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_32(FN, ...) \ + SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(32, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_47(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_48(FN, ...) \ + SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(48, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_63(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_64(FN, ...) \ + SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(64, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_7_(FN, ...) \ + NPY_EXPAND(FN(1, __VA_ARGS__)) \ + NPY_EXPAND(FN(2, __VA_ARGS__)) NPY_EXPAND(FN(3, __VA_ARGS__)) \ + NPY_EXPAND(FN(4, __VA_ARGS__)) NPY_EXPAND(FN(5, __VA_ARGS__)) \ + NPY_EXPAND(FN(6, __VA_ARGS__)) NPY_EXPAND(FN(7, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_15_(FN, ...) \ + SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(8, __VA_ARGS__)) NPY_EXPAND(FN(9, __VA_ARGS__)) \ + NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \ + NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \ + NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_31_(FN, ...) \ + SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \ + NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \ + NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \ + NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \ + NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \ + NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \ + NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \ + NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_47_(FN, ...) \ + SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \ + NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \ + NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \ + NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \ + NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \ + NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \ + NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \ + NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_63_(FN, ...) \ + SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \ + NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \ + NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \ + NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \ + NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \ + NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \ + NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \ + NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__)) diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src new file mode 100644 index 000000000..9858fc0dc --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc.h.src @@ -0,0 +1,421 @@ +#ifndef _SIMD_SIMD_INC_H_ +#define _SIMD_SIMD_INC_H_ + +#include <Python.h> +#include "simd/simd.h" + +#if NPY_SIMD +/************************************ + ** Types + ************************************/ +/** + * Gather all data types supported by the module. +*/ +typedef union +{ + // scalars + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + npyv_lanetype_@sfx@ @sfx@; + /**end repeat**/ + // sequence + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + npyv_lanetype_@sfx@ *q@sfx@; + /**end repeat**/ + // vectors + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64# + */ + npyv_@sfx@ v@sfx@; + /**end repeat**/ + // multi-vectors x2 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32# + */ + npyv_@sfx@x2 v@sfx@x2; + /**end repeat**/ + // multi-vectors x3 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32# + */ + npyv_@sfx@x3 v@sfx@x3; + /**end repeat**/ +#if NPY_SIMD_F64 + npyv_f64 vf64; + npyv_f64x2 vf64x2; + npyv_f64x3 vf64x3; +#endif +} simd_data; + +/** + * Data types IDs and suffixes. Must be same data types as the ones + * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'. +*/ +typedef enum +{ + simd_data_none = 0, + // scalars + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_@sfx@, + /**end repeat**/ + // sequences + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_q@sfx@, + /**end repeat**/ + // vectors + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64# + */ + simd_data_v@sfx@, + /**end repeat**/ + // multi-vectors x2 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_v@sfx@x2, + /**end repeat**/ + // multi-vectors x3 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_v@sfx@x3, + /**end repeat**/ + simd_data_end, +} simd_data_type; +/************************************ + ** Declarations (inc_data) + ************************************/ +/** + * simd_data_type information + */ +typedef struct +{ + // type name compatible with python style + const char *pyname; + // returns '1' if the type represent a unsigned integer + int is_unsigned:1; + // returns '1' if the type represent a signed integer + int is_signed:1; + // returns '1' if the type represent a single or double precision + int is_float:1; + // returns '1' if the type represent a boolean + int is_bool:1; + // returns '1' if the type represent a sequence + int is_sequence:1; + // returns '1' if the type represent a scalar + int is_scalar:1; + // returns '1' if the type represent a vector + int is_vector:1; + // returns the len of multi-vector if the type reprsent x2 or x3 vector + // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2 + int is_vectorx; + // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8 + simd_data_type to_scalar; + // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8 + // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector + // e.g. simd_data_vb8 -> simd_data_vu8 + simd_data_type to_vector; + // number of vector lanes + int nlanes; + // sizeof lane type + int lane_size; +} simd_data_info; + +/** + * Returns data info of certain dtype. + * + * Example: + ** const simd_data_info *info = simd_data_getinfo(simd_data_vu8); + ** if (info->is_vector && info->is_unsigned) { + ** ... + ** } + */ +static const simd_data_info * +simd_data_getinfo(simd_data_type dtype); + +/************************************ + ** Declarations (inc_vector) + ************************************/ +typedef struct +{ + PyObject_HEAD + // vector type id + simd_data_type dtype; + // vector data, aligned for safe casting + npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH]; +} PySIMDVectorObject; +/** + * Create a Python obj(PySIMDVectorObject) from a NPYV vector based on the contents + * of `data`(simd_data) and according to the vector data type `dtype` + * on range(simd_data_[vu8:vf64]). + * Return NULL and a Python exception on failure, otherwise new reference. + * + * Example: + ** simd_data data = {.vu8 = npyv_setall_u8(0xff)}; + ** PySIMDVectorObject *obj = PySIMDVector_FromData(data, simd_data_vu8); + ** if (obj != NULL) { + ** printf("I have a valid vector obj and first element is \n", obj->data[0]); + ** Py_DECREF(obj); + ** } + */ +static PySIMDVectorObject * +PySIMDVector_FromData(simd_data data, simd_data_type dtype); +/** + * Return a NPYV vector(simd_data) representation of `obj`(PySIMDVectorObject) and + * according to the vector data type `dtype` on range (simd_data_[vu8:vf64]). + * Raise a Python exception on failure. + * + * Example: + ** simd_data data = PySIMDVector_AsData(vec_obj, simd_data_vf32); + ** if (!PyErr_Occurred()) { + ** npyv_f32 add_1 = npyv_add_f32(data.vf32, npyv_setall_f32(1)); + ** ... + ** } + */ +static simd_data +PySIMDVector_AsData(PySIMDVectorObject *obj, simd_data_type dtype); +/** + * initialize and register PySIMDVectorType to certain PyModule, + * PySIMDVectorType can be reached through attribute 'vector_type'. + * return -1 on error, 0 on success. + */ +static int +PySIMDVectorType_Init(PyObject *module); + +/************************************ + ** Declarations (inc_convert) + ************************************/ +/** + * Return a C scalar(simd_data) representation of `obj` and + * according to the scalar data type `dtype` on range (simd_data_[u8:f64]). + * Raise a Python exception on failure. + * + * Example: + ** simd_data data = simd_scalar_from_number(obj, simd_data_f32); + ** if (!PyErr_Occurred()) { + ** printf("I have a valid float %d\n", data.f32); + ** } + */ +static simd_data +simd_scalar_from_number(PyObject *obj, simd_data_type dtype); +/** + * Create a Python scalar from a C scalar based on the contents + * of `data`(simd_data) and according to the scalar data type `dtype` + * on range(simd_data_[u8:f64]). + * Return NULL and a Python exception on failure, otherwise new reference. + * + * Example: + ** simd_data data = {.u32 = 0x7fffffff}; + ** PyObject *obj = simd_scalar_to_number(data, simd_data_s32); + ** if (obj != NULL) { + ** printf("I have a valid Python integer %d\n", PyLong_AsLong(obj)); + ** Py_DECREF(obj); + ** } + */ +static PyObject * +simd_scalar_to_number(simd_data data, simd_data_type dtype); +/** + * Allocate a C array in memory according to number of elements `len` + * and sequence data type `dtype` on range(simd_data_[qu8:qf64]). + * + * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL + * with a Python exception on failure. + * + * Example: + ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64); + ** if (aligned_ptr != NULL) { + ** // aligned store + ** npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0)); + ** printf("The first element of my array %f\n", aligned_ptr[0]); + ** simd_sequence_free(aligned_ptr); + ** } + */ +static void * +simd_sequence_new(Py_ssize_t len, simd_data_type dtype); +/** + * Return the number of elements of the allocated C array `ptr` + * by `simd_sequence_new()` or `simd_sequence_from_iterable()`. + */ +static Py_ssize_t +simd_sequence_len(const void *ptr); +/** + * Free the allocated C array by `simd_sequence_new()` or + * `simd_sequence_from_iterable()`. + */ +static void +simd_sequence_free(void *ptr); +/** + * Return a C array representation of a PyObject sequence `obj` and + * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]). + * + * Note: parameter `min_size` takes the number of minimum acceptable elements. + * + * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL + * with a Python exception on failure. + * + * Example: + ** npyv_lanetype_u32 *ptr = simd_sequence_from_iterable(seq_obj, simd_data_qu32, npyv_nlanes_u32); + ** if (ptr != NULL) { + ** npyv_u32 a = npyv_load_u32(ptr); + ** ... + ** simd_sequence_free(ptr); + ** } + ** + */ +static void * +simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size); +/** + * Fill a Python sequence object `obj` with a C array `ptr` allocated by + * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to + * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]). + * + * Return 0 on success or -1 with a Python exception on failure. + */ +static int +simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype); +/** + * Create a Python list from a C array `ptr` allocated by + * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to + * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]). + * + * Return NULL and a Python exception on failure, otherwise new reference. + */ +static PyObject * +simd_sequence_to_list(const void *ptr, simd_data_type dtype); +/** + * Return a SIMD multi-vector(simd_data) representation of Python tuple of + * (simd_vector*,) `obj` according to the scalar data type `dtype` + * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]). + * + * Raise a Python exception on failure. + * + * Example: + ** simd_data data = simd_vectorx_from_tuple(tuple_obj, simd_data_vf32x2); + ** if (!PyErr_Occurred()) { + ** npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]); + ** ... + ** } + ** + */ +static simd_data +simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype); +/** + * Create a Python tuple of 'simd_vector' from a SIMD multi-vector + * based on the contents of `data`(simd_data) and according to + * the multi-vector data type `dtype` on range + * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]). + * + * Return NULL and a Python exception on failure, otherwise new reference. + */ +static PyObject * +simd_vectorx_to_tuple(simd_data data, simd_data_type dtype); + +/************************************ + ** Declarations (inc_arg) + ************************************/ +typedef struct +{ + simd_data_type dtype; + simd_data data; + // set by simd_arg_converter() + PyObject *obj; +} simd_arg; +/** + * The following functions gather all conversions between all data types + * and they can used instead of all above functions. + */ +/** + * Convert a Python object `obj` into simd_data `arg->data` according to the + * required data type `arg->dtype`. + * + * Return -1 and raise Python exception on failure, otherwise return 0. + * + * Notes: + * - requires `simd_arg_free()` or `simd_sequence_free()` + * to free allocated C array, in case of sequence data types. + * - the number of minimum acceptable elements for sequence data + * types is the number of lanes of the equivalent vector data type. + * + * Example #1: + ** simd_arg arg = {.dtype = simd_data_qu8}; + ** if (simd_arg_from_obj(seq_obj, &arg) < 0) { + ** // fails to convert a python sequence object to C array of uint8 + ** return; + ** } + ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8); + ** ... + ** simd_arg_free(&arg); + * + * Example #2: + ** simd_arg arg = {.dtype = simd_data_vf32}; + ** if (simd_arg_from_obj(vector_obj, &arg) < 0) { + ** // fails to convert a python simd_vector to NPYV vector + ** return; + ** } + ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1)); + ** ... + */ +static int +simd_arg_from_obj(PyObject *obj, simd_arg *arg); +/** + * Convert a simd_data `arg->data` to into a Python object according to the + * required data type `arg->dtype`. + * + * Return NULL and raise Python exception on failure, otherwise return + * new reference. + * + * Example: + ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}}; + ** PyObject *obj = simd_arg_to_obj(&arg); + ** if (obj == NULL) { + ** // fails convert C uint32 to Python integer. + ** return; + ** } + ** + */ +static PyObject * +simd_arg_to_obj(const simd_arg *arg); +/** + * Converter function used similar to simd_arg_from_obj() but + * used with PyArg_Parse*(). + * + * Notes: + * - requires `simd_arg_free()` or `simd_sequence_free()` + * to free allocated C array, in case of sequence data types. + * - the number of minimum acceptable elements for sequence data + * types is the number of lanes of the equivalent vector data type. + * - use 'arg->obj' to retrieve the parameter obj. + * + * Example: + ** simd_arg seq_f32 = {.dtype = simd_data_qf32}; + ** simd_arg vec_f32 = {.dtype = simd_data_vf32}; + ** if (!PyArg_ParseTuple( + ** args, "O&O&:add_sum_f32", + ** simd_arg_converter, &seq_f32, + ** simd_arg_converter, &vec_f32 + ** )) { + ** // fail + ** return; + ** } + ** npyv_f32 load_a = npyv_load_f32(seq_f32.data.qf32); + ** npyv_f32 sum = npyv_add_f32(load_a, vec_f32.data.vf32); + ** ... + ** simd_arg_free(&seq_f32); + */ +static int +simd_arg_converter(PyObject *obj, simd_arg *arg); +/** + * Free the allocated C array, if the arg hold sequence data type. + */ +static void +simd_arg_free(simd_arg *arg); + +#endif // NPY_SIMD +#endif // _SIMD_SIMD_INC_H_ diff --git a/numpy/core/src/_simd/_simd_vector.inc b/numpy/core/src/_simd/_simd_vector.inc new file mode 100644 index 000000000..2a1378f22 --- /dev/null +++ b/numpy/core/src/_simd/_simd_vector.inc @@ -0,0 +1,178 @@ +/** + * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and + * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN` + * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was + * deemed too harmful to readability. + */ +/************************************ + ** Private Definitions + ************************************/ +static Py_ssize_t +simd__vector_length(PySIMDVectorObject *self) +{ + return simd_data_getinfo(self->dtype)->nlanes; +} +static PyObject * +simd__vector_item(PySIMDVectorObject *self, Py_ssize_t i) +{ + const simd_data_info *info = simd_data_getinfo(self->dtype); + int nlanes = info->nlanes; + if (i >= nlanes) { + PyErr_SetString(PyExc_IndexError, "vector index out of range"); + return NULL; + } + npyv_lanetype_u8 *src = self->data + i * info->lane_size; + simd_data data; + memcpy(&data.u64, src, info->lane_size); + return simd_scalar_to_number(data, info->to_scalar); +} + +static PySequenceMethods simd__vector_as_sequence = { + .sq_length = (lenfunc) simd__vector_length, + .sq_item = (ssizeargfunc) simd__vector_item +}; + +static PyObject * +simd__vector_name(PySIMDVectorObject *self) +{ + return PyUnicode_FromString(simd_data_getinfo(self->dtype)->pyname); +} +static PyGetSetDef simd__vector_getset[] = { + { "__name__", (getter)simd__vector_name, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL } +}; + +static PyObject * +simd__vector_repr(PySIMDVectorObject *self) +{ + PyObject *obj = PySequence_List((PyObject*)self); + if (obj != NULL) { + const char *type_name = simd_data_getinfo(self->dtype)->pyname; + PyObject *repr = PyUnicode_FromFormat("<%s of %R>", type_name, obj); + Py_DECREF(obj); + return repr; + } + return obj; +} +static PyObject * +simd__vector_compare(PyObject *self, PyObject *other, int cmp_op) +{ + PyObject *obj; + if (PyTuple_Check(other)) { + obj = PySequence_Tuple(self); + } else if (PyList_Check(other)) { + obj = PySequence_List(self); + } else { + obj = PySequence_Fast(self, "invalid argument, expected a vector"); + } + if (obj != NULL) { + PyObject *rich = PyObject_RichCompare(obj, other, cmp_op); + Py_DECREF(obj); + return rich; + } + return obj; +} +static PyTypeObject PySIMDVectorType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)), + .tp_basicsize = sizeof(PySIMDVectorObject), + .tp_repr = (reprfunc)simd__vector_repr, + .tp_as_sequence = &simd__vector_as_sequence, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_richcompare = simd__vector_compare, + .tp_getset = simd__vector_getset +}; + +/************************************ + ** Protected Definitions + ************************************/ +static PySIMDVectorObject * +PySIMDVector_FromData(simd_data data, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_vector && info->nlanes > 0); + + PySIMDVectorObject *vec = PyObject_New(PySIMDVectorObject, &PySIMDVectorType); + if (vec == NULL) { + return (PySIMDVectorObject*)PyErr_NoMemory(); + } + vec->dtype = dtype; + if (info->is_bool) { + // boolean vectors are internally treated as unsigned + // vectors to add compatibility among all SIMD extensions + switch(dtype) { + case simd_data_vb8: + data.vu8 = npyv_cvt_u8_b8(data.vb8); + break; + case simd_data_vb16: + data.vu16 = npyv_cvt_u16_b16(data.vb16); + break; + case simd_data_vb32: + data.vu32 = npyv_cvt_u32_b32(data.vb32); + break; + default: + data.vu64 = npyv_cvt_u64_b64(data.vb64); + } + } + npyv_store_u8(vec->data, data.vu8); + return vec; +} + +static simd_data +PySIMDVector_AsData(PySIMDVectorObject *vec, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_vector && info->nlanes > 0); + + simd_data data = {.u64 = 0}; + if (!PyObject_IsInstance( + (PyObject *)vec, (PyObject *)&PySIMDVectorType + )) { + PyErr_Format(PyExc_TypeError, + "a vector type %s is required", info->pyname + ); + return data; + } + if (vec->dtype != dtype) { + PyErr_Format(PyExc_TypeError, + "a vector type %s is required, got(%s)", + info->pyname, simd_data_getinfo(vec->dtype)->pyname + ); + return data; + } + + data.vu8 = npyv_load_u8(vec->data); + if (info->is_bool) { + // boolean vectors are internally treated as unsigned + // vectors to add compatibility among all SIMD extensions + switch(dtype) { + case simd_data_vb8: + data.vb8 = npyv_cvt_b8_u8(data.vu8); + break; + case simd_data_vb16: + data.vb16 = npyv_cvt_b16_u16(data.vu16); + break; + case simd_data_vb32: + data.vb32 = npyv_cvt_b32_u32(data.vu32); + break; + default: + data.vb64 = npyv_cvt_b64_u64(data.vu64); + } + } + return data; +} + +static int +PySIMDVectorType_Init(PyObject *module) +{ + Py_INCREF(&PySIMDVectorType); + if (PyType_Ready(&PySIMDVectorType)) { + return -1; + } + if (PyModule_AddObject( + module, "vector_type",(PyObject *)&PySIMDVectorType + )) { + return -1; + } + return 0; +} diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py new file mode 100644 index 000000000..50e77a4b8 --- /dev/null +++ b/numpy/core/tests/test_simd.py @@ -0,0 +1,550 @@ +# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics +# may be involved in their functionality. +import pytest +from numpy.core._simd import targets + +class _Test_Utility: + # submodule of the desired SIMD extention, e.g. targets["AVX512F"] + npyv = None + # the current data type suffix e.g. 's8' + sfx = None + + def __getattr__(self, attr): + """ + To call NPV intrinsics without the attribute 'npyv' and + auto suffixing intrinsics according to class attribute 'sfx' + """ + return getattr(self.npyv, attr + "_" + self.sfx) + + def _data(self, start=None, count=None, reverse=False): + """ + Create list of consecutive numbers according to number of vector's lanes. + """ + if start is None: + start = 1 + if count is None: + count = self.nlanes + rng = range(start, start + count) + if reverse: + rng = reversed(rng) + if self._is_fp(): + return [x / 1.0 for x in rng] + return list(rng) + + def _is_unsigned(self): + return self.sfx[0] == 'u' + + def _is_signed(self): + return self.sfx[0] == 's' + + def _is_fp(self): + return self.sfx[0] == 'f' + + def _scalar_size(self): + return int(self.sfx[1:]) + + def _int_clip(self, seq): + if self._is_fp(): + return seq + max_int = self._int_max() + min_int = self._int_min() + return [min(max(v, min_int), max_int) for v in seq] + + def _int_max(self): + if self._is_fp(): + return None + max_u = self._to_unsigned(self.setall(-1))[0] + if self._is_signed(): + return max_u // 2 + return max_u + + def _int_min(self): + if self._is_fp(): + return None + if self._is_unsigned(): + return 0 + return -(self._int_max() + 1) + + def _true_mask(self): + max_unsig = getattr(self.npyv, "setall_u" + self.sfx[1:])(-1) + return max_unsig[0] + + def _to_unsigned(self, vector): + if isinstance(vector, (list, tuple)): + return getattr(self.npyv, "load_u" + self.sfx[1:])(vector) + else: + sfx = vector.__name__.replace("npyv_", "") + if sfx[0] == "b": + cvt_intrin = "cvt_u{0}_b{0}" + else: + cvt_intrin = "reinterpret_u{0}_{1}" + return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector) + +class _SIMD_INT(_Test_Utility): + """ + To test all integer vector types at once + """ + def test_operators_shift(self): + if self.sfx in ("u8", "s8"): + return + + data_a = self._data(self._int_max() - self.nlanes) + data_b = self._data(self._int_min(), reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + for count in range(self._scalar_size()): + # load to cast + data_shl_a = self.load([a << count for a in data_a]) + # left shift + shl = self.shl(vdata_a, count) + assert shl == data_shl_a + # left shift by an immediate constant + shli = self.shli(vdata_a, count) + assert shli == data_shl_a + # load to cast + data_shr_a = self.load([a >> count for a in data_a]) + # right shift + shr = self.shr(vdata_a, count) + assert shr == data_shr_a + # right shift by an immediate constant + shri = self.shri(vdata_a, count) + assert shri == data_shr_a + + def test_arithmetic_subadd_saturated(self): + if self.sfx in ("u32", "s32", "u64", "s64"): + return + + data_a = self._data(self._int_max() - self.nlanes) + data_b = self._data(self._int_min(), reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + data_adds = self._int_clip([a + b for a, b in zip(data_a, data_b)]) + adds = self.adds(vdata_a, vdata_b) + assert adds == data_adds + + data_subs = self._int_clip([a - b for a, b in zip(data_a, data_b)]) + subs = self.subs(vdata_a, vdata_b) + assert subs == data_subs + +class _SIMD_FP(_Test_Utility): + """ + To test all float vector types at once + """ + def test_arithmetic_fused(self): + vdata_a, vdata_b, vdata_c = [self.load(self._data())]*3 + vdata_cx2 = self.add(vdata_c, vdata_c) + # multiply and add, a*b + c + data_fma = self.load([a * b + c for a, b, c in zip(vdata_a, vdata_b, vdata_c)]) + fma = self.muladd(vdata_a, vdata_b, vdata_c) + assert fma == data_fma + # multiply and subtract, a*b - c + fms = self.mulsub(vdata_a, vdata_b, vdata_c) + data_fms = self.sub(data_fma, vdata_cx2) + assert fms == data_fms + # negate multiply and add, -(a*b) + c + nfma = self.nmuladd(vdata_a, vdata_b, vdata_c) + data_nfma = self.sub(vdata_cx2, data_fma) + assert nfma == data_nfma + # negate multiply and subtract, -(a*b) - c + nfms = self.nmulsub(vdata_a, vdata_b, vdata_c) + data_nfms = self.mul(data_fma, self.setall(-1)) + assert nfms == data_nfms + +class _SIMD_ALL(_Test_Utility): + """ + To test all vector types at once + """ + def test_memory_load(self): + data = self._data() + # unaligned load + load_data = self.load(data) + assert load_data == data + # aligned load + loada_data = self.loada(data) + assert loada_data == data + # stream load + loads_data = self.loads(data) + assert loads_data == data + # load lower part + loadl = self.loadl(data) + loadl_half = list(loadl)[:self.nlanes//2] + data_half = data[:self.nlanes//2] + assert loadl_half == data_half + assert loadl != data # detect overflow + + def test_memory_store(self): + data = self._data() + vdata = self.load(data) + # unaligned store + store = [0] * self.nlanes + self.store(store, vdata) + assert store == data + # aligned store + store_a = [0] * self.nlanes + self.storea(store_a, vdata) + assert store_a == data + # stream store + store_s = [0] * self.nlanes + self.stores(store_s, vdata) + assert store_s == data + # store lower part + store_l = [0] * self.nlanes + self.storel(store_l, vdata) + assert store_l[:self.nlanes//2] == data[:self.nlanes//2] + assert store_l != vdata # detect overflow + # store higher part + store_h = [0] * self.nlanes + self.storeh(store_h, vdata) + assert store_h[:self.nlanes//2] == data[self.nlanes//2:] + assert store_h != vdata # detect overflow + + def test_memory_partial_load(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + data = self._data() + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] # test out of range + for n in lanes: + load_till = self.load_till(data, n, 15) + data_till = data[:n] + [15] * (self.nlanes-n) + assert load_till == data_till + load_tillz = self.load_tillz(data, n) + data_tillz = data[:n] + [0] * (self.nlanes-n) + assert load_tillz == data_tillz + + def test_memory_partial_store(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + data = self._data() + data_rev = self._data(reverse=True) + vdata = self.load(data) + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] + for n in lanes: + data_till = data_rev.copy() + data_till[:n] = data[:n] + store_till = self._data(reverse=True) + self.store_till(store_till, n, vdata) + assert store_till == data_till + + def test_memory_noncont_load(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + for stride in range(1, 64): + data = self._data(count=stride*self.nlanes) + data_stride = data[::stride] + loadn = self.loadn(data, stride) + assert loadn == data_stride + + for stride in range(-64, 0): + data = self._data(stride, -stride*self.nlanes) + data_stride = self.load(data[::stride]) # cast unsigned + loadn = self.loadn(data, stride) + assert loadn == data_stride + + def test_memory_noncont_partial_load(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] + for stride in range(1, 64): + data = self._data(count=stride*self.nlanes) + data_stride = data[::stride] + for n in lanes: + data_stride_till = data_stride[:n] + [15] * (self.nlanes-n) + loadn_till = self.loadn_till(data, stride, n, 15) + assert loadn_till == data_stride_till + data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n) + loadn_tillz = self.loadn_tillz(data, stride, n) + assert loadn_tillz == data_stride_tillz + + for stride in range(-64, 0): + data = self._data(stride, -stride*self.nlanes) + data_stride = list(self.load(data[::stride])) # cast unsigned + for n in lanes: + data_stride_till = data_stride[:n] + [15] * (self.nlanes-n) + loadn_till = self.loadn_till(data, stride, n, 15) + assert loadn_till == data_stride_till + data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n) + loadn_tillz = self.loadn_tillz(data, stride, n) + assert loadn_tillz == data_stride_tillz + + def test_memory_noncont_store(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + vdata = self.load(self._data()) + for stride in range(1, 64): + data = [15] * stride * self.nlanes + data[::stride] = vdata + storen = [15] * stride * self.nlanes + storen += [127]*64 + self.storen(storen, stride, vdata) + assert storen[:-64] == data + assert storen[-64:] == [127]*64 # detect overflow + + for stride in range(-64, 0): + data = [15] * -stride * self.nlanes + data[::stride] = vdata + storen = [127]*64 + storen += [15] * -stride * self.nlanes + self.storen(storen, stride, vdata) + assert storen[64:] == data + assert storen[:64] == [127]*64 # detect overflow + + def test_memory_noncont_partial_store(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + data = self._data() + vdata = self.load(data) + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] + for stride in range(1, 64): + for n in lanes: + data_till = [15] * stride * self.nlanes + data_till[::stride] = data[:n] + [15] * (self.nlanes-n) + storen_till = [15] * stride * self.nlanes + storen_till += [127]*64 + self.storen_till(storen_till, stride, n, vdata) + assert storen_till[:-64] == data_till + assert storen_till[-64:] == [127]*64 # detect overflow + + for stride in range(-64, 0): + for n in lanes: + data_till = [15] * -stride * self.nlanes + data_till[::stride] = data[:n] + [15] * (self.nlanes-n) + storen_till = [127]*64 + storen_till += [15] * -stride * self.nlanes + self.storen_till(storen_till, stride, n, vdata) + assert storen_till[64:] == data_till + assert storen_till[:64] == [127]*64 # detect overflow + + def test_misc(self): + broadcast_zero = self.zero() + assert broadcast_zero == [0] * self.nlanes + for i in range(1, 10): + broadcasti = self.setall(i) + assert broadcasti == [i] * self.nlanes + + data_a, data_b = self._data(), self._data(reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + # py level of npyv_set_* don't support ignoring the extra specified lanes or + # fill non-specified lanes with zero. + vset = self.set(*data_a) + assert vset == data_a + # py level of npyv_setf_* don't support ignoring the extra specified lanes or + # fill non-specified lanes with the specified scalar. + vsetf = self.setf(10, *data_a) + assert vsetf == data_a + + # We're testing the sainty of _simd's type-vector, + # reinterpret* intrinsics itself are tested via compiler + # during the build of _simd module + sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"] + if self.npyv.simd_f64: + sfxes.append("f64") + for sfx in sfxes: + vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__ + assert vec_name == "npyv_" + sfx + + # select & mask operations + select_a = self.select(self.cmpeq(self.zero(), self.zero()), vdata_a, vdata_b) + assert select_a == data_a + select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b) + assert select_b == data_b + + # cleanup intrinsic is only used with AVX for + # zeroing registers to avoid the AVX-SSE transition penalty, + # so nothing to test here + self.npyv.cleanup() + + def test_reorder(self): + data_a, data_b = self._data(), self._data(reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + # lower half part + data_a_lo = data_a[:self.nlanes//2] + data_b_lo = data_b[:self.nlanes//2] + # higher half part + data_a_hi = data_a[self.nlanes//2:] + data_b_hi = data_b[self.nlanes//2:] + # combine two lower parts + combinel = self.combinel(vdata_a, vdata_b) + assert combinel == data_a_lo + data_b_lo + # combine two higher parts + combineh = self.combineh(vdata_a, vdata_b) + assert combineh == data_a_hi + data_b_hi + # combine x2 + combine = self.combine(vdata_a, vdata_b) + assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi) + # zip(interleave) + data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p] + data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p] + vzip = self.zip(vdata_a, vdata_b) + assert vzip == (data_zipl, data_ziph) + + def test_operators_comparison(self): + if self._is_fp(): + data_a = self._data() + else: + data_a = self._data(self._int_max() - self.nlanes) + data_b = self._data(self._int_min(), reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + mask_true = self._true_mask() + def to_bool(vector): + return [lane == mask_true for lane in vector] + # equal + data_eq = [a == b for a, b in zip(data_a, data_b)] + cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b)) + assert cmpeq == data_eq + # not equal + data_neq = [a != b for a, b in zip(data_a, data_b)] + cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b)) + assert cmpneq == data_neq + # greater than + data_gt = [a > b for a, b in zip(data_a, data_b)] + cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b)) + assert cmpgt == data_gt + # greater than and equal + data_ge = [a >= b for a, b in zip(data_a, data_b)] + cmpge = to_bool(self.cmpge(vdata_a, vdata_b)) + assert cmpge == data_ge + # less than + data_lt = [a < b for a, b in zip(data_a, data_b)] + cmplt = to_bool(self.cmplt(vdata_a, vdata_b)) + assert cmplt == data_lt + # less than and equal + data_le = [a <= b for a, b in zip(data_a, data_b)] + cmple = to_bool(self.cmple(vdata_a, vdata_b)) + assert cmple == data_le + + def test_operators_logical(self): + if self._is_fp(): + data_a = self._data() + else: + data_a = self._data(self._int_max() - self.nlanes) + data_b = self._data(self._int_min(), reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + if self._is_fp(): + data_cast_a = self._to_unsigned(vdata_a) + data_cast_b = self._to_unsigned(vdata_b) + cast, cast_data = self._to_unsigned, self._to_unsigned + else: + data_cast_a, data_cast_b = data_a, data_b + cast, cast_data = lambda a: a, self.load + + data_xor = cast_data([a ^ b for a, b in zip(data_cast_a, data_cast_b)]) + vxor = cast(self.xor(vdata_a, vdata_b)) + assert vxor == data_xor + + data_or = cast_data([a | b for a, b in zip(data_cast_a, data_cast_b)]) + vor = cast(getattr(self, "or")(vdata_a, vdata_b)) + assert vor == data_or + + data_and = cast_data([a & b for a, b in zip(data_cast_a, data_cast_b)]) + vand = cast(getattr(self, "and")(vdata_a, vdata_b)) + assert vand == data_and + + data_not = cast_data([~a for a in data_cast_a]) + vnot = cast(getattr(self, "not")(vdata_a)) + assert vnot == data_not + + def test_conversion_boolean(self): + bsfx = "b" + self.sfx[1:] + to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx)) + from_boolean = getattr(self.npyv, "cvt_%s_%s" % (self.sfx, bsfx)) + + false_vb = to_boolean(self.setall(0)) + true_vb = self.cmpeq(self.setall(0), self.setall(0)) + assert false_vb != true_vb + + false_vsfx = from_boolean(false_vb) + true_vsfx = from_boolean(true_vb) + assert false_vsfx != true_vsfx + + def test_arithmetic_subadd(self): + if self._is_fp(): + data_a = self._data() + else: + data_a = self._data(self._int_max() - self.nlanes) + data_b = self._data(self._int_min(), reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + # non-saturated + data_add = self.load([a + b for a, b in zip(data_a, data_b)]) # load to cast + add = self.add(vdata_a, vdata_b) + assert add == data_add + data_sub = self.load([a - b for a, b in zip(data_a, data_b)]) + sub = self.sub(vdata_a, vdata_b) + assert sub == data_sub + + def test_arithmetic_mul(self): + if self.sfx in ("u64", "s64"): + return + + if self._is_fp(): + data_a = self._data() + else: + data_a = self._data(self._int_max() - self.nlanes) + data_b = self._data(self._int_min(), reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + data_mul = self.load([a * b for a, b in zip(data_a, data_b)]) + mul = self.mul(vdata_a, vdata_b) + assert mul == data_mul + + def test_arithmetic_div(self): + if not self._is_fp(): + return + + data_a, data_b = self._data(), self._data(reverse=True) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + # load to truncate f64 to precision of f32 + data_div = self.load([a / b for a, b in zip(data_a, data_b)]) + div = self.div(vdata_a, vdata_b) + assert div == data_div + + +int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64") +fp_sfx = ("f32", "f64") +all_sfx = int_sfx + fp_sfx +tests_registry = { + int_sfx : _SIMD_INT, + fp_sfx : _SIMD_FP, + all_sfx : _SIMD_ALL +} +for target_name, npyv in targets.items(): + simd_width = npyv.simd if npyv else '' + pretty_name = target_name.split('__') # multi-target separator + if len(pretty_name) > 1: + # multi-target + pretty_name = f"({' '.join(pretty_name)})" + else: + pretty_name = pretty_name[0] + + skip = "" + skip_sfx = dict() + if not npyv: + skip = f"target '{pretty_name}' isn't supported by current machine" + elif not npyv.simd: + skip = f"target '{pretty_name}' isn't supported by NPYV" + elif not npyv.simd_f64: + skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision" + + for sfxes, cls in tests_registry.items(): + for sfx in sfxes: + skip_m = skip_sfx.get(sfx, skip) + inhr = (cls,) + attr = dict(npyv=targets[target_name], sfx=sfx) + tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr) + if skip_m: + pytest.mark.skip(reason=skip_m)(tcls) + globals()[tcls.__name__] = tcls diff --git a/numpy/core/tests/test_simd_module.py b/numpy/core/tests/test_simd_module.py new file mode 100644 index 000000000..3d710884a --- /dev/null +++ b/numpy/core/tests/test_simd_module.py @@ -0,0 +1,97 @@ +import pytest +from numpy.core._simd import targets +""" +This testing unit only for checking the sanity of common functionality, +therefore all we need is just to take one submodule that represents any +of enabled SIMD extensions to run the test on it and the second submodule +required to run only one check related to the possibility of mixing +the data types among each submodule. +""" +npyvs = [npyv_mod for npyv_mod in targets.values() if npyv_mod and npyv_mod.simd] +npyv, npyv2 = (npyvs + [None, None])[:2] + +unsigned_sfx = ["u8", "u16", "u32", "u64"] +signed_sfx = ["s8", "s16", "s32", "s64"] +fp_sfx = ["f32"] +if npyv and npyv.simd_f64: + fp_sfx.append("f64") + +int_sfx = unsigned_sfx + signed_sfx +all_sfx = unsigned_sfx + int_sfx + +@pytest.mark.skipif(not npyv, reason="could not find any SIMD extension with NPYV support") +class Test_SIMD_MODULE: + + @pytest.mark.parametrize('sfx', all_sfx) + def test_num_lanes(self, sfx): + nlanes = getattr(npyv, "nlanes_" + sfx) + vector = getattr(npyv, "setall_" + sfx)(1) + assert len(vector) == nlanes + + @pytest.mark.parametrize('sfx', all_sfx) + def test_type_name(self, sfx): + vector = getattr(npyv, "setall_" + sfx)(1) + assert vector.__name__ == "npyv_" + sfx + + def test_raises(self): + a, b = [npyv.setall_u32(1)]*2 + for sfx in all_sfx: + vcb = lambda intrin: getattr(npyv, f"{intrin}_{sfx}") + pytest.raises(TypeError, vcb("add"), a) + pytest.raises(TypeError, vcb("add"), a, b, a) + pytest.raises(TypeError, vcb("setall")) + pytest.raises(TypeError, vcb("setall"), [1]) + pytest.raises(TypeError, vcb("load"), 1) + pytest.raises(ValueError, vcb("load"), [1]) + pytest.raises(ValueError, vcb("store"), [1], getattr(npyv, f"reinterpret_{sfx}_u32")(a)) + + @pytest.mark.skipif(not npyv2, reason=( + "could not find a second SIMD extension with NPYV support" + )) + def test_nomix(self): + # mix among submodules isn't allowed + a = npyv.setall_u32(1) + a2 = npyv2.setall_u32(1) + pytest.raises(TypeError, npyv.add_u32, a2, a2) + pytest.raises(TypeError, npyv2.add_u32, a, a) + + @pytest.mark.parametrize('sfx', unsigned_sfx) + def test_unsigned_overflow(self, sfx): + nlanes = getattr(npyv, "nlanes_" + sfx) + maxu = (1 << int(sfx[1:])) - 1 + maxu_72 = (1 << 72) - 1 + lane = getattr(npyv, "setall_" + sfx)(maxu_72)[0] + assert lane == maxu + lanes = getattr(npyv, "load_" + sfx)([maxu_72] * nlanes) + assert lanes == [maxu] * nlanes + lane = getattr(npyv, "setall_" + sfx)(-1)[0] + assert lane == maxu + lanes = getattr(npyv, "load_" + sfx)([-1] * nlanes) + assert lanes == [maxu] * nlanes + + @pytest.mark.parametrize('sfx', signed_sfx) + def test_signed_overflow(self, sfx): + nlanes = getattr(npyv, "nlanes_" + sfx) + maxs_72 = (1 << 71) - 1 + lane = getattr(npyv, "setall_" + sfx)(maxs_72)[0] + assert lane == -1 + lanes = getattr(npyv, "load_" + sfx)([maxs_72] * nlanes) + assert lanes == [-1] * nlanes + mins_72 = -1 << 71 + lane = getattr(npyv, "setall_" + sfx)(mins_72)[0] + assert lane == 0 + lanes = getattr(npyv, "load_" + sfx)([mins_72] * nlanes) + assert lanes == [0] * nlanes + + def test_truncate_f32(self): + f32 = npyv.setall_f32(0.1)[0] + assert f32 != 0.1 + assert round(f32, 1) == 0.1 + + def test_compare(self): + data_range = range(0, npyv.nlanes_u32) + vdata = npyv.load_u32(data_range) + assert vdata == list(data_range) + assert vdata == tuple(data_range) + for i in data_range: + assert vdata[i] == data_range[i] diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py index 72ea0c388..3eba6e32a 100644 --- a/numpy/distutils/ccompiler_opt.py +++ b/numpy/distutils/ccompiler_opt.py @@ -2372,19 +2372,18 @@ class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse): else: dispatch_rows.append(("Generated", '')) for tar in self.feature_sorted(target_sources): - tar_as_seq = [tar] if isinstance(tar, str) else tar sources = target_sources[tar] - name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar) + pretty_name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar) flags = ' '.join(self.feature_flags(tar)) implies = ' '.join(self.feature_sorted(self.feature_implies(tar))) detect = ' '.join(self.feature_detect(tar)) extra_checks = [] - for name in tar_as_seq: + for name in ((tar,) if isinstance(tar, str) else tar): extra_checks += self.feature_extra_checks(name) extra_checks = (' '.join(extra_checks) if extra_checks else "none") dispatch_rows.append(('', '')) - dispatch_rows.append((name, implies)) + dispatch_rows.append((pretty_name, implies)) dispatch_rows.append(("Flags", flags)) dispatch_rows.append(("Extra checks", extra_checks)) dispatch_rows.append(("Detect", detect)) diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py index 60ba4c917..a4fda537d 100644 --- a/numpy/distutils/command/build.py +++ b/numpy/distutils/command/build.py @@ -22,6 +22,8 @@ class build(old_build): "specify a list of dispatched CPU optimizations"), ('disable-optimization', None, "disable CPU optimized code(dispatch,simd,fast...)"), + ('simd-test=', None, + "specify a list of CPU optimizations to be tested against NumPy SIMD interface"), ] help_options = old_build.help_options + [ @@ -36,6 +38,16 @@ class build(old_build): self.cpu_baseline = "min" self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default self.disable_optimization = False + """ + the '_simd' module is a very large. Adding more dispatched features + will increase binary size and compile time. By default we minimize + the targeted features to those most commonly used by the NumPy SIMD interface(NPYV), + NOTE: any specified features will be ignored if they're: + - part of the baseline(--cpu-baseline) + - not part of dispatch-able features(--cpu-dispatch) + - not supported by compiler or platform + """ + self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD" def finalize_options(self): build_scripts = self.build_scripts diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py index 1a881c56a..ca6f8bcd2 100644 --- a/numpy/distutils/command/build_ext.py +++ b/numpy/distutils/command/build_ext.py @@ -19,8 +19,7 @@ from numpy.distutils.misc_util import ( has_cxx_sources, has_f_sources, is_sequence ) from numpy.distutils.command.config_compiler import show_fortran_compilers -from numpy.distutils.ccompiler_opt import new_ccompiler_opt - +from numpy.distutils.ccompiler_opt import new_ccompiler_opt, CCompilerOpt class build_ext (old_build_ext): @@ -39,6 +38,8 @@ class build_ext (old_build_ext): "specify a list of dispatched CPU optimizations"), ('disable-optimization', None, "disable CPU optimized code(dispatch,simd,fast...)"), + ('simd-test=', None, + "specify a list of CPU optimizations to be tested against NumPy SIMD interface"), ] help_options = old_build_ext.help_options + [ @@ -56,6 +57,7 @@ class build_ext (old_build_ext): self.cpu_baseline = None self.cpu_dispatch = None self.disable_optimization = None + self.simd_test = None def finalize_options(self): if self.parallel: @@ -87,7 +89,9 @@ class build_ext (old_build_ext): ('cpu_baseline', 'cpu_baseline'), ('cpu_dispatch', 'cpu_dispatch'), ('disable_optimization', 'disable_optimization'), + ('simd_test', 'simd_test') ) + CCompilerOpt.conf_target_groups["simd_test"] = self.simd_test def run(self): if not self.extensions: diff --git a/runtests.py b/runtests.py index f8b70d936..87e26768b 100755 --- a/runtests.py +++ b/runtests.py @@ -122,6 +122,9 @@ def main(argv): help="Specify a list of dispatched CPU optimizations"), parser.add_argument("--disable-optimization", action="store_true", help="Disable CPU optimized code(dispatch,simd,fast...)"), + parser.add_argument("--simd-test", default=None, + help="Specify a list of CPU optimizations to be " + "tested against NumPy SIMD interface"), parser.add_argument("--show-build-log", action="store_true", help="Show build output rather than using a log file") parser.add_argument("--bench", action="store_true", @@ -439,6 +442,8 @@ def build_project(args): cmd += ["--cpu-dispatch", args.cpu_dispatch] if args.disable_optimization: cmd += ["--disable-optimization"] + if args.simd_test is not None: + cmd += ["--simd-test", args.simd_test] # Install; avoid producing eggs so numpy can be imported from dst_dir. cmd += ['install', '--prefix=' + dst_dir, '--single-version-externally-managed', |