diff options
author | Matti Picus <matti.picus@gmail.com> | 2023-02-15 14:01:17 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-02-15 14:01:17 +0200 |
commit | 0bd56e7ec12f8ceeb8d082340e71e60b873d5c57 (patch) | |
tree | 4151f2f93381a1eed7b29ea5cf2aca122711a2b8 | |
parent | a5f4e8e417d815cb577a8f2f9414b07a689d69b1 (diff) | |
parent | d07d5584fc63df10025190a4ea38c4863c1b1723 (diff) | |
download | numpy-0bd56e7ec12f8ceeb8d082340e71e60b873d5c57.tar.gz |
Merge pull request #22315 from r-devulap/avxsort
ENH: Vectorize quicksort for 16-bit and 64-bit dtype using AVX512
-rw-r--r-- | .gitmodules | 3 | ||||
-rw-r--r-- | azure-pipelines.yml | 3 | ||||
-rw-r--r-- | azure-steps-windows.yml | 2 | ||||
-rw-r--r-- | benchmarks/benchmarks/bench_function_base.py | 2 | ||||
-rw-r--r-- | numpy/core/meson.build | 11 | ||||
-rw-r--r-- | numpy/core/setup.py | 3 | ||||
-rw-r--r-- | numpy/core/src/common/common.hpp | 11 | ||||
-rw-r--r-- | numpy/core/src/common/half.hpp | 63 | ||||
-rw-r--r-- | numpy/core/src/common/meta.hpp | 54 | ||||
-rw-r--r-- | numpy/core/src/common/npstd.hpp | 54 | ||||
-rw-r--r-- | numpy/core/src/common/npy_config.h | 1 | ||||
-rw-r--r-- | numpy/core/src/npysort/quicksort.cpp | 127 | ||||
-rw-r--r-- | numpy/core/src/npysort/simd_qsort.dispatch.cpp | 44 | ||||
-rw-r--r-- | numpy/core/src/npysort/simd_qsort.hpp | 19 | ||||
-rw-r--r-- | numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp | 31 | ||||
-rw-r--r-- | numpy/core/src/npysort/x86-qsort.dispatch.cpp | 835 | ||||
-rw-r--r-- | numpy/core/src/npysort/x86-qsort.h | 28 | ||||
m--------- | numpy/core/src/npysort/x86-simd-sort | 0 | ||||
-rw-r--r-- | numpy/core/tests/test_multiarray.py | 39 |
19 files changed, 389 insertions, 941 deletions
diff --git a/.gitmodules b/.gitmodules index 1ea274daf..d849a3caf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "numpy/core/src/umath/svml"] path = numpy/core/src/umath/svml url = https://github.com/numpy/SVML.git +[submodule "numpy/core/src/npysort/x86-simd-sort"] + path = numpy/core/src/npysort/x86-simd-sort + url = https://github.com/intel/x86-simd-sort diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 18b72f490..7657ab87f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -184,6 +184,9 @@ stages: - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'" displayName: 'Check for unreachable code paths in Python modules' + - script: git submodule update --init + displayName: 'Fetch submodules' + # prefer usage of clang over gcc proper # to match likely scenario on many user mac machines - script: python setup.py build -j 4 build_src --verbose-cfg install diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml index 318f46398..a147ffd7a 100644 --- a/azure-steps-windows.yml +++ b/azure-steps-windows.yml @@ -1,4 +1,6 @@ steps: +- script: git submodule update --init + displayName: 'Fetch submodules' - task: UsePythonVersion@0 inputs: versionSpec: $(PYTHON_VERSION) diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py index 2e44ff76b..cc37bef39 100644 --- a/benchmarks/benchmarks/bench_function_base.py +++ b/benchmarks/benchmarks/bench_function_base.py @@ -248,7 +248,7 @@ class Sort(Benchmark): # In NumPy 1.17 and newer, 'merge' can be one of several # stable sorts, it isn't necessarily merge sort. ['quick', 'merge', 'heap'], - ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'], + ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'], [ ('random',), ('ordered',), diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 7cbfe6dc3..e6607d8ad 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -453,6 +453,11 @@ if cc.get_id() == 'msvc' staticlib_cflags += '-d2VolatileMetadata-' endif endif +# TODO: change to "feature" option in meson_options.txt? See +# https://mesonbuild.com/Build-options.html#build-options +if get_option('disable-simd-optimizations') + staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION' +endif npy_math_internal_h = custom_target( output: 'npy_math_internal.h', @@ -594,7 +599,8 @@ np_core_dep = declare_dependency( '.', 'include', 'src/common', - ] + ], + compile_args: disable_simd_optimizations ) @@ -718,7 +724,8 @@ src_multiarray = [ 'src/multiarray/usertypes.c', 'src/multiarray/vdot.c', src_file.process('src/common/npy_sort.h.src'), - 'src/npysort/x86-qsort.dispatch.cpp', + 'src/npysort/simd_qsort.dispatch.cpp', + 'src/npysort/simd_qsort_16bit.dispatch.cpp', 'src/npysort/quicksort.cpp', 'src/npysort/mergesort.cpp', 'src/npysort/timsort.cpp', diff --git a/numpy/core/setup.py b/numpy/core/setup.py index c7e28af9c..2cad4ba43 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -942,7 +942,6 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'usertypes.c'), join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), - join('src', 'npysort', 'x86-qsort.dispatch.cpp'), join('src', 'npysort', 'quicksort.cpp'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), @@ -964,6 +963,8 @@ def configuration(parent_package='',top_path=None): # links to the arm64 npymath library, # see gh-22673 join('src', 'npymath', 'arm64_exports.c'), + join('src', 'npysort', 'simd_qsort.dispatch.cpp'), + join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'), ] ####################################################################### diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp new file mode 100644 index 000000000..47d790bcf --- /dev/null +++ b/numpy/core/src/common/common.hpp @@ -0,0 +1,11 @@ +#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP +#define NUMPY_CORE_SRC_COMMON_COMMON_HPP +/* + * The following C++ headers are safe to be used standalone, however, + * they are gathered to make it easy for us and for the future need to support PCH. + */ +#include "npstd.hpp" +#include "half.hpp" +#include "meta.hpp" + +#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp new file mode 100644 index 000000000..399f2fa79 --- /dev/null +++ b/numpy/core/src/common/half.hpp @@ -0,0 +1,63 @@ +#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP +#define NUMPY_CORE_SRC_COMMON_HALF_HPP + +#include "npstd.hpp" + +// TODO(@seiko2plus): +// - covers half-precision operations that being supported by numpy/halffloat.h +// - support __fp16 +// - optimize x86 half<->single via cpu_fp16 +// - optimize ppc64 half<->single via cpu_vsx3 + +namespace np { + +/// @addtogroup cpp_core_types +/// @{ + +/// Provides a type that implements 16-bit floating point (half-precision). +/// This type is ensured to be 16-bit size. +class Half final { + public: + /// @name Public Constructors + /// @{ + + /// Default constructor. initialize nothing. + Half() = default; + /// Copy. + Half(const Half &r) + { + data_.u = r.data_.u; + } + + /// @} + + /// Returns a new Half constracted from the IEEE 754 binary16. + /// @param b the value of binary16. + static Half FromBits(uint16_t b) + { + Half f; + f.data_.u = b; + return f; + } + /// Returns the IEEE 754 binary16 representation. + uint16_t Bits() const + { + return data_.u; + } + + private: + union { + uint16_t u; +/* +TODO(@seiko2plus): support __fp16 +#ifdef NPY_HAVE_HW_FP16 + __fp16 f; +#endif +*/ + } data_; +}; + +/// @} cpp_core_types + +} // namespace np +#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp new file mode 100644 index 000000000..27ea1857e --- /dev/null +++ b/numpy/core/src/common/meta.hpp @@ -0,0 +1,54 @@ +#ifndef NUMPY_CORE_SRC_COMMON_META_HPP +#define NUMPY_CORE_SRC_COMMON_META_HPP + +#include "npstd.hpp" + +namespace np { namespace meta { +/// @addtogroup cpp_core_meta +/// @{ + +namespace details { +template<int size, bool unsig> +struct IntBySize; + +template<bool unsig> +struct IntBySize<sizeof(uint8_t), unsig> { + using Type = typename std::conditional< + unsig, uint8_t, int8_t>::type; +}; +template<bool unsig> +struct IntBySize<sizeof(uint16_t), unsig> { + using Type = typename std::conditional< + unsig, uint16_t, int16_t>::type; +}; +template<bool unsig> +struct IntBySize<sizeof(uint32_t), unsig> { + using Type = typename std::conditional< + unsig, uint32_t, int32_t>::type; +}; +template<bool unsig> +struct IntBySize<sizeof(uint64_t), unsig> { + using Type = typename std::conditional< + unsig, uint64_t, int64_t>::type; +}; +} // namespace details + +/// Provides safe conversion of any integer type synonyms +/// to a fixed-width integer type. +template<typename T> +struct FixedWidth { + using TF_ = typename details::IntBySize< + sizeof(T), std::is_unsigned<T>::value + >::Type; + + using Type = typename std::conditional< + std::is_integral<T>::value, TF_, T + >::type; +}; + +/// @} cpp_core_meta + +}} // namespace np::meta + +#endif // NUMPY_CORE_SRC_COMMON_META_HPP + diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp new file mode 100644 index 000000000..71993bd7c --- /dev/null +++ b/numpy/core/src/common/npstd.hpp @@ -0,0 +1,54 @@ +#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP +#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP + +#include <cstddef> +#include <cstring> +#include <cctype> + +#include <string> +#include <algorithm> +#include <utility> +#include <cstdlib> +#include <cmath> +#include <complex> +#include <type_traits> + +#include <numpy/npy_common.h> + +#include "npy_config.h" + +namespace np { +/// @addtogroup cpp_core_types +/// @{ +using std::uint8_t; +using std::int8_t; +using std::uint16_t; +using std::int16_t; +using std::uint32_t; +using std::int32_t; +using std::uint64_t; +using std::int64_t; +using std::uintptr_t; +using std::intptr_t; +using std::complex; + +/** Guard for long double. + * + * The C implementation defines long double as double + * on MinGW to provide compatibility with MSVC to unify + * one behavior under Windows OS, which makes npy_longdouble + * not fit to be used with template specialization or overloading. + * + * This type will be set to `void` when `npy_longdouble` is not defined + * as `long double`. + */ +using LongDouble = typename std::conditional< + !std::is_same<npy_longdouble, long double>::value, + void, npy_longdouble +>::type; +/// @} cpp_core_types + +} // namespace np + +#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP + diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h index d6886c5ea..715b17777 100644 --- a/numpy/core/src/common/npy_config.h +++ b/numpy/core/src/common/npy_config.h @@ -2,6 +2,7 @@ #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ #include "config.h" +#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features] #include "numpy/numpyconfig.h" #include "numpy/utils.h" #include "numpy/npy_os.h" diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 3e351dd84..7497ebaa3 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -54,15 +54,11 @@ #include "npysort_common.h" #include "npysort_heapsort.h" #include "numpy_tag.h" +#include "simd_qsort.hpp" -#include "x86-qsort.h" #include <cstdlib> #include <utility> -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort.dispatch.h" -#endif - #define NOT_USED NPY_UNUSED(unused) /* * pushing largest partition has upper bound of log2(n) space @@ -73,70 +69,50 @@ #define SMALL_MERGESORT 20 #define SMALL_STRING 16 +// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure +// out why it has test failures +#if defined(_MSC_VER) || defined(__CYGWIN__) +template<typename T> +inline bool quicksort_dispatch(T*, npy_intp) +{ + return false; +} +#else +template<typename T> +inline bool quicksort_dispatch(T *start, npy_intp num) +{ + using TF = typename np::meta::FixedWidth<T>::Type; + void (*dispfunc)(TF*, intptr_t) = nullptr; + if (sizeof(T) == sizeof(uint16_t)) { + #ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort_16bit.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>); + } + else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) { + #ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>); + } + if (dispfunc) { + (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num)); + return true; + } + return false; +} +#endif // _MSC_VER || CYGWIN + /* ***************************************************************************** ** NUMERIC SORTS ** ***************************************************************************** */ -namespace { - -template <typename Tag> -struct x86_dispatch { - static bool quicksort(typename Tag::type *, npy_intp) { return false; } -}; - -template <> -struct x86_dispatch<npy::int_tag> { - static bool quicksort(npy_int *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch<npy::uint_tag> { - static bool quicksort(npy_uint *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch<npy::float_tag> { - static bool quicksort(npy_float *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -} // namespace - template <typename Tag, typename type> static int quicksort_(type *start, npy_intp num) { - if (x86_dispatch<Tag>::quicksort(start, num)) - return 0; - type vp; type *pl = start; type *pr = pl + num - 1; @@ -729,56 +705,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr)) NPY_NO_EXPORT int quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_short *)start, n)) { + return 0; + } return quicksort_<npy::short_tag>((npy_short *)start, n); } NPY_NO_EXPORT int quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_ushort *)start, n)) { + return 0; + } return quicksort_<npy::ushort_tag>((npy_ushort *)start, n); } NPY_NO_EXPORT int quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_int *)start, n)) { + return 0; + } return quicksort_<npy::int_tag>((npy_int *)start, n); } NPY_NO_EXPORT int quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_uint *)start, n)) { + return 0; + } return quicksort_<npy::uint_tag>((npy_uint *)start, n); } NPY_NO_EXPORT int quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_long *)start, n)) { + return 0; + } return quicksort_<npy::long_tag>((npy_long *)start, n); } NPY_NO_EXPORT int quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_ulong *)start, n)) { + return 0; + } return quicksort_<npy::ulong_tag>((npy_ulong *)start, n); } NPY_NO_EXPORT int quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_longlong *)start, n)) { + return 0; + } return quicksort_<npy::longlong_tag>((npy_longlong *)start, n); } NPY_NO_EXPORT int quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_ulonglong *)start, n)) { + return 0; + } return quicksort_<npy::ulonglong_tag>((npy_ulonglong *)start, n); } NPY_NO_EXPORT int quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((np::Half *)start, n)) { + return 0; + } return quicksort_<npy::half_tag>((npy_half *)start, n); } NPY_NO_EXPORT int quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_float *)start, n)) { + return 0; + } return quicksort_<npy::float_tag>((npy_float *)start, n); } NPY_NO_EXPORT int quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_double *)start, n)) { + return 0; + } return quicksort_<npy::double_tag>((npy_double *)start, n); } NPY_NO_EXPORT int diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp new file mode 100644 index 000000000..101bb3dcc --- /dev/null +++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp @@ -0,0 +1,44 @@ +/*@targets + * $maxopt $keep_baseline avx512_skx + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "simd_qsort.hpp" + +#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER) + #include "x86-simd-sort/src/avx512-32bit-qsort.hpp" + #include "x86-simd-sort/src/avx512-64bit-qsort.hpp" +#endif + +namespace np { namespace qsort_simd { + +#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +#endif // NPY_HAVE_AVX512_SKX + +}} // namespace np::simd diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp new file mode 100644 index 000000000..7cdee774d --- /dev/null +++ b/numpy/core/src/npysort/simd_qsort.hpp @@ -0,0 +1,19 @@ +#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP +#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP + +#include "common.hpp" + +namespace np { namespace qsort_simd { + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size)) + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort_16bit.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size)) + +} } // np::qsort_simd +#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp new file mode 100644 index 000000000..a6465a883 --- /dev/null +++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp @@ -0,0 +1,31 @@ +/*@targets + * $maxopt $keep_baseline avx512_icl + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "simd_qsort.hpp" + +#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER) + #include "x86-simd-sort/src/avx512-16bit-qsort.hpp" +#endif + +namespace np { namespace qsort_simd { + +#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size) +{ + avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +#endif // NPY_HAVE_AVX512_ICL + +}} // namespace np::qsort_simd diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp deleted file mode 100644 index 8e88cc667..000000000 --- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp +++ /dev/null @@ -1,835 +0,0 @@ -/*@targets - * $maxopt $keep_baseline avx512_skx - */ -// policy $keep_baseline is used to avoid skip building avx512_skx -// when its part of baseline features (--cpu-baseline), since -// 'baseline' option isn't specified within targets. - -#include "x86-qsort.h" -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#ifdef NPY_HAVE_AVX512_SKX -#include "numpy/npy_math.h" - -#include "npy_sort.h" -#include "numpy_tag.h" - -#include "simd/simd.h" -#include <immintrin.h> - -template <typename Tag, typename type> -NPY_NO_EXPORT int -heapsort_(type *start, npy_intp n); - -/* - * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are - * based on these two research papers: - * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types - * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ - * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel - * Skylake https://arxiv.org/pdf/1704.08579.pdf - * - * High level idea: Vectorize the quicksort partitioning using AVX-512 - * compressstore instructions. The algorithm to pick the pivot is to use median - * of 72 elements picked at random. If the array size is < 128, then use - * Bitonic sorting network. Good resource for bitonic sorting network: - * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 - * - * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340 - * for potential problems when converting this code to universal intrinsics - * framework. - */ - -/* - * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 -#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 -#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 -#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 -#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 -#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF) -#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32) -#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32) -#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d -#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK) -#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK) - -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - -/* - * Vectorized random number generator xoroshiro128+. Broken into 2 parts: - * (1) vnext generates 2 64-bit random integers - * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to - * the length of the array - */ -#define VROTL(x, k) /* rotate each uint64_t value in vector */ \ - _mm256_or_si256(_mm256_slli_epi64((x), (k)), \ - _mm256_srli_epi64((x), 64 - (k))) - -static inline __m256i -vnext(__m256i *s0, __m256i *s1) -{ - *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */ - *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1), - _mm256_slli_epi64(*s1, 16)); - *s1 = VROTL(*s1, 37); - return _mm256_add_epi64(*s0, *s1); /* return random vector */ -} - -/* transform random numbers to the range between 0 and bound - 1 */ -static inline __m256i -rnd_epu32(__m256i rnd_vec, __m256i bound) -{ - __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32); - __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound); - return _mm256_blend_epi32(odd, even, 0b01010101); -} - -template <typename type> -struct vector; - -template <> -struct vector<npy_int> { - using tag = npy::int_tag; - using type_t = npy_int; - using zmm_t = __m512i; - using ymm_t = __m256i; - - static type_t type_max() { return NPY_MAX_INT32; } - static type_t type_min() { return NPY_MIN_INT32; } - static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); } - - static __mmask16 ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); - } - template <int scale> - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem) - { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y) - { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); } - static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } - template<__mmask16 mask> - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); } -}; -template <> -struct vector<npy_uint> { - using tag = npy::uint_tag; - using type_t = npy_uint; - using zmm_t = __m512i; - using ymm_t = __m256i; - - static type_t type_max() { return NPY_MAX_UINT32; } - static type_t type_min() { return 0; } - static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); } - - template<int scale> - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi32(index, base, scale); - } - static __mmask16 ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem) - { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y) - { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); } - static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } - template<__mmask16 mask> - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); } -}; -template <> -struct vector<npy_float> { - using tag = npy::float_tag; - using type_t = npy_float; - using zmm_t = __m512; - using ymm_t = __m256; - - static type_t type_max() { return NPY_INFINITYF; } - static type_t type_min() { return -NPY_INFINITYF; } - static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); } - - static __mmask16 ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); - } - template<int scale> - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_ps(index, base, scale); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); } - static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_compressstoreu_ps(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem) - { - return _mm512_mask_loadu_ps(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y) - { - return _mm512_mask_mov_ps(x, mask, y); - } - static void mask_storeu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_storeu_ps(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_ps(idx, zmm); - } - static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); } - static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_ps(v); } - template<__mmask16 mask> - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); } -}; - -/* - * COEX == Compare and Exchange two registers by swapping min and max values - */ -template <typename vtype, typename mm_t> -void -COEX(mm_t &a, mm_t &b) -{ - mm_t temp = a; - a = vtype::min(a, b); - b = vtype::max(temp, b); -} - -template <typename vtype, typename zmm_t = typename vtype::zmm_t> -static inline zmm_t -cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask) -{ - zmm_t min = vtype::min(in2, in1); - zmm_t max = vtype::max(in2, in1); - return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max -} - -/* - * Assumes zmm is random and performs a full sorting network defined in - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg - */ -template <typename vtype, typename zmm_t = typename vtype::zmm_t> -static inline zmm_t -sort_zmm(zmm_t zmm) -{ - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), - 0xAAAA); - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm), - 0xCCCC); - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), - 0xAAAA); - zmm = cmp_merge<vtype>( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0); - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), - 0xCCCC); - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), - 0xAAAA); - zmm = cmp_merge<vtype>( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00); - zmm = cmp_merge<vtype>( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0); - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), - 0xCCCC); - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), - 0xAAAA); - return zmm; -} - -// Assumes zmm is bitonic and performs a recursive half cleaner -template <typename vtype, typename zmm_t = typename vtype::zmm_t> -static inline zmm_t -bitonic_merge_zmm(zmm_t zmm) -{ - // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. - zmm = cmp_merge<vtype>( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00); - // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. - zmm = cmp_merge<vtype>( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0); - // 3) half_cleaner[4] - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), - 0xCCCC); - // 3) half_cleaner[1] - zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), - 0xAAAA); - return zmm; -} - -// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner -template <typename vtype, typename zmm_t = typename vtype::zmm_t> -static inline void -bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2) -{ - // 1) First step of a merging network: coex of zmm1 and zmm2 reversed - *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2); - zmm_t zmm3 = vtype::min(*zmm1, *zmm2); - zmm_t zmm4 = vtype::max(*zmm1, *zmm2); - // 2) Recursive half cleaner for each - *zmm1 = bitonic_merge_zmm<vtype>(zmm3); - *zmm2 = bitonic_merge_zmm<vtype>(zmm4); -} - -// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive -// half cleaner -template <typename vtype, typename zmm_t = typename vtype::zmm_t> -static inline void -bitonic_merge_four_zmm(zmm_t *zmm) -{ - zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]); - zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); - zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[1], zmm2r)); - zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[0], zmm3r)); - zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); - zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); - zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); - zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); - zmm[0] = bitonic_merge_zmm<vtype>(zmm0); - zmm[1] = bitonic_merge_zmm<vtype>(zmm1); - zmm[2] = bitonic_merge_zmm<vtype>(zmm2); - zmm[3] = bitonic_merge_zmm<vtype>(zmm3); -} - -template <typename vtype, typename zmm_t = typename vtype::zmm_t> -static inline void -bitonic_merge_eight_zmm(zmm_t *zmm) -{ - zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]); - zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]); - zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]); - zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); - zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); - zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); - zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[3], zmm4r)); - zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[2], zmm5r)); - zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[1], zmm6r)); - zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[0], zmm7r)); - COEX<vtype>(zmm_t1, zmm_t3); - COEX<vtype>(zmm_t2, zmm_t4); - COEX<vtype>(zmm_t5, zmm_t7); - COEX<vtype>(zmm_t6, zmm_t8); - COEX<vtype>(zmm_t1, zmm_t2); - COEX<vtype>(zmm_t3, zmm_t4); - COEX<vtype>(zmm_t5, zmm_t6); - COEX<vtype>(zmm_t7, zmm_t8); - zmm[0] = bitonic_merge_zmm<vtype>(zmm_t1); - zmm[1] = bitonic_merge_zmm<vtype>(zmm_t2); - zmm[2] = bitonic_merge_zmm<vtype>(zmm_t3); - zmm[3] = bitonic_merge_zmm<vtype>(zmm_t4); - zmm[4] = bitonic_merge_zmm<vtype>(zmm_t5); - zmm[5] = bitonic_merge_zmm<vtype>(zmm_t6); - zmm[6] = bitonic_merge_zmm<vtype>(zmm_t7); - zmm[7] = bitonic_merge_zmm<vtype>(zmm_t8); -} - -template <typename vtype, typename type_t> -static inline void -sort_16(type_t *arr, npy_int N) -{ - __mmask16 load_mask = (0x0001 << N) - 0x0001; - typename vtype::zmm_t zmm = - vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); - vtype::mask_storeu(arr, load_mask, sort_zmm<vtype>(zmm)); -} - -template <typename vtype, typename type_t> -static inline void -sort_32(type_t *arr, npy_int N) -{ - if (N <= 16) { - sort_16<vtype>(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm1 = vtype::loadu(arr); - __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001; - zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16); - zmm1 = sort_zmm<vtype>(zmm1); - zmm2 = sort_zmm<vtype>(zmm2); - bitonic_merge_two_zmm<vtype>(&zmm1, &zmm2); - vtype::storeu(arr, zmm1); - vtype::mask_storeu(arr + 16, load_mask, zmm2); -} - -template <typename vtype, typename type_t> -static inline void -sort_64(type_t *arr, npy_int N) -{ - if (N <= 32) { - sort_32<vtype>(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm[4]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 16); - __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; - if (N < 48) { - load_mask1 = (0x0001 << (N - 32)) - 0x0001; - load_mask2 = 0x0000; - } - else if (N < 64) { - load_mask2 = (0x0001 << (N - 48)) - 0x0001; - } - zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); - zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48); - zmm[0] = sort_zmm<vtype>(zmm[0]); - zmm[1] = sort_zmm<vtype>(zmm[1]); - zmm[2] = sort_zmm<vtype>(zmm[2]); - zmm[3] = sort_zmm<vtype>(zmm[3]); - bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]); - bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]); - bitonic_merge_four_zmm<vtype>(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 16, zmm[1]); - vtype::mask_storeu(arr + 32, load_mask1, zmm[2]); - vtype::mask_storeu(arr + 48, load_mask2, zmm[3]); -} - -template <typename vtype, typename type_t> -static inline void -sort_128(type_t *arr, npy_int N) -{ - if (N <= 64) { - sort_64<vtype>(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm[8]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 16); - zmm[2] = vtype::loadu(arr + 32); - zmm[3] = vtype::loadu(arr + 48); - zmm[0] = sort_zmm<vtype>(zmm[0]); - zmm[1] = sort_zmm<vtype>(zmm[1]); - zmm[2] = sort_zmm<vtype>(zmm[2]); - zmm[3] = sort_zmm<vtype>(zmm[3]); - __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; - __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; - if (N < 80) { - load_mask1 = (0x0001 << (N - 64)) - 0x0001; - load_mask2 = 0x0000; - load_mask3 = 0x0000; - load_mask4 = 0x0000; - } - else if (N < 96) { - load_mask2 = (0x0001 << (N - 80)) - 0x0001; - load_mask3 = 0x0000; - load_mask4 = 0x0000; - } - else if (N < 112) { - load_mask3 = (0x0001 << (N - 96)) - 0x0001; - load_mask4 = 0x0000; - } - else { - load_mask4 = (0x0001 << (N - 112)) - 0x0001; - } - zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); - zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80); - zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96); - zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112); - zmm[4] = sort_zmm<vtype>(zmm[4]); - zmm[5] = sort_zmm<vtype>(zmm[5]); - zmm[6] = sort_zmm<vtype>(zmm[6]); - zmm[7] = sort_zmm<vtype>(zmm[7]); - bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]); - bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]); - bitonic_merge_two_zmm<vtype>(&zmm[4], &zmm[5]); - bitonic_merge_two_zmm<vtype>(&zmm[6], &zmm[7]); - bitonic_merge_four_zmm<vtype>(zmm); - bitonic_merge_four_zmm<vtype>(zmm + 4); - bitonic_merge_eight_zmm<vtype>(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 16, zmm[1]); - vtype::storeu(arr + 32, zmm[2]); - vtype::storeu(arr + 48, zmm[3]); - vtype::mask_storeu(arr + 64, load_mask1, zmm[4]); - vtype::mask_storeu(arr + 80, load_mask2, zmm[5]); - vtype::mask_storeu(arr + 96, load_mask3, zmm[6]); - vtype::mask_storeu(arr + 112, load_mask4, zmm[7]); -} - -template <typename type_t> -static inline void -swap(type_t *arr, npy_intp ii, npy_intp jj) -{ - type_t temp = arr[ii]; - arr[ii] = arr[jj]; - arr[jj] = temp; -} - -// Median of 3 strategy -// template<typename type_t> -// static inline -// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp -// right) { -// return (rand() % (right + 1 - left)) + left; -// //npy_intp middle = ((right-left)/2) + left; -// //type_t a = arr[left], b = arr[middle], c = arr[right]; -// //if ((b >= a && b <= c) || (b <= a && b >= c)) -// // return middle; -// //if ((a >= b && a <= c) || (a <= b && a >= c)) -// // return left; -// //else -// // return right; -//} - -/* - * Picking the pivot: Median of 72 array elements chosen at random. - */ - -template <typename vtype, typename type_t> -static inline type_t -get_pivot(type_t *arr, const npy_intp left, const npy_intp right) -{ - /* seeds for vectorized random number generator */ - __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374, - 1324281658759788278, 6214952190349879213); - __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653, - 7874578921548791257, 1998265912745817298); - s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left)); - s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right)); - - npy_intp arrsize = right - left + 1; - __m256i bound = - _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize); - __m512i left_vec = _mm512_set1_epi64(left); - __m512i right_vec = _mm512_set1_epi64(right); - using ymm_t = typename vtype::ymm_t; - ymm_t v[9]; - /* fill 9 vectors with random numbers */ - for (npy_int i = 0; i < 9; ++i) { - __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */ - __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32( - rand_64, bound)); /* random numbers between 0 and bound - 1 */ - __m512i indices; - if (i < 5) - indices = - _mm512_add_epi64(left_vec, rand_32); /* indices for arr */ - else - indices = - _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */ - - v[i] = vtype::template i64gather<sizeof(type_t)>(indices, arr); - } - - /* median network for 9 elements */ - COEX<vtype>(v[0], v[1]); - COEX<vtype>(v[2], v[3]); - COEX<vtype>(v[4], v[5]); - COEX<vtype>(v[6], v[7]); - COEX<vtype>(v[0], v[2]); - COEX<vtype>(v[1], v[3]); - COEX<vtype>(v[4], v[6]); - COEX<vtype>(v[5], v[7]); - COEX<vtype>(v[0], v[4]); - COEX<vtype>(v[1], v[2]); - COEX<vtype>(v[5], v[6]); - COEX<vtype>(v[3], v[7]); - COEX<vtype>(v[1], v[5]); - COEX<vtype>(v[2], v[6]); - COEX<vtype>(v[3], v[5]); - COEX<vtype>(v[2], v[4]); - COEX<vtype>(v[3], v[4]); - COEX<vtype>(v[3], v[8]); - COEX<vtype>(v[4], v[8]); - - // technically v[4] needs to be sorted before we pick the correct median, - // picking the 4th element works just as well for performance - type_t *temp = (type_t *)&v[4]; - - return temp[4]; -} - -/* - * Partition one ZMM register based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template <typename vtype, typename type_t, typename zmm_t> -static inline npy_int -partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec, - const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec) -{ - /* which elements are larger than the pivot */ - __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec); - npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask); - vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec); - vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask, - curr_vec); - *smallest_vec = vtype::min(curr_vec, *smallest_vec); - *biggest_vec = vtype::max(curr_vec, *biggest_vec); - return amount_gt_pivot; -} - -/* - * Partition an array based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template <typename vtype, typename type_t> -static inline npy_intp -partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot, - type_t *smallest, type_t *biggest) -{ - /* make array length divisible by 16 , shortening the array */ - for (npy_int i = (right - left) % 16; i > 0; --i) { - *smallest = MIN(*smallest, arr[left]); - *biggest = MAX(*biggest, arr[left]); - if (arr[left] > pivot) { - swap(arr, left, --right); - } - else { - ++left; - } - } - - if (left == right) - return left; /* less than 16 elements in the array */ - - using zmm_t = typename vtype::zmm_t; - zmm_t pivot_vec = vtype::set1(pivot); - zmm_t min_vec = vtype::set1(*smallest); - zmm_t max_vec = vtype::set1(*biggest); - - if (right - left == 16) { - zmm_t vec = vtype::loadu(arr + left); - npy_int amount_gt_pivot = partition_vec<vtype>( - arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec); - *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); - return left + (16 - amount_gt_pivot); - } - - // first and last 16 values are partitioned at the end - zmm_t vec_left = vtype::loadu(arr + left); - zmm_t vec_right = vtype::loadu(arr + (right - 16)); - // store points of the vectors - npy_intp r_store = right - 16; - npy_intp l_store = left; - // indices for loading the elements - left += 16; - right -= 16; - while (right - left != 0) { - zmm_t curr_vec; - /* - * if fewer elements are stored on the right side of the array, - * then next elements are loaded from the right side, - * otherwise from the left side - */ - if ((r_store + 16) - right < left - l_store) { - right -= 16; - curr_vec = vtype::loadu(arr + right); - } - else { - curr_vec = vtype::loadu(arr + left); - left += 16; - } - // partition the current vector and save it on both sides of the array - npy_int amount_gt_pivot = - partition_vec<vtype>(arr, l_store, r_store + 16, curr_vec, - pivot_vec, &min_vec, &max_vec); - ; - r_store -= amount_gt_pivot; - l_store += (16 - amount_gt_pivot); - } - - /* partition and save vec_left and vec_right */ - npy_int amount_gt_pivot = - partition_vec<vtype>(arr, l_store, r_store + 16, vec_left, - pivot_vec, &min_vec, &max_vec); - l_store += (16 - amount_gt_pivot); - amount_gt_pivot = - partition_vec<vtype>(arr, l_store, l_store + 16, vec_right, - pivot_vec, &min_vec, &max_vec); - l_store += (16 - amount_gt_pivot); - *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); - return l_store; -} - -template <typename vtype, typename type_t> -static inline void -qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters) -{ - /* - * Resort to heapsort if quicksort isn't making any progress - */ - if (max_iters <= 0) { - heapsort_<typename vtype::tag>(arr + left, right + 1 - left); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128<vtype>(arr + left, (npy_int)(right + 1 - left)); - return; - } - - type_t pivot = get_pivot<vtype>(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - npy_intp pivot_index = partition_avx512<vtype>(arr, left, right + 1, pivot, - &smallest, &biggest); - if (pivot != smallest) - qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_<vtype>(arr, pivot_index, right, max_iters - 1); -} - -static inline npy_intp -replace_nan_with_inf(npy_float *arr, npy_intp arrsize) -{ - npy_intp nan_count = 0; - __mmask16 loadmask = 0xFFFF; - while (arrsize > 0) { - if (arrsize < 16) { - loadmask = (0x0001 << arrsize) - 0x0001; - } - __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); - __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((npy_int)nanmask); - _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); - arr += 16; - arrsize -= 16; - } - return nan_count; -} - -static inline void -replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count) -{ - for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = NPY_NANF; - nan_count -= 1; - } -} - -/*************************************** - * C > C++ dispatch - ***************************************/ - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize) -{ - if (arrsize > 1) { - qsort_<vector<npy_int>, npy_int>((npy_int *)arr, 0, arrsize - 1, - 2 * (npy_int)log2(arrsize)); - } -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize) -{ - if (arrsize > 1) { - qsort_<vector<npy_uint>, npy_uint>((npy_uint *)arr, 0, arrsize - 1, - 2 * (npy_int)log2(arrsize)); - } -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize) -{ - if (arrsize > 1) { - npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize); - qsort_<vector<npy_float>, npy_float>((npy_float *)arr, 0, arrsize - 1, - 2 * (npy_int)log2(arrsize)); - replace_inf_with_nan((npy_float *)arr, arrsize, nan_count); - } -} - -#endif // NPY_HAVE_AVX512_SKX diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h deleted file mode 100644 index 6340e2bc7..000000000 --- a/numpy/core/src/npysort/x86-qsort.h +++ /dev/null @@ -1,28 +0,0 @@ -#include "numpy/npy_common.h" - -#include "npy_cpu_dispatch.h" - -#ifndef NPY_NO_EXPORT -#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort.dispatch.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float, - (void *start, npy_intp num)) - -#ifdef __cplusplus -} -#endif diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort new file mode 160000 +Subproject 7d7591cf5927e83e4a1e7c4b6f2c4dc91a97889 diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index a7b72d54f..3f3fab54d 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9858,39 +9858,48 @@ class TestViewDtype: # Test various array sizes that hit different code paths in quicksort-avx512 -@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191, - 256, 383, 512, 1023, 2047]) -def test_sort_float(N): +@pytest.mark.parametrize("N", np.arange(1, 512)) +@pytest.mark.parametrize("dtype", ['e', 'f', 'd']) +def test_sort_float(N, dtype): # Regular data with nan sprinkled np.random.seed(42) - arr = -0.5 + np.random.sample(N).astype('f') + arr = -0.5 + np.random.sample(N).astype(dtype) arr[np.random.choice(arr.shape[0], 3)] = np.nan assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) # (2) with +INF - infarr = np.inf*np.ones(N, dtype='f') + infarr = np.inf*np.ones(N, dtype=dtype) infarr[np.random.choice(infarr.shape[0], 5)] = -1.0 assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) # (3) with -INF - neginfarr = -np.inf*np.ones(N, dtype='f') + neginfarr = -np.inf*np.ones(N, dtype=dtype) neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0 assert_equal(np.sort(neginfarr, kind='quick'), np.sort(neginfarr, kind='heap')) # (4) with +/-INF - infarr = np.inf*np.ones(N, dtype='f') + infarr = np.inf*np.ones(N, dtype=dtype) infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) - -def test_sort_int(): - # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled - rng = np.random.default_rng(42) - N = 2047 - minv = np.iinfo(np.int32).min - maxv = np.iinfo(np.int32).max - arr = rng.integers(low=minv, high=maxv, size=N).astype('int32') +def test_sort_float16(): + arr = np.arange(65536, dtype=np.int16) + temp = np.frombuffer(arr.tobytes(), dtype=np.float16) + data = np.copy(temp) + np.random.shuffle(data) + data_backup = data + assert_equal(np.sort(data, kind='quick'), + np.sort(data_backup, kind='heap')) + + +@pytest.mark.parametrize("N", np.arange(1, 512)) +@pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L']) +def test_sort_int(N, dtype): + # Random data with MAX and MIN sprinkled + minv = np.iinfo(dtype).min + maxv = np.iinfo(dtype).max + arr = np.random.randint(low=minv, high=maxv-1, size=N, dtype=dtype) arr[np.random.choice(arr.shape[0], 10)] = minv arr[np.random.choice(arr.shape[0], 10)] = maxv assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) |