summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2023-02-15 14:01:17 +0200
committerGitHub <noreply@github.com>2023-02-15 14:01:17 +0200
commit0bd56e7ec12f8ceeb8d082340e71e60b873d5c57 (patch)
tree4151f2f93381a1eed7b29ea5cf2aca122711a2b8
parenta5f4e8e417d815cb577a8f2f9414b07a689d69b1 (diff)
parentd07d5584fc63df10025190a4ea38c4863c1b1723 (diff)
downloadnumpy-0bd56e7ec12f8ceeb8d082340e71e60b873d5c57.tar.gz
Merge pull request #22315 from r-devulap/avxsort
ENH: Vectorize quicksort for 16-bit and 64-bit dtype using AVX512
-rw-r--r--.gitmodules3
-rw-r--r--azure-pipelines.yml3
-rw-r--r--azure-steps-windows.yml2
-rw-r--r--benchmarks/benchmarks/bench_function_base.py2
-rw-r--r--numpy/core/meson.build11
-rw-r--r--numpy/core/setup.py3
-rw-r--r--numpy/core/src/common/common.hpp11
-rw-r--r--numpy/core/src/common/half.hpp63
-rw-r--r--numpy/core/src/common/meta.hpp54
-rw-r--r--numpy/core/src/common/npstd.hpp54
-rw-r--r--numpy/core/src/common/npy_config.h1
-rw-r--r--numpy/core/src/npysort/quicksort.cpp127
-rw-r--r--numpy/core/src/npysort/simd_qsort.dispatch.cpp44
-rw-r--r--numpy/core/src/npysort/simd_qsort.hpp19
-rw-r--r--numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp31
-rw-r--r--numpy/core/src/npysort/x86-qsort.dispatch.cpp835
-rw-r--r--numpy/core/src/npysort/x86-qsort.h28
m---------numpy/core/src/npysort/x86-simd-sort0
-rw-r--r--numpy/core/tests/test_multiarray.py39
19 files changed, 389 insertions, 941 deletions
diff --git a/.gitmodules b/.gitmodules
index 1ea274daf..d849a3caf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
[submodule "numpy/core/src/umath/svml"]
path = numpy/core/src/umath/svml
url = https://github.com/numpy/SVML.git
+[submodule "numpy/core/src/npysort/x86-simd-sort"]
+ path = numpy/core/src/npysort/x86-simd-sort
+ url = https://github.com/intel/x86-simd-sort
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 18b72f490..7657ab87f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -184,6 +184,9 @@ stages:
- script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
displayName: 'Check for unreachable code paths in Python modules'
+ - script: git submodule update --init
+ displayName: 'Fetch submodules'
+
# prefer usage of clang over gcc proper
# to match likely scenario on many user mac machines
- script: python setup.py build -j 4 build_src --verbose-cfg install
diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml
index 318f46398..a147ffd7a 100644
--- a/azure-steps-windows.yml
+++ b/azure-steps-windows.yml
@@ -1,4 +1,6 @@
steps:
+- script: git submodule update --init
+ displayName: 'Fetch submodules'
- task: UsePythonVersion@0
inputs:
versionSpec: $(PYTHON_VERSION)
diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
index 2e44ff76b..cc37bef39 100644
--- a/benchmarks/benchmarks/bench_function_base.py
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -248,7 +248,7 @@ class Sort(Benchmark):
# In NumPy 1.17 and newer, 'merge' can be one of several
# stable sorts, it isn't necessarily merge sort.
['quick', 'merge', 'heap'],
- ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'],
+ ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'],
[
('random',),
('ordered',),
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 7cbfe6dc3..e6607d8ad 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -453,6 +453,11 @@ if cc.get_id() == 'msvc'
staticlib_cflags += '-d2VolatileMetadata-'
endif
endif
+# TODO: change to "feature" option in meson_options.txt? See
+# https://mesonbuild.com/Build-options.html#build-options
+if get_option('disable-simd-optimizations')
+ staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION'
+endif
npy_math_internal_h = custom_target(
output: 'npy_math_internal.h',
@@ -594,7 +599,8 @@ np_core_dep = declare_dependency(
'.',
'include',
'src/common',
- ]
+ ],
+ compile_args: disable_simd_optimizations
)
@@ -718,7 +724,8 @@ src_multiarray = [
'src/multiarray/usertypes.c',
'src/multiarray/vdot.c',
src_file.process('src/common/npy_sort.h.src'),
- 'src/npysort/x86-qsort.dispatch.cpp',
+ 'src/npysort/simd_qsort.dispatch.cpp',
+ 'src/npysort/simd_qsort_16bit.dispatch.cpp',
'src/npysort/quicksort.cpp',
'src/npysort/mergesort.cpp',
'src/npysort/timsort.cpp',
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index c7e28af9c..2cad4ba43 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -942,7 +942,6 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'usertypes.c'),
join('src', 'multiarray', 'vdot.c'),
join('src', 'common', 'npy_sort.h.src'),
- join('src', 'npysort', 'x86-qsort.dispatch.cpp'),
join('src', 'npysort', 'quicksort.cpp'),
join('src', 'npysort', 'mergesort.cpp'),
join('src', 'npysort', 'timsort.cpp'),
@@ -964,6 +963,8 @@ def configuration(parent_package='',top_path=None):
# links to the arm64 npymath library,
# see gh-22673
join('src', 'npymath', 'arm64_exports.c'),
+ join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
+ join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
]
#######################################################################
diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp
new file mode 100644
index 000000000..47d790bcf
--- /dev/null
+++ b/numpy/core/src/common/common.hpp
@@ -0,0 +1,11 @@
+#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP
+#define NUMPY_CORE_SRC_COMMON_COMMON_HPP
+/*
+ * The following C++ headers are safe to be used standalone, however,
+ * they are gathered to make it easy for us and for the future need to support PCH.
+ */
+#include "npstd.hpp"
+#include "half.hpp"
+#include "meta.hpp"
+
+#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
new file mode 100644
index 000000000..399f2fa79
--- /dev/null
+++ b/numpy/core/src/common/half.hpp
@@ -0,0 +1,63 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_HPP
+
+#include "npstd.hpp"
+
+// TODO(@seiko2plus):
+// - covers half-precision operations that being supported by numpy/halffloat.h
+// - support __fp16
+// - optimize x86 half<->single via cpu_fp16
+// - optimize ppc64 half<->single via cpu_vsx3
+
+namespace np {
+
+/// @addtogroup cpp_core_types
+/// @{
+
+/// Provides a type that implements 16-bit floating point (half-precision).
+/// This type is ensured to be 16-bit size.
+class Half final {
+ public:
+ /// @name Public Constructors
+ /// @{
+
+ /// Default constructor. initialize nothing.
+ Half() = default;
+ /// Copy.
+ Half(const Half &r)
+ {
+ data_.u = r.data_.u;
+ }
+
+ /// @}
+
+ /// Returns a new Half constracted from the IEEE 754 binary16.
+ /// @param b the value of binary16.
+ static Half FromBits(uint16_t b)
+ {
+ Half f;
+ f.data_.u = b;
+ return f;
+ }
+ /// Returns the IEEE 754 binary16 representation.
+ uint16_t Bits() const
+ {
+ return data_.u;
+ }
+
+ private:
+ union {
+ uint16_t u;
+/*
+TODO(@seiko2plus): support __fp16
+#ifdef NPY_HAVE_HW_FP16
+ __fp16 f;
+#endif
+*/
+ } data_;
+};
+
+/// @} cpp_core_types
+
+} // namespace np
+#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP
diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp
new file mode 100644
index 000000000..27ea1857e
--- /dev/null
+++ b/numpy/core/src/common/meta.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_META_HPP
+#define NUMPY_CORE_SRC_COMMON_META_HPP
+
+#include "npstd.hpp"
+
+namespace np { namespace meta {
+/// @addtogroup cpp_core_meta
+/// @{
+
+namespace details {
+template<int size, bool unsig>
+struct IntBySize;
+
+template<bool unsig>
+struct IntBySize<sizeof(uint8_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint8_t, int8_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint16_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint16_t, int16_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint32_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint32_t, int32_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint64_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint64_t, int64_t>::type;
+};
+} // namespace details
+
+/// Provides safe conversion of any integer type synonyms
+/// to a fixed-width integer type.
+template<typename T>
+struct FixedWidth {
+ using TF_ = typename details::IntBySize<
+ sizeof(T), std::is_unsigned<T>::value
+ >::Type;
+
+ using Type = typename std::conditional<
+ std::is_integral<T>::value, TF_, T
+ >::type;
+};
+
+/// @} cpp_core_meta
+
+}} // namespace np::meta
+
+#endif // NUMPY_CORE_SRC_COMMON_META_HPP
+
diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp
new file mode 100644
index 000000000..71993bd7c
--- /dev/null
+++ b/numpy/core/src/common/npstd.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#include <numpy/npy_common.h>
+
+#include "npy_config.h"
+
+namespace np {
+/// @addtogroup cpp_core_types
+/// @{
+using std::uint8_t;
+using std::int8_t;
+using std::uint16_t;
+using std::int16_t;
+using std::uint32_t;
+using std::int32_t;
+using std::uint64_t;
+using std::int64_t;
+using std::uintptr_t;
+using std::intptr_t;
+using std::complex;
+
+/** Guard for long double.
+ *
+ * The C implementation defines long double as double
+ * on MinGW to provide compatibility with MSVC to unify
+ * one behavior under Windows OS, which makes npy_longdouble
+ * not fit to be used with template specialization or overloading.
+ *
+ * This type will be set to `void` when `npy_longdouble` is not defined
+ * as `long double`.
+ */
+using LongDouble = typename std::conditional<
+ !std::is_same<npy_longdouble, long double>::value,
+ void, npy_longdouble
+>::type;
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index d6886c5ea..715b17777 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
#define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
#include "config.h"
+#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features]
#include "numpy/numpyconfig.h"
#include "numpy/utils.h"
#include "numpy/npy_os.h"
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 3e351dd84..7497ebaa3 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -54,15 +54,11 @@
#include "npysort_common.h"
#include "npysort_heapsort.h"
#include "numpy_tag.h"
+#include "simd_qsort.hpp"
-#include "x86-qsort.h"
#include <cstdlib>
#include <utility>
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
#define NOT_USED NPY_UNUSED(unused)
/*
* pushing largest partition has upper bound of log2(n) space
@@ -73,70 +69,50 @@
#define SMALL_MERGESORT 20
#define SMALL_STRING 16
+// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure
+// out why it has test failures
+#if defined(_MSC_VER) || defined(__CYGWIN__)
+template<typename T>
+inline bool quicksort_dispatch(T*, npy_intp)
+{
+ return false;
+}
+#else
+template<typename T>
+inline bool quicksort_dispatch(T *start, npy_intp num)
+{
+ using TF = typename np::meta::FixedWidth<T>::Type;
+ void (*dispfunc)(TF*, intptr_t) = nullptr;
+ if (sizeof(T) == sizeof(uint16_t)) {
+ #ifndef NPY_DISABLE_OPTIMIZATION
+ #include "simd_qsort_16bit.dispatch.h"
+ #endif
+ NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+ }
+ else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+ #ifndef NPY_DISABLE_OPTIMIZATION
+ #include "simd_qsort.dispatch.h"
+ #endif
+ NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+ }
+ if (dispfunc) {
+ (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
+ return true;
+ }
+ return false;
+}
+#endif // _MSC_VER || CYGWIN
+
/*
*****************************************************************************
** NUMERIC SORTS **
*****************************************************************************
*/
-namespace {
-
-template <typename Tag>
-struct x86_dispatch {
- static bool quicksort(typename Tag::type *, npy_intp) { return false; }
-};
-
-template <>
-struct x86_dispatch<npy::int_tag> {
- static bool quicksort(npy_int *start, npy_intp num)
- {
- void (*dispfunc)(void *, npy_intp) = nullptr;
- NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int);
- if (dispfunc) {
- (*dispfunc)(start, num);
- return true;
- }
- return false;
- }
-};
-
-template <>
-struct x86_dispatch<npy::uint_tag> {
- static bool quicksort(npy_uint *start, npy_intp num)
- {
- void (*dispfunc)(void *, npy_intp) = nullptr;
- NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint);
- if (dispfunc) {
- (*dispfunc)(start, num);
- return true;
- }
- return false;
- }
-};
-
-template <>
-struct x86_dispatch<npy::float_tag> {
- static bool quicksort(npy_float *start, npy_intp num)
- {
- void (*dispfunc)(void *, npy_intp) = nullptr;
- NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float);
- if (dispfunc) {
- (*dispfunc)(start, num);
- return true;
- }
- return false;
- }
-};
-
-} // namespace
-
template <typename Tag, typename type>
static int
quicksort_(type *start, npy_intp num)
{
- if (x86_dispatch<Tag>::quicksort(start, num))
- return 0;
-
type vp;
type *pl = start;
type *pr = pl + num - 1;
@@ -729,56 +705,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr))
NPY_NO_EXPORT int
quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_short *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::short_tag>((npy_short *)start, n);
}
NPY_NO_EXPORT int
quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_ushort *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::ushort_tag>((npy_ushort *)start, n);
}
NPY_NO_EXPORT int
quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_int *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::int_tag>((npy_int *)start, n);
}
NPY_NO_EXPORT int
quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_uint *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::uint_tag>((npy_uint *)start, n);
}
NPY_NO_EXPORT int
quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_long *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::long_tag>((npy_long *)start, n);
}
NPY_NO_EXPORT int
quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_ulong *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::ulong_tag>((npy_ulong *)start, n);
}
NPY_NO_EXPORT int
quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_longlong *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::longlong_tag>((npy_longlong *)start, n);
}
NPY_NO_EXPORT int
quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_ulonglong *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::ulonglong_tag>((npy_ulonglong *)start, n);
}
NPY_NO_EXPORT int
quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((np::Half *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::half_tag>((npy_half *)start, n);
}
NPY_NO_EXPORT int
quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_float *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::float_tag>((npy_float *)start, n);
}
NPY_NO_EXPORT int
quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr))
{
+ if (quicksort_dispatch((npy_double *)start, n)) {
+ return 0;
+ }
return quicksort_<npy::double_tag>((npy_double *)start, n);
}
NPY_NO_EXPORT int
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
new file mode 100644
index 000000000..101bb3dcc
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -0,0 +1,44 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+ #include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
+ #include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+#endif // NPY_HAVE_AVX512_SKX
+
+}} // namespace np::simd
diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp
new file mode 100644
index 000000000..7cdee774d
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.hpp
@@ -0,0 +1,19 @@
+#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+
+#include "common.hpp"
+
+namespace np { namespace qsort_simd {
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "simd_qsort.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "simd_qsort_16bit.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+} } // np::qsort_simd
+#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
new file mode 100644
index 000000000..a6465a883
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -0,0 +1,31 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
+ #include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
+{
+ avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size)
+{
+ avx512_qsort(arr, size);
+}
+#endif // NPY_HAVE_AVX512_ICL
+
+}} // namespace np::qsort_simd
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
deleted file mode 100644
index 8e88cc667..000000000
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ /dev/null
@@ -1,835 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_skx
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_SKX
-#include "numpy/npy_math.h"
-
-#include "npy_sort.h"
-#include "numpy_tag.h"
-
-#include "simd/simd.h"
-#include <immintrin.h>
-
-template <typename Tag, typename type>
-NPY_NO_EXPORT int
-heapsort_(type *start, npy_intp n);
-
-/*
- * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are
- * based on these two research papers:
- * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types
- * https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * High level idea: Vectorize the quicksort partitioning using AVX-512
- * compressstore instructions. The algorithm to pick the pivot is to use median
- * of 72 elements picked at random. If the array size is < 128, then use
- * Bitonic sorting network. Good resource for bitonic sorting network:
- * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340
- * for potential problems when converting this code to universal intrinsics
- * framework.
- */
-
-/*
- * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK)
-#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK)
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/*
- * Vectorized random number generator xoroshiro128+. Broken into 2 parts:
- * (1) vnext generates 2 64-bit random integers
- * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to
- * the length of the array
- */
-#define VROTL(x, k) /* rotate each uint64_t value in vector */ \
- _mm256_or_si256(_mm256_slli_epi64((x), (k)), \
- _mm256_srli_epi64((x), 64 - (k)))
-
-static inline __m256i
-vnext(__m256i *s0, __m256i *s1)
-{
- *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
- *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1),
- _mm256_slli_epi64(*s1, 16));
- *s1 = VROTL(*s1, 37);
- return _mm256_add_epi64(*s0, *s1); /* return random vector */
-}
-
-/* transform random numbers to the range between 0 and bound - 1 */
-static inline __m256i
-rnd_epu32(__m256i rnd_vec, __m256i bound)
-{
- __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
- __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound);
- return _mm256_blend_epi32(odd, even, 0b01010101);
-}
-
-template <typename type>
-struct vector;
-
-template <>
-struct vector<npy_int> {
- using tag = npy::int_tag;
- using type_t = npy_int;
- using zmm_t = __m512i;
- using ymm_t = __m256i;
-
- static type_t type_max() { return NPY_MAX_INT32; }
- static type_t type_min() { return NPY_MIN_INT32; }
- static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
- static __mmask16 ge(zmm_t x, zmm_t y)
- {
- return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
- }
- template <int scale>
- static ymm_t i64gather(__m512i index, void const *base)
- {
- return _mm512_i64gather_epi32(index, base, scale);
- }
- static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
- static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
- static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
- {
- return _mm512_mask_compressstoreu_epi32(mem, mask, x);
- }
- static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
- {
- return _mm512_mask_loadu_epi32(x, mask, mem);
- }
- static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
- {
- return _mm512_mask_mov_epi32(x, mask, y);
- }
- static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
- {
- return _mm512_mask_storeu_epi32(mem, mask, x);
- }
- static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
- static zmm_t permutexvar(__m512i idx, zmm_t zmm)
- {
- return _mm512_permutexvar_epi32(idx, zmm);
- }
- static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); }
- static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); }
- static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
- template<__mmask16 mask>
- static zmm_t shuffle(zmm_t zmm)
- {
- return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
- }
- static void storeu(void *mem, zmm_t x)
- {
- return _mm512_storeu_si512(mem, x);
- }
-
- static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
- static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
-};
-template <>
-struct vector<npy_uint> {
- using tag = npy::uint_tag;
- using type_t = npy_uint;
- using zmm_t = __m512i;
- using ymm_t = __m256i;
-
- static type_t type_max() { return NPY_MAX_UINT32; }
- static type_t type_min() { return 0; }
- static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
- template<int scale>
- static ymm_t i64gather(__m512i index, void const *base)
- {
- return _mm512_i64gather_epi32(index, base, scale);
- }
- static __mmask16 ge(zmm_t x, zmm_t y)
- {
- return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
- }
- static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
- static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
- static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
- {
- return _mm512_mask_compressstoreu_epi32(mem, mask, x);
- }
- static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
- {
- return _mm512_mask_loadu_epi32(x, mask, mem);
- }
- static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
- {
- return _mm512_mask_mov_epi32(x, mask, y);
- }
- static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
- {
- return _mm512_mask_storeu_epi32(mem, mask, x);
- }
- static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
- static zmm_t permutexvar(__m512i idx, zmm_t zmm)
- {
- return _mm512_permutexvar_epi32(idx, zmm);
- }
- static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); }
- static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); }
- static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
- template<__mmask16 mask>
- static zmm_t shuffle(zmm_t zmm)
- {
- return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
- }
- static void storeu(void *mem, zmm_t x)
- {
- return _mm512_storeu_si512(mem, x);
- }
-
- static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
- static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
-};
-template <>
-struct vector<npy_float> {
- using tag = npy::float_tag;
- using type_t = npy_float;
- using zmm_t = __m512;
- using ymm_t = __m256;
-
- static type_t type_max() { return NPY_INFINITYF; }
- static type_t type_min() { return -NPY_INFINITYF; }
- static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
-
- static __mmask16 ge(zmm_t x, zmm_t y)
- {
- return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
- }
- template<int scale>
- static ymm_t i64gather(__m512i index, void const *base)
- {
- return _mm512_i64gather_ps(index, base, scale);
- }
- static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
- static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
- static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
- {
- return _mm512_mask_compressstoreu_ps(mem, mask, x);
- }
- static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
- {
- return _mm512_mask_loadu_ps(x, mask, mem);
- }
- static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
- {
- return _mm512_mask_mov_ps(x, mask, y);
- }
- static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
- {
- return _mm512_mask_storeu_ps(mem, mask, x);
- }
- static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
- static zmm_t permutexvar(__m512i idx, zmm_t zmm)
- {
- return _mm512_permutexvar_ps(idx, zmm);
- }
- static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); }
- static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); }
- static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
- template<__mmask16 mask>
- static zmm_t shuffle(zmm_t zmm)
- {
- return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
- }
- static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
-
- static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
- static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
-};
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-void
-COEX(mm_t &a, mm_t &b)
-{
- mm_t temp = a;
- a = vtype::min(a, b);
- b = vtype::max(temp, b);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
-{
- zmm_t min = vtype::min(in2, in1);
- zmm_t max = vtype::max(in2, in1);
- return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max
-}
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-sort_zmm(zmm_t zmm)
-{
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
- 0xAAAA);
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
- 0xCCCC);
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
- 0xAAAA);
- zmm = cmp_merge<vtype>(
- zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0);
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
- 0xCCCC);
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
- 0xAAAA);
- zmm = cmp_merge<vtype>(
- zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00);
- zmm = cmp_merge<vtype>(
- zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
- 0xCCCC);
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
- 0xAAAA);
- return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-bitonic_merge_zmm(zmm_t zmm)
-{
- // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
- zmm = cmp_merge<vtype>(
- zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00);
- // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
- zmm = cmp_merge<vtype>(
- zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
- // 3) half_cleaner[4]
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
- 0xCCCC);
- // 3) half_cleaner[1]
- zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
- 0xAAAA);
- return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
-{
- // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
- *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2);
- zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
- zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
- // 2) Recursive half cleaner for each
- *zmm1 = bitonic_merge_zmm<vtype>(zmm3);
- *zmm2 = bitonic_merge_zmm<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_four_zmm(zmm_t *zmm)
-{
- zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]);
- zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]);
- zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
- zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
- zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
- vtype::max(zmm[1], zmm2r));
- zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
- vtype::max(zmm[0], zmm3r));
- zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
- zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
- zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
- zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
- zmm[0] = bitonic_merge_zmm<vtype>(zmm0);
- zmm[1] = bitonic_merge_zmm<vtype>(zmm1);
- zmm[2] = bitonic_merge_zmm<vtype>(zmm2);
- zmm[3] = bitonic_merge_zmm<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_eight_zmm(zmm_t *zmm)
-{
- zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]);
- zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]);
- zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]);
- zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]);
- zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
- zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
- zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
- zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
- zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
- vtype::max(zmm[3], zmm4r));
- zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
- vtype::max(zmm[2], zmm5r));
- zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
- vtype::max(zmm[1], zmm6r));
- zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
- vtype::max(zmm[0], zmm7r));
- COEX<vtype>(zmm_t1, zmm_t3);
- COEX<vtype>(zmm_t2, zmm_t4);
- COEX<vtype>(zmm_t5, zmm_t7);
- COEX<vtype>(zmm_t6, zmm_t8);
- COEX<vtype>(zmm_t1, zmm_t2);
- COEX<vtype>(zmm_t3, zmm_t4);
- COEX<vtype>(zmm_t5, zmm_t6);
- COEX<vtype>(zmm_t7, zmm_t8);
- zmm[0] = bitonic_merge_zmm<vtype>(zmm_t1);
- zmm[1] = bitonic_merge_zmm<vtype>(zmm_t2);
- zmm[2] = bitonic_merge_zmm<vtype>(zmm_t3);
- zmm[3] = bitonic_merge_zmm<vtype>(zmm_t4);
- zmm[4] = bitonic_merge_zmm<vtype>(zmm_t5);
- zmm[5] = bitonic_merge_zmm<vtype>(zmm_t6);
- zmm[6] = bitonic_merge_zmm<vtype>(zmm_t7);
- zmm[7] = bitonic_merge_zmm<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_16(type_t *arr, npy_int N)
-{
- __mmask16 load_mask = (0x0001 << N) - 0x0001;
- typename vtype::zmm_t zmm =
- vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
- vtype::mask_storeu(arr, load_mask, sort_zmm<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_32(type_t *arr, npy_int N)
-{
- if (N <= 16) {
- sort_16<vtype>(arr, N);
- return;
- }
- using zmm_t = typename vtype::zmm_t;
- zmm_t zmm1 = vtype::loadu(arr);
- __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001;
- zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
- zmm1 = sort_zmm<vtype>(zmm1);
- zmm2 = sort_zmm<vtype>(zmm2);
- bitonic_merge_two_zmm<vtype>(&zmm1, &zmm2);
- vtype::storeu(arr, zmm1);
- vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_64(type_t *arr, npy_int N)
-{
- if (N <= 32) {
- sort_32<vtype>(arr, N);
- return;
- }
- using zmm_t = typename vtype::zmm_t;
- zmm_t zmm[4];
- zmm[0] = vtype::loadu(arr);
- zmm[1] = vtype::loadu(arr + 16);
- __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
- if (N < 48) {
- load_mask1 = (0x0001 << (N - 32)) - 0x0001;
- load_mask2 = 0x0000;
- }
- else if (N < 64) {
- load_mask2 = (0x0001 << (N - 48)) - 0x0001;
- }
- zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
- zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
- zmm[0] = sort_zmm<vtype>(zmm[0]);
- zmm[1] = sort_zmm<vtype>(zmm[1]);
- zmm[2] = sort_zmm<vtype>(zmm[2]);
- zmm[3] = sort_zmm<vtype>(zmm[3]);
- bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
- bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
- bitonic_merge_four_zmm<vtype>(zmm);
- vtype::storeu(arr, zmm[0]);
- vtype::storeu(arr + 16, zmm[1]);
- vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
- vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_128(type_t *arr, npy_int N)
-{
- if (N <= 64) {
- sort_64<vtype>(arr, N);
- return;
- }
- using zmm_t = typename vtype::zmm_t;
- zmm_t zmm[8];
- zmm[0] = vtype::loadu(arr);
- zmm[1] = vtype::loadu(arr + 16);
- zmm[2] = vtype::loadu(arr + 32);
- zmm[3] = vtype::loadu(arr + 48);
- zmm[0] = sort_zmm<vtype>(zmm[0]);
- zmm[1] = sort_zmm<vtype>(zmm[1]);
- zmm[2] = sort_zmm<vtype>(zmm[2]);
- zmm[3] = sort_zmm<vtype>(zmm[3]);
- __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
- __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
- if (N < 80) {
- load_mask1 = (0x0001 << (N - 64)) - 0x0001;
- load_mask2 = 0x0000;
- load_mask3 = 0x0000;
- load_mask4 = 0x0000;
- }
- else if (N < 96) {
- load_mask2 = (0x0001 << (N - 80)) - 0x0001;
- load_mask3 = 0x0000;
- load_mask4 = 0x0000;
- }
- else if (N < 112) {
- load_mask3 = (0x0001 << (N - 96)) - 0x0001;
- load_mask4 = 0x0000;
- }
- else {
- load_mask4 = (0x0001 << (N - 112)) - 0x0001;
- }
- zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
- zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
- zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
- zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
- zmm[4] = sort_zmm<vtype>(zmm[4]);
- zmm[5] = sort_zmm<vtype>(zmm[5]);
- zmm[6] = sort_zmm<vtype>(zmm[6]);
- zmm[7] = sort_zmm<vtype>(zmm[7]);
- bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
- bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
- bitonic_merge_two_zmm<vtype>(&zmm[4], &zmm[5]);
- bitonic_merge_two_zmm<vtype>(&zmm[6], &zmm[7]);
- bitonic_merge_four_zmm<vtype>(zmm);
- bitonic_merge_four_zmm<vtype>(zmm + 4);
- bitonic_merge_eight_zmm<vtype>(zmm);
- vtype::storeu(arr, zmm[0]);
- vtype::storeu(arr + 16, zmm[1]);
- vtype::storeu(arr + 32, zmm[2]);
- vtype::storeu(arr + 48, zmm[3]);
- vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
- vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
- vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
- vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-template <typename type_t>
-static inline void
-swap(type_t *arr, npy_intp ii, npy_intp jj)
-{
- type_t temp = arr[ii];
- arr[ii] = arr[jj];
- arr[jj] = temp;
-}
-
-// Median of 3 strategy
-// template<typename type_t>
-// static inline
-// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp
-// right) {
-// return (rand() % (right + 1 - left)) + left;
-// //npy_intp middle = ((right-left)/2) + left;
-// //type_t a = arr[left], b = arr[middle], c = arr[right];
-// //if ((b >= a && b <= c) || (b <= a && b >= c))
-// // return middle;
-// //if ((a >= b && a <= c) || (a <= b && a >= c))
-// // return left;
-// //else
-// // return right;
-//}
-
-/*
- * Picking the pivot: Median of 72 array elements chosen at random.
- */
-
-template <typename vtype, typename type_t>
-static inline type_t
-get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
-{
- /* seeds for vectorized random number generator */
- __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374,
- 1324281658759788278, 6214952190349879213);
- __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653,
- 7874578921548791257, 1998265912745817298);
- s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left));
- s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right));
-
- npy_intp arrsize = right - left + 1;
- __m256i bound =
- _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize);
- __m512i left_vec = _mm512_set1_epi64(left);
- __m512i right_vec = _mm512_set1_epi64(right);
- using ymm_t = typename vtype::ymm_t;
- ymm_t v[9];
- /* fill 9 vectors with random numbers */
- for (npy_int i = 0; i < 9; ++i) {
- __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */
- __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(
- rand_64, bound)); /* random numbers between 0 and bound - 1 */
- __m512i indices;
- if (i < 5)
- indices =
- _mm512_add_epi64(left_vec, rand_32); /* indices for arr */
- else
- indices =
- _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */
-
- v[i] = vtype::template i64gather<sizeof(type_t)>(indices, arr);
- }
-
- /* median network for 9 elements */
- COEX<vtype>(v[0], v[1]);
- COEX<vtype>(v[2], v[3]);
- COEX<vtype>(v[4], v[5]);
- COEX<vtype>(v[6], v[7]);
- COEX<vtype>(v[0], v[2]);
- COEX<vtype>(v[1], v[3]);
- COEX<vtype>(v[4], v[6]);
- COEX<vtype>(v[5], v[7]);
- COEX<vtype>(v[0], v[4]);
- COEX<vtype>(v[1], v[2]);
- COEX<vtype>(v[5], v[6]);
- COEX<vtype>(v[3], v[7]);
- COEX<vtype>(v[1], v[5]);
- COEX<vtype>(v[2], v[6]);
- COEX<vtype>(v[3], v[5]);
- COEX<vtype>(v[2], v[4]);
- COEX<vtype>(v[3], v[4]);
- COEX<vtype>(v[3], v[8]);
- COEX<vtype>(v[4], v[8]);
-
- // technically v[4] needs to be sorted before we pick the correct median,
- // picking the 4th element works just as well for performance
- type_t *temp = (type_t *)&v[4];
-
- return temp[4];
-}
-
-/*
- * Partition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline npy_int
-partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
- const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec)
-{
- /* which elements are larger than the pivot */
- __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec);
- npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask);
- vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec);
- vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask,
- curr_vec);
- *smallest_vec = vtype::min(curr_vec, *smallest_vec);
- *biggest_vec = vtype::max(curr_vec, *biggest_vec);
- return amount_gt_pivot;
-}
-
-/*
- * Partition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline npy_intp
-partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
- type_t *smallest, type_t *biggest)
-{
- /* make array length divisible by 16 , shortening the array */
- for (npy_int i = (right - left) % 16; i > 0; --i) {
- *smallest = MIN(*smallest, arr[left]);
- *biggest = MAX(*biggest, arr[left]);
- if (arr[left] > pivot) {
- swap(arr, left, --right);
- }
- else {
- ++left;
- }
- }
-
- if (left == right)
- return left; /* less than 16 elements in the array */
-
- using zmm_t = typename vtype::zmm_t;
- zmm_t pivot_vec = vtype::set1(pivot);
- zmm_t min_vec = vtype::set1(*smallest);
- zmm_t max_vec = vtype::set1(*biggest);
-
- if (right - left == 16) {
- zmm_t vec = vtype::loadu(arr + left);
- npy_int amount_gt_pivot = partition_vec<vtype>(
- arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec);
- *smallest = vtype::reducemin(min_vec);
- *biggest = vtype::reducemax(max_vec);
- return left + (16 - amount_gt_pivot);
- }
-
- // first and last 16 values are partitioned at the end
- zmm_t vec_left = vtype::loadu(arr + left);
- zmm_t vec_right = vtype::loadu(arr + (right - 16));
- // store points of the vectors
- npy_intp r_store = right - 16;
- npy_intp l_store = left;
- // indices for loading the elements
- left += 16;
- right -= 16;
- while (right - left != 0) {
- zmm_t curr_vec;
- /*
- * if fewer elements are stored on the right side of the array,
- * then next elements are loaded from the right side,
- * otherwise from the left side
- */
- if ((r_store + 16) - right < left - l_store) {
- right -= 16;
- curr_vec = vtype::loadu(arr + right);
- }
- else {
- curr_vec = vtype::loadu(arr + left);
- left += 16;
- }
- // partition the current vector and save it on both sides of the array
- npy_int amount_gt_pivot =
- partition_vec<vtype>(arr, l_store, r_store + 16, curr_vec,
- pivot_vec, &min_vec, &max_vec);
- ;
- r_store -= amount_gt_pivot;
- l_store += (16 - amount_gt_pivot);
- }
-
- /* partition and save vec_left and vec_right */
- npy_int amount_gt_pivot =
- partition_vec<vtype>(arr, l_store, r_store + 16, vec_left,
- pivot_vec, &min_vec, &max_vec);
- l_store += (16 - amount_gt_pivot);
- amount_gt_pivot =
- partition_vec<vtype>(arr, l_store, l_store + 16, vec_right,
- pivot_vec, &min_vec, &max_vec);
- l_store += (16 - amount_gt_pivot);
- *smallest = vtype::reducemin(min_vec);
- *biggest = vtype::reducemax(max_vec);
- return l_store;
-}
-
-template <typename vtype, typename type_t>
-static inline void
-qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
-{
- /*
- * Resort to heapsort if quicksort isn't making any progress
- */
- if (max_iters <= 0) {
- heapsort_<typename vtype::tag>(arr + left, right + 1 - left);
- return;
- }
- /*
- * Base case: use bitonic networks to sort arrays <= 128
- */
- if (right + 1 - left <= 128) {
- sort_128<vtype>(arr + left, (npy_int)(right + 1 - left));
- return;
- }
-
- type_t pivot = get_pivot<vtype>(arr, left, right);
- type_t smallest = vtype::type_max();
- type_t biggest = vtype::type_min();
- npy_intp pivot_index = partition_avx512<vtype>(arr, left, right + 1, pivot,
- &smallest, &biggest);
- if (pivot != smallest)
- qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
- if (pivot != biggest)
- qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-static inline npy_intp
-replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
-{
- npy_intp nan_count = 0;
- __mmask16 loadmask = 0xFFFF;
- while (arrsize > 0) {
- if (arrsize < 16) {
- loadmask = (0x0001 << arrsize) - 0x0001;
- }
- __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
- __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
- nan_count += _mm_popcnt_u32((npy_int)nanmask);
- _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
- arr += 16;
- arrsize -= 16;
- }
- return nan_count;
-}
-
-static inline void
-replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count)
-{
- for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) {
- arr[ii] = NPY_NANF;
- nan_count -= 1;
- }
-}
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
-{
- if (arrsize > 1) {
- qsort_<vector<npy_int>, npy_int>((npy_int *)arr, 0, arrsize - 1,
- 2 * (npy_int)log2(arrsize));
- }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
-{
- if (arrsize > 1) {
- qsort_<vector<npy_uint>, npy_uint>((npy_uint *)arr, 0, arrsize - 1,
- 2 * (npy_int)log2(arrsize));
- }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
-{
- if (arrsize > 1) {
- npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize);
- qsort_<vector<npy_float>, npy_float>((npy_float *)arr, 0, arrsize - 1,
- 2 * (npy_int)log2(arrsize));
- replace_inf_with_nan((npy_float *)arr, arrsize, nan_count);
- }
-}
-
-#endif // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h
deleted file mode 100644
index 6340e2bc7..000000000
--- a/numpy/core/src/npysort/x86-qsort.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
- (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
- (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
- (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort
new file mode 160000
+Subproject 7d7591cf5927e83e4a1e7c4b6f2c4dc91a97889
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index a7b72d54f..3f3fab54d 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -9858,39 +9858,48 @@ class TestViewDtype:
# Test various array sizes that hit different code paths in quicksort-avx512
-@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191,
- 256, 383, 512, 1023, 2047])
-def test_sort_float(N):
+@pytest.mark.parametrize("N", np.arange(1, 512))
+@pytest.mark.parametrize("dtype", ['e', 'f', 'd'])
+def test_sort_float(N, dtype):
# Regular data with nan sprinkled
np.random.seed(42)
- arr = -0.5 + np.random.sample(N).astype('f')
+ arr = -0.5 + np.random.sample(N).astype(dtype)
arr[np.random.choice(arr.shape[0], 3)] = np.nan
assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
# (2) with +INF
- infarr = np.inf*np.ones(N, dtype='f')
+ infarr = np.inf*np.ones(N, dtype=dtype)
infarr[np.random.choice(infarr.shape[0], 5)] = -1.0
assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
# (3) with -INF
- neginfarr = -np.inf*np.ones(N, dtype='f')
+ neginfarr = -np.inf*np.ones(N, dtype=dtype)
neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0
assert_equal(np.sort(neginfarr, kind='quick'),
np.sort(neginfarr, kind='heap'))
# (4) with +/-INF
- infarr = np.inf*np.ones(N, dtype='f')
+ infarr = np.inf*np.ones(N, dtype=dtype)
infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf
assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
-
-def test_sort_int():
- # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled
- rng = np.random.default_rng(42)
- N = 2047
- minv = np.iinfo(np.int32).min
- maxv = np.iinfo(np.int32).max
- arr = rng.integers(low=minv, high=maxv, size=N).astype('int32')
+def test_sort_float16():
+ arr = np.arange(65536, dtype=np.int16)
+ temp = np.frombuffer(arr.tobytes(), dtype=np.float16)
+ data = np.copy(temp)
+ np.random.shuffle(data)
+ data_backup = data
+ assert_equal(np.sort(data, kind='quick'),
+ np.sort(data_backup, kind='heap'))
+
+
+@pytest.mark.parametrize("N", np.arange(1, 512))
+@pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
+def test_sort_int(N, dtype):
+ # Random data with MAX and MIN sprinkled
+ minv = np.iinfo(dtype).min
+ maxv = np.iinfo(dtype).max
+ arr = np.random.randint(low=minv, high=maxv-1, size=N, dtype=dtype)
arr[np.random.choice(arr.shape[0], 10)] = minv
arr[np.random.choice(arr.shape[0], 10)] = maxv
assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))