From 49278b961b7254bc6a4aee478587c69682a3827e Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 19 Sep 2022 10:35:03 -0700
Subject: ENH: Add x86-simd-sort source files

---
 .../x86-simd-sort/src/avx512-16bit-qsort.hpp       | 527 +++++++++++++
 .../x86-simd-sort/src/avx512-32bit-qsort.hpp       | 712 ++++++++++++++++++
 .../x86-simd-sort/src/avx512-64bit-qsort.hpp       | 820 +++++++++++++++++++++
 .../x86-simd-sort/src/avx512-common-qsort.h        | 218 ++++++
 4 files changed, 2277 insertions(+)
 create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
 create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
 create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
 create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
new file mode 100644
index 000000000..1673eb5da
--- /dev/null
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
@@ -0,0 +1,527 @@
+/*******************************************************************
+ * Copyright (C) 2022 Intel Corporation
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
+ * ****************************************************************/
+
+#ifndef __AVX512_QSORT_16BIT__
+#define __AVX512_QSORT_16BIT__
+
+#include "avx512-common-qsort.h"
+
+/*
+ * Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+// ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+#define NETWORK_16BIT_1 \
+    24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, \
+            11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+#define NETWORK_16BIT_2 \
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, \
+            3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+#define NETWORK_16BIT_3 \
+    27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, 11, 10, 9, \
+            8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
+#define NETWORK_16BIT_4 \
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \
+            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+#define NETWORK_16BIT_5 \
+    23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, 7, 6, 5, \
+            4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+#define NETWORK_16BIT_6 \
+    15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, \
+            26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+
+template <>
+struct vector<int16_t> {
+    using type_t = int16_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask32;
+    static const uint8_t numlanes = 32;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_INT16;
+    }
+    static type_t type_min()
+    {
+        return X86_SIMD_SORT_MIN_INT16;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi16(type_max());
+    }
+
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask32(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
+    }
+    //template <int scale>
+    //static zmm_t i64gather(__m512i index, void const *base)
+    //{
+    //    return _mm512_i64gather_epi64(index, base, scale);
+    //}
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epi16(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        // AVX512_VBMI2
+        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        // AVX512BW
+        return _mm512_mask_loadu_epi16(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi16(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epi16(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi16(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
+        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
+        return std::max(lo_max, hi_max);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
+        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
+        return std::min(lo_min, hi_min);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi16(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
+        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+template <>
+struct vector<uint16_t> {
+    using type_t = uint16_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask32;
+    static const uint8_t numlanes = 32;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_UINT16;
+    }
+    static type_t type_min()
+    {
+        return 0;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi16(type_max());
+    } // TODO: this should broadcast bits as is?
+
+    //template<int scale>
+    //static zmm_t i64gather(__m512i index, void const *base)
+    //{
+    //    return _mm512_i64gather_epi64(index, base, scale);
+    //}
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask32(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epu16(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_epi16(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi16(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epu16(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi16(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
+        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
+        return std::max(lo_max, hi_max);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
+        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
+        return std::min(lo_min, hi_min);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi16(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
+        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+
+/*
+ * Assumes zmm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline zmm_t sort_zmm_16bit(zmm_t zmm)
+{
+    // Level 1
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAAAAAA);
+    // Level 2
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
+            0xCCCCCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAAAAAA);
+    // Level 3
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_1), zmm),
+            0xF0F0F0F0);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCCCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAAAAAA);
+    // Level 4
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_2), zmm),
+            0xFF00FF00);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm),
+            0xF0F0F0F0);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCCCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAAAAAA);
+    // Level 5
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm),
+            0xFFFF0000);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm),
+            0xFF00FF00);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm),
+            0xF0F0F0F0);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCCCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAAAAAA);
+    return zmm;
+}
+
+// Assumes zmm is bitonic and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
+{
+    // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_6), zmm),
+            0xFFFF0000);
+    // 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm),
+            0xFF00FF00);
+    // 3) half_cleaner[8]
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm),
+            0xF0F0F0F0);
+    // 3) half_cleaner[4]
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCCCCCC);
+    // 3) half_cleaner[2]
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAAAAAA);
+    return zmm;
+}
+
+// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
+{
+    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
+    zmm2 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm2);
+    zmm_t zmm3 = vtype::min(zmm1, zmm2);
+    zmm_t zmm4 = vtype::max(zmm1, zmm2);
+    // 2) Recursive half cleaner for each
+    zmm1 = bitonic_merge_zmm_16bit<vtype>(zmm3);
+    zmm2 = bitonic_merge_zmm_16bit<vtype>(zmm4);
+}
+
+// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
+// half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
+{
+    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[2]);
+    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[3]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
+    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4),
+                                      vtype::max(zmm[1], zmm2r));
+    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4),
+                                      vtype::max(zmm[0], zmm3r));
+    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
+    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
+    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
+    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
+    zmm[0] = bitonic_merge_zmm_16bit<vtype>(zmm0);
+    zmm[1] = bitonic_merge_zmm_16bit<vtype>(zmm1);
+    zmm[2] = bitonic_merge_zmm_16bit<vtype>(zmm2);
+    zmm[3] = bitonic_merge_zmm_16bit<vtype>(zmm3);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_32_16bit(type_t *arr, int32_t N)
+{
+    typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;
+    typename vtype::zmm_t zmm
+            = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
+    vtype::mask_storeu(arr, load_mask, sort_zmm_16bit<vtype>(zmm));
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_64_16bit(type_t *arr, int32_t N)
+{
+    if (N <= 32) {
+        sort_32_16bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    typename vtype::opmask_t load_mask
+            = ((0x1ull << (N - 32)) - 0x1ull) & 0xFFFFFFFF;
+    zmm_t zmm1 = vtype::loadu(arr);
+    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 32);
+    zmm1 = sort_zmm_16bit<vtype>(zmm1);
+    zmm2 = sort_zmm_16bit<vtype>(zmm2);
+    bitonic_merge_two_zmm_16bit<vtype>(zmm1, zmm2);
+    vtype::storeu(arr, zmm1);
+    vtype::mask_storeu(arr + 32, load_mask, zmm2);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_128_16bit(type_t *arr, int32_t N)
+{
+    if (N <= 64) {
+        sort_64_16bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[4];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 32);
+    opmask_t load_mask1 = 0xFFFFFFFF, load_mask2 = 0xFFFFFFFF;
+    if (N != 128) {
+        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
+        load_mask1 = combined_mask & 0xFFFFFFFF;
+        load_mask2 = (combined_mask >> 32) & 0xFFFFFFFF;
+    }
+    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
+    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 96);
+    zmm[0] = sort_zmm_16bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_16bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_16bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_16bit<vtype>(zmm[3]);
+    bitonic_merge_two_zmm_16bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_16bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_four_zmm_16bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 32, zmm[1]);
+    vtype::mask_storeu(arr + 64, load_mask1, zmm[2]);
+    vtype::mask_storeu(arr + 96, load_mask2, zmm[3]);
+}
+
+template <typename vtype, typename type_t>
+static inline type_t
+get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right)
+{
+    // median of 32
+    int64_t size = (right - left) / 32;
+    __m512i rand_vec = _mm512_set_epi16(arr[left],
+                                        arr[left + size],
+                                        arr[left + 2 * size],
+                                        arr[left + 3 * size],
+                                        arr[left + 4 * size],
+                                        arr[left + 5 * size],
+                                        arr[left + 6 * size],
+                                        arr[left + 7 * size],
+                                        arr[left + 8 * size],
+                                        arr[left + 9 * size],
+                                        arr[left + 10 * size],
+                                        arr[left + 11 * size],
+                                        arr[left + 12 * size],
+                                        arr[left + 13 * size],
+                                        arr[left + 14 * size],
+                                        arr[left + 15 * size],
+                                        arr[left + 16 * size],
+                                        arr[left + 17 * size],
+                                        arr[left + 18 * size],
+                                        arr[left + 19 * size],
+                                        arr[left + 20 * size],
+                                        arr[left + 21 * size],
+                                        arr[left + 22 * size],
+                                        arr[left + 23 * size],
+                                        arr[left + 24 * size],
+                                        arr[left + 25 * size],
+                                        arr[left + 26 * size],
+                                        arr[left + 27 * size],
+                                        arr[left + 28 * size],
+                                        arr[left + 29 * size],
+                                        arr[left + 30 * size],
+                                        arr[left + 31 * size]);
+    __m512i sort = sort_zmm_16bit<vtype>(rand_vec);
+    return ((type_t *)&sort)[16];
+}
+
+template <typename vtype, typename type_t>
+static inline void
+qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
+{
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_16bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_16bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512<vtype>(
+            arr, left, right + 1, pivot, &smallest, &biggest);
+    if (pivot != smallest)
+        qsort_16bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
+    if (pivot != biggest)
+        qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
+}
+
+template <>
+void avx512_qsort(int16_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qsort_16bit_<vector<int16_t>, int16_t>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort(uint16_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qsort_16bit_<vector<uint16_t>, uint16_t>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+#endif // __AVX512_QSORT_16BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
new file mode 100644
index 000000000..cbc5368f0
--- /dev/null
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
@@ -0,0 +1,712 @@
+/*******************************************************************
+ * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2021 Serge Sans Paille
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
+ *          Serge Sans Paille <serge.guelton@telecom-bretagne.eu>
+ * ****************************************************************/
+#ifndef __AVX512_QSORT_32BIT__
+#define __AVX512_QSORT_32BIT__
+
+#include "avx512-common-qsort.h"
+
+/*
+ * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+#define NETWORK_32BIT_1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+#define NETWORK_32BIT_2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+#define NETWORK_32BIT_3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+#define NETWORK_32BIT_4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
+#define NETWORK_32BIT_5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
+#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+
+template <>
+struct vector<int32_t> {
+    using type_t = int32_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask16;
+    static const uint8_t numlanes = 16;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_INT32;
+    }
+    static type_t type_min()
+    {
+        return X86_SIMD_SORT_MIN_INT32;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi32(type_max());
+    }
+
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask16(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
+    }
+    template <int scale>
+    static ymm_t i64gather(__m512i index, void const *base)
+    {
+        return _mm512_i64gather_epi32(index, base, scale);
+    }
+    static zmm_t merge(ymm_t y1, ymm_t y2)
+    {
+        zmm_t z1 = _mm512_castsi256_si512(y1);
+        return _mm512_inserti32x8(z1, y2, 1);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_epi32(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi32(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi32(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epi32(x, y);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epi32(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi32(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        return _mm512_reduce_max_epi32(v);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        return _mm512_reduce_min_epi32(v);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi32(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+
+    static ymm_t max(ymm_t x, ymm_t y)
+    {
+        return _mm256_max_epi32(x, y);
+    }
+    static ymm_t min(ymm_t x, ymm_t y)
+    {
+        return _mm256_min_epi32(x, y);
+    }
+};
+template <>
+struct vector<uint32_t> {
+    using type_t = uint32_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask16;
+    static const uint8_t numlanes = 16;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_UINT32;
+    }
+    static type_t type_min()
+    {
+        return 0;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi32(type_max());
+    } // TODO: this should broadcast bits as is?
+
+    template <int scale>
+    static ymm_t i64gather(__m512i index, void const *base)
+    {
+        return _mm512_i64gather_epi32(index, base, scale);
+    }
+    static zmm_t merge(ymm_t y1, ymm_t y2)
+    {
+        zmm_t z1 = _mm512_castsi256_si512(y1);
+        return _mm512_inserti32x8(z1, y2, 1);
+    }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask16(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epu32(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_epi32(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi32(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi32(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epu32(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi32(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        return _mm512_reduce_max_epu32(v);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        return _mm512_reduce_min_epu32(v);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi32(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+
+    static ymm_t max(ymm_t x, ymm_t y)
+    {
+        return _mm256_max_epu32(x, y);
+    }
+    static ymm_t min(ymm_t x, ymm_t y)
+    {
+        return _mm256_min_epu32(x, y);
+    }
+};
+template <>
+struct vector<float> {
+    using type_t = float;
+    using zmm_t = __m512;
+    using ymm_t = __m256;
+    using opmask_t = __mmask16;
+    static const uint8_t numlanes = 16;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_INFINITYF;
+    }
+    static type_t type_min()
+    {
+        return -X86_SIMD_SORT_INFINITYF;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_ps(type_max());
+    }
+
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask16(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+    }
+    template <int scale>
+    static ymm_t i64gather(__m512i index, void const *base)
+    {
+        return _mm512_i64gather_ps(index, base, scale);
+    }
+    static zmm_t merge(ymm_t y1, ymm_t y2)
+    {
+        zmm_t z1 = _mm512_castsi512_ps(
+                _mm512_castsi256_si512(_mm256_castps_si256(y1)));
+        return _mm512_insertf32x8(z1, y2, 1);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_ps(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_ps(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_ps(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_ps(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_ps(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_ps(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_ps(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_ps(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        return _mm512_reduce_max_ps(v);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        return _mm512_reduce_min_ps(v);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_ps(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_ps(mem, x);
+    }
+
+    static ymm_t max(ymm_t x, ymm_t y)
+    {
+        return _mm256_max_ps(x, y);
+    }
+    static ymm_t min(ymm_t x, ymm_t y)
+    {
+        return _mm256_min_ps(x, y);
+    }
+};
+
+/*
+ * Assumes zmm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline zmm_t sort_zmm_32bit(zmm_t zmm)
+{
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAA);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
+            0xCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAA);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_3), zmm),
+            0xF0F0);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAA);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm),
+            0xFF00);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
+            0xF0F0);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCC);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAA);
+    return zmm;
+}
+
+// Assumes zmm is bitonic and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
+{
+    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm),
+            0xFF00);
+    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
+            0xF0F0);
+    // 3) half_cleaner[4]
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
+            0xCCCC);
+    // 3) half_cleaner[1]
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
+            0xAAAA);
+    return zmm;
+}
+
+// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
+{
+    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
+    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
+    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
+    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
+    // 2) Recursive half cleaner for each
+    *zmm1 = bitonic_merge_zmm_32bit<vtype>(zmm3);
+    *zmm2 = bitonic_merge_zmm_32bit<vtype>(zmm4);
+}
+
+// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
+// half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
+{
+    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
+    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
+    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[1], zmm2r));
+    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[0], zmm3r));
+    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
+    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
+    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
+    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
+    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm0);
+    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm1);
+    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm2);
+    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm3);
+}
+
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
+{
+    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
+    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
+    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]);
+    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
+    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[3], zmm4r));
+    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[2], zmm5r));
+    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[1], zmm6r));
+    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[0], zmm7r));
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_32bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_32bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_32bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_32bit<vtype>(zmm_t8);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_16_32bit(type_t *arr, int32_t N)
+{
+    typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
+    typename vtype::zmm_t zmm
+            = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
+    vtype::mask_storeu(arr, load_mask, sort_zmm_32bit<vtype>(zmm));
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_32_32bit(type_t *arr, int32_t N)
+{
+    if (N <= 16) {
+        sort_16_32bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t zmm1 = vtype::loadu(arr);
+    typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001;
+    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
+    zmm1 = sort_zmm_32bit<vtype>(zmm1);
+    zmm2 = sort_zmm_32bit<vtype>(zmm2);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm1, &zmm2);
+    vtype::storeu(arr, zmm1);
+    vtype::mask_storeu(arr + 16, load_mask, zmm2);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_64_32bit(type_t *arr, int32_t N)
+{
+    if (N <= 32) {
+        sort_32_32bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[4];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 16);
+    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
+    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
+    load_mask1 &= combined_mask & 0xFFFF;
+    load_mask2 &= (combined_mask >> 16) & 0xFFFF;
+    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
+    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
+    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
+    bitonic_merge_four_zmm_32bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 16, zmm[1]);
+    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
+    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_128_32bit(type_t *arr, int32_t N)
+{
+    if (N <= 64) {
+        sort_64_32bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[8];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 16);
+    zmm[2] = vtype::loadu(arr + 32);
+    zmm[3] = vtype::loadu(arr + 48);
+    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
+    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
+    opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
+    if (N != 128) {
+        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
+        load_mask1 &= combined_mask & 0xFFFF;
+        load_mask2 &= (combined_mask >> 16) & 0xFFFF;
+        load_mask3 &= (combined_mask >> 32) & 0xFFFF;
+        load_mask4 &= (combined_mask >> 48) & 0xFFFF;
+    }
+    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
+    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
+    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
+    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
+    zmm[4] = sort_zmm_32bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_32bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_32bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_32bit<vtype>(zmm[7]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[4], &zmm[5]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[6], &zmm[7]);
+    bitonic_merge_four_zmm_32bit<vtype>(zmm);
+    bitonic_merge_four_zmm_32bit<vtype>(zmm + 4);
+    bitonic_merge_eight_zmm_32bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 16, zmm[1]);
+    vtype::storeu(arr + 32, zmm[2]);
+    vtype::storeu(arr + 48, zmm[3]);
+    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
+    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
+    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
+    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
+}
+
+template <typename vtype, typename type_t>
+static inline type_t
+get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right)
+{
+    // median of 16
+    int64_t size = (right - left) / 16;
+    using zmm_t = typename vtype::zmm_t;
+    using ymm_t = typename vtype::ymm_t;
+    __m512i rand_index1 = _mm512_set_epi64(left + size,
+                                           left + 2 * size,
+                                           left + 3 * size,
+                                           left + 4 * size,
+                                           left + 5 * size,
+                                           left + 6 * size,
+                                           left + 7 * size,
+                                           left + 8 * size);
+    __m512i rand_index2 = _mm512_set_epi64(left + 9 * size,
+                                           left + 10 * size,
+                                           left + 11 * size,
+                                           left + 12 * size,
+                                           left + 13 * size,
+                                           left + 14 * size,
+                                           left + 15 * size,
+                                           left + 16 * size);
+    ymm_t rand_vec1
+            = vtype::template i64gather<sizeof(type_t)>(rand_index1, arr);
+    ymm_t rand_vec2
+            = vtype::template i64gather<sizeof(type_t)>(rand_index2, arr);
+    zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2);
+    zmm_t sort = sort_zmm_32bit<vtype>(rand_vec);
+    // pivot will never be a nan, since there are no nan's!
+    return ((type_t *)&sort)[8];
+}
+
+template <typename vtype, typename type_t>
+static inline void
+qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
+{
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512<vtype>(
+            arr, left, right + 1, pivot, &smallest, &biggest);
+    if (pivot != smallest)
+        qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
+    if (pivot != biggest)
+        qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
+}
+
+static inline int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
+{
+    int64_t nan_count = 0;
+    __mmask16 loadmask = 0xFFFF;
+    while (arrsize > 0) {
+        if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; }
+        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
+        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
+        nan_count += _mm_popcnt_u32((int32_t)nanmask);
+        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
+        arr += 16;
+        arrsize -= 16;
+    }
+    return nan_count;
+}
+
+static inline void
+replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
+{
+    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
+        arr[ii] = std::nanf("1");
+        nan_count -= 1;
+    }
+}
+
+template <>
+void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qsort_32bit_<vector<int32_t>, int32_t>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<uint32_t>(uint32_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qsort_32bit_<vector<uint32_t>, uint32_t>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<float>(float *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qsort_32bit_<vector<float>, float>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
+#endif //__AVX512_QSORT_32BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
new file mode 100644
index 000000000..f680c0704
--- /dev/null
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
@@ -0,0 +1,820 @@
+/*******************************************************************
+ * Copyright (C) 2022 Intel Corporation
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
+ * ****************************************************************/
+
+#ifndef __AVX512_QSORT_64BIT__
+#define __AVX512_QSORT_64BIT__
+
+#include "avx512-common-qsort.h"
+
+/*
+ * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+// ZMM                  7, 6, 5, 4, 3, 2, 1, 0
+#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3
+#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
+#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
+#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
+static const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
+
+template <>
+struct vector<int64_t> {
+    using type_t = int64_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m512i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_INT64;
+    }
+    static type_t type_min()
+    {
+        return X86_SIMD_SORT_MIN_INT64;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi64(type_max());
+    } // TODO: this should broadcast bits as is?
+
+    static zmm_t set(type_t v1,
+                     type_t v2,
+                     type_t v3,
+                     type_t v4,
+                     type_t v5,
+                     type_t v6,
+                     type_t v7,
+                     type_t v8)
+    {
+        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask8(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base)
+    {
+        return _mm512_i64gather_epi64(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epi64(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_epi64(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi64(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi64(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epi64(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi64(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        return _mm512_reduce_max_epi64(v);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        return _mm512_reduce_min_epi64(v);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi64(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        __m512d temp = _mm512_castsi512_pd(zmm);
+        return _mm512_castpd_si512(
+                _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+template <>
+struct vector<uint64_t> {
+    using type_t = uint64_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m512i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_UINT64;
+    }
+    static type_t type_min()
+    {
+        return 0;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi64(type_max());
+    }
+
+    static zmm_t set(type_t v1,
+                     type_t v2,
+                     type_t v3,
+                     type_t v4,
+                     type_t v5,
+                     type_t v6,
+                     type_t v7,
+                     type_t v8)
+    {
+        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base)
+    {
+        return _mm512_i64gather_epi64(index, base, scale);
+    }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask8(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epu64(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_epi64(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi64(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi64(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epu64(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi64(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        return _mm512_reduce_max_epu64(v);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        return _mm512_reduce_min_epu64(v);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi64(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        __m512d temp = _mm512_castsi512_pd(zmm);
+        return _mm512_castpd_si512(
+                _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+template <>
+struct vector<double> {
+    using type_t = double;
+    using zmm_t = __m512d;
+    using ymm_t = __m512d;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_INFINITY;
+    }
+    static type_t type_min()
+    {
+        return -X86_SIMD_SORT_INFINITY;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_pd(type_max());
+    }
+
+    static zmm_t set(type_t v1,
+                     type_t v2,
+                     type_t v3,
+                     type_t v4,
+                     type_t v5,
+                     type_t v6,
+                     type_t v7,
+                     type_t v8)
+    {
+        return _mm512_set_pd(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask8(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base)
+    {
+        return _mm512_i64gather_pd(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_pd(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_pd(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_pd(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_pd(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_pd(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_pd(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_pd(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_pd(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        return _mm512_reduce_max_pd(v);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        return _mm512_reduce_min_pd(v);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_pd(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_pd(mem, x);
+    }
+};
+
+/*
+ * Assumes zmm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline zmm_t sort_zmm_64bit(zmm_t zmm)
+{
+    zmm = cmp_merge<vtype>(
+            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_1), zmm),
+            0xCC);
+    zmm = cmp_merge<vtype>(
+            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    zmm = cmp_merge<vtype>(zmm, vtype::permutexvar(rev_index, zmm), 0xF0);
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm),
+            0xCC);
+    zmm = cmp_merge<vtype>(
+            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    return zmm;
+}
+
+// Assumes zmm is bitonic and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
+{
+
+    // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_4), zmm),
+            0xF0);
+    // 2) half_cleaner[4]
+    zmm = cmp_merge<vtype>(
+            zmm,
+            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm),
+            0xCC);
+    // 3) half_cleaner[1]
+    zmm = cmp_merge<vtype>(
+            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    return zmm;
+}
+
+// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
+{
+    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
+    zmm2 = vtype::permutexvar(rev_index, zmm2);
+    zmm_t zmm3 = vtype::min(zmm1, zmm2);
+    zmm_t zmm4 = vtype::max(zmm1, zmm2);
+    // 2) Recursive half cleaner for each
+    zmm1 = bitonic_merge_zmm_64bit<vtype>(zmm3);
+    zmm2 = bitonic_merge_zmm_64bit<vtype>(zmm4);
+}
+
+// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
+// half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
+{
+    // 1) First step of a merging network
+    zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]);
+    zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
+    // 2) Recursive half clearer: 16
+    zmm_t zmm_t3 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm2r));
+    zmm_t zmm_t4 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm3r));
+    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
+    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
+    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
+    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm0);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm1);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm2);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm3);
+}
+
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
+{
+    zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
+    zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]);
+    zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]);
+    zmm_t zmm7r = vtype::permutexvar(rev_index, zmm[7]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
+    zmm_t zmm_t5 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm4r));
+    zmm_t zmm_t6 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm5r));
+    zmm_t zmm_t7 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm6r));
+    zmm_t zmm_t8 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm7r));
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
+}
+
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+static inline void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
+{
+    zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
+    zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]);
+    zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]);
+    zmm_t zmm11r = vtype::permutexvar(rev_index, zmm[11]);
+    zmm_t zmm12r = vtype::permutexvar(rev_index, zmm[12]);
+    zmm_t zmm13r = vtype::permutexvar(rev_index, zmm[13]);
+    zmm_t zmm14r = vtype::permutexvar(rev_index, zmm[14]);
+    zmm_t zmm15r = vtype::permutexvar(rev_index, zmm[15]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm15r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm14r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm13r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm12r);
+    zmm_t zmm_t5 = vtype::min(zmm[4], zmm11r);
+    zmm_t zmm_t6 = vtype::min(zmm[5], zmm10r);
+    zmm_t zmm_t7 = vtype::min(zmm[6], zmm9r);
+    zmm_t zmm_t8 = vtype::min(zmm[7], zmm8r);
+    zmm_t zmm_t9 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm8r));
+    zmm_t zmm_t10 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm9r));
+    zmm_t zmm_t11 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm10r));
+    zmm_t zmm_t12 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm11r));
+    zmm_t zmm_t13 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm12r));
+    zmm_t zmm_t14 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm13r));
+    zmm_t zmm_t15 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm14r));
+    zmm_t zmm_t16 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm15r));
+    // Recusive half clear 16 zmm regs
+    COEX<vtype>(zmm_t1, zmm_t5);
+    COEX<vtype>(zmm_t2, zmm_t6);
+    COEX<vtype>(zmm_t3, zmm_t7);
+    COEX<vtype>(zmm_t4, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t13);
+    COEX<vtype>(zmm_t10, zmm_t14);
+    COEX<vtype>(zmm_t11, zmm_t15);
+    COEX<vtype>(zmm_t12, zmm_t16);
+    //
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t11);
+    COEX<vtype>(zmm_t10, zmm_t12);
+    COEX<vtype>(zmm_t13, zmm_t15);
+    COEX<vtype>(zmm_t14, zmm_t16);
+    //
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t10);
+    COEX<vtype>(zmm_t11, zmm_t12);
+    COEX<vtype>(zmm_t13, zmm_t14);
+    COEX<vtype>(zmm_t15, zmm_t16);
+    //
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
+    zmm[8] = bitonic_merge_zmm_64bit<vtype>(zmm_t9);
+    zmm[9] = bitonic_merge_zmm_64bit<vtype>(zmm_t10);
+    zmm[10] = bitonic_merge_zmm_64bit<vtype>(zmm_t11);
+    zmm[11] = bitonic_merge_zmm_64bit<vtype>(zmm_t12);
+    zmm[12] = bitonic_merge_zmm_64bit<vtype>(zmm_t13);
+    zmm[13] = bitonic_merge_zmm_64bit<vtype>(zmm_t14);
+    zmm[14] = bitonic_merge_zmm_64bit<vtype>(zmm_t15);
+    zmm[15] = bitonic_merge_zmm_64bit<vtype>(zmm_t16);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_8_64bit(type_t *arr, int32_t N)
+{
+    typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
+    typename vtype::zmm_t zmm
+            = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
+    vtype::mask_storeu(arr, load_mask, sort_zmm_64bit<vtype>(zmm));
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_16_64bit(type_t *arr, int32_t N)
+{
+    if (N <= 8) {
+        sort_8_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t zmm1 = vtype::loadu(arr);
+    typename vtype::opmask_t load_mask = (0x01 << (N - 8)) - 0x01;
+    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 8);
+    zmm1 = sort_zmm_64bit<vtype>(zmm1);
+    zmm2 = sort_zmm_64bit<vtype>(zmm2);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm1, zmm2);
+    vtype::storeu(arr, zmm1);
+    vtype::mask_storeu(arr + 8, load_mask, zmm2);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_32_64bit(type_t *arr, int32_t N)
+{
+    if (N <= 16) {
+        sort_16_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[4];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
+    load_mask1 = (combined_mask)&0xFF;
+    load_mask2 = (combined_mask >> 8) & 0xFF;
+    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 16);
+    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 24);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::mask_storeu(arr + 16, load_mask1, zmm[2]);
+    vtype::mask_storeu(arr + 24, load_mask2, zmm[3]);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_64_64bit(type_t *arr, int32_t N)
+{
+    if (N <= 32) {
+        sort_32_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[8];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    zmm[2] = vtype::loadu(arr + 16);
+    zmm[3] = vtype::loadu(arr + 24);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
+    // N-32 >= 1
+    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
+    load_mask1 = (combined_mask)&0xFF;
+    load_mask2 = (combined_mask >> 8) & 0xFF;
+    load_mask3 = (combined_mask >> 16) & 0xFF;
+    load_mask4 = (combined_mask >> 24) & 0xFF;
+    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
+    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 40);
+    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 48);
+    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 56);
+    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::storeu(arr + 16, zmm[2]);
+    vtype::storeu(arr + 24, zmm[3]);
+    vtype::mask_storeu(arr + 32, load_mask1, zmm[4]);
+    vtype::mask_storeu(arr + 40, load_mask2, zmm[5]);
+    vtype::mask_storeu(arr + 48, load_mask3, zmm[6]);
+    vtype::mask_storeu(arr + 56, load_mask4, zmm[7]);
+}
+
+template <typename vtype, typename type_t>
+static inline void sort_128_64bit(type_t *arr, int32_t N)
+{
+    if (N <= 64) {
+        sort_64_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[16];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    zmm[2] = vtype::loadu(arr + 16);
+    zmm[3] = vtype::loadu(arr + 24);
+    zmm[4] = vtype::loadu(arr + 32);
+    zmm[5] = vtype::loadu(arr + 40);
+    zmm[6] = vtype::loadu(arr + 48);
+    zmm[7] = vtype::loadu(arr + 56);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
+    opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF;
+    opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF;
+    if (N != 128) {
+        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
+        load_mask1 = (combined_mask)&0xFF;
+        load_mask2 = (combined_mask >> 8) & 0xFF;
+        load_mask3 = (combined_mask >> 16) & 0xFF;
+        load_mask4 = (combined_mask >> 24) & 0xFF;
+        load_mask5 = (combined_mask >> 32) & 0xFF;
+        load_mask6 = (combined_mask >> 40) & 0xFF;
+        load_mask7 = (combined_mask >> 48) & 0xFF;
+        load_mask8 = (combined_mask >> 56) & 0xFF;
+    }
+    zmm[8] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
+    zmm[9] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 72);
+    zmm[10] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 80);
+    zmm[11] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 88);
+    zmm[12] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 96);
+    zmm[13] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 104);
+    zmm[14] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 112);
+    zmm[15] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 120);
+    zmm[8] = sort_zmm_64bit<vtype>(zmm[8]);
+    zmm[9] = sort_zmm_64bit<vtype>(zmm[9]);
+    zmm[10] = sort_zmm_64bit<vtype>(zmm[10]);
+    zmm[11] = sort_zmm_64bit<vtype>(zmm[11]);
+    zmm[12] = sort_zmm_64bit<vtype>(zmm[12]);
+    zmm[13] = sort_zmm_64bit<vtype>(zmm[13]);
+    zmm[14] = sort_zmm_64bit<vtype>(zmm[14]);
+    zmm[15] = sort_zmm_64bit<vtype>(zmm[15]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[8], zmm[9]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[10], zmm[11]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[12], zmm[13]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[14], zmm[15]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 8);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 12);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm + 8);
+    bitonic_merge_sixteen_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::storeu(arr + 16, zmm[2]);
+    vtype::storeu(arr + 24, zmm[3]);
+    vtype::storeu(arr + 32, zmm[4]);
+    vtype::storeu(arr + 40, zmm[5]);
+    vtype::storeu(arr + 48, zmm[6]);
+    vtype::storeu(arr + 56, zmm[7]);
+    vtype::mask_storeu(arr + 64, load_mask1, zmm[8]);
+    vtype::mask_storeu(arr + 72, load_mask2, zmm[9]);
+    vtype::mask_storeu(arr + 80, load_mask3, zmm[10]);
+    vtype::mask_storeu(arr + 88, load_mask4, zmm[11]);
+    vtype::mask_storeu(arr + 96, load_mask5, zmm[12]);
+    vtype::mask_storeu(arr + 104, load_mask6, zmm[13]);
+    vtype::mask_storeu(arr + 112, load_mask7, zmm[14]);
+    vtype::mask_storeu(arr + 120, load_mask8, zmm[15]);
+}
+
+template <typename vtype, typename type_t>
+static inline type_t
+get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right)
+{
+    // median of 8
+    int64_t size = (right - left) / 8;
+    using zmm_t = typename vtype::zmm_t;
+    __m512i rand_index = _mm512_set_epi64(left + size,
+                                          left + 2 * size,
+                                          left + 3 * size,
+                                          left + 4 * size,
+                                          left + 5 * size,
+                                          left + 6 * size,
+                                          left + 7 * size,
+                                          left + 8 * size);
+    zmm_t rand_vec = vtype::template i64gather<sizeof(type_t)>(rand_index, arr);
+    // pivot will never be a nan, since there are no nan's!
+    zmm_t sort = sort_zmm_64bit<vtype>(rand_vec);
+    return ((type_t *)&sort)[4];
+}
+
+template <typename vtype, typename type_t>
+static inline void
+qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
+{
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_64bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512<vtype>(
+            arr, left, right + 1, pivot, &smallest, &biggest);
+    if (pivot != smallest)
+        qsort_64bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
+    if (pivot != biggest)
+        qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
+}
+
+static inline int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
+{
+    int64_t nan_count = 0;
+    __mmask8 loadmask = 0xFF;
+    while (arrsize > 0) {
+        if (arrsize < 8) { loadmask = (0x01 << arrsize) - 0x01; }
+        __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr);
+        __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
+        nan_count += _mm_popcnt_u32((int32_t)nanmask);
+        _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE);
+        arr += 8;
+        arrsize -= 8;
+    }
+    return nan_count;
+}
+
+static inline void
+replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)
+{
+    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
+        arr[ii] = std::nan("1");
+        nan_count -= 1;
+    }
+}
+
+template <>
+void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qsort_64bit_<vector<int64_t>, int64_t>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<uint64_t>(uint64_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qsort_64bit_<vector<uint64_t>, uint64_t>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<double>(double *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qsort_64bit_<vector<double>, double>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+#endif // __AVX512_QSORT_64BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
new file mode 100644
index 000000000..e713e1f20
--- /dev/null
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
@@ -0,0 +1,218 @@
+/*******************************************************************
+ * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2021 Serge Sans Paille
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
+ *          Serge Sans Paille <serge.guelton@telecom-bretagne.eu>
+ * ****************************************************************/
+
+#ifndef __AVX512_QSORT_COMMON__
+#define __AVX512_QSORT_COMMON__
+
+/*
+ * Quicksort using AVX-512. The ideas and code are based on these two research
+ * papers [1] and [2]. On a high level, the idea is to vectorize quicksort
+ * partitioning using AVX-512 compressstore instructions. If the array size is
+ * < 128, then use Bitonic sorting network implemented on 512-bit registers.
+ * The precise network definitions depend on the dtype and are defined in
+ * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
+ * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
+ * network. The core implementations of the vectorized qsort functions
+ * avx512_qsort<T>(T*, int64_t) are modified versions of avx2 quicksort
+ * presented in the paper [2] and source code associated with that paper [3].
+ *
+ * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
+ *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
+ *
+ * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
+ * Skylake https://arxiv.org/pdf/1704.08579.pdf
+ *
+ * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT
+ *
+ * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
+ *
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+#include <limits>
+
+#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
+#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
+#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
+#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
+#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
+#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
+#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
+#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
+#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
+#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
+#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
+#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
+#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
+#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
+#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
+#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
+#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
+#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
+#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
+#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
+
+template <typename type>
+struct vector;
+
+template <typename T>
+void avx512_qsort(T *arr, int64_t arrsize);
+
+/*
+ * COEX == Compare and Exchange two registers by swapping min and max values
+ */
+template <typename vtype, typename mm_t>
+static void COEX(mm_t &a, mm_t &b)
+{
+    mm_t temp = a;
+    a = vtype::min(a, b);
+    b = vtype::max(temp, b);
+}
+
+template <typename vtype,
+          typename zmm_t = typename vtype::zmm_t,
+          typename opmask_t = typename vtype::opmask_t>
+static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask)
+{
+    zmm_t min = vtype::min(in2, in1);
+    zmm_t max = vtype::max(in2, in1);
+    return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max
+}
+
+/*
+ * Parition one ZMM register based on the pivot and returns the index of the
+ * last element that is less than equal to the pivot.
+ */
+template <typename vtype, typename type_t, typename zmm_t>
+static inline int32_t partition_vec(type_t *arr,
+                                    int64_t left,
+                                    int64_t right,
+                                    const zmm_t curr_vec,
+                                    const zmm_t pivot_vec,
+                                    zmm_t *smallest_vec,
+                                    zmm_t *biggest_vec)
+{
+    /* which elements are larger than the pivot */
+    typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec);
+    int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
+    vtype::mask_compressstoreu(
+            arr + left, vtype::knot_opmask(gt_mask), curr_vec);
+    vtype::mask_compressstoreu(
+            arr + right - amount_gt_pivot, gt_mask, curr_vec);
+    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
+    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
+    return amount_gt_pivot;
+}
+
+/*
+ * Parition an array based on the pivot and returns the index of the
+ * last element that is less than equal to the pivot.
+ */
+template <typename vtype, typename type_t>
+static inline int64_t partition_avx512(type_t *arr,
+                                       int64_t left,
+                                       int64_t right,
+                                       type_t pivot,
+                                       type_t *smallest,
+                                       type_t *biggest)
+{
+    /* make array length divisible by vtype::numlanes , shortening the array */
+    for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
+        *smallest = std::min(*smallest, arr[left]);
+        *biggest = std::max(*biggest, arr[left]);
+        if (arr[left] > pivot) { std::swap(arr[left], arr[--right]); }
+        else {
+            ++left;
+        }
+    }
+
+    if (left == right)
+        return left; /* less than vtype::numlanes elements in the array */
+
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t pivot_vec = vtype::set1(pivot);
+    zmm_t min_vec = vtype::set1(*smallest);
+    zmm_t max_vec = vtype::set1(*biggest);
+
+    if (right - left == vtype::numlanes) {
+        zmm_t vec = vtype::loadu(arr + left);
+        int32_t amount_gt_pivot = partition_vec<vtype>(arr,
+                                                       left,
+                                                       left + vtype::numlanes,
+                                                       vec,
+                                                       pivot_vec,
+                                                       &min_vec,
+                                                       &max_vec);
+        *smallest = vtype::reducemin(min_vec);
+        *biggest = vtype::reducemax(max_vec);
+        return left + (vtype::numlanes - amount_gt_pivot);
+    }
+
+    // first and last vtype::numlanes values are partitioned at the end
+    zmm_t vec_left = vtype::loadu(arr + left);
+    zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
+    // store points of the vectors
+    int64_t r_store = right - vtype::numlanes;
+    int64_t l_store = left;
+    // indices for loading the elements
+    left += vtype::numlanes;
+    right -= vtype::numlanes;
+    while (right - left != 0) {
+        zmm_t curr_vec;
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((r_store + vtype::numlanes) - right < left - l_store) {
+            right -= vtype::numlanes;
+            curr_vec = vtype::loadu(arr + right);
+        }
+        else {
+            curr_vec = vtype::loadu(arr + left);
+            left += vtype::numlanes;
+        }
+        // partition the current vector and save it on both sides of the array
+        int32_t amount_gt_pivot
+                = partition_vec<vtype>(arr,
+                                       l_store,
+                                       r_store + vtype::numlanes,
+                                       curr_vec,
+                                       pivot_vec,
+                                       &min_vec,
+                                       &max_vec);
+        ;
+        r_store -= amount_gt_pivot;
+        l_store += (vtype::numlanes - amount_gt_pivot);
+    }
+
+    /* partition and save vec_left and vec_right */
+    int32_t amount_gt_pivot = partition_vec<vtype>(arr,
+                                                   l_store,
+                                                   r_store + vtype::numlanes,
+                                                   vec_left,
+                                                   pivot_vec,
+                                                   &min_vec,
+                                                   &max_vec);
+    l_store += (vtype::numlanes - amount_gt_pivot);
+    amount_gt_pivot = partition_vec<vtype>(arr,
+                                           l_store,
+                                           l_store + vtype::numlanes,
+                                           vec_right,
+                                           pivot_vec,
+                                           &min_vec,
+                                           &max_vec);
+    l_store += (vtype::numlanes - amount_gt_pivot);
+    *smallest = vtype::reducemin(min_vec);
+    *biggest = vtype::reducemax(max_vec);
+    return l_store;
+}
+#endif // __AVX512_QSORT_COMMON__
-- 
cgit v1.2.1


From ae978b8a2bc4e7b219d796519f9327feb08fe4e7 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 27 Sep 2022 21:58:29 -0700
Subject: ENH: Add AVX-512 based 64-bit dtype sort

---
 numpy/core/setup.py                               |   3 +-
 numpy/core/src/npysort/quicksort.cpp              |  46 +-
 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp |  54 ++
 numpy/core/src/npysort/x86-qsort-skx.h            |  37 +
 numpy/core/src/npysort/x86-qsort.dispatch.cpp     | 835 ----------------------
 numpy/core/src/npysort/x86-qsort.h                |  28 -
 6 files changed, 137 insertions(+), 866 deletions(-)
 create mode 100644 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
 create mode 100644 numpy/core/src/npysort/x86-qsort-skx.h
 delete mode 100644 numpy/core/src/npysort/x86-qsort.dispatch.cpp
 delete mode 100644 numpy/core/src/npysort/x86-qsort.h

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index e509b9d11..912867709 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -650,6 +650,7 @@ def configuration(parent_package='',top_path=None):
     config.add_include_dirs(join('src', 'multiarray'))
     config.add_include_dirs(join('src', 'umath'))
     config.add_include_dirs(join('src', 'npysort'))
+    config.add_include_dirs(join('src', 'npysort', 'x86-simd-sort', 'src'))
     config.add_include_dirs(join('src', '_simd'))
 
     config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
@@ -942,7 +943,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'common', 'npy_sort.h.src'),
-            join('src', 'npysort', 'x86-qsort.dispatch.cpp'),
+            join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
             join('src', 'npysort', 'quicksort.cpp'),
             join('src', 'npysort', 'mergesort.cpp'),
             join('src', 'npysort', 'timsort.cpp'),
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 3e351dd84..06ac0c172 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -55,12 +55,12 @@
 #include "npysort_heapsort.h"
 #include "numpy_tag.h"
 
-#include "x86-qsort.h"
+#include "x86-qsort-skx.h"
 #include <cstdlib>
 #include <utility>
 
 #ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
+#include "x86-qsort-skx.dispatch.h"
 #endif
 
 #define NOT_USED NPY_UNUSED(unused)
@@ -86,6 +86,48 @@ struct x86_dispatch {
     static bool quicksort(typename Tag::type *, npy_intp) { return false; }
 };
 
+template <>
+struct x86_dispatch<npy::long_tag> {
+    static bool quicksort(npy_long *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
+template <>
+struct x86_dispatch<npy::ulong_tag> {
+    static bool quicksort(npy_ulong *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
+template <>
+struct x86_dispatch<npy::double_tag> {
+    static bool quicksort(npy_double *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_double);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
 template <>
 struct x86_dispatch<npy::int_tag> {
     static bool quicksort(npy_int *start, npy_intp num)
diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
new file mode 100644
index 000000000..d26b8fc9f
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
@@ -0,0 +1,54 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "x86-qsort-skx.h"
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#ifdef NPY_HAVE_AVX512_SKX
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+
+/***************************************
+ * C > C++ dispatch
+ ***************************************/
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_long>((npy_long*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_ulong>((npy_ulong*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_double)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_double>((npy_double*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_int>((npy_int*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_uint>((npy_uint*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_float>((npy_float*)arr, arrsize);
+}
+
+#endif  // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort-skx.h b/numpy/core/src/npysort/x86-qsort-skx.h
new file mode 100644
index 000000000..9a5cb2c9d
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort-skx.h
@@ -0,0 +1,37 @@
+#include "numpy/npy_common.h"
+
+#include "npy_cpu_dispatch.h"
+
+#ifndef NPY_NO_EXPORT
+#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include "x86-qsort-skx.dispatch.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_long,
+                         (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ulong,
+                         (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_double,
+                         (void *start, npy_intp num))
+
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
+                         (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
+                         (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
+                         (void *start, npy_intp num))
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
deleted file mode 100644
index 8e88cc667..000000000
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ /dev/null
@@ -1,835 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_skx
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_SKX
-#include "numpy/npy_math.h"
-
-#include "npy_sort.h"
-#include "numpy_tag.h"
-
-#include "simd/simd.h"
-#include <immintrin.h>
-
-template <typename Tag, typename type>
-NPY_NO_EXPORT int
-heapsort_(type *start, npy_intp n);
-
-/*
- * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are
- * based on these two research papers:
- * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types
- *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * High level idea: Vectorize the quicksort partitioning using AVX-512
- * compressstore instructions. The algorithm to pick the pivot is to use median
- * of 72 elements picked at random. If the array size is < 128, then use
- * Bitonic sorting network. Good resource for bitonic sorting network:
- * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340
- * for potential problems when converting this code to universal intrinsics
- * framework.
- */
-
-/*
- * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK)
-#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK)
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/*
- * Vectorized random number generator xoroshiro128+. Broken into 2 parts:
- * (1) vnext generates 2 64-bit random integers
- * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to
- *     the length of the array
- */
-#define VROTL(x, k) /* rotate each uint64_t value in vector */ \
-    _mm256_or_si256(_mm256_slli_epi64((x), (k)),               \
-                    _mm256_srli_epi64((x), 64 - (k)))
-
-static inline __m256i
-vnext(__m256i *s0, __m256i *s1)
-{
-    *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
-    *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1),
-                           _mm256_slli_epi64(*s1, 16));
-    *s1 = VROTL(*s1, 37);
-    return _mm256_add_epi64(*s0, *s1); /* return random vector */
-}
-
-/* transform random numbers to the range between 0 and bound - 1 */
-static inline __m256i
-rnd_epu32(__m256i rnd_vec, __m256i bound)
-{
-    __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
-    __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound);
-    return _mm256_blend_epi32(odd, even, 0b01010101);
-}
-
-template <typename type>
-struct vector;
-
-template <>
-struct vector<npy_int> {
-    using tag = npy::int_tag;
-    using type_t = npy_int;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_INT32; }
-    static type_t type_min() { return NPY_MIN_INT32; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
-};
-template <>
-struct vector<npy_uint> {
-    using tag = npy::uint_tag;
-    using type_t = npy_uint;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_UINT32; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
-};
-template <>
-struct vector<npy_float> {
-    using tag = npy::float_tag;
-    using type_t = npy_float;
-    using zmm_t = __m512;
-    using ymm_t = __m256;
-
-    static type_t type_max() { return NPY_INFINITYF; }
-    static type_t type_min() { return -NPY_INFINITYF; }
-    static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
-    }
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_ps(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_ps(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_ps(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_ps(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_ps(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_ps(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
-};
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-void
-COEX(mm_t &a, mm_t &b)
-{
-    mm_t temp = a;
-    a = vtype::min(a, b);
-    b = vtype::max(temp, b);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
-{
-    zmm_t min = vtype::min(in2, in1);
-    zmm_t max = vtype::max(in2, in1);
-    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
-}
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-sort_zmm(zmm_t zmm)
-{
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-bitonic_merge_zmm(zmm_t zmm)
-{
-    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00);
-    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
-{
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2);
-    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
-    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
-    // 2) Recursive half cleaner for each
-    *zmm1 = bitonic_merge_zmm<vtype>(zmm3);
-    *zmm2 = bitonic_merge_zmm<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_four_zmm(zmm_t *zmm)
-{
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_eight_zmm(zmm_t *zmm)
-{
-    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_16(type_t *arr, npy_int N)
-{
-    __mmask16 load_mask = (0x0001 << N) - 0x0001;
-    typename vtype::zmm_t zmm =
-            vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_32(type_t *arr, npy_int N)
-{
-    if (N <= 16) {
-        sort_16<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
-    zmm1 = sort_zmm<vtype>(zmm1);
-    zmm2 = sort_zmm<vtype>(zmm2);
-    bitonic_merge_two_zmm<vtype>(&zmm1, &zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_64(type_t *arr, npy_int N)
-{
-    if (N <= 32) {
-        sort_32<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    if (N < 48) {
-        load_mask1 = (0x0001 << (N - 32)) - 0x0001;
-        load_mask2 = 0x0000;
-    }
-    else if (N < 64) {
-        load_mask2 = (0x0001 << (N - 48)) - 0x0001;
-    }
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_128(type_t *arr, npy_int N)
-{
-    if (N <= 64) {
-        sort_64<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    zmm[2] = vtype::loadu(arr + 32);
-    zmm[3] = vtype::loadu(arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
-    if (N < 80) {
-        load_mask1 = (0x0001 << (N - 64)) - 0x0001;
-        load_mask2 = 0x0000;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 96) {
-        load_mask2 = (0x0001 << (N - 80)) - 0x0001;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 112) {
-        load_mask3 = (0x0001 << (N - 96)) - 0x0001;
-        load_mask4 = 0x0000;
-    }
-    else {
-        load_mask4 = (0x0001 << (N - 112)) - 0x0001;
-    }
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
-    zmm[4] = sort_zmm<vtype>(zmm[4]);
-    zmm[5] = sort_zmm<vtype>(zmm[5]);
-    zmm[6] = sort_zmm<vtype>(zmm[6]);
-    zmm[7] = sort_zmm<vtype>(zmm[7]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[4], &zmm[5]);
-    bitonic_merge_two_zmm<vtype>(&zmm[6], &zmm[7]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    bitonic_merge_four_zmm<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::storeu(arr + 32, zmm[2]);
-    vtype::storeu(arr + 48, zmm[3]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-template <typename type_t>
-static inline void
-swap(type_t *arr, npy_intp ii, npy_intp jj)
-{
-    type_t temp = arr[ii];
-    arr[ii] = arr[jj];
-    arr[jj] = temp;
-}
-
-// Median of 3 strategy
-// template<typename type_t>
-// static inline
-// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp
-// right) {
-//    return (rand() % (right + 1 - left)) + left;
-//    //npy_intp middle = ((right-left)/2) + left;
-//    //type_t a = arr[left], b = arr[middle], c = arr[right];
-//    //if ((b >= a && b <= c) || (b <= a && b >= c))
-//    //    return middle;
-//    //if ((a >= b && a <= c) || (a <= b && a >= c))
-//    //    return left;
-//    //else
-//    //    return right;
-//}
-
-/*
- * Picking the pivot: Median of 72 array elements chosen at random.
- */
-
-template <typename vtype, typename type_t>
-static inline type_t
-get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
-{
-    /* seeds for vectorized random number generator */
-    __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374,
-                                    1324281658759788278, 6214952190349879213);
-    __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653,
-                                    7874578921548791257, 1998265912745817298);
-    s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left));
-    s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right));
-
-    npy_intp arrsize = right - left + 1;
-    __m256i bound =
-            _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize);
-    __m512i left_vec = _mm512_set1_epi64(left);
-    __m512i right_vec = _mm512_set1_epi64(right);
-    using ymm_t = typename vtype::ymm_t;
-    ymm_t v[9];
-    /* fill 9 vectors with random numbers */
-    for (npy_int i = 0; i < 9; ++i) {
-        __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */
-        __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(
-                rand_64, bound)); /* random numbers between 0 and bound - 1 */
-        __m512i indices;
-        if (i < 5)
-            indices =
-                    _mm512_add_epi64(left_vec, rand_32); /* indices for arr */
-        else
-            indices =
-                    _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */
-
-        v[i] = vtype::template i64gather<sizeof(type_t)>(indices, arr);
-    }
-
-    /* median network for 9 elements */
-    COEX<vtype>(v[0], v[1]);
-    COEX<vtype>(v[2], v[3]);
-    COEX<vtype>(v[4], v[5]);
-    COEX<vtype>(v[6], v[7]);
-    COEX<vtype>(v[0], v[2]);
-    COEX<vtype>(v[1], v[3]);
-    COEX<vtype>(v[4], v[6]);
-    COEX<vtype>(v[5], v[7]);
-    COEX<vtype>(v[0], v[4]);
-    COEX<vtype>(v[1], v[2]);
-    COEX<vtype>(v[5], v[6]);
-    COEX<vtype>(v[3], v[7]);
-    COEX<vtype>(v[1], v[5]);
-    COEX<vtype>(v[2], v[6]);
-    COEX<vtype>(v[3], v[5]);
-    COEX<vtype>(v[2], v[4]);
-    COEX<vtype>(v[3], v[4]);
-    COEX<vtype>(v[3], v[8]);
-    COEX<vtype>(v[4], v[8]);
-
-    // technically v[4] needs to be sorted before we pick the correct median,
-    // picking the 4th element works just as well for performance
-    type_t *temp = (type_t *)&v[4];
-
-    return temp[4];
-}
-
-/*
- * Partition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline npy_int
-partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
-              const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec)
-{
-    /* which elements are larger than the pivot */
-    __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec);
-    npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask);
-    vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec);
-    vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask,
-                               curr_vec);
-    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
-    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_gt_pivot;
-}
-
-/*
- * Partition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline npy_intp
-partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
-                 type_t *smallest, type_t *biggest)
-{
-    /* make array length divisible by 16 , shortening the array */
-    for (npy_int i = (right - left) % 16; i > 0; --i) {
-        *smallest = MIN(*smallest, arr[left]);
-        *biggest = MAX(*biggest, arr[left]);
-        if (arr[left] > pivot) {
-            swap(arr, left, --right);
-        }
-        else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than 16 elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    if (right - left == 16) {
-        zmm_t vec = vtype::loadu(arr + left);
-        npy_int amount_gt_pivot = partition_vec<vtype>(
-                arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec);
-        *smallest = vtype::reducemin(min_vec);
-        *biggest = vtype::reducemax(max_vec);
-        return left + (16 - amount_gt_pivot);
-    }
-
-    // first and last 16 values are partitioned at the end
-    zmm_t vec_left = vtype::loadu(arr + left);
-    zmm_t vec_right = vtype::loadu(arr + (right - 16));
-    // store points of the vectors
-    npy_intp r_store = right - 16;
-    npy_intp l_store = left;
-    // indices for loading the elements
-    left += 16;
-    right -= 16;
-    while (right - left != 0) {
-        zmm_t curr_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + 16) - right < left - l_store) {
-            right -= 16;
-            curr_vec = vtype::loadu(arr + right);
-        }
-        else {
-            curr_vec = vtype::loadu(arr + left);
-            left += 16;
-        }
-        // partition the current vector and save it on both sides of the array
-        npy_int amount_gt_pivot =
-                partition_vec<vtype>(arr, l_store, r_store + 16, curr_vec,
-                                     pivot_vec, &min_vec, &max_vec);
-        ;
-        r_store -= amount_gt_pivot;
-        l_store += (16 - amount_gt_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    npy_int amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + 16, vec_left,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, l_store + 16, vec_right,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-
-template <typename vtype, typename type_t>
-static inline void
-qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
-{
-    /*
-     * Resort to heapsort if quicksort isn't making any progress
-     */
-    if (max_iters <= 0) {
-        heapsort_<typename vtype::tag>(arr + left, right + 1 - left);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128<vtype>(arr + left, (npy_int)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    npy_intp pivot_index = partition_avx512<vtype>(arr, left, right + 1, pivot,
-                                                   &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-static inline npy_intp
-replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
-{
-    npy_intp nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) {
-            loadmask = (0x0001 << arrsize) - 0x0001;
-        }
-        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((npy_int)nanmask);
-        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-static inline void
-replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count)
-{
-    for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = NPY_NANF;
-        nan_count -= 1;
-    }
-}
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_int>, npy_int>((npy_int *)arr, 0, arrsize - 1,
-                                         2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_uint>, npy_uint>((npy_uint *)arr, 0, arrsize - 1,
-                                           2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize);
-        qsort_<vector<npy_float>, npy_float>((npy_float *)arr, 0, arrsize - 1,
-                                             2 * (npy_int)log2(arrsize));
-        replace_inf_with_nan((npy_float *)arr, arrsize, nan_count);
-    }
-}
-
-#endif  // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h
deleted file mode 100644
index 6340e2bc7..000000000
--- a/numpy/core/src/npysort/x86-qsort.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
-- 
cgit v1.2.1


From 882503ac9383b3fff0ecf5423e732e64469347ba Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 28 Sep 2022 13:34:11 -0700
Subject: ENH: Add AVX-512 based 16-bit dtype sort

---
 numpy/core/setup.py                               |  1 +
 numpy/core/src/npysort/quicksort.cpp              | 34 +++++++++++++++++++++++
 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 29 +++++++++++++++++++
 numpy/core/src/npysort/x86-qsort-icl.h            | 24 ++++++++++++++++
 4 files changed, 88 insertions(+)
 create mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
 create mode 100644 numpy/core/src/npysort/x86-qsort-icl.h

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 912867709..0331a2f9b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -944,6 +944,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'common', 'npy_sort.h.src'),
             join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
+            join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'),
             join('src', 'npysort', 'quicksort.cpp'),
             join('src', 'npysort', 'mergesort.cpp'),
             join('src', 'npysort', 'timsort.cpp'),
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 06ac0c172..d89dac173 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -56,6 +56,7 @@
 #include "numpy_tag.h"
 
 #include "x86-qsort-skx.h"
+#include "x86-qsort-icl.h"
 #include <cstdlib>
 #include <utility>
 
@@ -86,6 +87,7 @@ struct x86_dispatch {
     static bool quicksort(typename Tag::type *, npy_intp) { return false; }
 };
 
+
 template <>
 struct x86_dispatch<npy::long_tag> {
     static bool quicksort(npy_long *start, npy_intp num)
@@ -170,6 +172,38 @@ struct x86_dispatch<npy::float_tag> {
     }
 };
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include "x86-qsort-icl.dispatch.h"
+#endif
+
+template <>
+struct x86_dispatch<npy::short_tag> {
+    static bool quicksort(npy_short *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
+template <>
+struct x86_dispatch<npy::ushort_tag> {
+    static bool quicksort(npy_ushort *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
 }  // namespace
 
 template <typename Tag, typename type>
diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
new file mode 100644
index 000000000..7d6dc331b
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
@@ -0,0 +1,29 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "x86-qsort-icl.h"
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#ifdef NPY_HAVE_AVX512_ICL
+#include "avx512-16bit-qsort.hpp"
+
+/***************************************
+ * C > C++ dispatch
+ ***************************************/
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_short>((npy_short*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_ushort>((npy_ushort*)arr, arrsize);
+}
+
+#endif  // NPY_HAVE_AVX512_ICL
diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h
new file mode 100644
index 000000000..2093e0bce
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort-icl.h
@@ -0,0 +1,24 @@
+#include "numpy/npy_common.h"
+
+#include "npy_cpu_dispatch.h"
+
+#ifndef NPY_NO_EXPORT
+#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include "x86-qsort-icl.dispatch.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short,
+                         (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort,
+                         (void *start, npy_intp num))
+
+#ifdef __cplusplus
+}
+#endif
-- 
cgit v1.2.1


From 1b5f40c89634d9399c1f3a7906dedc153b202b69 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 28 Sep 2022 22:21:52 -0700
Subject: BUG: Use longlong when NPY_SIZEOF_LONG is 4

---
 numpy/core/src/npysort/quicksort.cpp              | 10 ++++++++++
 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp |  8 ++++++++
 2 files changed, 18 insertions(+)

diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index d89dac173..3af6b91d6 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -89,8 +89,13 @@ struct x86_dispatch {
 
 
 template <>
+#if NPY_SIZEOF_LONG == 8
 struct x86_dispatch<npy::long_tag> {
     static bool quicksort(npy_long *start, npy_intp num)
+#else
+struct x86_dispatch<npy::longlong_tag> {
+    static bool quicksort(npy_longlong *start, npy_intp num)
+#endif
     {
         void (*dispfunc)(void *, npy_intp) = nullptr;
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long);
@@ -103,8 +108,13 @@ struct x86_dispatch<npy::long_tag> {
 };
 
 template <>
+#if NPY_SIZEOF_LONG == 8
 struct x86_dispatch<npy::ulong_tag> {
     static bool quicksort(npy_ulong *start, npy_intp num)
+#else
+struct x86_dispatch<npy::ulonglong_tag> {
+    static bool quicksort(npy_ulonglong *start, npy_intp num)
+#endif
     {
         void (*dispfunc)(void *, npy_intp) = nullptr;
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong);
diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
index d26b8fc9f..fb328f547 100644
--- a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
+++ b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
@@ -18,13 +18,21 @@
 NPY_NO_EXPORT void
 NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize)
 {
+#if NPY_SIZEOF_LONG == 8
     avx512_qsort<npy_long>((npy_long*)arr, arrsize);
+#else
+    avx512_qsort<npy_longlong>((npy_longlong*)arr, arrsize);
+#endif
 }
 
 NPY_NO_EXPORT void
 NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize)
 {
+#if NPY_SIZEOF_LONG == 8
     avx512_qsort<npy_ulong>((npy_ulong*)arr, arrsize);
+#else
+    avx512_qsort<npy_ulonglong>((npy_ulonglong*)arr, arrsize);
+#endif
 }
 
 NPY_NO_EXPORT void
-- 
cgit v1.2.1


From 9edebc521b13bc2aa5a3367635730a4c4b4efac4 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 28 Sep 2022 22:22:41 -0700
Subject: Revert "ENH: Add AVX-512 based 16-bit dtype sort"

This reverts commit 225c8bab83d239d8888bc7b688efed97ab2284cf.
---
 numpy/core/setup.py                               |  1 -
 numpy/core/src/npysort/quicksort.cpp              | 34 -----------------------
 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 29 -------------------
 numpy/core/src/npysort/x86-qsort-icl.h            | 24 ----------------
 4 files changed, 88 deletions(-)
 delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
 delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.h

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 0331a2f9b..912867709 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -944,7 +944,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'common', 'npy_sort.h.src'),
             join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
-            join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'),
             join('src', 'npysort', 'quicksort.cpp'),
             join('src', 'npysort', 'mergesort.cpp'),
             join('src', 'npysort', 'timsort.cpp'),
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 3af6b91d6..85b4a1e62 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -56,7 +56,6 @@
 #include "numpy_tag.h"
 
 #include "x86-qsort-skx.h"
-#include "x86-qsort-icl.h"
 #include <cstdlib>
 #include <utility>
 
@@ -87,7 +86,6 @@ struct x86_dispatch {
     static bool quicksort(typename Tag::type *, npy_intp) { return false; }
 };
 
-
 template <>
 #if NPY_SIZEOF_LONG == 8
 struct x86_dispatch<npy::long_tag> {
@@ -182,38 +180,6 @@ struct x86_dispatch<npy::float_tag> {
     }
 };
 
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-icl.dispatch.h"
-#endif
-
-template <>
-struct x86_dispatch<npy::short_tag> {
-    static bool quicksort(npy_short *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::ushort_tag> {
-    static bool quicksort(npy_ushort *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
 }  // namespace
 
 template <typename Tag, typename type>
diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
deleted file mode 100644
index 7d6dc331b..000000000
--- a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_icl
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort-icl.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_ICL
-#include "avx512-16bit-qsort.hpp"
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_short>((npy_short*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_ushort>((npy_ushort*)arr, arrsize);
-}
-
-#endif  // NPY_HAVE_AVX512_ICL
diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h
deleted file mode 100644
index 2093e0bce..000000000
--- a/numpy/core/src/npysort/x86-qsort-icl.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-icl.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
-- 
cgit v1.2.1


From 57215f84ce60653908b99179338416dd7c2bbd36 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Fri, 30 Sep 2022 11:06:40 -0700
Subject: BUG: Ensure long/longlong is 8 bytes for 64-bit qsort

---
 numpy/core/src/npysort/quicksort.cpp              | 36 +++++++++++++++++------
 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp | 12 ++------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 85b4a1e62..0674d25ac 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -86,14 +86,10 @@ struct x86_dispatch {
     static bool quicksort(typename Tag::type *, npy_intp) { return false; }
 };
 
-template <>
 #if NPY_SIZEOF_LONG == 8
+template <>
 struct x86_dispatch<npy::long_tag> {
     static bool quicksort(npy_long *start, npy_intp num)
-#else
-struct x86_dispatch<npy::longlong_tag> {
-    static bool quicksort(npy_longlong *start, npy_intp num)
-#endif
     {
         void (*dispfunc)(void *, npy_intp) = nullptr;
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long);
@@ -104,15 +100,36 @@ struct x86_dispatch<npy::longlong_tag> {
         return false;
     }
 };
-
 template <>
-#if NPY_SIZEOF_LONG == 8
 struct x86_dispatch<npy::ulong_tag> {
     static bool quicksort(npy_ulong *start, npy_intp num)
-#else
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+#elif NPY_SIZEOF_LONGLONG == 8
+template <>
+struct x86_dispatch<npy::longlong_tag> {
+    static bool quicksort(npy_longlong *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+template <>
 struct x86_dispatch<npy::ulonglong_tag> {
     static bool quicksort(npy_ulonglong *start, npy_intp num)
-#endif
     {
         void (*dispfunc)(void *, npy_intp) = nullptr;
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong);
@@ -123,6 +140,7 @@ struct x86_dispatch<npy::ulonglong_tag> {
         return false;
     }
 };
+#endif
 
 template <>
 struct x86_dispatch<npy::double_tag> {
diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
index fb328f547..521b198ce 100644
--- a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
+++ b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
@@ -18,21 +18,13 @@
 NPY_NO_EXPORT void
 NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize)
 {
-#if NPY_SIZEOF_LONG == 8
-    avx512_qsort<npy_long>((npy_long*)arr, arrsize);
-#else
-    avx512_qsort<npy_longlong>((npy_longlong*)arr, arrsize);
-#endif
+    avx512_qsort<int64_t>((int64_t*)arr, arrsize);
 }
 
 NPY_NO_EXPORT void
 NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize)
 {
-#if NPY_SIZEOF_LONG == 8
-    avx512_qsort<npy_ulong>((npy_ulong*)arr, arrsize);
-#else
-    avx512_qsort<npy_ulonglong>((npy_ulonglong*)arr, arrsize);
-#endif
+    avx512_qsort<uint64_t>((uint64_t*)arr, arrsize);
 }
 
 NPY_NO_EXPORT void
-- 
cgit v1.2.1


From 73280879df00c9542909779bc9fbd99747681be7 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <devulapalli.raghuveer@gmail.com>
Date: Wed, 5 Oct 2022 22:18:39 -0700
Subject: MAINT: Force inline bitonic network functions

---
 .../x86-simd-sort/src/avx512-16bit-qsort.hpp       | 18 ++++++-------
 .../x86-simd-sort/src/avx512-32bit-qsort.hpp       | 26 +++++++++----------
 .../x86-simd-sort/src/avx512-64bit-qsort.hpp       | 30 +++++++++++-----------
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
index 1673eb5da..26a54e36b 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
@@ -236,7 +236,7 @@ struct vector<uint16_t> {
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t sort_zmm_16bit(zmm_t zmm)
+NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
 {
     // Level 1
     zmm = cmp_merge<vtype>(
@@ -308,7 +308,7 @@ static inline zmm_t sort_zmm_16bit(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
+NPY_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
 {
     // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
     zmm = cmp_merge<vtype>(
@@ -340,7 +340,7 @@ static inline zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
+NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
     zmm2 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm2);
@@ -354,7 +354,7 @@ static inline void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
+NPY_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
 {
     zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[2]);
     zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[3]);
@@ -375,7 +375,7 @@ static inline void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_32_16bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
 {
     typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;
     typename vtype::zmm_t zmm
@@ -384,7 +384,7 @@ static inline void sort_32_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_64_16bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
 {
     if (N <= 32) {
         sort_32_16bit<vtype>(arr, N);
@@ -403,7 +403,7 @@ static inline void sort_64_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_128_16bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
 {
     if (N <= 64) {
         sort_64_16bit<vtype>(arr, N);
@@ -436,7 +436,7 @@ static inline void sort_128_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline type_t
+NPY_FINLINE type_t
 get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right)
 {
     // median of 32
@@ -478,7 +478,7 @@ get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right)
 }
 
 template <typename vtype, typename type_t>
-static inline void
+static void
 qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
 {
     /*
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
index cbc5368f0..7899d8522 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
@@ -336,7 +336,7 @@ struct vector<float> {
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t sort_zmm_32bit(zmm_t zmm)
+NPY_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
 {
     zmm = cmp_merge<vtype>(
             zmm,
@@ -383,7 +383,7 @@ static inline zmm_t sort_zmm_32bit(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
+NPY_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
 {
     // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
     zmm = cmp_merge<vtype>(
@@ -410,7 +410,7 @@ static inline zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
+NPY_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
     *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
@@ -424,7 +424,7 @@ static inline void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
+NPY_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
 {
     zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
     zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
@@ -445,7 +445,7 @@ static inline void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
+NPY_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
 {
     zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
     zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
@@ -482,7 +482,7 @@ static inline void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_16_32bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
 {
     typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
     typename vtype::zmm_t zmm
@@ -491,7 +491,7 @@ static inline void sort_16_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_32_32bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
 {
     if (N <= 16) {
         sort_16_32bit<vtype>(arr, N);
@@ -509,7 +509,7 @@ static inline void sort_32_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_64_32bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
 {
     if (N <= 32) {
         sort_32_32bit<vtype>(arr, N);
@@ -540,7 +540,7 @@ static inline void sort_64_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_128_32bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
 {
     if (N <= 64) {
         sort_64_32bit<vtype>(arr, N);
@@ -592,7 +592,7 @@ static inline void sort_128_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline type_t
+NPY_FINLINE type_t
 get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right)
 {
     // median of 16
@@ -626,7 +626,7 @@ get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right)
 }
 
 template <typename vtype, typename type_t>
-static inline void
+static void
 qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
 {
     /*
@@ -655,7 +655,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-static inline int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
+NPY_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
 {
     int64_t nan_count = 0;
     __mmask16 loadmask = 0xFFFF;
@@ -671,7 +671,7 @@ static inline int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
     return nan_count;
 }
 
-static inline void
+NPY_FINLINE void
 replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
 {
     for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
index f680c0704..62a7fa54e 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
@@ -331,7 +331,7 @@ struct vector<double> {
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t sort_zmm_64bit(zmm_t zmm)
+NPY_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
 {
     zmm = cmp_merge<vtype>(
             zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
@@ -353,7 +353,7 @@ static inline zmm_t sort_zmm_64bit(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
+NPY_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
 {
 
     // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
@@ -374,7 +374,7 @@ static inline zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
+NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
     zmm2 = vtype::permutexvar(rev_index, zmm2);
@@ -388,7 +388,7 @@ static inline void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
+NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
 {
     // 1) First step of a merging network
     zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]);
@@ -409,7 +409,7 @@ static inline void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
+NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
 {
     zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
     zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]);
@@ -442,7 +442,7 @@ static inline void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
+NPY_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
 {
     zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
     zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]);
@@ -515,7 +515,7 @@ static inline void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_8_64bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
 {
     typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
     typename vtype::zmm_t zmm
@@ -524,7 +524,7 @@ static inline void sort_8_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_16_64bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
 {
     if (N <= 8) {
         sort_8_64bit<vtype>(arr, N);
@@ -542,7 +542,7 @@ static inline void sort_16_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_32_64bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
 {
     if (N <= 16) {
         sort_16_64bit<vtype>(arr, N);
@@ -573,7 +573,7 @@ static inline void sort_32_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_64_64bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
 {
     if (N <= 32) {
         sort_32_64bit<vtype>(arr, N);
@@ -624,7 +624,7 @@ static inline void sort_64_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline void sort_128_64bit(type_t *arr, int32_t N)
+NPY_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
 {
     if (N <= 64) {
         sort_64_64bit<vtype>(arr, N);
@@ -714,7 +714,7 @@ static inline void sort_128_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-static inline type_t
+NPY_FINLINE type_t
 get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right)
 {
     // median of 8
@@ -735,7 +735,7 @@ get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right)
 }
 
 template <typename vtype, typename type_t>
-static inline void
+static void
 qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
 {
     /*
@@ -764,7 +764,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-static inline int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
+NPY_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
 {
     int64_t nan_count = 0;
     __mmask8 loadmask = 0xFF;
@@ -780,7 +780,7 @@ static inline int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
     return nan_count;
 }
 
-static inline void
+NPY_FINLINE void
 replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)
 {
     for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
-- 
cgit v1.2.1


From fba06e75e4865168f5c3b6637c8a792fc1d9a2d7 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 6 Oct 2022 13:51:18 -0700
Subject: ENH: Use npyv_* for missing intrinsics in gcc-6

---
 .../npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp | 20 ++++++++++----------
 .../npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp | 18 +++++++++---------
 .../npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp | 18 +++++++++---------
 .../npysort/x86-simd-sort/src/avx512-common-qsort.h  |  1 +
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
index 26a54e36b..51cb4dbb0 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
@@ -57,7 +57,7 @@ struct vector<int16_t> {
 
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask32(x);
+        return npyv_not_b16(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -106,16 +106,16 @@ struct vector<int16_t> {
     {
         zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
         zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
-        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
+        type_t lo_max = (type_t)npyv_reduce_max_s32(lo);
+        type_t hi_max = (type_t)npyv_reduce_max_s32(hi);
         return std::max(lo_max, hi_max);
     }
     static type_t reducemin(zmm_t v)
     {
         zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
         zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
-        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
+        type_t lo_min = (type_t)npyv_reduce_min_s32(lo);
+        type_t hi_min = (type_t)npyv_reduce_min_s32(hi);
         return std::min(lo_min, hi_min);
     }
     static zmm_t set1(type_t v)
@@ -161,7 +161,7 @@ struct vector<uint16_t> {
     //}
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask32(x);
+        return npyv_not_b16(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -203,16 +203,16 @@ struct vector<uint16_t> {
     {
         zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
         zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
-        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
+        type_t lo_max = (type_t)npyv_reduce_max_s32(lo);
+        type_t hi_max = (type_t)npyv_reduce_max_s32(hi);
         return std::max(lo_max, hi_max);
     }
     static type_t reducemin(zmm_t v)
     {
         zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
         zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
-        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
+        type_t lo_min = (type_t)npyv_reduce_min_s32(lo);
+        type_t hi_min = (type_t)npyv_reduce_min_s32(hi);
         return std::min(lo_min, hi_min);
     }
     static zmm_t set1(type_t v)
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
index 7899d8522..ac5bece7a 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
@@ -46,7 +46,7 @@ struct vector<int32_t> {
 
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask16(x);
+        return _mm512_knot(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -96,11 +96,11 @@ struct vector<int32_t> {
     }
     static type_t reducemax(zmm_t v)
     {
-        return _mm512_reduce_max_epi32(v);
+        return npyv_reduce_max_s32(v);
     }
     static type_t reducemin(zmm_t v)
     {
-        return _mm512_reduce_min_epi32(v);
+        return npyv_reduce_min_s32(v);
     }
     static zmm_t set1(type_t v)
     {
@@ -158,7 +158,7 @@ struct vector<uint32_t> {
     }
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask16(x);
+        return _mm512_knot(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -198,11 +198,11 @@ struct vector<uint32_t> {
     }
     static type_t reducemax(zmm_t v)
     {
-        return _mm512_reduce_max_epu32(v);
+        return npyv_reduce_max_u32(v);
     }
     static type_t reducemin(zmm_t v)
     {
-        return _mm512_reduce_min_epu32(v);
+        return npyv_reduce_min_u32(v);
     }
     static zmm_t set1(type_t v)
     {
@@ -250,7 +250,7 @@ struct vector<float> {
 
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask16(x);
+        return _mm512_knot(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -301,11 +301,11 @@ struct vector<float> {
     }
     static type_t reducemax(zmm_t v)
     {
-        return _mm512_reduce_max_ps(v);
+        return npyv_reduce_max_f32(v);
     }
     static type_t reducemin(zmm_t v)
     {
-        return _mm512_reduce_min_ps(v);
+        return npyv_reduce_min_f32(v);
     }
     static zmm_t set1(type_t v)
     {
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
index 62a7fa54e..e6b7f8943 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
@@ -56,7 +56,7 @@ struct vector<int64_t> {
 
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask8(x);
+        return npyv_not_b64(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -101,11 +101,11 @@ struct vector<int64_t> {
     }
     static type_t reducemax(zmm_t v)
     {
-        return _mm512_reduce_max_epi64(v);
+        return npyv_reduce_max_s64(v);
     }
     static type_t reducemin(zmm_t v)
     {
-        return _mm512_reduce_min_epi64(v);
+        return npyv_reduce_min_s64(v);
     }
     static zmm_t set1(type_t v)
     {
@@ -163,7 +163,7 @@ struct vector<uint64_t> {
     }
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask8(x);
+        return npyv_not_b64(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -203,11 +203,11 @@ struct vector<uint64_t> {
     }
     static type_t reducemax(zmm_t v)
     {
-        return _mm512_reduce_max_epu64(v);
+        return npyv_reduce_max_u64(v);
     }
     static type_t reducemin(zmm_t v)
     {
-        return _mm512_reduce_min_epu64(v);
+        return npyv_reduce_min_u64(v);
     }
     static zmm_t set1(type_t v)
     {
@@ -260,7 +260,7 @@ struct vector<double> {
 
     static opmask_t knot_opmask(opmask_t x)
     {
-        return _knot_mask8(x);
+        return npyv_not_b64(x);
     }
     static opmask_t ge(zmm_t x, zmm_t y)
     {
@@ -305,11 +305,11 @@ struct vector<double> {
     }
     static type_t reducemax(zmm_t v)
     {
-        return _mm512_reduce_max_pd(v);
+        return npyv_reduce_max_f64(v);
     }
     static type_t reducemin(zmm_t v)
     {
-        return _mm512_reduce_min_pd(v);
+        return npyv_reduce_min_f64(v);
     }
     static zmm_t set1(type_t v)
     {
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
index e713e1f20..56560185c 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
@@ -38,6 +38,7 @@
 #include <cstdint>
 #include <immintrin.h>
 #include <limits>
+#include "simd/simd.h"
 
 #define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
 #define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
-- 
cgit v1.2.1


From 92bd9902d4233d9f5befe05fd47bfb8b2d4e102a Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 10 Oct 2022 22:31:02 -0700
Subject: MAINT: Disable AVX-512 qsort on macOS and WIN32

---
 numpy/core/setup.py                  | 18 +++++++++++++++++-
 numpy/core/src/npysort/quicksort.cpp |  8 ++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 912867709..fb91f8e68 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -68,6 +68,15 @@ class CallOnceOnly:
             out = copy.deepcopy(pickle.loads(self._check_complex))
         return out
 
+# Temporarily disable AVX512 sorting on WIN32 and macOS until we can figure
+# out why the build fails
+def enable_avx512_qsort():
+    enable = True
+    platform = sysconfig.get_platform()
+    if "win32" in platform or "macos" in platform:
+        enable = False
+    return enable
+
 def can_link_svml():
     """SVML library is supported only on x86_64 architecture and currently
     only on linux
@@ -484,6 +493,9 @@ def configuration(parent_package='',top_path=None):
             if can_link_svml():
                 moredefs.append(('NPY_CAN_LINK_SVML', 1))
 
+            if enable_avx512_qsort():
+                moredefs.append(('NPY_ENABLE_AVX512_QSORT', 1))
+
             # Use bogus stride debug aid to flush out bugs where users use
             # strides of dimensions with length 1 to index a full contiguous
             # array.
@@ -943,7 +955,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'common', 'npy_sort.h.src'),
-            join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
             join('src', 'npysort', 'quicksort.cpp'),
             join('src', 'npysort', 'mergesort.cpp'),
             join('src', 'npysort', 'timsort.cpp'),
@@ -967,6 +978,11 @@ def configuration(parent_package='',top_path=None):
             join('src', 'npymath', 'arm64_exports.c'),
             ]
 
+    if enable_avx512_qsort():
+        multiarray_src += [
+                join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
+                ]
+
     #######################################################################
     #             _multiarray_umath module - umath part                   #
     #######################################################################
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 0674d25ac..363daf46f 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -55,13 +55,15 @@
 #include "npysort_heapsort.h"
 #include "numpy_tag.h"
 
-#include "x86-qsort-skx.h"
 #include <cstdlib>
 #include <utility>
 
+#ifdef NPY_ENABLE_AVX512_QSORT
+#include "x86-qsort-skx.h"
 #ifndef NPY_DISABLE_OPTIMIZATION
 #include "x86-qsort-skx.dispatch.h"
-#endif
+#endif // NPY_DISABLE_OPTIMIZATION
+#endif // NPY_ENABLE_AVX512_QSORT
 
 #define NOT_USED NPY_UNUSED(unused)
 /*
@@ -86,6 +88,7 @@ struct x86_dispatch {
     static bool quicksort(typename Tag::type *, npy_intp) { return false; }
 };
 
+#ifdef NPY_ENABLE_AVX512_QSORT
 #if NPY_SIZEOF_LONG == 8
 template <>
 struct x86_dispatch<npy::long_tag> {
@@ -197,6 +200,7 @@ struct x86_dispatch<npy::float_tag> {
         return false;
     }
 };
+#endif // NPY_ENABLE_AVX512_QSORT
 
 }  // namespace
 
-- 
cgit v1.2.1


From 37c52d4757e71e4ce33483181302807d5f72340a Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 11 Oct 2022 10:10:51 -0700
Subject: BUG: Do not use a global static const __m512 variable

---
 numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
index e6b7f8943..d882d78d9 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
@@ -19,7 +19,6 @@
 #define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
 #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
 #define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
-static const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
 
 template <>
 struct vector<int64_t> {
@@ -333,6 +332,7 @@ struct vector<double> {
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
 {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     zmm = cmp_merge<vtype>(
             zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
     zmm = cmp_merge<vtype>(
@@ -376,6 +376,7 @@ NPY_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
 {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
     zmm2 = vtype::permutexvar(rev_index, zmm2);
     zmm_t zmm3 = vtype::min(zmm1, zmm2);
@@ -390,6 +391,7 @@ NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
 {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     // 1) First step of a merging network
     zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]);
     zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]);
@@ -411,6 +413,7 @@ NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
 {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
     zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]);
     zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]);
@@ -444,6 +447,7 @@ NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
 {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
     zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]);
     zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]);
-- 
cgit v1.2.1


From 0d3feb0a829ea53d525487ea351055442b467c2c Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 11 Oct 2022 10:34:15 -0700
Subject: ENH: Add AVX-512 based 16-bit dtype sort

This reverts commit 138ba7583253e7540a206e7f0df3edcd5e26c518.
---
 numpy/core/setup.py                               |  3 +-
 numpy/core/src/npysort/quicksort.cpp              | 51 +++++++++++++++++++----
 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 29 +++++++++++++
 numpy/core/src/npysort/x86-qsort-icl.h            | 24 +++++++++++
 4 files changed, 97 insertions(+), 10 deletions(-)
 create mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
 create mode 100644 numpy/core/src/npysort/x86-qsort-icl.h

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index fb91f8e68..c5d8564f9 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -73,7 +73,7 @@ class CallOnceOnly:
 def enable_avx512_qsort():
     enable = True
     platform = sysconfig.get_platform()
-    if "win32" in platform or "macos" in platform:
+    if "win32" in platform:
         enable = False
     return enable
 
@@ -981,6 +981,7 @@ def configuration(parent_package='',top_path=None):
     if enable_avx512_qsort():
         multiarray_src += [
                 join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
+                join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'),
                 ]
 
     #######################################################################
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 363daf46f..6c90bf0bb 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -58,13 +58,6 @@
 #include <cstdlib>
 #include <utility>
 
-#ifdef NPY_ENABLE_AVX512_QSORT
-#include "x86-qsort-skx.h"
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-skx.dispatch.h"
-#endif // NPY_DISABLE_OPTIMIZATION
-#endif // NPY_ENABLE_AVX512_QSORT
-
 #define NOT_USED NPY_UNUSED(unused)
 /*
  * pushing largest partition has upper bound of log2(n) space
@@ -88,7 +81,15 @@ struct x86_dispatch {
     static bool quicksort(typename Tag::type *, npy_intp) { return false; }
 };
 
+// Currently disabled on WIN32 only
 #ifdef NPY_ENABLE_AVX512_QSORT
+#include "x86-qsort-skx.h"
+#include "x86-qsort-icl.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include "x86-qsort-skx.dispatch.h"
+#endif
+
 #if NPY_SIZEOF_LONG == 8
 template <>
 struct x86_dispatch<npy::long_tag> {
@@ -143,7 +144,7 @@ struct x86_dispatch<npy::ulonglong_tag> {
         return false;
     }
 };
-#endif
+#endif // NPY_SIZEOF_LONG
 
 template <>
 struct x86_dispatch<npy::double_tag> {
@@ -200,9 +201,41 @@ struct x86_dispatch<npy::float_tag> {
         return false;
     }
 };
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include "x86-qsort-icl.dispatch.h"
+#endif
+
+template <>
+struct x86_dispatch<npy::short_tag> {
+    static bool quicksort(npy_short *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
+template <>
+struct x86_dispatch<npy::ushort_tag> {
+    static bool quicksort(npy_ushort *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
 #endif // NPY_ENABLE_AVX512_QSORT
 
-}  // namespace
+}  // end namespace
 
 template <typename Tag, typename type>
 static int
diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
new file mode 100644
index 000000000..7d6dc331b
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
@@ -0,0 +1,29 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "x86-qsort-icl.h"
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#ifdef NPY_HAVE_AVX512_ICL
+#include "avx512-16bit-qsort.hpp"
+
+/***************************************
+ * C > C++ dispatch
+ ***************************************/
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_short>((npy_short*)arr, arrsize);
+}
+
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort<npy_ushort>((npy_ushort*)arr, arrsize);
+}
+
+#endif  // NPY_HAVE_AVX512_ICL
diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h
new file mode 100644
index 000000000..2093e0bce
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort-icl.h
@@ -0,0 +1,24 @@
+#include "numpy/npy_common.h"
+
+#include "npy_cpu_dispatch.h"
+
+#ifndef NPY_NO_EXPORT
+#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include "x86-qsort-icl.dispatch.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short,
+                         (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort,
+                         (void *start, npy_intp num))
+
+#ifdef __cplusplus
+}
+#endif
-- 
cgit v1.2.1


From c71352232164ab7ddc4142ebc1db694493b34ff9 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 11 Oct 2022 14:38:30 -0700
Subject: MAINT: Fix comment

---
 numpy/core/setup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index c5d8564f9..3ab00205f 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -68,12 +68,11 @@ class CallOnceOnly:
             out = copy.deepcopy(pickle.loads(self._check_complex))
         return out
 
-# Temporarily disable AVX512 sorting on WIN32 and macOS until we can figure
-# out why the build fails
+# Temporarily disable AVX512 sorting on WIN32 until we can figure
+# out why it has test failures
 def enable_avx512_qsort():
     enable = True
-    platform = sysconfig.get_platform()
-    if "win32" in platform:
+    if "win32" in sysconfig.get_platform():
         enable = False
     return enable
 
-- 
cgit v1.2.1


From e91610af8ed4b9ba200086c7edea2f9a1a4ca280 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 12 Oct 2022 14:02:08 -0700
Subject: MAINT: Use loadu intrinsic instead of set1_epi16

gcc-8 is missing the _mm512_set1_epi16 intrinsic
---
 .../x86-simd-sort/src/avx512-16bit-qsort.hpp       | 170 +++++++++------------
 1 file changed, 74 insertions(+), 96 deletions(-)

diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
index 51cb4dbb0..5fcb8902d 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
@@ -15,24 +15,20 @@
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
  */
 // ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-#define NETWORK_16BIT_1 \
-    24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, \
-            11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK_16BIT_2 \
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, \
-            3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK_16BIT_3 \
-    27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, 11, 10, 9, \
-            8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK_16BIT_4 \
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \
-            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-#define NETWORK_16BIT_5 \
-    23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, 7, 6, 5, \
-            4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-#define NETWORK_16BIT_6 \
-    15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, \
-            26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+static const uint16_t network[6][32]
+        = {{7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8,
+            23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24},
+           {15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+            31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16},
+           {4,  5,  6,  7,  0,  1,  2,  3,  12, 13, 14, 15, 8,  9,  10, 11,
+            20, 21, 22, 23, 16, 17, 18, 19, 28, 29, 30, 31, 24, 25, 26, 27},
+           {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+            15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0},
+           {8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,
+            24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23},
+           {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}};
+
 
 template <>
 struct vector<int16_t> {
@@ -42,6 +38,10 @@ struct vector<int16_t> {
     using opmask_t = __mmask32;
     static const uint8_t numlanes = 32;
 
+    static zmm_t get_network(int index)
+    {
+        return _mm512_loadu_si512(&network[index-1][0]);
+    }
     static type_t type_max()
     {
         return X86_SIMD_SORT_MAX_INT16;
@@ -54,20 +54,15 @@ struct vector<int16_t> {
     {
         return _mm512_set1_epi16(type_max());
     }
-
     static opmask_t knot_opmask(opmask_t x)
     {
         return npyv_not_b16(x);
     }
+
     static opmask_t ge(zmm_t x, zmm_t y)
     {
         return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
     }
-    //template <int scale>
-    //static zmm_t i64gather(__m512i index, void const *base)
-    //{
-    //    return _mm512_i64gather_epi64(index, base, scale);
-    //}
     static zmm_t loadu(void const *mem)
     {
         return _mm512_loadu_si512(mem);
@@ -141,6 +136,10 @@ struct vector<uint16_t> {
     using opmask_t = __mmask32;
     static const uint8_t numlanes = 32;
 
+    static zmm_t get_network(int index)
+    {
+        return _mm512_loadu_si512(&network[index-1][0]);
+    }
     static type_t type_max()
     {
         return X86_SIMD_SORT_MAX_UINT16;
@@ -152,13 +151,8 @@ struct vector<uint16_t> {
     static zmm_t zmm_max()
     {
         return _mm512_set1_epi16(type_max());
-    } // TODO: this should broadcast bits as is?
+    }
 
-    //template<int scale>
-    //static zmm_t i64gather(__m512i index, void const *base)
-    //{
-    //    return _mm512_i64gather_epi64(index, base, scale);
-    //}
     static opmask_t knot_opmask(opmask_t x)
     {
         return npyv_not_b16(x);
@@ -254,9 +248,7 @@ NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
             0xAAAAAAAA);
     // Level 3
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_1), zmm),
-            0xF0F0F0F0);
+            zmm, vtype::permutexvar(vtype::get_network(1), zmm), 0xF0F0F0F0);
     zmm = cmp_merge<vtype>(
             zmm,
             vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
@@ -267,13 +259,9 @@ NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
             0xAAAAAAAA);
     // Level 4
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_2), zmm),
-            0xFF00FF00);
+            zmm, vtype::permutexvar(vtype::get_network(2), zmm), 0xFF00FF00);
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm),
-            0xF0F0F0F0);
+            zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
     zmm = cmp_merge<vtype>(
             zmm,
             vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
@@ -284,17 +272,11 @@ NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
             0xAAAAAAAA);
     // Level 5
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm),
-            0xFFFF0000);
+            zmm, vtype::permutexvar(vtype::get_network(4), zmm), 0xFFFF0000);
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm),
-            0xFF00FF00);
+            zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00);
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm),
-            0xF0F0F0F0);
+            zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
     zmm = cmp_merge<vtype>(
             zmm,
             vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
@@ -312,19 +294,13 @@ NPY_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
 {
     // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_6), zmm),
-            0xFFFF0000);
+            zmm, vtype::permutexvar(vtype::get_network(6), zmm), 0xFFFF0000);
     // 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm),
-            0xFF00FF00);
+            zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00);
     // 3) half_cleaner[8]
     zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm),
-            0xF0F0F0F0);
+            zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
     // 3) half_cleaner[4]
     zmm = cmp_merge<vtype>(
             zmm,
@@ -343,7 +319,7 @@ template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    zmm2 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm2);
+    zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2);
     zmm_t zmm3 = vtype::min(zmm1, zmm2);
     zmm_t zmm4 = vtype::max(zmm1, zmm2);
     // 2) Recursive half cleaner for each
@@ -356,13 +332,13 @@ NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 NPY_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
 {
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[3]);
+    zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]);
+    zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]);
     zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
     zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4),
+    zmm_t zmm_t3 = vtype::permutexvar(vtype::get_network(4),
                                       vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4),
+    zmm_t zmm_t4 = vtype::permutexvar(vtype::get_network(4),
                                       vtype::max(zmm[0], zmm3r));
     zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
     zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
@@ -436,43 +412,45 @@ NPY_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-NPY_FINLINE type_t
-get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right)
+NPY_FINLINE type_t get_pivot_16bit(type_t *arr,
+                                   const int64_t left,
+                                   const int64_t right)
 {
     // median of 32
     int64_t size = (right - left) / 32;
-    __m512i rand_vec = _mm512_set_epi16(arr[left],
-                                        arr[left + size],
-                                        arr[left + 2 * size],
-                                        arr[left + 3 * size],
-                                        arr[left + 4 * size],
-                                        arr[left + 5 * size],
-                                        arr[left + 6 * size],
-                                        arr[left + 7 * size],
-                                        arr[left + 8 * size],
-                                        arr[left + 9 * size],
-                                        arr[left + 10 * size],
-                                        arr[left + 11 * size],
-                                        arr[left + 12 * size],
-                                        arr[left + 13 * size],
-                                        arr[left + 14 * size],
-                                        arr[left + 15 * size],
-                                        arr[left + 16 * size],
-                                        arr[left + 17 * size],
-                                        arr[left + 18 * size],
-                                        arr[left + 19 * size],
-                                        arr[left + 20 * size],
-                                        arr[left + 21 * size],
-                                        arr[left + 22 * size],
-                                        arr[left + 23 * size],
-                                        arr[left + 24 * size],
-                                        arr[left + 25 * size],
-                                        arr[left + 26 * size],
-                                        arr[left + 27 * size],
-                                        arr[left + 28 * size],
-                                        arr[left + 29 * size],
-                                        arr[left + 30 * size],
-                                        arr[left + 31 * size]);
+    type_t vec_arr[32] = {arr[left],
+                          arr[left + size],
+                          arr[left + 2 * size],
+                          arr[left + 3 * size],
+                          arr[left + 4 * size],
+                          arr[left + 5 * size],
+                          arr[left + 6 * size],
+                          arr[left + 7 * size],
+                          arr[left + 8 * size],
+                          arr[left + 9 * size],
+                          arr[left + 10 * size],
+                          arr[left + 11 * size],
+                          arr[left + 12 * size],
+                          arr[left + 13 * size],
+                          arr[left + 14 * size],
+                          arr[left + 15 * size],
+                          arr[left + 16 * size],
+                          arr[left + 17 * size],
+                          arr[left + 18 * size],
+                          arr[left + 19 * size],
+                          arr[left + 20 * size],
+                          arr[left + 21 * size],
+                          arr[left + 22 * size],
+                          arr[left + 23 * size],
+                          arr[left + 24 * size],
+                          arr[left + 25 * size],
+                          arr[left + 26 * size],
+                          arr[left + 27 * size],
+                          arr[left + 28 * size],
+                          arr[left + 29 * size],
+                          arr[left + 30 * size],
+                          arr[left + 31 * size]};
+    __m512i rand_vec = _mm512_loadu_si512(vec_arr);
     __m512i sort = sort_zmm_16bit<vtype>(rand_vec);
     return ((type_t *)&sort)[16];
 }
-- 
cgit v1.2.1


From 73aa5ea217818b93631cdf61ae0530b75e27303e Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 13 Oct 2022 22:48:08 -0700
Subject: TST: Add quicksort test coverage for all 16, 32, 64 bit dtypes

---
 numpy/core/tests/test_multiarray.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 2d6f9c38c..0dc697bb0 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -9858,39 +9858,39 @@ class TestViewDtype:
 
 
 # Test various array sizes that hit different code paths in quicksort-avx512
-@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191,
-                               256, 383, 512, 1023, 2047])
-def test_sort_float(N):
+@pytest.mark.parametrize("N", np.arange(1,512))
+@pytest.mark.parametrize("dtype", ['e', 'f', 'd'])
+def test_sort_float(N, dtype):
     # Regular data with nan sprinkled
     np.random.seed(42)
-    arr = -0.5 + np.random.sample(N).astype('f')
+    arr = -0.5 + np.random.sample(N).astype(dtype)
     arr[np.random.choice(arr.shape[0], 3)] = np.nan
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
 
     # (2) with +INF
-    infarr = np.inf*np.ones(N, dtype='f')
+    infarr = np.inf*np.ones(N, dtype=dtype)
     infarr[np.random.choice(infarr.shape[0], 5)] = -1.0
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
     # (3) with -INF
-    neginfarr = -np.inf*np.ones(N, dtype='f')
+    neginfarr = -np.inf*np.ones(N, dtype=dtype)
     neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0
     assert_equal(np.sort(neginfarr, kind='quick'),
                  np.sort(neginfarr, kind='heap'))
 
     # (4) with +/-INF
-    infarr = np.inf*np.ones(N, dtype='f')
+    infarr = np.inf*np.ones(N, dtype=dtype)
     infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
 
-def test_sort_int():
-    # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled
-    rng = np.random.default_rng(42)
-    N = 2047
-    minv = np.iinfo(np.int32).min
-    maxv = np.iinfo(np.int32).max
-    arr = rng.integers(low=minv, high=maxv, size=N).astype('int32')
+@pytest.mark.parametrize("N", np.arange(1,512))
+@pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
+def test_sort_int(N, dtype):
+    # Random data with MAX and MIN sprinkled
+    minv = np.iinfo(dtype).min
+    maxv = np.iinfo(dtype).max
+    arr = np.random.randint(low=minv, high=maxv-1, size=N, dtype=dtype)
     arr[np.random.choice(arr.shape[0], 10)] = minv
     arr[np.random.choice(arr.shape[0], 10)] = maxv
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
-- 
cgit v1.2.1


From e9b39401f51351fc05712c207a78fecaac6c02fa Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 13 Oct 2022 22:51:27 -0700
Subject: MAINT: Fix linter errors

---
 numpy/core/tests/test_multiarray.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 0dc697bb0..31c57f9bc 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -9858,7 +9858,7 @@ class TestViewDtype:
 
 
 # Test various array sizes that hit different code paths in quicksort-avx512
-@pytest.mark.parametrize("N", np.arange(1,512))
+@pytest.mark.parametrize("N", np.arange(1, 512))
 @pytest.mark.parametrize("dtype", ['e', 'f', 'd'])
 def test_sort_float(N, dtype):
     # Regular data with nan sprinkled
@@ -9884,7 +9884,7 @@ def test_sort_float(N, dtype):
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
 
-@pytest.mark.parametrize("N", np.arange(1,512))
+@pytest.mark.parametrize("N", np.arange(1, 512))
 @pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
 def test_sort_int(N, dtype):
     # Random data with MAX and MIN sprinkled
-- 
cgit v1.2.1


From df915b889125948cb2461c3bacf892b6143515f0 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 31 Oct 2022 11:05:36 -0700
Subject: ENH: Use AVX-512 qsort for half precision float

---
 numpy/core/src/npysort/quicksort.cpp               |  15 ++
 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp  |   6 +
 numpy/core/src/npysort/x86-qsort-icl.h             |   3 +
 .../x86-simd-sort/src/avx512-16bit-qsort.hpp       | 211 ++++++++++++++++++++-
 .../x86-simd-sort/src/avx512-32bit-qsort.hpp       |   5 +-
 .../x86-simd-sort/src/avx512-64bit-qsort.hpp       |   5 +-
 .../x86-simd-sort/src/avx512-common-qsort.h        |  19 +-
 7 files changed, 253 insertions(+), 11 deletions(-)

diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 6c90bf0bb..f2cada873 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -206,6 +206,21 @@ struct x86_dispatch<npy::float_tag> {
 #include "x86-qsort-icl.dispatch.h"
 #endif
 
+template <>
+struct x86_dispatch<npy::half_tag> {
+    static bool quicksort(npy_half *start, npy_intp num)
+    {
+        void (*dispfunc)(void *, npy_intp) = nullptr;
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_half);
+        if (dispfunc) {
+            (*dispfunc)(start, num);
+            return true;
+        }
+        return false;
+    }
+};
+
+
 template <>
 struct x86_dispatch<npy::short_tag> {
     static bool quicksort(npy_short *start, npy_intp num)
diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
index 7d6dc331b..3dce8a9b4 100644
--- a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
+++ b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
@@ -14,6 +14,12 @@
 /***************************************
  * C > C++ dispatch
  ***************************************/
+NPY_NO_EXPORT void
+NPY_CPU_DISPATCH_CURFX(x86_quicksort_half)(void *arr, npy_intp arrsize)
+{
+    avx512_qsort_fp16((npy_half*)arr, arrsize);
+}
+
 NPY_NO_EXPORT void
 NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize)
 {
diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h
index 2093e0bce..92cef9cbc 100644
--- a/numpy/core/src/npysort/x86-qsort-icl.h
+++ b/numpy/core/src/npysort/x86-qsort-icl.h
@@ -13,6 +13,9 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_half,
+                         (void *start, npy_intp num))
+
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short,
                          (void *start, npy_intp num))
 
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
index 5fcb8902d..190188ecc 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
@@ -29,6 +29,142 @@ static const uint16_t network[6][32]
            {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}};
 
+struct float16 {
+    uint16_t val;
+};
+
+template <>
+struct vector<float16> {
+    using type_t = uint16_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask32;
+    static const uint8_t numlanes = 32;
+
+    static zmm_t get_network(int index)
+    {
+        return _mm512_loadu_si512(&network[index - 1][0]);
+    }
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_INFINITYH;
+    }
+    static type_t type_min()
+    {
+        return X86_SIMD_SORT_NEGINFINITYH;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi16(type_max());
+    }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask32(x);
+    }
+
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+	zmm_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000));	
+	zmm_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000));	
+	zmm_t exp_x = _mm512_and_si512(x, _mm512_set1_epi16(0x7c00));	
+	zmm_t exp_y = _mm512_and_si512(y, _mm512_set1_epi16(0x7c00));	
+	zmm_t mant_x = _mm512_and_si512(x, _mm512_set1_epi16(0x3ff));	
+	zmm_t mant_y = _mm512_and_si512(y, _mm512_set1_epi16(0x3ff));	
+
+	__mmask32 mask_ge = _mm512_cmp_epu16_mask(sign_x, sign_y, _MM_CMPINT_LT); // only greater than
+	__mmask32 sign_eq = _mm512_cmpeq_epu16_mask(sign_x, sign_y);
+	__mmask32 neg = _mm512_mask_cmpeq_epu16_mask(sign_eq, sign_x, _mm512_set1_epi16(0x8000)); // both numbers are -ve
+
+	// compare exponents only if signs are equal:
+	mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(sign_eq, exp_x, exp_y, _MM_CMPINT_NLE);
+	// get mask for elements for which both sign and exponents are equal:
+	__mmask32 exp_eq = _mm512_mask_cmpeq_epu16_mask(sign_eq, exp_x, exp_y);
+
+	// compare mantissa for elements for which both sign and expponent are equal:
+	mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
+	return _kxor_mask32(mask_ge, neg);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(y, ge(x, y), x);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        // AVX512_VBMI2
+        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        // AVX512BW
+        return _mm512_mask_loadu_epi16(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi16(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, ge(x, y), y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi16(idx, zmm);
+    }
+    // Apparently this is a terrible for perf, npy_half_to_float seems to work
+    // better
+    //static float uint16_to_float(uint16_t val)
+    //{
+    //    // Ideally use _mm_loadu_si16, but its only gcc > 11.x
+    //    // TODO: use inline ASM? https://godbolt.org/z/aGYvh7fMM
+    //    __m128i xmm = _mm_maskz_loadu_epi16(0x01, &val);
+    //    __m128 xmm2 = _mm_cvtph_ps(xmm);
+    //    return _mm_cvtss_f32(xmm2);
+    //}
+    static type_t float_to_uint16(float val)
+    {
+        __m128 xmm = _mm_load_ss(&val);
+        __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC);
+        return _mm_extract_epi16(xmm2, 0);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
+        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
+        float lo_max = _mm512_reduce_max_ps(lo);
+        float hi_max = _mm512_reduce_max_ps(hi);
+        return float_to_uint16(std::max(lo_max, hi_max));
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
+        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
+        float lo_max = _mm512_reduce_min_ps(lo);
+        float hi_max = _mm512_reduce_min_ps(hi);
+        return float_to_uint16(std::min(lo_max, hi_max));
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi16(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
+        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
 
 template <>
 struct vector<int16_t> {
@@ -40,7 +176,7 @@ struct vector<int16_t> {
 
     static zmm_t get_network(int index)
     {
-        return _mm512_loadu_si512(&network[index-1][0]);
+        return _mm512_loadu_si512(&network[index - 1][0]);
     }
     static type_t type_max()
     {
@@ -138,7 +274,7 @@ struct vector<uint16_t> {
 
     static zmm_t get_network(int index)
     {
-        return _mm512_loadu_si512(&network[index-1][0]);
+        return _mm512_loadu_si512(&network[index - 1][0]);
     }
     static type_t type_max()
     {
@@ -455,6 +591,38 @@ NPY_FINLINE type_t get_pivot_16bit(type_t *arr,
     return ((type_t *)&sort)[16];
 }
 
+template <>
+bool comparison_func<vector<float16>>(const uint16_t &a, const uint16_t &b)
+{
+    uint16_t signa = a & 0x8000, signb = b & 0x8000;
+    uint16_t expa = a & 0x7c00, expb = b & 0x7c00; 
+    uint16_t manta = a & 0x3ff, mantb = b & 0x3ff; 
+    if (signa != signb) {
+	// opposite signs
+	return a > b;
+    }
+    else if (signa > 0) {
+    	// both -ve
+	if (expa != expb) {
+	    return expa > expb;
+	}	
+	else {
+	    return manta > mantb;
+	}
+    }
+    else {
+	// both +ve
+	if (expa != expb) {
+	    return expa < expb;
+	}	
+	else {
+	    return manta < mantb;
+	}
+    }
+    
+    //return npy_half_to_float(a) < npy_half_to_float(b);
+}
+
 template <typename vtype, typename type_t>
 static void
 qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
@@ -463,7 +631,7 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
      * Resort to std::sort if quicksort isnt making any progress
      */
     if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1);
+        std::sort(arr + left, arr + right + 1, comparison_func<vtype>);
         return;
     }
     /*
@@ -485,6 +653,33 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
+NPY_FINLINE int64_t replace_nan_with_inf(uint16_t *arr, int64_t arrsize)
+{
+    int64_t nan_count = 0;
+    __mmask16 loadmask = 0xFFFF;
+    while (arrsize > 0) {
+        if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; }
+        __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr);
+        __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm);
+        __mmask16 nanmask = _mm512_cmp_ps_mask(
+                in_zmm_asfloat, in_zmm_asfloat, _CMP_NEQ_UQ);
+        nan_count += _mm_popcnt_u32((int32_t)nanmask);
+        _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF);
+        arr += 16;
+        arrsize -= 16;
+    }
+    return nan_count;
+}
+
+NPY_FINLINE void
+replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)
+{
+    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
+        arr[ii] = 0xFFFF;
+        nan_count -= 1;
+    }
+}
+
 template <>
 void avx512_qsort(int16_t *arr, int64_t arrsize)
 {
@@ -502,4 +697,14 @@ void avx512_qsort(uint16_t *arr, int64_t arrsize)
                 arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
     }
 }
+
+void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qsort_16bit_<vector<float16>, uint16_t>(
+                arr, 0, arrsize - 1, 2 * (63 - __builtin_clzll(arrsize)));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
 #endif // __AVX512_QSORT_16BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
index ac5bece7a..877849d6c 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
@@ -592,8 +592,9 @@ NPY_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-NPY_FINLINE type_t
-get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right)
+NPY_FINLINE type_t get_pivot_32bit(type_t *arr,
+                                   const int64_t left,
+                                   const int64_t right)
 {
     // median of 16
     int64_t size = (right - left) / 16;
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
index d882d78d9..b067f5eda 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
@@ -718,8 +718,9 @@ NPY_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-NPY_FINLINE type_t
-get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right)
+NPY_FINLINE type_t get_pivot_64bit(type_t *arr,
+                                   const int64_t left,
+                                   const int64_t right)
 {
     // median of 8
     int64_t size = (right - left) / 8;
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
index 56560185c..639d2f788 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
@@ -33,15 +33,17 @@
  *
  */
 
+#include "simd/simd.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <immintrin.h>
 #include <limits>
-#include "simd/simd.h"
 
 #define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
 #define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
+#define X86_SIMD_SORT_INFINITYH 0x7c00
+#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
 #define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
 #define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
 #define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
@@ -57,6 +59,7 @@
 #define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
 #define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
 #define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
+#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
 #define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
 #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
 #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
@@ -67,6 +70,12 @@ struct vector;
 template <typename T>
 void avx512_qsort(T *arr, int64_t arrsize);
 
+template <typename vtype, typename T = typename vtype::type_t>
+bool comparison_func(const T &a, const T &b)
+{
+    return a < b;
+}
+
 /*
  * COEX == Compare and Exchange two registers by swapping min and max values
  */
@@ -127,9 +136,11 @@ static inline int64_t partition_avx512(type_t *arr,
 {
     /* make array length divisible by vtype::numlanes , shortening the array */
     for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
-        *smallest = std::min(*smallest, arr[left]);
-        *biggest = std::max(*biggest, arr[left]);
-        if (arr[left] > pivot) { std::swap(arr[left], arr[--right]); }
+        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
+        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
+        if (!comparison_func<vtype>(arr[left], pivot)) {
+            std::swap(arr[left], arr[--right]);
+        }
         else {
             ++left;
         }
-- 
cgit v1.2.1


From 361a1a649b298e44e4233f2fec8276674248956d Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <devulapalli.raghuveer@gmail.com>
Date: Tue, 1 Nov 2022 22:12:45 -0700
Subject: BENCH: Add float16 to sort benchmarks

---
 benchmarks/benchmarks/bench_function_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
index 2e44ff76b..cc37bef39 100644
--- a/benchmarks/benchmarks/bench_function_base.py
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -248,7 +248,7 @@ class Sort(Benchmark):
         # In NumPy 1.17 and newer, 'merge' can be one of several
         # stable sorts, it isn't necessarily merge sort.
         ['quick', 'merge', 'heap'],
-        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'],
+        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'],
         [
             ('random',),
             ('ordered',),
-- 
cgit v1.2.1


From 47ed2780364a270a427d74f0db642bcd4a37e6f5 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <devulapalli.raghuveer@gmail.com>
Date: Tue, 1 Nov 2022 22:13:24 -0700
Subject: TST: Add test for float16 quicksort

---
 numpy/core/tests/test_multiarray.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 31c57f9bc..796ee07c3 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -9883,6 +9883,13 @@ def test_sort_float(N, dtype):
     infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
+def test_sort_float16():
+    arr = np.arange(65536, dtype=np.int16)
+    temp = np.frombuffer(arr.tobytes(), dtype=np.float16)
+    data = np.copy(temp)
+    np.random.shuffle(data)
+    data_backup = data
+    assert_equal(np.sort(data, kind='quick'), np.sort(data_backup, kind='heap'))
 
 @pytest.mark.parametrize("N", np.arange(1, 512))
 @pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
-- 
cgit v1.2.1


From f4c835332426d518c9e99bd00b45e8f5f453d6c8 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <devulapalli.raghuveer@gmail.com>
Date: Wed, 2 Nov 2022 14:17:27 -0700
Subject: Fix linter errors'

---
 numpy/core/tests/test_multiarray.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 796ee07c3..1d4de8e6e 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -9889,7 +9889,9 @@ def test_sort_float16():
     data = np.copy(temp)
     np.random.shuffle(data)
     data_backup = data
-    assert_equal(np.sort(data, kind='quick'), np.sort(data_backup, kind='heap'))
+    assert_equal(np.sort(data, kind='quick'),
+            np.sort(data_backup, kind='heap'))
+
 
 @pytest.mark.parametrize("N", np.arange(1, 512))
 @pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
-- 
cgit v1.2.1


From 6f2ea90d4d7f69ccc3c6389ef70d50652f3064b7 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 2 Nov 2022 16:05:33 -0700
Subject: BUG: Use log2 instead a builtin

---
 numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
index 190188ecc..ce8637e32 100644
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
+++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
@@ -703,7 +703,7 @@ void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize)
     if (arrsize > 1) {
         int64_t nan_count = replace_nan_with_inf(arr, arrsize);
         qsort_16bit_<vector<float16>, uint16_t>(
-                arr, 0, arrsize - 1, 2 * (63 - __builtin_clzll(arrsize)));
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
         replace_inf_with_nan(arr, arrsize, nan_count);
     }
 }
-- 
cgit v1.2.1


From 7c6615a229ec303b504dcacc695dabb4502e28b4 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 30 Jan 2023 13:03:39 -0800
Subject: Adding x86-simd-sort as submodule

---
 .gitmodules                                        |   3 +
 numpy/core/src/npysort/x86-simd-sort               |   1 +
 .../x86-simd-sort/src/avx512-16bit-qsort.hpp       | 710 ------------------
 .../x86-simd-sort/src/avx512-32bit-qsort.hpp       | 713 ------------------
 .../x86-simd-sort/src/avx512-64bit-qsort.hpp       | 825 ---------------------
 .../x86-simd-sort/src/avx512-common-qsort.h        | 230 ------
 6 files changed, 4 insertions(+), 2478 deletions(-)
 create mode 160000 numpy/core/src/npysort/x86-simd-sort
 delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
 delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
 delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
 delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h

diff --git a/.gitmodules b/.gitmodules
index 1ea274daf..d849a3caf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "numpy/core/src/umath/svml"]
 	path = numpy/core/src/umath/svml
 	url = https://github.com/numpy/SVML.git
+[submodule "numpy/core/src/npysort/x86-simd-sort"]
+	path = numpy/core/src/npysort/x86-simd-sort
+	url = https://github.com/intel/x86-simd-sort
diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort
new file mode 160000
index 000000000..0f1023bd0
--- /dev/null
+++ b/numpy/core/src/npysort/x86-simd-sort
@@ -0,0 +1 @@
+Subproject commit 0f1023bd0ffdabfe22883b85d4dfe55a6ed6ad3f
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
deleted file mode 100644
index ce8637e32..000000000
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp
+++ /dev/null
@@ -1,710 +0,0 @@
-/*******************************************************************
- * Copyright (C) 2022 Intel Corporation
- * SPDX-License-Identifier: BSD-3-Clause
- * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
- * ****************************************************************/
-
-#ifndef __AVX512_QSORT_16BIT__
-#define __AVX512_QSORT_16BIT__
-
-#include "avx512-common-qsort.h"
-
-/*
- * Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-// ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-static const uint16_t network[6][32]
-        = {{7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8,
-            23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24},
-           {15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
-            31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16},
-           {4,  5,  6,  7,  0,  1,  2,  3,  12, 13, 14, 15, 8,  9,  10, 11,
-            20, 21, 22, 23, 16, 17, 18, 19, 28, 29, 30, 31, 24, 25, 26, 27},
-           {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
-            15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0},
-           {8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,
-            24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23},
-           {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}};
-
-struct float16 {
-    uint16_t val;
-};
-
-template <>
-struct vector<float16> {
-    using type_t = uint16_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask32;
-    static const uint8_t numlanes = 32;
-
-    static zmm_t get_network(int index)
-    {
-        return _mm512_loadu_si512(&network[index - 1][0]);
-    }
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_INFINITYH;
-    }
-    static type_t type_min()
-    {
-        return X86_SIMD_SORT_NEGINFINITYH;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi16(type_max());
-    }
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _knot_mask32(x);
-    }
-
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-	zmm_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000));	
-	zmm_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000));	
-	zmm_t exp_x = _mm512_and_si512(x, _mm512_set1_epi16(0x7c00));	
-	zmm_t exp_y = _mm512_and_si512(y, _mm512_set1_epi16(0x7c00));	
-	zmm_t mant_x = _mm512_and_si512(x, _mm512_set1_epi16(0x3ff));	
-	zmm_t mant_y = _mm512_and_si512(y, _mm512_set1_epi16(0x3ff));	
-
-	__mmask32 mask_ge = _mm512_cmp_epu16_mask(sign_x, sign_y, _MM_CMPINT_LT); // only greater than
-	__mmask32 sign_eq = _mm512_cmpeq_epu16_mask(sign_x, sign_y);
-	__mmask32 neg = _mm512_mask_cmpeq_epu16_mask(sign_eq, sign_x, _mm512_set1_epi16(0x8000)); // both numbers are -ve
-
-	// compare exponents only if signs are equal:
-	mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(sign_eq, exp_x, exp_y, _MM_CMPINT_NLE);
-	// get mask for elements for which both sign and exponents are equal:
-	__mmask32 exp_eq = _mm512_mask_cmpeq_epu16_mask(sign_eq, exp_x, exp_y);
-
-	// compare mantissa for elements for which both sign and expponent are equal:
-	mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
-	return _kxor_mask32(mask_ge, neg);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(y, ge(x, y), x);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        // AVX512_VBMI2
-        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        // AVX512BW
-        return _mm512_mask_loadu_epi16(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi16(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, ge(x, y), y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi16(idx, zmm);
-    }
-    // Apparently this is a terrible for perf, npy_half_to_float seems to work
-    // better
-    //static float uint16_to_float(uint16_t val)
-    //{
-    //    // Ideally use _mm_loadu_si16, but its only gcc > 11.x
-    //    // TODO: use inline ASM? https://godbolt.org/z/aGYvh7fMM
-    //    __m128i xmm = _mm_maskz_loadu_epi16(0x01, &val);
-    //    __m128 xmm2 = _mm_cvtph_ps(xmm);
-    //    return _mm_cvtss_f32(xmm2);
-    //}
-    static type_t float_to_uint16(float val)
-    {
-        __m128 xmm = _mm_load_ss(&val);
-        __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC);
-        return _mm_extract_epi16(xmm2, 0);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
-        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
-        float lo_max = _mm512_reduce_max_ps(lo);
-        float hi_max = _mm512_reduce_max_ps(hi);
-        return float_to_uint16(std::max(lo_max, hi_max));
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
-        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
-        float lo_max = _mm512_reduce_min_ps(lo);
-        float hi_max = _mm512_reduce_min_ps(hi);
-        return float_to_uint16(std::min(lo_max, hi_max));
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi16(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
-        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-
-template <>
-struct vector<int16_t> {
-    using type_t = int16_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask32;
-    static const uint8_t numlanes = 32;
-
-    static zmm_t get_network(int index)
-    {
-        return _mm512_loadu_si512(&network[index - 1][0]);
-    }
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_INT16;
-    }
-    static type_t type_min()
-    {
-        return X86_SIMD_SORT_MIN_INT16;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi16(type_max());
-    }
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return npyv_not_b16(x);
-    }
-
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epi16(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        // AVX512_VBMI2
-        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        // AVX512BW
-        return _mm512_mask_loadu_epi16(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi16(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epi16(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi16(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_max = (type_t)npyv_reduce_max_s32(lo);
-        type_t hi_max = (type_t)npyv_reduce_max_s32(hi);
-        return std::max(lo_max, hi_max);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_min = (type_t)npyv_reduce_min_s32(lo);
-        type_t hi_min = (type_t)npyv_reduce_min_s32(hi);
-        return std::min(lo_min, hi_min);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi16(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
-        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-template <>
-struct vector<uint16_t> {
-    using type_t = uint16_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask32;
-    static const uint8_t numlanes = 32;
-
-    static zmm_t get_network(int index)
-    {
-        return _mm512_loadu_si512(&network[index - 1][0]);
-    }
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_UINT16;
-    }
-    static type_t type_min()
-    {
-        return 0;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi16(type_max());
-    }
-
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return npyv_not_b16(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epu16(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi16(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi16(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epu16(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi16(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_max = (type_t)npyv_reduce_max_s32(lo);
-        type_t hi_max = (type_t)npyv_reduce_max_s32(hi);
-        return std::max(lo_max, hi_max);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_min = (type_t)npyv_reduce_min_s32(lo);
-        type_t hi_min = (type_t)npyv_reduce_min_s32(hi);
-        return std::min(lo_min, hi_min);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi16(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
-        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
-{
-    // Level 1
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAAAAAA);
-    // Level 2
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
-            0xCCCCCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAAAAAA);
-    // Level 3
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(1), zmm), 0xF0F0F0F0);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCCCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAAAAAA);
-    // Level 4
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(2), zmm), 0xFF00FF00);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCCCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAAAAAA);
-    // Level 5
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(4), zmm), 0xFFFF0000);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCCCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAAAAAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
-{
-    // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(6), zmm), 0xFFFF0000);
-    // 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00);
-    // 3) half_cleaner[8]
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCCCCCC);
-    // 3) half_cleaner[2]
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAAAAAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
-{
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2);
-    zmm_t zmm3 = vtype::min(zmm1, zmm2);
-    zmm_t zmm4 = vtype::max(zmm1, zmm2);
-    // 2) Recursive half cleaner for each
-    zmm1 = bitonic_merge_zmm_16bit<vtype>(zmm3);
-    zmm2 = bitonic_merge_zmm_16bit<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
-{
-    zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(vtype::get_network(4),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(vtype::get_network(4),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm_16bit<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm_16bit<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm_16bit<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm_16bit<vtype>(zmm3);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
-{
-    typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;
-    typename vtype::zmm_t zmm
-            = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm_16bit<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
-{
-    if (N <= 32) {
-        sort_32_16bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    typename vtype::opmask_t load_mask
-            = ((0x1ull << (N - 32)) - 0x1ull) & 0xFFFFFFFF;
-    zmm_t zmm1 = vtype::loadu(arr);
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 32);
-    zmm1 = sort_zmm_16bit<vtype>(zmm1);
-    zmm2 = sort_zmm_16bit<vtype>(zmm2);
-    bitonic_merge_two_zmm_16bit<vtype>(zmm1, zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 32, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
-{
-    if (N <= 64) {
-        sort_64_16bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 32);
-    opmask_t load_mask1 = 0xFFFFFFFF, load_mask2 = 0xFFFFFFFF;
-    if (N != 128) {
-        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
-        load_mask1 = combined_mask & 0xFFFFFFFF;
-        load_mask2 = (combined_mask >> 32) & 0xFFFFFFFF;
-    }
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 96);
-    zmm[0] = sort_zmm_16bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_16bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_16bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_16bit<vtype>(zmm[3]);
-    bitonic_merge_two_zmm_16bit<vtype>(zmm[0], zmm[1]);
-    bitonic_merge_two_zmm_16bit<vtype>(zmm[2], zmm[3]);
-    bitonic_merge_four_zmm_16bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 32, zmm[1]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 96, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE type_t get_pivot_16bit(type_t *arr,
-                                   const int64_t left,
-                                   const int64_t right)
-{
-    // median of 32
-    int64_t size = (right - left) / 32;
-    type_t vec_arr[32] = {arr[left],
-                          arr[left + size],
-                          arr[left + 2 * size],
-                          arr[left + 3 * size],
-                          arr[left + 4 * size],
-                          arr[left + 5 * size],
-                          arr[left + 6 * size],
-                          arr[left + 7 * size],
-                          arr[left + 8 * size],
-                          arr[left + 9 * size],
-                          arr[left + 10 * size],
-                          arr[left + 11 * size],
-                          arr[left + 12 * size],
-                          arr[left + 13 * size],
-                          arr[left + 14 * size],
-                          arr[left + 15 * size],
-                          arr[left + 16 * size],
-                          arr[left + 17 * size],
-                          arr[left + 18 * size],
-                          arr[left + 19 * size],
-                          arr[left + 20 * size],
-                          arr[left + 21 * size],
-                          arr[left + 22 * size],
-                          arr[left + 23 * size],
-                          arr[left + 24 * size],
-                          arr[left + 25 * size],
-                          arr[left + 26 * size],
-                          arr[left + 27 * size],
-                          arr[left + 28 * size],
-                          arr[left + 29 * size],
-                          arr[left + 30 * size],
-                          arr[left + 31 * size]};
-    __m512i rand_vec = _mm512_loadu_si512(vec_arr);
-    __m512i sort = sort_zmm_16bit<vtype>(rand_vec);
-    return ((type_t *)&sort)[16];
-}
-
-template <>
-bool comparison_func<vector<float16>>(const uint16_t &a, const uint16_t &b)
-{
-    uint16_t signa = a & 0x8000, signb = b & 0x8000;
-    uint16_t expa = a & 0x7c00, expb = b & 0x7c00; 
-    uint16_t manta = a & 0x3ff, mantb = b & 0x3ff; 
-    if (signa != signb) {
-	// opposite signs
-	return a > b;
-    }
-    else if (signa > 0) {
-    	// both -ve
-	if (expa != expb) {
-	    return expa > expb;
-	}	
-	else {
-	    return manta > mantb;
-	}
-    }
-    else {
-	// both +ve
-	if (expa != expb) {
-	    return expa < expb;
-	}	
-	else {
-	    return manta < mantb;
-	}
-    }
-    
-    //return npy_half_to_float(a) < npy_half_to_float(b);
-}
-
-template <typename vtype, typename type_t>
-static void
-qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
-{
-    /*
-     * Resort to std::sort if quicksort isnt making any progress
-     */
-    if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1, comparison_func<vtype>);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128_16bit<vtype>(arr + left, (int32_t)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot_16bit<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512<vtype>(
-            arr, left, right + 1, pivot, &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_16bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-NPY_FINLINE int64_t replace_nan_with_inf(uint16_t *arr, int64_t arrsize)
-{
-    int64_t nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; }
-        __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr);
-        __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(
-                in_zmm_asfloat, in_zmm_asfloat, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((int32_t)nanmask);
-        _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-NPY_FINLINE void
-replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)
-{
-    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = 0xFFFF;
-        nan_count -= 1;
-    }
-}
-
-template <>
-void avx512_qsort(int16_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        qsort_16bit_<vector<int16_t>, int16_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qsort(uint16_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        qsort_16bit_<vector<uint16_t>, uint16_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qsort_16bit_<vector<float16>, uint16_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
-    }
-}
-#endif // __AVX512_QSORT_16BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
deleted file mode 100644
index 877849d6c..000000000
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp
+++ /dev/null
@@ -1,713 +0,0 @@
-/*******************************************************************
- * Copyright (C) 2022 Intel Corporation
- * Copyright (C) 2021 Serge Sans Paille
- * SPDX-License-Identifier: BSD-3-Clause
- * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
- *          Serge Sans Paille <serge.guelton@telecom-bretagne.eu>
- * ****************************************************************/
-#ifndef __AVX512_QSORT_32BIT__
-#define __AVX512_QSORT_32BIT__
-
-#include "avx512-common-qsort.h"
-
-/*
- * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-#define NETWORK_32BIT_1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-#define NETWORK_32BIT_2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK_32BIT_3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK_32BIT_4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK_32BIT_5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-
-template <>
-struct vector<int32_t> {
-    using type_t = int32_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask16;
-    static const uint8_t numlanes = 16;
-
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_INT32;
-    }
-    static type_t type_min()
-    {
-        return X86_SIMD_SORT_MIN_INT32;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi32(type_max());
-    }
-
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _mm512_knot(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t merge(ymm_t y1, ymm_t y2)
-    {
-        zmm_t z1 = _mm512_castsi256_si512(y1);
-        return _mm512_inserti32x8(z1, y2, 1);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epi32(x, y);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epi32(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        return npyv_reduce_max_s32(v);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        return npyv_reduce_min_s32(v);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi32(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y)
-    {
-        return _mm256_max_epi32(x, y);
-    }
-    static ymm_t min(ymm_t x, ymm_t y)
-    {
-        return _mm256_min_epi32(x, y);
-    }
-};
-template <>
-struct vector<uint32_t> {
-    using type_t = uint32_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask16;
-    static const uint8_t numlanes = 16;
-
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_UINT32;
-    }
-    static type_t type_min()
-    {
-        return 0;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi32(type_max());
-    } // TODO: this should broadcast bits as is?
-
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t merge(ymm_t y1, ymm_t y2)
-    {
-        zmm_t z1 = _mm512_castsi256_si512(y1);
-        return _mm512_inserti32x8(z1, y2, 1);
-    }
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _mm512_knot(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epu32(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epu32(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        return npyv_reduce_max_u32(v);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        return npyv_reduce_min_u32(v);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi32(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y)
-    {
-        return _mm256_max_epu32(x, y);
-    }
-    static ymm_t min(ymm_t x, ymm_t y)
-    {
-        return _mm256_min_epu32(x, y);
-    }
-};
-template <>
-struct vector<float> {
-    using type_t = float;
-    using zmm_t = __m512;
-    using ymm_t = __m256;
-    using opmask_t = __mmask16;
-    static const uint8_t numlanes = 16;
-
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_INFINITYF;
-    }
-    static type_t type_min()
-    {
-        return -X86_SIMD_SORT_INFINITYF;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_ps(type_max());
-    }
-
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _mm512_knot(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
-    }
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_ps(index, base, scale);
-    }
-    static zmm_t merge(ymm_t y1, ymm_t y2)
-    {
-        zmm_t z1 = _mm512_castsi512_ps(
-                _mm512_castsi256_si512(_mm256_castps_si256(y1)));
-        return _mm512_insertf32x8(z1, y2, 1);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_ps(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_ps(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_ps(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_ps(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_ps(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_ps(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_ps(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_ps(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        return npyv_reduce_max_f32(v);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        return npyv_reduce_min_f32(v);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_ps(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_ps(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y)
-    {
-        return _mm256_max_ps(x, y);
-    }
-    static ymm_t min(ymm_t x, ymm_t y)
-    {
-        return _mm256_min_ps(x, y);
-    }
-};
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
-{
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
-            0xCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_3), zmm),
-            0xF0F0);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm),
-            0xFF00);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
-            0xF0F0);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCC);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
-{
-    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm),
-            0xFF00);
-    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
-            0xF0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-            0xCCCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-            0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
-{
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
-    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
-    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
-    // 2) Recursive half cleaner for each
-    *zmm1 = bitonic_merge_zmm_32bit<vtype>(zmm3);
-    *zmm2 = bitonic_merge_zmm_32bit<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
-{
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
-{
-    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
-                                      vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm_32bit<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm_32bit<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm_32bit<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm_32bit<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
-{
-    typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
-    typename vtype::zmm_t zmm
-            = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm_32bit<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
-{
-    if (N <= 16) {
-        sort_16_32bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
-    zmm1 = sort_zmm_32bit<vtype>(zmm1);
-    zmm2 = sort_zmm_32bit<vtype>(zmm2);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm1, &zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
-{
-    if (N <= 32) {
-        sort_32_32bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
-    load_mask1 &= combined_mask & 0xFFFF;
-    load_mask2 &= (combined_mask >> 16) & 0xFFFF;
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
-    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_four_zmm_32bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
-{
-    if (N <= 64) {
-        sort_64_32bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    zmm[2] = vtype::loadu(arr + 32);
-    zmm[3] = vtype::loadu(arr + 48);
-    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
-    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
-    if (N != 128) {
-        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
-        load_mask1 &= combined_mask & 0xFFFF;
-        load_mask2 &= (combined_mask >> 16) & 0xFFFF;
-        load_mask3 &= (combined_mask >> 32) & 0xFFFF;
-        load_mask4 &= (combined_mask >> 48) & 0xFFFF;
-    }
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
-    zmm[4] = sort_zmm_32bit<vtype>(zmm[4]);
-    zmm[5] = sort_zmm_32bit<vtype>(zmm[5]);
-    zmm[6] = sort_zmm_32bit<vtype>(zmm[6]);
-    zmm[7] = sort_zmm_32bit<vtype>(zmm[7]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[4], &zmm[5]);
-    bitonic_merge_two_zmm_32bit<vtype>(&zmm[6], &zmm[7]);
-    bitonic_merge_four_zmm_32bit<vtype>(zmm);
-    bitonic_merge_four_zmm_32bit<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm_32bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::storeu(arr + 32, zmm[2]);
-    vtype::storeu(arr + 48, zmm[3]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE type_t get_pivot_32bit(type_t *arr,
-                                   const int64_t left,
-                                   const int64_t right)
-{
-    // median of 16
-    int64_t size = (right - left) / 16;
-    using zmm_t = typename vtype::zmm_t;
-    using ymm_t = typename vtype::ymm_t;
-    __m512i rand_index1 = _mm512_set_epi64(left + size,
-                                           left + 2 * size,
-                                           left + 3 * size,
-                                           left + 4 * size,
-                                           left + 5 * size,
-                                           left + 6 * size,
-                                           left + 7 * size,
-                                           left + 8 * size);
-    __m512i rand_index2 = _mm512_set_epi64(left + 9 * size,
-                                           left + 10 * size,
-                                           left + 11 * size,
-                                           left + 12 * size,
-                                           left + 13 * size,
-                                           left + 14 * size,
-                                           left + 15 * size,
-                                           left + 16 * size);
-    ymm_t rand_vec1
-            = vtype::template i64gather<sizeof(type_t)>(rand_index1, arr);
-    ymm_t rand_vec2
-            = vtype::template i64gather<sizeof(type_t)>(rand_index2, arr);
-    zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2);
-    zmm_t sort = sort_zmm_32bit<vtype>(rand_vec);
-    // pivot will never be a nan, since there are no nan's!
-    return ((type_t *)&sort)[8];
-}
-
-template <typename vtype, typename type_t>
-static void
-qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
-{
-    /*
-     * Resort to std::sort if quicksort isnt making any progress
-     */
-    if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512<vtype>(
-            arr, left, right + 1, pivot, &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-NPY_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
-{
-    int64_t nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; }
-        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((int32_t)nanmask);
-        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-NPY_FINLINE void
-replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
-{
-    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = std::nanf("1");
-        nan_count -= 1;
-    }
-}
-
-template <>
-void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        qsort_32bit_<vector<int32_t>, int32_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qsort<uint32_t>(uint32_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        qsort_32bit_<vector<uint32_t>, uint32_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qsort<float>(float *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qsort_32bit_<vector<float>, float>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
-    }
-}
-
-#endif //__AVX512_QSORT_32BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
deleted file mode 100644
index b067f5eda..000000000
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp
+++ /dev/null
@@ -1,825 +0,0 @@
-/*******************************************************************
- * Copyright (C) 2022 Intel Corporation
- * SPDX-License-Identifier: BSD-3-Clause
- * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
- * ****************************************************************/
-
-#ifndef __AVX512_QSORT_64BIT__
-#define __AVX512_QSORT_64BIT__
-
-#include "avx512-common-qsort.h"
-
-/*
- * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-// ZMM                  7, 6, 5, 4, 3, 2, 1, 0
-#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
-
-template <>
-struct vector<int64_t> {
-    using type_t = int64_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m512i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_INT64;
-    }
-    static type_t type_min()
-    {
-        return X86_SIMD_SORT_MIN_INT64;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi64(type_max());
-    } // TODO: this should broadcast bits as is?
-
-    static zmm_t set(type_t v1,
-                     type_t v2,
-                     type_t v3,
-                     type_t v4,
-                     type_t v5,
-                     type_t v6,
-                     type_t v7,
-                     type_t v8)
-    {
-        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return npyv_not_b64(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi64(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epi64(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi64(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi64(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi64(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epi64(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi64(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        return npyv_reduce_max_s64(v);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        return npyv_reduce_min_s64(v);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi64(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        __m512d temp = _mm512_castsi512_pd(zmm);
-        return _mm512_castpd_si512(
-                _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-template <>
-struct vector<uint64_t> {
-    using type_t = uint64_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m512i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_UINT64;
-    }
-    static type_t type_min()
-    {
-        return 0;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi64(type_max());
-    }
-
-    static zmm_t set(type_t v1,
-                     type_t v2,
-                     type_t v3,
-                     type_t v4,
-                     type_t v5,
-                     type_t v6,
-                     type_t v7,
-                     type_t v8)
-    {
-        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi64(index, base, scale);
-    }
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return npyv_not_b64(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epu64(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi64(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi64(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi64(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epu64(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi64(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        return npyv_reduce_max_u64(v);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        return npyv_reduce_min_u64(v);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi64(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        __m512d temp = _mm512_castsi512_pd(zmm);
-        return _mm512_castpd_si512(
-                _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-template <>
-struct vector<double> {
-    using type_t = double;
-    using zmm_t = __m512d;
-    using ymm_t = __m512d;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_INFINITY;
-    }
-    static type_t type_min()
-    {
-        return -X86_SIMD_SORT_INFINITY;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_pd(type_max());
-    }
-
-    static zmm_t set(type_t v1,
-                     type_t v2,
-                     type_t v3,
-                     type_t v4,
-                     type_t v5,
-                     type_t v6,
-                     type_t v7,
-                     type_t v8)
-    {
-        return _mm512_set_pd(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return npyv_not_b64(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_pd(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_pd(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_pd(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_pd(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_pd(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_pd(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_pd(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_pd(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_pd(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        return npyv_reduce_max_f64(v);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        return npyv_reduce_min_f64(v);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_pd(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_pd(mem, x);
-    }
-};
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
-{
-    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_1), zmm),
-            0xCC);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    zmm = cmp_merge<vtype>(zmm, vtype::permutexvar(rev_index, zmm), 0xF0);
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm),
-            0xCC);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
-{
-
-    // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_4), zmm),
-            0xF0);
-    // 2) half_cleaner[4]
-    zmm = cmp_merge<vtype>(
-            zmm,
-            vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm),
-            0xCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
-{
-    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    zmm2 = vtype::permutexvar(rev_index, zmm2);
-    zmm_t zmm3 = vtype::min(zmm1, zmm2);
-    zmm_t zmm4 = vtype::max(zmm1, zmm2);
-    // 2) Recursive half cleaner for each
-    zmm1 = bitonic_merge_zmm_64bit<vtype>(zmm3);
-    zmm2 = bitonic_merge_zmm_64bit<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
-{
-    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
-    // 1) First step of a merging network
-    zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    // 2) Recursive half clearer: 16
-    zmm_t zmm_t3 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
-{
-    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
-    zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(rev_index, zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-NPY_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
-{
-    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
-    zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
-    zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]);
-    zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]);
-    zmm_t zmm11r = vtype::permutexvar(rev_index, zmm[11]);
-    zmm_t zmm12r = vtype::permutexvar(rev_index, zmm[12]);
-    zmm_t zmm13r = vtype::permutexvar(rev_index, zmm[13]);
-    zmm_t zmm14r = vtype::permutexvar(rev_index, zmm[14]);
-    zmm_t zmm15r = vtype::permutexvar(rev_index, zmm[15]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm15r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm14r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm13r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm12r);
-    zmm_t zmm_t5 = vtype::min(zmm[4], zmm11r);
-    zmm_t zmm_t6 = vtype::min(zmm[5], zmm10r);
-    zmm_t zmm_t7 = vtype::min(zmm[6], zmm9r);
-    zmm_t zmm_t8 = vtype::min(zmm[7], zmm8r);
-    zmm_t zmm_t9 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm8r));
-    zmm_t zmm_t10 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm9r));
-    zmm_t zmm_t11 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm10r));
-    zmm_t zmm_t12 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm11r));
-    zmm_t zmm_t13 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm12r));
-    zmm_t zmm_t14 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm13r));
-    zmm_t zmm_t15 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm14r));
-    zmm_t zmm_t16 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm15r));
-    // Recusive half clear 16 zmm regs
-    COEX<vtype>(zmm_t1, zmm_t5);
-    COEX<vtype>(zmm_t2, zmm_t6);
-    COEX<vtype>(zmm_t3, zmm_t7);
-    COEX<vtype>(zmm_t4, zmm_t8);
-    COEX<vtype>(zmm_t9, zmm_t13);
-    COEX<vtype>(zmm_t10, zmm_t14);
-    COEX<vtype>(zmm_t11, zmm_t15);
-    COEX<vtype>(zmm_t12, zmm_t16);
-    //
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t9, zmm_t11);
-    COEX<vtype>(zmm_t10, zmm_t12);
-    COEX<vtype>(zmm_t13, zmm_t15);
-    COEX<vtype>(zmm_t14, zmm_t16);
-    //
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    COEX<vtype>(zmm_t9, zmm_t10);
-    COEX<vtype>(zmm_t11, zmm_t12);
-    COEX<vtype>(zmm_t13, zmm_t14);
-    COEX<vtype>(zmm_t15, zmm_t16);
-    //
-    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
-    zmm[8] = bitonic_merge_zmm_64bit<vtype>(zmm_t9);
-    zmm[9] = bitonic_merge_zmm_64bit<vtype>(zmm_t10);
-    zmm[10] = bitonic_merge_zmm_64bit<vtype>(zmm_t11);
-    zmm[11] = bitonic_merge_zmm_64bit<vtype>(zmm_t12);
-    zmm[12] = bitonic_merge_zmm_64bit<vtype>(zmm_t13);
-    zmm[13] = bitonic_merge_zmm_64bit<vtype>(zmm_t14);
-    zmm[14] = bitonic_merge_zmm_64bit<vtype>(zmm_t15);
-    zmm[15] = bitonic_merge_zmm_64bit<vtype>(zmm_t16);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
-{
-    typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
-    typename vtype::zmm_t zmm
-            = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm_64bit<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
-{
-    if (N <= 8) {
-        sort_8_64bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    typename vtype::opmask_t load_mask = (0x01 << (N - 8)) - 0x01;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 8);
-    zmm1 = sort_zmm_64bit<vtype>(zmm1);
-    zmm2 = sort_zmm_64bit<vtype>(zmm2);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm1, zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 8, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
-{
-    if (N <= 16) {
-        sort_16_64bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 8);
-    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
-    uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
-    load_mask1 = (combined_mask)&0xFF;
-    load_mask2 = (combined_mask >> 8) & 0xFF;
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 16);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 24);
-    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 8, zmm[1]);
-    vtype::mask_storeu(arr + 16, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 24, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
-{
-    if (N <= 32) {
-        sort_32_64bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 8);
-    zmm[2] = vtype::loadu(arr + 16);
-    zmm[3] = vtype::loadu(arr + 24);
-    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
-    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
-    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
-    // N-32 >= 1
-    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
-    load_mask1 = (combined_mask)&0xFF;
-    load_mask2 = (combined_mask >> 8) & 0xFF;
-    load_mask3 = (combined_mask >> 16) & 0xFF;
-    load_mask4 = (combined_mask >> 24) & 0xFF;
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 40);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 48);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 56);
-    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
-    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
-    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
-    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 8, zmm[1]);
-    vtype::storeu(arr + 16, zmm[2]);
-    vtype::storeu(arr + 24, zmm[3]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 40, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 48, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 56, load_mask4, zmm[7]);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
-{
-    if (N <= 64) {
-        sort_64_64bit<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    using opmask_t = typename vtype::opmask_t;
-    zmm_t zmm[16];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 8);
-    zmm[2] = vtype::loadu(arr + 16);
-    zmm[3] = vtype::loadu(arr + 24);
-    zmm[4] = vtype::loadu(arr + 32);
-    zmm[5] = vtype::loadu(arr + 40);
-    zmm[6] = vtype::loadu(arr + 48);
-    zmm[7] = vtype::loadu(arr + 56);
-    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
-    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
-    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
-    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
-    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
-    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
-    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
-    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
-    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
-    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
-    opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF;
-    opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF;
-    if (N != 128) {
-        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
-        load_mask1 = (combined_mask)&0xFF;
-        load_mask2 = (combined_mask >> 8) & 0xFF;
-        load_mask3 = (combined_mask >> 16) & 0xFF;
-        load_mask4 = (combined_mask >> 24) & 0xFF;
-        load_mask5 = (combined_mask >> 32) & 0xFF;
-        load_mask6 = (combined_mask >> 40) & 0xFF;
-        load_mask7 = (combined_mask >> 48) & 0xFF;
-        load_mask8 = (combined_mask >> 56) & 0xFF;
-    }
-    zmm[8] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[9] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 72);
-    zmm[10] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 80);
-    zmm[11] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 88);
-    zmm[12] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 96);
-    zmm[13] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 104);
-    zmm[14] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 112);
-    zmm[15] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 120);
-    zmm[8] = sort_zmm_64bit<vtype>(zmm[8]);
-    zmm[9] = sort_zmm_64bit<vtype>(zmm[9]);
-    zmm[10] = sort_zmm_64bit<vtype>(zmm[10]);
-    zmm[11] = sort_zmm_64bit<vtype>(zmm[11]);
-    zmm[12] = sort_zmm_64bit<vtype>(zmm[12]);
-    zmm[13] = sort_zmm_64bit<vtype>(zmm[13]);
-    zmm[14] = sort_zmm_64bit<vtype>(zmm[14]);
-    zmm[15] = sort_zmm_64bit<vtype>(zmm[15]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[8], zmm[9]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[10], zmm[11]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[12], zmm[13]);
-    bitonic_merge_two_zmm_64bit<vtype>(zmm[14], zmm[15]);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm + 8);
-    bitonic_merge_four_zmm_64bit<vtype>(zmm + 12);
-    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
-    bitonic_merge_eight_zmm_64bit<vtype>(zmm + 8);
-    bitonic_merge_sixteen_zmm_64bit<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 8, zmm[1]);
-    vtype::storeu(arr + 16, zmm[2]);
-    vtype::storeu(arr + 24, zmm[3]);
-    vtype::storeu(arr + 32, zmm[4]);
-    vtype::storeu(arr + 40, zmm[5]);
-    vtype::storeu(arr + 48, zmm[6]);
-    vtype::storeu(arr + 56, zmm[7]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[8]);
-    vtype::mask_storeu(arr + 72, load_mask2, zmm[9]);
-    vtype::mask_storeu(arr + 80, load_mask3, zmm[10]);
-    vtype::mask_storeu(arr + 88, load_mask4, zmm[11]);
-    vtype::mask_storeu(arr + 96, load_mask5, zmm[12]);
-    vtype::mask_storeu(arr + 104, load_mask6, zmm[13]);
-    vtype::mask_storeu(arr + 112, load_mask7, zmm[14]);
-    vtype::mask_storeu(arr + 120, load_mask8, zmm[15]);
-}
-
-template <typename vtype, typename type_t>
-NPY_FINLINE type_t get_pivot_64bit(type_t *arr,
-                                   const int64_t left,
-                                   const int64_t right)
-{
-    // median of 8
-    int64_t size = (right - left) / 8;
-    using zmm_t = typename vtype::zmm_t;
-    __m512i rand_index = _mm512_set_epi64(left + size,
-                                          left + 2 * size,
-                                          left + 3 * size,
-                                          left + 4 * size,
-                                          left + 5 * size,
-                                          left + 6 * size,
-                                          left + 7 * size,
-                                          left + 8 * size);
-    zmm_t rand_vec = vtype::template i64gather<sizeof(type_t)>(rand_index, arr);
-    // pivot will never be a nan, since there are no nan's!
-    zmm_t sort = sort_zmm_64bit<vtype>(rand_vec);
-    return ((type_t *)&sort)[4];
-}
-
-template <typename vtype, typename type_t>
-static void
-qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
-{
-    /*
-     * Resort to std::sort if quicksort isnt making any progress
-     */
-    if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128_64bit<vtype>(arr + left, (int32_t)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512<vtype>(
-            arr, left, right + 1, pivot, &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_64bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-NPY_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
-{
-    int64_t nan_count = 0;
-    __mmask8 loadmask = 0xFF;
-    while (arrsize > 0) {
-        if (arrsize < 8) { loadmask = (0x01 << arrsize) - 0x01; }
-        __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr);
-        __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((int32_t)nanmask);
-        _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE);
-        arr += 8;
-        arrsize -= 8;
-    }
-    return nan_count;
-}
-
-NPY_FINLINE void
-replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)
-{
-    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = std::nan("1");
-        nan_count -= 1;
-    }
-}
-
-template <>
-void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        qsort_64bit_<vector<int64_t>, int64_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qsort<uint64_t>(uint64_t *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        qsort_64bit_<vector<uint64_t>, uint64_t>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qsort<double>(double *arr, int64_t arrsize)
-{
-    if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qsort_64bit_<vector<double>, double>(
-                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
-    }
-}
-#endif // __AVX512_QSORT_64BIT__
diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
deleted file mode 100644
index 639d2f788..000000000
--- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*******************************************************************
- * Copyright (C) 2022 Intel Corporation
- * Copyright (C) 2021 Serge Sans Paille
- * SPDX-License-Identifier: BSD-3-Clause
- * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
- *          Serge Sans Paille <serge.guelton@telecom-bretagne.eu>
- * ****************************************************************/
-
-#ifndef __AVX512_QSORT_COMMON__
-#define __AVX512_QSORT_COMMON__
-
-/*
- * Quicksort using AVX-512. The ideas and code are based on these two research
- * papers [1] and [2]. On a high level, the idea is to vectorize quicksort
- * partitioning using AVX-512 compressstore instructions. If the array size is
- * < 128, then use Bitonic sorting network implemented on 512-bit registers.
- * The precise network definitions depend on the dtype and are defined in
- * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
- * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
- * network. The core implementations of the vectorized qsort functions
- * avx512_qsort<T>(T*, int64_t) are modified versions of avx2 quicksort
- * presented in the paper [2] and source code associated with that paper [3].
- *
- * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
- *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- *
- * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT
- *
- * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- */
-
-#include "simd/simd.h"
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <immintrin.h>
-#include <limits>
-
-#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
-#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
-#define X86_SIMD_SORT_INFINITYH 0x7c00
-#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
-#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
-#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
-#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
-#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
-#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
-#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
-#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
-#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
-#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
-#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
-#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
-#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
-#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
-#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
-#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
-#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-
-template <typename type>
-struct vector;
-
-template <typename T>
-void avx512_qsort(T *arr, int64_t arrsize);
-
-template <typename vtype, typename T = typename vtype::type_t>
-bool comparison_func(const T &a, const T &b)
-{
-    return a < b;
-}
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-static void COEX(mm_t &a, mm_t &b)
-{
-    mm_t temp = a;
-    a = vtype::min(a, b);
-    b = vtype::max(temp, b);
-}
-
-template <typename vtype,
-          typename zmm_t = typename vtype::zmm_t,
-          typename opmask_t = typename vtype::opmask_t>
-static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask)
-{
-    zmm_t min = vtype::min(in2, in1);
-    zmm_t max = vtype::max(in2, in1);
-    return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max
-}
-
-/*
- * Parition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline int32_t partition_vec(type_t *arr,
-                                    int64_t left,
-                                    int64_t right,
-                                    const zmm_t curr_vec,
-                                    const zmm_t pivot_vec,
-                                    zmm_t *smallest_vec,
-                                    zmm_t *biggest_vec)
-{
-    /* which elements are larger than the pivot */
-    typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec);
-    int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
-    vtype::mask_compressstoreu(
-            arr + left, vtype::knot_opmask(gt_mask), curr_vec);
-    vtype::mask_compressstoreu(
-            arr + right - amount_gt_pivot, gt_mask, curr_vec);
-    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
-    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_gt_pivot;
-}
-
-/*
- * Parition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline int64_t partition_avx512(type_t *arr,
-                                       int64_t left,
-                                       int64_t right,
-                                       type_t pivot,
-                                       type_t *smallest,
-                                       type_t *biggest)
-{
-    /* make array length divisible by vtype::numlanes , shortening the array */
-    for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
-        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
-        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
-        if (!comparison_func<vtype>(arr[left], pivot)) {
-            std::swap(arr[left], arr[--right]);
-        }
-        else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than vtype::numlanes elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    if (right - left == vtype::numlanes) {
-        zmm_t vec = vtype::loadu(arr + left);
-        int32_t amount_gt_pivot = partition_vec<vtype>(arr,
-                                                       left,
-                                                       left + vtype::numlanes,
-                                                       vec,
-                                                       pivot_vec,
-                                                       &min_vec,
-                                                       &max_vec);
-        *smallest = vtype::reducemin(min_vec);
-        *biggest = vtype::reducemax(max_vec);
-        return left + (vtype::numlanes - amount_gt_pivot);
-    }
-
-    // first and last vtype::numlanes values are partitioned at the end
-    zmm_t vec_left = vtype::loadu(arr + left);
-    zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
-    // store points of the vectors
-    int64_t r_store = right - vtype::numlanes;
-    int64_t l_store = left;
-    // indices for loading the elements
-    left += vtype::numlanes;
-    right -= vtype::numlanes;
-    while (right - left != 0) {
-        zmm_t curr_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + vtype::numlanes) - right < left - l_store) {
-            right -= vtype::numlanes;
-            curr_vec = vtype::loadu(arr + right);
-        }
-        else {
-            curr_vec = vtype::loadu(arr + left);
-            left += vtype::numlanes;
-        }
-        // partition the current vector and save it on both sides of the array
-        int32_t amount_gt_pivot
-                = partition_vec<vtype>(arr,
-                                       l_store,
-                                       r_store + vtype::numlanes,
-                                       curr_vec,
-                                       pivot_vec,
-                                       &min_vec,
-                                       &max_vec);
-        ;
-        r_store -= amount_gt_pivot;
-        l_store += (vtype::numlanes - amount_gt_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    int32_t amount_gt_pivot = partition_vec<vtype>(arr,
-                                                   l_store,
-                                                   r_store + vtype::numlanes,
-                                                   vec_left,
-                                                   pivot_vec,
-                                                   &min_vec,
-                                                   &max_vec);
-    l_store += (vtype::numlanes - amount_gt_pivot);
-    amount_gt_pivot = partition_vec<vtype>(arr,
-                                           l_store,
-                                           l_store + vtype::numlanes,
-                                           vec_right,
-                                           pivot_vec,
-                                           &min_vec,
-                                           &max_vec);
-    l_store += (vtype::numlanes - amount_gt_pivot);
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-#endif // __AVX512_QSORT_COMMON__
-- 
cgit v1.2.1


From e7240dcaf24aebca83c3f642a12fa070a557b9c4 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 31 Jan 2023 10:48:15 -0800
Subject: Add x86 simd sort dispatch files to meson.build

---
 numpy/core/meson.build | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 27d7ab851..74d983dbb 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -718,7 +718,8 @@ src_multiarray = [
   'src/multiarray/usertypes.c',
   'src/multiarray/vdot.c',
   src_file.process('src/common/npy_sort.h.src'),
-  'src/npysort/x86-qsort.dispatch.cpp',
+  'src/npysort/x86-qsort-skx.dispatch.cpp',
+  'src/npysort/x86-qsort-icl.dispatch.cpp',
   'src/npysort/quicksort.cpp',
   'src/npysort/mergesort.cpp',
   'src/npysort/timsort.cpp',
-- 
cgit v1.2.1


From a2f048f4886ef3bde2caef134a89c73a84163764 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 31 Jan 2023 12:46:47 -0800
Subject: Fetch submodules in macOS and Windows build

---
 azure-pipelines.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 18b72f490..9a95aad5f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -184,6 +184,9 @@ stages:
     - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
       displayName: 'Check for unreachable code paths in Python modules'
 
+    - script: git submodules update --init
+      displayName: 'Fetch submodules'
+
     # prefer usage of clang over gcc proper
     # to match likely scenario on many user mac machines
     - script: python setup.py build -j 4 build_src --verbose-cfg install
@@ -287,6 +290,7 @@ stages:
 
     steps:
     - template: azure-steps-windows.yml
+      submodules: true
 
 
   - job: Linux_conda
-- 
cgit v1.2.1


From a5d416bd60ce1067108e99951131768dfd9ee440 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 31 Jan 2023 12:56:17 -0800
Subject: Update to latest commit x86-simd-sort

---
 numpy/core/src/npysort/x86-simd-sort | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort
index 0f1023bd0..7d7591cf5 160000
--- a/numpy/core/src/npysort/x86-simd-sort
+++ b/numpy/core/src/npysort/x86-simd-sort
@@ -1 +1 @@
-Subproject commit 0f1023bd0ffdabfe22883b85d4dfe55a6ed6ad3f
+Subproject commit 7d7591cf5927e83e4a1e7c4b6f2c4dc91a97889f
-- 
cgit v1.2.1


From 774edbd8a572067556e9860d5e5c23f73107421a Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 1 Feb 2023 11:16:13 -0800
Subject: Fix azure-pipelines.yml to checkout submodules

---
 azure-pipelines.yml     | 3 +--
 azure-steps-windows.yml | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 9a95aad5f..7657ab87f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -184,7 +184,7 @@ stages:
     - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
       displayName: 'Check for unreachable code paths in Python modules'
 
-    - script: git submodules update --init
+    - script: git submodule update --init
       displayName: 'Fetch submodules'
 
     # prefer usage of clang over gcc proper
@@ -290,7 +290,6 @@ stages:
 
     steps:
     - template: azure-steps-windows.yml
-      submodules: true
 
 
   - job: Linux_conda
diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml
index 318f46398..a147ffd7a 100644
--- a/azure-steps-windows.yml
+++ b/azure-steps-windows.yml
@@ -1,4 +1,6 @@
 steps:
+- script: git submodule update --init
+  displayName: 'Fetch submodules'
 - task: UsePythonVersion@0
   inputs:
     versionSpec: $(PYTHON_VERSION)
-- 
cgit v1.2.1


From b358ba4fb3c42f296466d5a6271d253e7abb7db0 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 17:12:29 +0200
Subject: ENH: Towards modern C++

 This patch initializes new C++ headers and also brings new
 namespace `np::` to break away from the current approach
 of using C++ which tends not to be drawn into modernity.
---
 numpy/core/src/common/common.hpp | 11 +++++++
 numpy/core/src/common/half.hpp   | 63 ++++++++++++++++++++++++++++++++++++++++
 numpy/core/src/common/meta.hpp   | 54 ++++++++++++++++++++++++++++++++++
 numpy/core/src/common/npstd.hpp  | 54 ++++++++++++++++++++++++++++++++++
 4 files changed, 182 insertions(+)
 create mode 100644 numpy/core/src/common/common.hpp
 create mode 100644 numpy/core/src/common/half.hpp
 create mode 100644 numpy/core/src/common/meta.hpp
 create mode 100644 numpy/core/src/common/npstd.hpp

diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp
new file mode 100644
index 000000000..47d790bcf
--- /dev/null
+++ b/numpy/core/src/common/common.hpp
@@ -0,0 +1,11 @@
+#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP
+#define NUMPY_CORE_SRC_COMMON_COMMON_HPP
+/*
+ * The following C++ headers are safe to be used standalone, however,
+ * they are gathered to make it easy for us and for the future need to support PCH.
+ */
+#include "npstd.hpp"
+#include "half.hpp"
+#include "meta.hpp"
+
+#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
new file mode 100644
index 000000000..399f2fa79
--- /dev/null
+++ b/numpy/core/src/common/half.hpp
@@ -0,0 +1,63 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_HPP
+
+#include "npstd.hpp"
+
+// TODO(@seiko2plus):
+// - covers half-precision operations that being supported by numpy/halffloat.h
+// - support __fp16
+// - optimize x86 half<->single via cpu_fp16
+// - optimize ppc64 half<->single via cpu_vsx3
+
+namespace np {
+
+/// @addtogroup cpp_core_types
+/// @{
+
+/// Provides a type that implements 16-bit floating point (half-precision).
+/// This type is ensured to be 16-bit size.
+class Half final {
+ public:
+    /// @name Public Constructors
+    /// @{
+
+    /// Default constructor. initialize nothing.
+    Half() = default;
+    /// Copy.
+    Half(const Half &r)
+    {
+        data_.u = r.data_.u;
+    }
+
+    /// @}
+
+    /// Returns a new Half constracted from the IEEE 754 binary16.
+    /// @param b the value of binary16.
+    static Half FromBits(uint16_t b)
+    {
+        Half f;
+        f.data_.u = b;
+        return f;
+    }
+    /// Returns the IEEE 754 binary16 representation.
+    uint16_t Bits() const
+    {
+        return data_.u;
+    }
+
+ private:
+    union {
+        uint16_t u;
+/*
+TODO(@seiko2plus): support __fp16
+#ifdef NPY_HAVE_HW_FP16
+        __fp16 f;
+#endif
+*/
+    } data_;
+};
+
+/// @} cpp_core_types
+
+} // namespace np
+#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP
diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp
new file mode 100644
index 000000000..27ea1857e
--- /dev/null
+++ b/numpy/core/src/common/meta.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_META_HPP
+#define NUMPY_CORE_SRC_COMMON_META_HPP
+
+#include "npstd.hpp"
+
+namespace np { namespace meta {
+/// @addtogroup cpp_core_meta
+/// @{
+
+namespace details {
+template<int size, bool unsig>
+struct IntBySize;
+
+template<bool unsig>
+struct IntBySize<sizeof(uint8_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint8_t, int8_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint16_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint16_t, int16_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint32_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint32_t, int32_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint64_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint64_t, int64_t>::type;
+};
+} // namespace details
+
+/// Provides safe conversion of any integer type synonyms
+/// to a fixed-width integer type.
+template<typename T>
+struct FixedWidth {
+    using TF_ = typename details::IntBySize<
+        sizeof(T), std::is_unsigned<T>::value
+    >::Type;
+
+    using Type = typename std::conditional<
+        std::is_integral<T>::value, TF_, T
+    >::type;
+};
+
+/// @} cpp_core_meta
+
+}} // namespace np::meta
+
+#endif // NUMPY_CORE_SRC_COMMON_META_HPP
+
diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp
new file mode 100644
index 000000000..71993bd7c
--- /dev/null
+++ b/numpy/core/src/common/npstd.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#include <numpy/npy_common.h>
+
+#include "npy_config.h"
+
+namespace np {
+/// @addtogroup cpp_core_types
+/// @{
+using std::uint8_t;
+using std::int8_t;
+using std::uint16_t;
+using std::int16_t;
+using std::uint32_t;
+using std::int32_t;
+using std::uint64_t;
+using std::int64_t;
+using std::uintptr_t;
+using std::intptr_t;
+using std::complex;
+
+/** Guard for long double.
+ *
+ * The C implementation defines long double as double
+ * on MinGW to provide compatibility with MSVC to unify
+ * one behavior under Windows OS, which makes npy_longdouble
+ * not fit to be used with template specialization or overloading.
+ *
+ * This type will be set to `void` when `npy_longdouble` is not defined
+ * as `long double`.
+ */
+using LongDouble = typename std::conditional<
+    !std::is_same<npy_longdouble, long double>::value,
+     void, npy_longdouble
+>::type;
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
-- 
cgit v1.2.1


From 6d26364d4ca94f86acf7c813d3a69431a75455d0 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 17:15:10 +0200
Subject: ENH, SIMD: reimplement CPU dispatching of qsort

  For a Few C++ More
---
 numpy/core/meson.build                             |   4 +-
 numpy/core/setup.py                                |   4 +-
 numpy/core/src/npysort/quicksort.cpp               | 238 +++++----------------
 numpy/core/src/npysort/simd_qsort.dispatch.cpp     |  44 ++++
 numpy/core/src/npysort/simd_qsort.hpp              |  19 ++
 .../core/src/npysort/simd_qsort_16bit.dispatch.cpp |  31 +++
 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp  |  35 ---
 numpy/core/src/npysort/x86-qsort-icl.h             |  27 ---
 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp  |  54 -----
 numpy/core/src/npysort/x86-qsort-skx.h             |  37 ----
 10 files changed, 155 insertions(+), 338 deletions(-)
 create mode 100644 numpy/core/src/npysort/simd_qsort.dispatch.cpp
 create mode 100644 numpy/core/src/npysort/simd_qsort.hpp
 create mode 100644 numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
 delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
 delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.h
 delete mode 100644 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
 delete mode 100644 numpy/core/src/npysort/x86-qsort-skx.h

diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 74d983dbb..05f286a50 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -718,8 +718,8 @@ src_multiarray = [
   'src/multiarray/usertypes.c',
   'src/multiarray/vdot.c',
   src_file.process('src/common/npy_sort.h.src'),
-  'src/npysort/x86-qsort-skx.dispatch.cpp',
-  'src/npysort/x86-qsort-icl.dispatch.cpp',
+  'src/npysort/simd_qsort.dispatch.cpp',
+  'src/npysort/simd_qsort_16bit.dispatch.cpp',
   'src/npysort/quicksort.cpp',
   'src/npysort/mergesort.cpp',
   'src/npysort/timsort.cpp',
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 3ab00205f..cfae34e31 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -979,8 +979,8 @@ def configuration(parent_package='',top_path=None):
 
     if enable_avx512_qsort():
         multiarray_src += [
-                join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'),
-                join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'),
+                join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
+                join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
                 ]
 
     #######################################################################
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index f2cada873..0e65dc9bc 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -54,6 +54,7 @@
 #include "npysort_common.h"
 #include "npysort_heapsort.h"
 #include "numpy_tag.h"
+#include "simd_qsort.hpp"
 
 #include <cstdlib>
 #include <utility>
@@ -68,197 +69,39 @@
 #define SMALL_MERGESORT 20
 #define SMALL_STRING 16
 
+template<typename T>
+inline bool quicksort_dispatch(T *start, npy_intp num)
+{
+    using TF = typename np::meta::FixedWidth<T>::Type;
+    void (*dispfunc)(TF*, intptr_t) = nullptr;
+    if (sizeof(T) == sizeof(uint16_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort_16bit.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    if (dispfunc) {
+        (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
+        return true;
+    }
+    return false;
+}
 /*
  *****************************************************************************
  **                            NUMERIC SORTS                                **
  *****************************************************************************
  */
 
-namespace {
-
-template <typename Tag>
-struct x86_dispatch {
-    static bool quicksort(typename Tag::type *, npy_intp) { return false; }
-};
-
-// Currently disabled on WIN32 only
-#ifdef NPY_ENABLE_AVX512_QSORT
-#include "x86-qsort-skx.h"
-#include "x86-qsort-icl.h"
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-skx.dispatch.h"
-#endif
-
-#if NPY_SIZEOF_LONG == 8
-template <>
-struct x86_dispatch<npy::long_tag> {
-    static bool quicksort(npy_long *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-template <>
-struct x86_dispatch<npy::ulong_tag> {
-    static bool quicksort(npy_ulong *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-#elif NPY_SIZEOF_LONGLONG == 8
-template <>
-struct x86_dispatch<npy::longlong_tag> {
-    static bool quicksort(npy_longlong *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-template <>
-struct x86_dispatch<npy::ulonglong_tag> {
-    static bool quicksort(npy_ulonglong *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-#endif // NPY_SIZEOF_LONG
-
-template <>
-struct x86_dispatch<npy::double_tag> {
-    static bool quicksort(npy_double *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_double);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::int_tag> {
-    static bool quicksort(npy_int *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::uint_tag> {
-    static bool quicksort(npy_uint *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::float_tag> {
-    static bool quicksort(npy_float *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-icl.dispatch.h"
-#endif
-
-template <>
-struct x86_dispatch<npy::half_tag> {
-    static bool quicksort(npy_half *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_half);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-
-template <>
-struct x86_dispatch<npy::short_tag> {
-    static bool quicksort(npy_short *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::ushort_tag> {
-    static bool quicksort(npy_ushort *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-#endif // NPY_ENABLE_AVX512_QSORT
-
-}  // end namespace
-
 template <typename Tag, typename type>
 static int
 quicksort_(type *start, npy_intp num)
 {
-    if (x86_dispatch<Tag>::quicksort(start, num))
-        return 0;
-
     type vp;
     type *pl = start;
     type *pr = pl + num - 1;
@@ -851,56 +694,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr))
 NPY_NO_EXPORT int
 quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_short *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::short_tag>((npy_short *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ushort *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ushort_tag>((npy_ushort *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_int *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::int_tag>((npy_int *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_uint *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::uint_tag>((npy_uint *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_long *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::long_tag>((npy_long *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulong_tag>((npy_ulong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_longlong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::longlong_tag>((npy_longlong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulonglong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulonglong_tag>((npy_ulonglong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((np::Half *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::half_tag>((npy_half *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_float *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::float_tag>((npy_float *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_double *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::double_tag>((npy_double *)start, n);
 }
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
new file mode 100644
index 000000000..36b5d799c
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -0,0 +1,44 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#ifdef NPY_HAVE_AVX512_SKX
+    #include "avx512-32bit-qsort.hpp"
+    #include "avx512-64bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#ifdef NPY_HAVE_AVX512_SKX
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif  // NPY_HAVE_AVX512_SKX
+
+}} // namespace np::simd
diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp
new file mode 100644
index 000000000..7cdee774d
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.hpp
@@ -0,0 +1,19 @@
+#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+
+#include "common.hpp"
+
+namespace np { namespace qsort_simd {
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort_16bit.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+} } // np::qsort_simd
+#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
new file mode 100644
index 000000000..a816b8781
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -0,0 +1,31 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#ifdef NPY_HAVE_AVX512_ICL
+    #include "avx512-16bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#ifdef NPY_HAVE_AVX512_ICL
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
+{
+    avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif // NPY_HAVE_AVX512_ICL
+
+}} // namespace np::qsort_simd
diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
deleted file mode 100644
index 3dce8a9b4..000000000
--- a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_icl
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort-icl.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_ICL
-#include "avx512-16bit-qsort.hpp"
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_half)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort_fp16((npy_half*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_short>((npy_short*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_ushort>((npy_ushort*)arr, arrsize);
-}
-
-#endif  // NPY_HAVE_AVX512_ICL
diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h
deleted file mode 100644
index 92cef9cbc..000000000
--- a/numpy/core/src/npysort/x86-qsort-icl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-icl.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_half,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
deleted file mode 100644
index 521b198ce..000000000
--- a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_skx
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort-skx.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_SKX
-#include "avx512-32bit-qsort.hpp"
-#include "avx512-64bit-qsort.hpp"
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<int64_t>((int64_t*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<uint64_t>((uint64_t*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_double)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_double>((npy_double*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_int>((npy_int*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_uint>((npy_uint*)arr, arrsize);
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
-{
-    avx512_qsort<npy_float>((npy_float*)arr, arrsize);
-}
-
-#endif  // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort-skx.h b/numpy/core/src/npysort/x86-qsort-skx.h
deleted file mode 100644
index 9a5cb2c9d..000000000
--- a/numpy/core/src/npysort/x86-qsort-skx.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort-skx.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_long,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ulong,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_double,
-                         (void *start, npy_intp num))
-
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
-- 
cgit v1.2.1


From ba157435ab5c26350bb992149ae6a644a96ff06b Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 17:16:56 +0200
Subject: ENH, SIMD: include npy_cpu_dipatch.h by npy_config.h

  To guarantee of having #defs NPY_HAVE_[CPU features] in the scope
---
 numpy/core/src/common/npy_config.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index d6886c5ea..715b17777 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
 
 #include "config.h"
+#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features]
 #include "numpy/numpyconfig.h"
 #include "numpy/utils.h"
 #include "numpy/npy_os.h"
-- 
cgit v1.2.1


From 7ddb5daa866984caa78e3fa4b5cd4869f4ee94cf Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 21:04:27 +0200
Subject: ENH, SIMD: removes #NPY_ENABLE_AVX512_QSORT and use #directives
 instead

---
 numpy/core/setup.py                                  | 19 ++-----------------
 numpy/core/src/npysort/quicksort.cpp                 | 11 +++++++++++
 numpy/core/src/npysort/simd_qsort.dispatch.cpp       |  4 ++--
 numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp |  4 ++--
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index cfae34e31..d6117f02d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -68,14 +68,6 @@ class CallOnceOnly:
             out = copy.deepcopy(pickle.loads(self._check_complex))
         return out
 
-# Temporarily disable AVX512 sorting on WIN32 until we can figure
-# out why it has test failures
-def enable_avx512_qsort():
-    enable = True
-    if "win32" in sysconfig.get_platform():
-        enable = False
-    return enable
-
 def can_link_svml():
     """SVML library is supported only on x86_64 architecture and currently
     only on linux
@@ -492,9 +484,6 @@ def configuration(parent_package='',top_path=None):
             if can_link_svml():
                 moredefs.append(('NPY_CAN_LINK_SVML', 1))
 
-            if enable_avx512_qsort():
-                moredefs.append(('NPY_ENABLE_AVX512_QSORT', 1))
-
             # Use bogus stride debug aid to flush out bugs where users use
             # strides of dimensions with length 1 to index a full contiguous
             # array.
@@ -975,14 +964,10 @@ def configuration(parent_package='',top_path=None):
             # links to the arm64 npymath library,
             # see gh-22673
             join('src', 'npymath', 'arm64_exports.c'),
+            join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
+            join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
             ]
 
-    if enable_avx512_qsort():
-        multiarray_src += [
-                join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
-                join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
-                ]
-
     #######################################################################
     #             _multiarray_umath module - umath part                   #
     #######################################################################
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 0e65dc9bc..625fdebbb 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -69,6 +69,15 @@
 #define SMALL_MERGESORT 20
 #define SMALL_STRING 16
 
+// Temporarily disable AVX512 sorting on WIN32 until we can figure
+//  out why it has test failures
+#ifdef _MSC_VER
+template<typename T>
+inline bool quicksort_dispatch(T*, npy_intp)
+{
+    return false;
+}
+#else
 template<typename T>
 inline bool quicksort_dispatch(T *start, npy_intp num)
 {
@@ -92,6 +101,8 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
     }
     return false;
 }
+#endif // _MSC_VER
+
 /*
  *****************************************************************************
  **                            NUMERIC SORTS                                **
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
index 36b5d799c..c2ac5a2ae 100644
--- a/numpy/core/src/npysort/simd_qsort.dispatch.cpp
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -7,14 +7,14 @@
 
 #include "simd_qsort.hpp"
 
-#ifdef NPY_HAVE_AVX512_SKX
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
     #include "avx512-32bit-qsort.hpp"
     #include "avx512-64bit-qsort.hpp"
 #endif
 
 namespace np { namespace qsort_simd {
 
-#ifdef NPY_HAVE_AVX512_SKX
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
 template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
 {
     avx512_qsort(arr, size);
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
index a816b8781..673a2f81e 100644
--- a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -7,13 +7,13 @@
 
 #include "simd_qsort.hpp"
 
-#ifdef NPY_HAVE_AVX512_ICL
+#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
     #include "avx512-16bit-qsort.hpp"
 #endif
 
 namespace np { namespace qsort_simd {
 
-#ifdef NPY_HAVE_AVX512_ICL
+#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
 template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
 {
     avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size);
-- 
cgit v1.2.1


From 3e84a70000f27487f2cc680795620d92f2d9b3a4 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 21:20:09 +0200
Subject: fix up meson

---
 numpy/core/meson.build | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 05f286a50..fad6f462e 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -453,6 +453,11 @@ if cc.get_id() == 'msvc'
      staticlib_cflags +=  '-d2VolatileMetadata-'
    endif
 endif
+# TODO: change to "feature" option in meson_options.txt? See
+# https://mesonbuild.com/Build-options.html#build-options
+if get_option('disable-simd-optimizations')
+  staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION'
+endif
 
 npy_math_internal_h = custom_target(
   output: 'npy_math_internal.h',
-- 
cgit v1.2.1


From 344fe0587ba0ed48e75eb358a3dfbbb27a013354 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 21:36:04 +0200
Subject: fix up up meson

---
 numpy/core/meson.build | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index fad6f462e..eea31faac 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -599,7 +599,8 @@ np_core_dep = declare_dependency(
     '.',
     'include',
     'src/common',
-  ]
+  ],
+  compile_args: disable_simd_optimizations
 )
 
 
-- 
cgit v1.2.1


From 472a47f8ea9aa9ffe933c15ac4c0c148570b1781 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 7 Feb 2023 21:56:43 +0200
Subject: No need for add x86-simd-sort as global directory

---
 numpy/core/setup.py                                  | 1 -
 numpy/core/src/npysort/simd_qsort.dispatch.cpp       | 4 ++--
 numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index d6117f02d..0793ad561 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -650,7 +650,6 @@ def configuration(parent_package='',top_path=None):
     config.add_include_dirs(join('src', 'multiarray'))
     config.add_include_dirs(join('src', 'umath'))
     config.add_include_dirs(join('src', 'npysort'))
-    config.add_include_dirs(join('src', 'npysort', 'x86-simd-sort', 'src'))
     config.add_include_dirs(join('src', '_simd'))
 
     config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
index c2ac5a2ae..101bb3dcc 100644
--- a/numpy/core/src/npysort/simd_qsort.dispatch.cpp
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -8,8 +8,8 @@
 #include "simd_qsort.hpp"
 
 #if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
-    #include "avx512-32bit-qsort.hpp"
-    #include "avx512-64bit-qsort.hpp"
+    #include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
+    #include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
 #endif
 
 namespace np { namespace qsort_simd {
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
index 673a2f81e..a6465a883 100644
--- a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -8,7 +8,7 @@
 #include "simd_qsort.hpp"
 
 #if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
-    #include "avx512-16bit-qsort.hpp"
+    #include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
 #endif
 
 namespace np { namespace qsort_simd {
-- 
cgit v1.2.1


From d07d5584fc63df10025190a4ea38c4863c1b1723 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 7 Feb 2023 14:50:47 -0800
Subject: Disable on CYGWIN

---
 numpy/core/src/npysort/quicksort.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 625fdebbb..7497ebaa3 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -69,9 +69,9 @@
 #define SMALL_MERGESORT 20
 #define SMALL_STRING 16
 
-// Temporarily disable AVX512 sorting on WIN32 until we can figure
-//  out why it has test failures
-#ifdef _MSC_VER
+// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure
+// out why it has test failures
+#if defined(_MSC_VER) || defined(__CYGWIN__)
 template<typename T>
 inline bool quicksort_dispatch(T*, npy_intp)
 {
@@ -101,7 +101,7 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
     }
     return false;
 }
-#endif // _MSC_VER
+#endif // _MSC_VER || CYGWIN
 
 /*
  *****************************************************************************
-- 
cgit v1.2.1