From 49278b961b7254bc6a4aee478587c69682a3827e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 19 Sep 2022 10:35:03 -0700 Subject: ENH: Add x86-simd-sort source files --- .../x86-simd-sort/src/avx512-16bit-qsort.hpp | 527 +++++++++++++ .../x86-simd-sort/src/avx512-32bit-qsort.hpp | 712 ++++++++++++++++++ .../x86-simd-sort/src/avx512-64bit-qsort.hpp | 820 +++++++++++++++++++++ .../x86-simd-sort/src/avx512-common-qsort.h | 218 ++++++ 4 files changed, 2277 insertions(+) create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp create mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp new file mode 100644 index 000000000..1673eb5da --- /dev/null +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp @@ -0,0 +1,527 @@ +/******************************************************************* + * Copyright (C) 2022 Intel Corporation + * SPDX-License-Identifier: BSD-3-Clause + * Authors: Raghuveer Devulapalli + * ****************************************************************/ + +#ifndef __AVX512_QSORT_16BIT__ +#define __AVX512_QSORT_16BIT__ + +#include "avx512-common-qsort.h" + +/* + * Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +// ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +#define NETWORK_16BIT_1 \ + 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, \ + 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +#define NETWORK_16BIT_2 \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, \ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define NETWORK_16BIT_3 \ + 27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, 11, 10, 9, \ + 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 +#define NETWORK_16BIT_4 \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +#define NETWORK_16BIT_5 \ + 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, 7, 6, 5, \ + 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 +#define NETWORK_16BIT_6 \ + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, \ + 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + +template <> +struct vector { + using type_t = int16_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask32; + static const uint8_t numlanes = 32; + + static type_t type_max() + { + return X86_SIMD_SORT_MAX_INT16; + } + static type_t type_min() + { + return X86_SIMD_SORT_MIN_INT16; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi16(type_max()); + } + + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask32(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT); + } + //template + //static zmm_t i64gather(__m512i index, void const *base) + //{ + // return _mm512_i64gather_epi64(index, base, scale); + //} + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_epi16(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + // AVX512_VBMI2 + return _mm512_mask_compressstoreu_epi16(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + // AVX512BW + return _mm512_mask_loadu_epi16(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi16(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi16(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_epi16(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi16(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); + zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); + type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo); + type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi); + return std::max(lo_max, hi_max); + } + static type_t reducemin(zmm_t v) + { + zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); + zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); + type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo); + type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi); + return std::min(lo_min, hi_min); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi16(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); + return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } +}; +template <> +struct vector { + using type_t = uint16_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask32; + static const uint8_t numlanes = 32; + + static type_t type_max() + { + return X86_SIMD_SORT_MAX_UINT16; + } + static type_t type_min() + { + return 0; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi16(type_max()); + } // TODO: this should broadcast bits as is? + + //template + //static zmm_t i64gather(__m512i index, void const *base) + //{ + // return _mm512_i64gather_epi64(index, base, scale); + //} + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask32(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_epu16(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_epi16(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_epi16(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi16(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi16(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_epu16(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi16(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); + zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); + type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo); + type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi); + return std::max(lo_max, hi_max); + } + static type_t reducemin(zmm_t v) + { + zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); + zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); + type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo); + type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi); + return std::min(lo_min, hi_min); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi16(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); + return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } +}; + +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +template +static inline zmm_t sort_zmm_16bit(zmm_t zmm) +{ + // Level 1 + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAAAAAA); + // Level 2 + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCCCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAAAAAA); + // Level 3 + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_1), zmm), + 0xF0F0F0F0); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCCCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAAAAAA); + // Level 4 + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_2), zmm), + 0xFF00FF00); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm), + 0xF0F0F0F0); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCCCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAAAAAA); + // Level 5 + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm), + 0xFFFF0000); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm), + 0xFF00FF00); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm), + 0xF0F0F0F0); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCCCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAAAAAA); + return zmm; +} + +// Assumes zmm is bitonic and performs a recursive half cleaner +template +static inline zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) +{ + // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc .. + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_6), zmm), + 0xFFFF0000); + // 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm), + 0xFF00FF00); + // 3) half_cleaner[8] + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm), + 0xF0F0F0F0); + // 3) half_cleaner[4] + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCCCCCC); + // 3) half_cleaner[2] + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAAAAAA); + return zmm; +} + +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +template +static inline void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) +{ + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + zmm2 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm2); + zmm_t zmm3 = vtype::min(zmm1, zmm2); + zmm_t zmm4 = vtype::max(zmm1, zmm2); + // 2) Recursive half cleaner for each + zmm1 = bitonic_merge_zmm_16bit(zmm3); + zmm2 = bitonic_merge_zmm_16bit(zmm4); +} + +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive +// half cleaner +template +static inline void bitonic_merge_four_zmm_16bit(zmm_t *zmm) +{ + zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[2]); + zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[3]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); + zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), + vtype::max(zmm[1], zmm2r)); + zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), + vtype::max(zmm[0], zmm3r)); + zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); + zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); + zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); + zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_16bit(zmm0); + zmm[1] = bitonic_merge_zmm_16bit(zmm1); + zmm[2] = bitonic_merge_zmm_16bit(zmm2); + zmm[3] = bitonic_merge_zmm_16bit(zmm3); +} + +template +static inline void sort_32_16bit(type_t *arr, int32_t N) +{ + typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF; + typename vtype::zmm_t zmm + = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); + vtype::mask_storeu(arr, load_mask, sort_zmm_16bit(zmm)); +} + +template +static inline void sort_64_16bit(type_t *arr, int32_t N) +{ + if (N <= 32) { + sort_32_16bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + typename vtype::opmask_t load_mask + = ((0x1ull << (N - 32)) - 0x1ull) & 0xFFFFFFFF; + zmm_t zmm1 = vtype::loadu(arr); + zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 32); + zmm1 = sort_zmm_16bit(zmm1); + zmm2 = sort_zmm_16bit(zmm2); + bitonic_merge_two_zmm_16bit(zmm1, zmm2); + vtype::storeu(arr, zmm1); + vtype::mask_storeu(arr + 32, load_mask, zmm2); +} + +template +static inline void sort_128_16bit(type_t *arr, int32_t N) +{ + if (N <= 64) { + sort_64_16bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[4]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 32); + opmask_t load_mask1 = 0xFFFFFFFF, load_mask2 = 0xFFFFFFFF; + if (N != 128) { + uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; + load_mask1 = combined_mask & 0xFFFFFFFF; + load_mask2 = (combined_mask >> 32) & 0xFFFFFFFF; + } + zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); + zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 96); + zmm[0] = sort_zmm_16bit(zmm[0]); + zmm[1] = sort_zmm_16bit(zmm[1]); + zmm[2] = sort_zmm_16bit(zmm[2]); + zmm[3] = sort_zmm_16bit(zmm[3]); + bitonic_merge_two_zmm_16bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_16bit(zmm[2], zmm[3]); + bitonic_merge_four_zmm_16bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 32, zmm[1]); + vtype::mask_storeu(arr + 64, load_mask1, zmm[2]); + vtype::mask_storeu(arr + 96, load_mask2, zmm[3]); +} + +template +static inline type_t +get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right) +{ + // median of 32 + int64_t size = (right - left) / 32; + __m512i rand_vec = _mm512_set_epi16(arr[left], + arr[left + size], + arr[left + 2 * size], + arr[left + 3 * size], + arr[left + 4 * size], + arr[left + 5 * size], + arr[left + 6 * size], + arr[left + 7 * size], + arr[left + 8 * size], + arr[left + 9 * size], + arr[left + 10 * size], + arr[left + 11 * size], + arr[left + 12 * size], + arr[left + 13 * size], + arr[left + 14 * size], + arr[left + 15 * size], + arr[left + 16 * size], + arr[left + 17 * size], + arr[left + 18 * size], + arr[left + 19 * size], + arr[left + 20 * size], + arr[left + 21 * size], + arr[left + 22 * size], + arr[left + 23 * size], + arr[left + 24 * size], + arr[left + 25 * size], + arr[left + 26 * size], + arr[left + 27 * size], + arr[left + 28 * size], + arr[left + 29 * size], + arr[left + 30 * size], + arr[left + 31 * size]); + __m512i sort = sort_zmm_16bit(rand_vec); + return ((type_t *)&sort)[16]; +} + +template +static inline void +qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_16bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_16bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_16bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_16bit_(arr, pivot_index, right, max_iters - 1); +} + +template <> +void avx512_qsort(int16_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + qsort_16bit_, int16_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(uint16_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + qsort_16bit_, uint16_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} +#endif // __AVX512_QSORT_16BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp new file mode 100644 index 000000000..cbc5368f0 --- /dev/null +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp @@ -0,0 +1,712 @@ +/******************************************************************* + * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2021 Serge Sans Paille + * SPDX-License-Identifier: BSD-3-Clause + * Authors: Raghuveer Devulapalli + * Serge Sans Paille + * ****************************************************************/ +#ifndef __AVX512_QSORT_32BIT__ +#define __AVX512_QSORT_32BIT__ + +#include "avx512-common-qsort.h" + +/* + * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +#define NETWORK_32BIT_1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +#define NETWORK_32BIT_2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 +#define NETWORK_32BIT_3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +#define NETWORK_32BIT_4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 +#define NETWORK_32BIT_5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 +#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + +template <> +struct vector { + using type_t = int32_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask16; + static const uint8_t numlanes = 16; + + static type_t type_max() + { + return X86_SIMD_SORT_MAX_INT32; + } + static type_t type_min() + { + return X86_SIMD_SORT_MIN_INT32; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi32(type_max()); + } + + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask16(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); + } + template + static ymm_t i64gather(__m512i index, void const *base) + { + return _mm512_i64gather_epi32(index, base, scale); + } + static zmm_t merge(ymm_t y1, ymm_t y2) + { + zmm_t z1 = _mm512_castsi256_si512(y1); + return _mm512_inserti32x8(z1, y2, 1); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_epi32(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_epi32(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi32(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi32(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_epi32(x, y); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_epi32(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi32(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + return _mm512_reduce_max_epi32(v); + } + static type_t reducemin(zmm_t v) + { + return _mm512_reduce_min_epi32(v); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi32(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } + + static ymm_t max(ymm_t x, ymm_t y) + { + return _mm256_max_epi32(x, y); + } + static ymm_t min(ymm_t x, ymm_t y) + { + return _mm256_min_epi32(x, y); + } +}; +template <> +struct vector { + using type_t = uint32_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask16; + static const uint8_t numlanes = 16; + + static type_t type_max() + { + return X86_SIMD_SORT_MAX_UINT32; + } + static type_t type_min() + { + return 0; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi32(type_max()); + } // TODO: this should broadcast bits as is? + + template + static ymm_t i64gather(__m512i index, void const *base) + { + return _mm512_i64gather_epi32(index, base, scale); + } + static zmm_t merge(ymm_t y1, ymm_t y2) + { + zmm_t z1 = _mm512_castsi256_si512(y1); + return _mm512_inserti32x8(z1, y2, 1); + } + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask16(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_epu32(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_epi32(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_epi32(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi32(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi32(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_epu32(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi32(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + return _mm512_reduce_max_epu32(v); + } + static type_t reducemin(zmm_t v) + { + return _mm512_reduce_min_epu32(v); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi32(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } + + static ymm_t max(ymm_t x, ymm_t y) + { + return _mm256_max_epu32(x, y); + } + static ymm_t min(ymm_t x, ymm_t y) + { + return _mm256_min_epu32(x, y); + } +}; +template <> +struct vector { + using type_t = float; + using zmm_t = __m512; + using ymm_t = __m256; + using opmask_t = __mmask16; + static const uint8_t numlanes = 16; + + static type_t type_max() + { + return X86_SIMD_SORT_INFINITYF; + } + static type_t type_min() + { + return -X86_SIMD_SORT_INFINITYF; + } + static zmm_t zmm_max() + { + return _mm512_set1_ps(type_max()); + } + + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask16(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); + } + template + static ymm_t i64gather(__m512i index, void const *base) + { + return _mm512_i64gather_ps(index, base, scale); + } + static zmm_t merge(ymm_t y1, ymm_t y2) + { + zmm_t z1 = _mm512_castsi512_ps( + _mm512_castsi256_si512(_mm256_castps_si256(y1))); + return _mm512_insertf32x8(z1, y2, 1); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_ps(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_ps(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_ps(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_ps(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_ps(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_ps(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_ps(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_ps(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + return _mm512_reduce_max_ps(v); + } + static type_t reducemin(zmm_t v) + { + return _mm512_reduce_min_ps(v); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_ps(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_ps(mem, x); + } + + static ymm_t max(ymm_t x, ymm_t y) + { + return _mm256_max_ps(x, y); + } + static ymm_t min(ymm_t x, ymm_t y) + { + return _mm256_min_ps(x, y); + } +}; + +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +template +static inline zmm_t sort_zmm_32bit(zmm_t zmm) +{ + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAA); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAA); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_3), zmm), + 0xF0F0); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAA); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm), + 0xFF00); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm), + 0xF0F0); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCC); + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAA); + return zmm; +} + +// Assumes zmm is bitonic and performs a recursive half cleaner +template +static inline zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) +{ + // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm), + 0xFF00); + // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm), + 0xF0F0); + // 3) half_cleaner[4] + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xCCCC); + // 3) half_cleaner[1] + zmm = cmp_merge( + zmm, + vtype::template shuffle(zmm), + 0xAAAA); + return zmm; +} + +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +template +static inline void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) +{ + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2); + zmm_t zmm3 = vtype::min(*zmm1, *zmm2); + zmm_t zmm4 = vtype::max(*zmm1, *zmm2); + // 2) Recursive half cleaner for each + *zmm1 = bitonic_merge_zmm_32bit(zmm3); + *zmm2 = bitonic_merge_zmm_32bit(zmm4); +} + +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive +// half cleaner +template +static inline void bitonic_merge_four_zmm_32bit(zmm_t *zmm) +{ + zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]); + zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); + zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[1], zmm2r)); + zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[0], zmm3r)); + zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); + zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); + zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); + zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_32bit(zmm0); + zmm[1] = bitonic_merge_zmm_32bit(zmm1); + zmm[2] = bitonic_merge_zmm_32bit(zmm2); + zmm[3] = bitonic_merge_zmm_32bit(zmm3); +} + +template +static inline void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) +{ + zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]); + zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]); + zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]); + zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); + zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[3], zmm4r)); + zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[2], zmm5r)); + zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[1], zmm6r)); + zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[0], zmm7r)); + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + zmm[0] = bitonic_merge_zmm_32bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_32bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_32bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_32bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_32bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_32bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_32bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_32bit(zmm_t8); +} + +template +static inline void sort_16_32bit(type_t *arr, int32_t N) +{ + typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001; + typename vtype::zmm_t zmm + = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); + vtype::mask_storeu(arr, load_mask, sort_zmm_32bit(zmm)); +} + +template +static inline void sort_32_32bit(type_t *arr, int32_t N) +{ + if (N <= 16) { + sort_16_32bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + zmm_t zmm1 = vtype::loadu(arr); + typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001; + zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16); + zmm1 = sort_zmm_32bit(zmm1); + zmm2 = sort_zmm_32bit(zmm2); + bitonic_merge_two_zmm_32bit(&zmm1, &zmm2); + vtype::storeu(arr, zmm1); + vtype::mask_storeu(arr + 16, load_mask, zmm2); +} + +template +static inline void sort_64_32bit(type_t *arr, int32_t N) +{ + if (N <= 32) { + sort_32_32bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[4]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 16); + opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; + load_mask1 &= combined_mask & 0xFFFF; + load_mask2 &= (combined_mask >> 16) & 0xFFFF; + zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); + zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48); + zmm[0] = sort_zmm_32bit(zmm[0]); + zmm[1] = sort_zmm_32bit(zmm[1]); + zmm[2] = sort_zmm_32bit(zmm[2]); + zmm[3] = sort_zmm_32bit(zmm[3]); + bitonic_merge_two_zmm_32bit(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_32bit(&zmm[2], &zmm[3]); + bitonic_merge_four_zmm_32bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 16, zmm[1]); + vtype::mask_storeu(arr + 32, load_mask1, zmm[2]); + vtype::mask_storeu(arr + 48, load_mask2, zmm[3]); +} + +template +static inline void sort_128_32bit(type_t *arr, int32_t N) +{ + if (N <= 64) { + sort_64_32bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[8]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 16); + zmm[2] = vtype::loadu(arr + 32); + zmm[3] = vtype::loadu(arr + 48); + zmm[0] = sort_zmm_32bit(zmm[0]); + zmm[1] = sort_zmm_32bit(zmm[1]); + zmm[2] = sort_zmm_32bit(zmm[2]); + zmm[3] = sort_zmm_32bit(zmm[3]); + opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; + if (N != 128) { + uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; + load_mask1 &= combined_mask & 0xFFFF; + load_mask2 &= (combined_mask >> 16) & 0xFFFF; + load_mask3 &= (combined_mask >> 32) & 0xFFFF; + load_mask4 &= (combined_mask >> 48) & 0xFFFF; + } + zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); + zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80); + zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96); + zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112); + zmm[4] = sort_zmm_32bit(zmm[4]); + zmm[5] = sort_zmm_32bit(zmm[5]); + zmm[6] = sort_zmm_32bit(zmm[6]); + zmm[7] = sort_zmm_32bit(zmm[7]); + bitonic_merge_two_zmm_32bit(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_32bit(&zmm[2], &zmm[3]); + bitonic_merge_two_zmm_32bit(&zmm[4], &zmm[5]); + bitonic_merge_two_zmm_32bit(&zmm[6], &zmm[7]); + bitonic_merge_four_zmm_32bit(zmm); + bitonic_merge_four_zmm_32bit(zmm + 4); + bitonic_merge_eight_zmm_32bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 16, zmm[1]); + vtype::storeu(arr + 32, zmm[2]); + vtype::storeu(arr + 48, zmm[3]); + vtype::mask_storeu(arr + 64, load_mask1, zmm[4]); + vtype::mask_storeu(arr + 80, load_mask2, zmm[5]); + vtype::mask_storeu(arr + 96, load_mask3, zmm[6]); + vtype::mask_storeu(arr + 112, load_mask4, zmm[7]); +} + +template +static inline type_t +get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right) +{ + // median of 16 + int64_t size = (right - left) / 16; + using zmm_t = typename vtype::zmm_t; + using ymm_t = typename vtype::ymm_t; + __m512i rand_index1 = _mm512_set_epi64(left + size, + left + 2 * size, + left + 3 * size, + left + 4 * size, + left + 5 * size, + left + 6 * size, + left + 7 * size, + left + 8 * size); + __m512i rand_index2 = _mm512_set_epi64(left + 9 * size, + left + 10 * size, + left + 11 * size, + left + 12 * size, + left + 13 * size, + left + 14 * size, + left + 15 * size, + left + 16 * size); + ymm_t rand_vec1 + = vtype::template i64gather(rand_index1, arr); + ymm_t rand_vec2 + = vtype::template i64gather(rand_index2, arr); + zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2); + zmm_t sort = sort_zmm_32bit(rand_vec); + // pivot will never be a nan, since there are no nan's! + return ((type_t *)&sort)[8]; +} + +template +static inline void +qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_32bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_32bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_32bit_(arr, pivot_index, right, max_iters - 1); +} + +static inline int64_t replace_nan_with_inf(float *arr, int64_t arrsize) +{ + int64_t nan_count = 0; + __mmask16 loadmask = 0xFFFF; + while (arrsize > 0) { + if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } + __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); + __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((int32_t)nanmask); + _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); + arr += 16; + arrsize -= 16; + } + return nan_count; +} + +static inline void +replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count) +{ + for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + arr[ii] = std::nanf("1"); + nan_count -= 1; + } +} + +template <> +void avx512_qsort(int32_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + qsort_32bit_, int32_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(uint32_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + qsort_32bit_, uint32_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(float *arr, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qsort_32bit_, float>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + +#endif //__AVX512_QSORT_32BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp new file mode 100644 index 000000000..f680c0704 --- /dev/null +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp @@ -0,0 +1,820 @@ +/******************************************************************* + * Copyright (C) 2022 Intel Corporation + * SPDX-License-Identifier: BSD-3-Clause + * Authors: Raghuveer Devulapalli + * ****************************************************************/ + +#ifndef __AVX512_QSORT_64BIT__ +#define __AVX512_QSORT_64BIT__ + +#include "avx512-common-qsort.h" + +/* + * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +// ZMM 7, 6, 5, 4, 3, 2, 1, 0 +#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3 +#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7 +#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 +#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 +static const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); + +template <> +struct vector { + using type_t = int64_t; + using zmm_t = __m512i; + using ymm_t = __m512i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() + { + return X86_SIMD_SORT_MAX_INT64; + } + static type_t type_min() + { + return X86_SIMD_SORT_MIN_INT64; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi64(type_max()); + } // TODO: this should broadcast bits as is? + + static zmm_t set(type_t v1, + type_t v2, + type_t v3, + type_t v4, + type_t v5, + type_t v6, + type_t v7, + type_t v8) + { + return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); + } + + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask8(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT); + } + template + static zmm_t i64gather(__m512i index, void const *base) + { + return _mm512_i64gather_epi64(index, base, scale); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_epi64(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_epi64(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_epi64(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi64(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi64(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_epi64(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi64(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + return _mm512_reduce_max_epi64(v); + } + static type_t reducemin(zmm_t v) + { + return _mm512_reduce_min_epi64(v); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi64(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + __m512d temp = _mm512_castsi512_pd(zmm); + return _mm512_castpd_si512( + _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } +}; +template <> +struct vector { + using type_t = uint64_t; + using zmm_t = __m512i; + using ymm_t = __m512i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() + { + return X86_SIMD_SORT_MAX_UINT64; + } + static type_t type_min() + { + return 0; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi64(type_max()); + } + + static zmm_t set(type_t v1, + type_t v2, + type_t v3, + type_t v4, + type_t v5, + type_t v6, + type_t v7, + type_t v8) + { + return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); + } + + template + static zmm_t i64gather(__m512i index, void const *base) + { + return _mm512_i64gather_epi64(index, base, scale); + } + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask8(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_epu64(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_epi64(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_epi64(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi64(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi64(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_epu64(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi64(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + return _mm512_reduce_max_epu64(v); + } + static type_t reducemin(zmm_t v) + { + return _mm512_reduce_min_epu64(v); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi64(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + __m512d temp = _mm512_castsi512_pd(zmm); + return _mm512_castpd_si512( + _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } +}; +template <> +struct vector { + using type_t = double; + using zmm_t = __m512d; + using ymm_t = __m512d; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() + { + return X86_SIMD_SORT_INFINITY; + } + static type_t type_min() + { + return -X86_SIMD_SORT_INFINITY; + } + static zmm_t zmm_max() + { + return _mm512_set1_pd(type_max()); + } + + static zmm_t set(type_t v1, + type_t v2, + type_t v3, + type_t v4, + type_t v5, + type_t v6, + type_t v7, + type_t v8) + { + return _mm512_set_pd(v1, v2, v3, v4, v5, v6, v7, v8); + } + + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask8(x); + } + static opmask_t ge(zmm_t x, zmm_t y) + { + return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); + } + template + static zmm_t i64gather(__m512i index, void const *base) + { + return _mm512_i64gather_pd(index, base, scale); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_pd(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_max_pd(x, y); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_compressstoreu_pd(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + return _mm512_mask_loadu_pd(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_pd(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_pd(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_min_pd(x, y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_pd(idx, zmm); + } + static type_t reducemax(zmm_t v) + { + return _mm512_reduce_max_pd(v); + } + static type_t reducemin(zmm_t v) + { + return _mm512_reduce_min_pd(v); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_pd(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_pd(mem, x); + } +}; + +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +template +static inline zmm_t sort_zmm_64bit(zmm_t zmm) +{ + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_1), zmm), + 0xCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + zmm = cmp_merge(zmm, vtype::permutexvar(rev_index, zmm), 0xF0); + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm), + 0xCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + return zmm; +} + +// Assumes zmm is bitonic and performs a recursive half cleaner +template +static inline zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) +{ + + // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7 + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_4), zmm), + 0xF0); + // 2) half_cleaner[4] + zmm = cmp_merge( + zmm, + vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm), + 0xCC); + // 3) half_cleaner[1] + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + return zmm; +} + +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +template +static inline void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) +{ + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + zmm2 = vtype::permutexvar(rev_index, zmm2); + zmm_t zmm3 = vtype::min(zmm1, zmm2); + zmm_t zmm4 = vtype::max(zmm1, zmm2); + // 2) Recursive half cleaner for each + zmm1 = bitonic_merge_zmm_64bit(zmm3); + zmm2 = bitonic_merge_zmm_64bit(zmm4); +} + +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive +// half cleaner +template +static inline void bitonic_merge_four_zmm_64bit(zmm_t *zmm) +{ + // 1) First step of a merging network + zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]); + zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); + // 2) Recursive half clearer: 16 + zmm_t zmm_t3 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm2r)); + zmm_t zmm_t4 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm3r)); + zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); + zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); + zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); + zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_64bit(zmm0); + zmm[1] = bitonic_merge_zmm_64bit(zmm1); + zmm[2] = bitonic_merge_zmm_64bit(zmm2); + zmm[3] = bitonic_merge_zmm_64bit(zmm3); +} + +template +static inline void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) +{ + zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]); + zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]); + zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]); + zmm_t zmm7r = vtype::permutexvar(rev_index, zmm[7]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); + zmm_t zmm_t5 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm4r)); + zmm_t zmm_t6 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm5r)); + zmm_t zmm_t7 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm6r)); + zmm_t zmm_t8 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm7r)); + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); +} + +template +static inline void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) +{ + zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]); + zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]); + zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]); + zmm_t zmm11r = vtype::permutexvar(rev_index, zmm[11]); + zmm_t zmm12r = vtype::permutexvar(rev_index, zmm[12]); + zmm_t zmm13r = vtype::permutexvar(rev_index, zmm[13]); + zmm_t zmm14r = vtype::permutexvar(rev_index, zmm[14]); + zmm_t zmm15r = vtype::permutexvar(rev_index, zmm[15]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm15r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm14r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm13r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm12r); + zmm_t zmm_t5 = vtype::min(zmm[4], zmm11r); + zmm_t zmm_t6 = vtype::min(zmm[5], zmm10r); + zmm_t zmm_t7 = vtype::min(zmm[6], zmm9r); + zmm_t zmm_t8 = vtype::min(zmm[7], zmm8r); + zmm_t zmm_t9 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm8r)); + zmm_t zmm_t10 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm9r)); + zmm_t zmm_t11 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm10r)); + zmm_t zmm_t12 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm11r)); + zmm_t zmm_t13 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm12r)); + zmm_t zmm_t14 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm13r)); + zmm_t zmm_t15 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm14r)); + zmm_t zmm_t16 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm15r)); + // Recusive half clear 16 zmm regs + COEX(zmm_t1, zmm_t5); + COEX(zmm_t2, zmm_t6); + COEX(zmm_t3, zmm_t7); + COEX(zmm_t4, zmm_t8); + COEX(zmm_t9, zmm_t13); + COEX(zmm_t10, zmm_t14); + COEX(zmm_t11, zmm_t15); + COEX(zmm_t12, zmm_t16); + // + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t9, zmm_t11); + COEX(zmm_t10, zmm_t12); + COEX(zmm_t13, zmm_t15); + COEX(zmm_t14, zmm_t16); + // + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + COEX(zmm_t9, zmm_t10); + COEX(zmm_t11, zmm_t12); + COEX(zmm_t13, zmm_t14); + COEX(zmm_t15, zmm_t16); + // + zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); + zmm[8] = bitonic_merge_zmm_64bit(zmm_t9); + zmm[9] = bitonic_merge_zmm_64bit(zmm_t10); + zmm[10] = bitonic_merge_zmm_64bit(zmm_t11); + zmm[11] = bitonic_merge_zmm_64bit(zmm_t12); + zmm[12] = bitonic_merge_zmm_64bit(zmm_t13); + zmm[13] = bitonic_merge_zmm_64bit(zmm_t14); + zmm[14] = bitonic_merge_zmm_64bit(zmm_t15); + zmm[15] = bitonic_merge_zmm_64bit(zmm_t16); +} + +template +static inline void sort_8_64bit(type_t *arr, int32_t N) +{ + typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; + typename vtype::zmm_t zmm + = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); + vtype::mask_storeu(arr, load_mask, sort_zmm_64bit(zmm)); +} + +template +static inline void sort_16_64bit(type_t *arr, int32_t N) +{ + if (N <= 8) { + sort_8_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + zmm_t zmm1 = vtype::loadu(arr); + typename vtype::opmask_t load_mask = (0x01 << (N - 8)) - 0x01; + zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 8); + zmm1 = sort_zmm_64bit(zmm1); + zmm2 = sort_zmm_64bit(zmm2); + bitonic_merge_two_zmm_64bit(zmm1, zmm2); + vtype::storeu(arr, zmm1); + vtype::mask_storeu(arr + 8, load_mask, zmm2); +} + +template +static inline void sort_32_64bit(type_t *arr, int32_t N) +{ + if (N <= 16) { + sort_16_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[4]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 16); + zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 24); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_four_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::mask_storeu(arr + 16, load_mask1, zmm[2]); + vtype::mask_storeu(arr + 24, load_mask2, zmm[3]); +} + +template +static inline void sort_64_64bit(type_t *arr, int32_t N) +{ + if (N <= 32) { + sort_32_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[8]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + zmm[2] = vtype::loadu(arr + 16); + zmm[3] = vtype::loadu(arr + 24); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; + // N-32 >= 1 + uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + load_mask3 = (combined_mask >> 16) & 0xFF; + load_mask4 = (combined_mask >> 24) & 0xFF; + zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); + zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 40); + zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 48); + zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 56); + zmm[4] = sort_zmm_64bit(zmm[4]); + zmm[5] = sort_zmm_64bit(zmm[5]); + zmm[6] = sort_zmm_64bit(zmm[6]); + zmm[7] = sort_zmm_64bit(zmm[7]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); + bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); + bitonic_merge_four_zmm_64bit(zmm); + bitonic_merge_four_zmm_64bit(zmm + 4); + bitonic_merge_eight_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::storeu(arr + 16, zmm[2]); + vtype::storeu(arr + 24, zmm[3]); + vtype::mask_storeu(arr + 32, load_mask1, zmm[4]); + vtype::mask_storeu(arr + 40, load_mask2, zmm[5]); + vtype::mask_storeu(arr + 48, load_mask3, zmm[6]); + vtype::mask_storeu(arr + 56, load_mask4, zmm[7]); +} + +template +static inline void sort_128_64bit(type_t *arr, int32_t N) +{ + if (N <= 64) { + sort_64_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[16]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + zmm[2] = vtype::loadu(arr + 16); + zmm[3] = vtype::loadu(arr + 24); + zmm[4] = vtype::loadu(arr + 32); + zmm[5] = vtype::loadu(arr + 40); + zmm[6] = vtype::loadu(arr + 48); + zmm[7] = vtype::loadu(arr + 56); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + zmm[4] = sort_zmm_64bit(zmm[4]); + zmm[5] = sort_zmm_64bit(zmm[5]); + zmm[6] = sort_zmm_64bit(zmm[6]); + zmm[7] = sort_zmm_64bit(zmm[7]); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; + opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF; + opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF; + if (N != 128) { + uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + load_mask3 = (combined_mask >> 16) & 0xFF; + load_mask4 = (combined_mask >> 24) & 0xFF; + load_mask5 = (combined_mask >> 32) & 0xFF; + load_mask6 = (combined_mask >> 40) & 0xFF; + load_mask7 = (combined_mask >> 48) & 0xFF; + load_mask8 = (combined_mask >> 56) & 0xFF; + } + zmm[8] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); + zmm[9] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 72); + zmm[10] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 80); + zmm[11] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 88); + zmm[12] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 96); + zmm[13] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 104); + zmm[14] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 112); + zmm[15] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 120); + zmm[8] = sort_zmm_64bit(zmm[8]); + zmm[9] = sort_zmm_64bit(zmm[9]); + zmm[10] = sort_zmm_64bit(zmm[10]); + zmm[11] = sort_zmm_64bit(zmm[11]); + zmm[12] = sort_zmm_64bit(zmm[12]); + zmm[13] = sort_zmm_64bit(zmm[13]); + zmm[14] = sort_zmm_64bit(zmm[14]); + zmm[15] = sort_zmm_64bit(zmm[15]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); + bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); + bitonic_merge_two_zmm_64bit(zmm[8], zmm[9]); + bitonic_merge_two_zmm_64bit(zmm[10], zmm[11]); + bitonic_merge_two_zmm_64bit(zmm[12], zmm[13]); + bitonic_merge_two_zmm_64bit(zmm[14], zmm[15]); + bitonic_merge_four_zmm_64bit(zmm); + bitonic_merge_four_zmm_64bit(zmm + 4); + bitonic_merge_four_zmm_64bit(zmm + 8); + bitonic_merge_four_zmm_64bit(zmm + 12); + bitonic_merge_eight_zmm_64bit(zmm); + bitonic_merge_eight_zmm_64bit(zmm + 8); + bitonic_merge_sixteen_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::storeu(arr + 16, zmm[2]); + vtype::storeu(arr + 24, zmm[3]); + vtype::storeu(arr + 32, zmm[4]); + vtype::storeu(arr + 40, zmm[5]); + vtype::storeu(arr + 48, zmm[6]); + vtype::storeu(arr + 56, zmm[7]); + vtype::mask_storeu(arr + 64, load_mask1, zmm[8]); + vtype::mask_storeu(arr + 72, load_mask2, zmm[9]); + vtype::mask_storeu(arr + 80, load_mask3, zmm[10]); + vtype::mask_storeu(arr + 88, load_mask4, zmm[11]); + vtype::mask_storeu(arr + 96, load_mask5, zmm[12]); + vtype::mask_storeu(arr + 104, load_mask6, zmm[13]); + vtype::mask_storeu(arr + 112, load_mask7, zmm[14]); + vtype::mask_storeu(arr + 120, load_mask8, zmm[15]); +} + +template +static inline type_t +get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right) +{ + // median of 8 + int64_t size = (right - left) / 8; + using zmm_t = typename vtype::zmm_t; + __m512i rand_index = _mm512_set_epi64(left + size, + left + 2 * size, + left + 3 * size, + left + 4 * size, + left + 5 * size, + left + 6 * size, + left + 7 * size, + left + 8 * size); + zmm_t rand_vec = vtype::template i64gather(rand_index, arr); + // pivot will never be a nan, since there are no nan's! + zmm_t sort = sort_zmm_64bit(rand_vec); + return ((type_t *)&sort)[4]; +} + +template +static inline void +qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_64bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_64bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_64bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_64bit_(arr, pivot_index, right, max_iters - 1); +} + +static inline int64_t replace_nan_with_inf(double *arr, int64_t arrsize) +{ + int64_t nan_count = 0; + __mmask8 loadmask = 0xFF; + while (arrsize > 0) { + if (arrsize < 8) { loadmask = (0x01 << arrsize) - 0x01; } + __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr); + __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((int32_t)nanmask); + _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE); + arr += 8; + arrsize -= 8; + } + return nan_count; +} + +static inline void +replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count) +{ + for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + arr[ii] = std::nan("1"); + nan_count -= 1; + } +} + +template <> +void avx512_qsort(int64_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + qsort_64bit_, int64_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(uint64_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + qsort_64bit_, uint64_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(double *arr, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qsort_64bit_, double>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} +#endif // __AVX512_QSORT_64BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h new file mode 100644 index 000000000..e713e1f20 --- /dev/null +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h @@ -0,0 +1,218 @@ +/******************************************************************* + * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2021 Serge Sans Paille + * SPDX-License-Identifier: BSD-3-Clause + * Authors: Raghuveer Devulapalli + * Serge Sans Paille + * ****************************************************************/ + +#ifndef __AVX512_QSORT_COMMON__ +#define __AVX512_QSORT_COMMON__ + +/* + * Quicksort using AVX-512. The ideas and code are based on these two research + * papers [1] and [2]. On a high level, the idea is to vectorize quicksort + * partitioning using AVX-512 compressstore instructions. If the array size is + * < 128, then use Bitonic sorting network implemented on 512-bit registers. + * The precise network definitions depend on the dtype and are defined in + * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and + * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting + * network. The core implementations of the vectorized qsort functions + * avx512_qsort(T*, int64_t) are modified versions of avx2 quicksort + * presented in the paper [2] and source code associated with that paper [3]. + * + * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types + * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ + * + * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel + * Skylake https://arxiv.org/pdf/1704.08579.pdf + * + * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT + * + * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 + * + */ + +#include +#include +#include +#include +#include + +#define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() +#define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() +#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits::max() +#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits::max() +#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits::min() +#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits::max() +#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits::max() +#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits::min() +#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits::max() +#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits::max() +#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits::min() +#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY) +#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64) +#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64) +#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF) +#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32) +#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32) +#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16) +#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) +#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d + +template +struct vector; + +template +void avx512_qsort(T *arr, int64_t arrsize); + +/* + * COEX == Compare and Exchange two registers by swapping min and max values + */ +template +static void COEX(mm_t &a, mm_t &b) +{ + mm_t temp = a; + a = vtype::min(a, b); + b = vtype::max(temp, b); +} + +template +static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) +{ + zmm_t min = vtype::min(in2, in1); + zmm_t max = vtype::max(in2, in1); + return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max +} + +/* + * Parition one ZMM register based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +template +static inline int32_t partition_vec(type_t *arr, + int64_t left, + int64_t right, + const zmm_t curr_vec, + const zmm_t pivot_vec, + zmm_t *smallest_vec, + zmm_t *biggest_vec) +{ + /* which elements are larger than the pivot */ + typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec); + int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); + vtype::mask_compressstoreu( + arr + left, vtype::knot_opmask(gt_mask), curr_vec); + vtype::mask_compressstoreu( + arr + right - amount_gt_pivot, gt_mask, curr_vec); + *smallest_vec = vtype::min(curr_vec, *smallest_vec); + *biggest_vec = vtype::max(curr_vec, *biggest_vec); + return amount_gt_pivot; +} + +/* + * Parition an array based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +template +static inline int64_t partition_avx512(type_t *arr, + int64_t left, + int64_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) +{ + /* make array length divisible by vtype::numlanes , shortening the array */ + for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { + *smallest = std::min(*smallest, arr[left]); + *biggest = std::max(*biggest, arr[left]); + if (arr[left] > pivot) { std::swap(arr[left], arr[--right]); } + else { + ++left; + } + } + + if (left == right) + return left; /* less than vtype::numlanes elements in the array */ + + using zmm_t = typename vtype::zmm_t; + zmm_t pivot_vec = vtype::set1(pivot); + zmm_t min_vec = vtype::set1(*smallest); + zmm_t max_vec = vtype::set1(*biggest); + + if (right - left == vtype::numlanes) { + zmm_t vec = vtype::loadu(arr + left); + int32_t amount_gt_pivot = partition_vec(arr, + left, + left + vtype::numlanes, + vec, + pivot_vec, + &min_vec, + &max_vec); + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return left + (vtype::numlanes - amount_gt_pivot); + } + + // first and last vtype::numlanes values are partitioned at the end + zmm_t vec_left = vtype::loadu(arr + left); + zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); + // store points of the vectors + int64_t r_store = right - vtype::numlanes; + int64_t l_store = left; + // indices for loading the elements + left += vtype::numlanes; + right -= vtype::numlanes; + while (right - left != 0) { + zmm_t curr_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((r_store + vtype::numlanes) - right < left - l_store) { + right -= vtype::numlanes; + curr_vec = vtype::loadu(arr + right); + } + else { + curr_vec = vtype::loadu(arr + left); + left += vtype::numlanes; + } + // partition the current vector and save it on both sides of the array + int32_t amount_gt_pivot + = partition_vec(arr, + l_store, + r_store + vtype::numlanes, + curr_vec, + pivot_vec, + &min_vec, + &max_vec); + ; + r_store -= amount_gt_pivot; + l_store += (vtype::numlanes - amount_gt_pivot); + } + + /* partition and save vec_left and vec_right */ + int32_t amount_gt_pivot = partition_vec(arr, + l_store, + r_store + vtype::numlanes, + vec_left, + pivot_vec, + &min_vec, + &max_vec); + l_store += (vtype::numlanes - amount_gt_pivot); + amount_gt_pivot = partition_vec(arr, + l_store, + l_store + vtype::numlanes, + vec_right, + pivot_vec, + &min_vec, + &max_vec); + l_store += (vtype::numlanes - amount_gt_pivot); + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; +} +#endif // __AVX512_QSORT_COMMON__ -- cgit v1.2.1 From ae978b8a2bc4e7b219d796519f9327feb08fe4e7 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 27 Sep 2022 21:58:29 -0700 Subject: ENH: Add AVX-512 based 64-bit dtype sort --- numpy/core/setup.py | 3 +- numpy/core/src/npysort/quicksort.cpp | 46 +- numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp | 54 ++ numpy/core/src/npysort/x86-qsort-skx.h | 37 + numpy/core/src/npysort/x86-qsort.dispatch.cpp | 835 ---------------------- numpy/core/src/npysort/x86-qsort.h | 28 - 6 files changed, 137 insertions(+), 866 deletions(-) create mode 100644 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp create mode 100644 numpy/core/src/npysort/x86-qsort-skx.h delete mode 100644 numpy/core/src/npysort/x86-qsort.dispatch.cpp delete mode 100644 numpy/core/src/npysort/x86-qsort.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index e509b9d11..912867709 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -650,6 +650,7 @@ def configuration(parent_package='',top_path=None): config.add_include_dirs(join('src', 'multiarray')) config.add_include_dirs(join('src', 'umath')) config.add_include_dirs(join('src', 'npysort')) + config.add_include_dirs(join('src', 'npysort', 'x86-simd-sort', 'src')) config.add_include_dirs(join('src', '_simd')) config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process @@ -942,7 +943,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'usertypes.c'), join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), - join('src', 'npysort', 'x86-qsort.dispatch.cpp'), + join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), join('src', 'npysort', 'quicksort.cpp'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 3e351dd84..06ac0c172 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -55,12 +55,12 @@ #include "npysort_heapsort.h" #include "numpy_tag.h" -#include "x86-qsort.h" +#include "x86-qsort-skx.h" #include #include #ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort.dispatch.h" +#include "x86-qsort-skx.dispatch.h" #endif #define NOT_USED NPY_UNUSED(unused) @@ -86,6 +86,48 @@ struct x86_dispatch { static bool quicksort(typename Tag::type *, npy_intp) { return false; } }; +template <> +struct x86_dispatch { + static bool quicksort(npy_long *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + +template <> +struct x86_dispatch { + static bool quicksort(npy_ulong *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + +template <> +struct x86_dispatch { + static bool quicksort(npy_double *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_double); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + template <> struct x86_dispatch { static bool quicksort(npy_int *start, npy_intp num) diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp new file mode 100644 index 000000000..d26b8fc9f --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp @@ -0,0 +1,54 @@ +/*@targets + * $maxopt $keep_baseline avx512_skx + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "x86-qsort-skx.h" +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#ifdef NPY_HAVE_AVX512_SKX +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" + +/*************************************** + * C > C++ dispatch + ***************************************/ +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_long*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_ulong*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_double)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_double*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_int*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_uint*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_float*)arr, arrsize); +} + +#endif // NPY_HAVE_AVX512_SKX diff --git a/numpy/core/src/npysort/x86-qsort-skx.h b/numpy/core/src/npysort/x86-qsort-skx.h new file mode 100644 index 000000000..9a5cb2c9d --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort-skx.h @@ -0,0 +1,37 @@ +#include "numpy/npy_common.h" + +#include "npy_cpu_dispatch.h" + +#ifndef NPY_NO_EXPORT +#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION +#include "x86-qsort-skx.dispatch.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_long, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ulong, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_double, + (void *start, npy_intp num)) + + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float, + (void *start, npy_intp num)) + +#ifdef __cplusplus +} +#endif diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp deleted file mode 100644 index 8e88cc667..000000000 --- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp +++ /dev/null @@ -1,835 +0,0 @@ -/*@targets - * $maxopt $keep_baseline avx512_skx - */ -// policy $keep_baseline is used to avoid skip building avx512_skx -// when its part of baseline features (--cpu-baseline), since -// 'baseline' option isn't specified within targets. - -#include "x86-qsort.h" -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#ifdef NPY_HAVE_AVX512_SKX -#include "numpy/npy_math.h" - -#include "npy_sort.h" -#include "numpy_tag.h" - -#include "simd/simd.h" -#include - -template -NPY_NO_EXPORT int -heapsort_(type *start, npy_intp n); - -/* - * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are - * based on these two research papers: - * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types - * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ - * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel - * Skylake https://arxiv.org/pdf/1704.08579.pdf - * - * High level idea: Vectorize the quicksort partitioning using AVX-512 - * compressstore instructions. The algorithm to pick the pivot is to use median - * of 72 elements picked at random. If the array size is < 128, then use - * Bitonic sorting network. Good resource for bitonic sorting network: - * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 - * - * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340 - * for potential problems when converting this code to universal intrinsics - * framework. - */ - -/* - * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 -#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 -#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 -#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 -#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 -#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF) -#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32) -#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32) -#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d -#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK) -#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK) - -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - -/* - * Vectorized random number generator xoroshiro128+. Broken into 2 parts: - * (1) vnext generates 2 64-bit random integers - * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to - * the length of the array - */ -#define VROTL(x, k) /* rotate each uint64_t value in vector */ \ - _mm256_or_si256(_mm256_slli_epi64((x), (k)), \ - _mm256_srli_epi64((x), 64 - (k))) - -static inline __m256i -vnext(__m256i *s0, __m256i *s1) -{ - *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */ - *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1), - _mm256_slli_epi64(*s1, 16)); - *s1 = VROTL(*s1, 37); - return _mm256_add_epi64(*s0, *s1); /* return random vector */ -} - -/* transform random numbers to the range between 0 and bound - 1 */ -static inline __m256i -rnd_epu32(__m256i rnd_vec, __m256i bound) -{ - __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32); - __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound); - return _mm256_blend_epi32(odd, even, 0b01010101); -} - -template -struct vector; - -template <> -struct vector { - using tag = npy::int_tag; - using type_t = npy_int; - using zmm_t = __m512i; - using ymm_t = __m256i; - - static type_t type_max() { return NPY_MAX_INT32; } - static type_t type_min() { return NPY_MIN_INT32; } - static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); } - - static __mmask16 ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); - } - template - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem) - { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y) - { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); } - static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } - template<__mmask16 mask> - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); } -}; -template <> -struct vector { - using tag = npy::uint_tag; - using type_t = npy_uint; - using zmm_t = __m512i; - using ymm_t = __m256i; - - static type_t type_max() { return NPY_MAX_UINT32; } - static type_t type_min() { return 0; } - static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); } - - template - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi32(index, base, scale); - } - static __mmask16 ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem) - { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y) - { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); } - static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } - template<__mmask16 mask> - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); } -}; -template <> -struct vector { - using tag = npy::float_tag; - using type_t = npy_float; - using zmm_t = __m512; - using ymm_t = __m256; - - static type_t type_max() { return NPY_INFINITYF; } - static type_t type_min() { return -NPY_INFINITYF; } - static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); } - - static __mmask16 ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); - } - template - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_ps(index, base, scale); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); } - static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_compressstoreu_ps(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem) - { - return _mm512_mask_loadu_ps(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y) - { - return _mm512_mask_mov_ps(x, mask, y); - } - static void mask_storeu(void *mem, __mmask16 mask, zmm_t x) - { - return _mm512_mask_storeu_ps(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_ps(idx, zmm); - } - static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); } - static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_ps(v); } - template<__mmask16 mask> - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); } -}; - -/* - * COEX == Compare and Exchange two registers by swapping min and max values - */ -template -void -COEX(mm_t &a, mm_t &b) -{ - mm_t temp = a; - a = vtype::min(a, b); - b = vtype::max(temp, b); -} - -template -static inline zmm_t -cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask) -{ - zmm_t min = vtype::min(in2, in1); - zmm_t max = vtype::max(in2, in1); - return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max -} - -/* - * Assumes zmm is random and performs a full sorting network defined in - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg - */ -template -static inline zmm_t -sort_zmm(zmm_t zmm) -{ - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xAAAA); - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xCCCC); - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xAAAA); - zmm = cmp_merge( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0); - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xCCCC); - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xAAAA); - zmm = cmp_merge( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00); - zmm = cmp_merge( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0); - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xCCCC); - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xAAAA); - return zmm; -} - -// Assumes zmm is bitonic and performs a recursive half cleaner -template -static inline zmm_t -bitonic_merge_zmm(zmm_t zmm) -{ - // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. - zmm = cmp_merge( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00); - // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. - zmm = cmp_merge( - zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0); - // 3) half_cleaner[4] - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xCCCC); - // 3) half_cleaner[1] - zmm = cmp_merge(zmm, vtype::template shuffle(zmm), - 0xAAAA); - return zmm; -} - -// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner -template -static inline void -bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2) -{ - // 1) First step of a merging network: coex of zmm1 and zmm2 reversed - *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2); - zmm_t zmm3 = vtype::min(*zmm1, *zmm2); - zmm_t zmm4 = vtype::max(*zmm1, *zmm2); - // 2) Recursive half cleaner for each - *zmm1 = bitonic_merge_zmm(zmm3); - *zmm2 = bitonic_merge_zmm(zmm4); -} - -// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive -// half cleaner -template -static inline void -bitonic_merge_four_zmm(zmm_t *zmm) -{ - zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]); - zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); - zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[1], zmm2r)); - zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[0], zmm3r)); - zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); - zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); - zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); - zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); - zmm[0] = bitonic_merge_zmm(zmm0); - zmm[1] = bitonic_merge_zmm(zmm1); - zmm[2] = bitonic_merge_zmm(zmm2); - zmm[3] = bitonic_merge_zmm(zmm3); -} - -template -static inline void -bitonic_merge_eight_zmm(zmm_t *zmm) -{ - zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]); - zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]); - zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]); - zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); - zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); - zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); - zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[3], zmm4r)); - zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[2], zmm5r)); - zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[1], zmm6r)); - zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), - vtype::max(zmm[0], zmm7r)); - COEX(zmm_t1, zmm_t3); - COEX(zmm_t2, zmm_t4); - COEX(zmm_t5, zmm_t7); - COEX(zmm_t6, zmm_t8); - COEX(zmm_t1, zmm_t2); - COEX(zmm_t3, zmm_t4); - COEX(zmm_t5, zmm_t6); - COEX(zmm_t7, zmm_t8); - zmm[0] = bitonic_merge_zmm(zmm_t1); - zmm[1] = bitonic_merge_zmm(zmm_t2); - zmm[2] = bitonic_merge_zmm(zmm_t3); - zmm[3] = bitonic_merge_zmm(zmm_t4); - zmm[4] = bitonic_merge_zmm(zmm_t5); - zmm[5] = bitonic_merge_zmm(zmm_t6); - zmm[6] = bitonic_merge_zmm(zmm_t7); - zmm[7] = bitonic_merge_zmm(zmm_t8); -} - -template -static inline void -sort_16(type_t *arr, npy_int N) -{ - __mmask16 load_mask = (0x0001 << N) - 0x0001; - typename vtype::zmm_t zmm = - vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); - vtype::mask_storeu(arr, load_mask, sort_zmm(zmm)); -} - -template -static inline void -sort_32(type_t *arr, npy_int N) -{ - if (N <= 16) { - sort_16(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm1 = vtype::loadu(arr); - __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001; - zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16); - zmm1 = sort_zmm(zmm1); - zmm2 = sort_zmm(zmm2); - bitonic_merge_two_zmm(&zmm1, &zmm2); - vtype::storeu(arr, zmm1); - vtype::mask_storeu(arr + 16, load_mask, zmm2); -} - -template -static inline void -sort_64(type_t *arr, npy_int N) -{ - if (N <= 32) { - sort_32(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm[4]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 16); - __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; - if (N < 48) { - load_mask1 = (0x0001 << (N - 32)) - 0x0001; - load_mask2 = 0x0000; - } - else if (N < 64) { - load_mask2 = (0x0001 << (N - 48)) - 0x0001; - } - zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); - zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48); - zmm[0] = sort_zmm(zmm[0]); - zmm[1] = sort_zmm(zmm[1]); - zmm[2] = sort_zmm(zmm[2]); - zmm[3] = sort_zmm(zmm[3]); - bitonic_merge_two_zmm(&zmm[0], &zmm[1]); - bitonic_merge_two_zmm(&zmm[2], &zmm[3]); - bitonic_merge_four_zmm(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 16, zmm[1]); - vtype::mask_storeu(arr + 32, load_mask1, zmm[2]); - vtype::mask_storeu(arr + 48, load_mask2, zmm[3]); -} - -template -static inline void -sort_128(type_t *arr, npy_int N) -{ - if (N <= 64) { - sort_64(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm[8]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 16); - zmm[2] = vtype::loadu(arr + 32); - zmm[3] = vtype::loadu(arr + 48); - zmm[0] = sort_zmm(zmm[0]); - zmm[1] = sort_zmm(zmm[1]); - zmm[2] = sort_zmm(zmm[2]); - zmm[3] = sort_zmm(zmm[3]); - __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; - __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; - if (N < 80) { - load_mask1 = (0x0001 << (N - 64)) - 0x0001; - load_mask2 = 0x0000; - load_mask3 = 0x0000; - load_mask4 = 0x0000; - } - else if (N < 96) { - load_mask2 = (0x0001 << (N - 80)) - 0x0001; - load_mask3 = 0x0000; - load_mask4 = 0x0000; - } - else if (N < 112) { - load_mask3 = (0x0001 << (N - 96)) - 0x0001; - load_mask4 = 0x0000; - } - else { - load_mask4 = (0x0001 << (N - 112)) - 0x0001; - } - zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); - zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80); - zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96); - zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112); - zmm[4] = sort_zmm(zmm[4]); - zmm[5] = sort_zmm(zmm[5]); - zmm[6] = sort_zmm(zmm[6]); - zmm[7] = sort_zmm(zmm[7]); - bitonic_merge_two_zmm(&zmm[0], &zmm[1]); - bitonic_merge_two_zmm(&zmm[2], &zmm[3]); - bitonic_merge_two_zmm(&zmm[4], &zmm[5]); - bitonic_merge_two_zmm(&zmm[6], &zmm[7]); - bitonic_merge_four_zmm(zmm); - bitonic_merge_four_zmm(zmm + 4); - bitonic_merge_eight_zmm(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 16, zmm[1]); - vtype::storeu(arr + 32, zmm[2]); - vtype::storeu(arr + 48, zmm[3]); - vtype::mask_storeu(arr + 64, load_mask1, zmm[4]); - vtype::mask_storeu(arr + 80, load_mask2, zmm[5]); - vtype::mask_storeu(arr + 96, load_mask3, zmm[6]); - vtype::mask_storeu(arr + 112, load_mask4, zmm[7]); -} - -template -static inline void -swap(type_t *arr, npy_intp ii, npy_intp jj) -{ - type_t temp = arr[ii]; - arr[ii] = arr[jj]; - arr[jj] = temp; -} - -// Median of 3 strategy -// template -// static inline -// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp -// right) { -// return (rand() % (right + 1 - left)) + left; -// //npy_intp middle = ((right-left)/2) + left; -// //type_t a = arr[left], b = arr[middle], c = arr[right]; -// //if ((b >= a && b <= c) || (b <= a && b >= c)) -// // return middle; -// //if ((a >= b && a <= c) || (a <= b && a >= c)) -// // return left; -// //else -// // return right; -//} - -/* - * Picking the pivot: Median of 72 array elements chosen at random. - */ - -template -static inline type_t -get_pivot(type_t *arr, const npy_intp left, const npy_intp right) -{ - /* seeds for vectorized random number generator */ - __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374, - 1324281658759788278, 6214952190349879213); - __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653, - 7874578921548791257, 1998265912745817298); - s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left)); - s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right)); - - npy_intp arrsize = right - left + 1; - __m256i bound = - _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize); - __m512i left_vec = _mm512_set1_epi64(left); - __m512i right_vec = _mm512_set1_epi64(right); - using ymm_t = typename vtype::ymm_t; - ymm_t v[9]; - /* fill 9 vectors with random numbers */ - for (npy_int i = 0; i < 9; ++i) { - __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */ - __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32( - rand_64, bound)); /* random numbers between 0 and bound - 1 */ - __m512i indices; - if (i < 5) - indices = - _mm512_add_epi64(left_vec, rand_32); /* indices for arr */ - else - indices = - _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */ - - v[i] = vtype::template i64gather(indices, arr); - } - - /* median network for 9 elements */ - COEX(v[0], v[1]); - COEX(v[2], v[3]); - COEX(v[4], v[5]); - COEX(v[6], v[7]); - COEX(v[0], v[2]); - COEX(v[1], v[3]); - COEX(v[4], v[6]); - COEX(v[5], v[7]); - COEX(v[0], v[4]); - COEX(v[1], v[2]); - COEX(v[5], v[6]); - COEX(v[3], v[7]); - COEX(v[1], v[5]); - COEX(v[2], v[6]); - COEX(v[3], v[5]); - COEX(v[2], v[4]); - COEX(v[3], v[4]); - COEX(v[3], v[8]); - COEX(v[4], v[8]); - - // technically v[4] needs to be sorted before we pick the correct median, - // picking the 4th element works just as well for performance - type_t *temp = (type_t *)&v[4]; - - return temp[4]; -} - -/* - * Partition one ZMM register based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template -static inline npy_int -partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec, - const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec) -{ - /* which elements are larger than the pivot */ - __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec); - npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask); - vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec); - vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask, - curr_vec); - *smallest_vec = vtype::min(curr_vec, *smallest_vec); - *biggest_vec = vtype::max(curr_vec, *biggest_vec); - return amount_gt_pivot; -} - -/* - * Partition an array based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template -static inline npy_intp -partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot, - type_t *smallest, type_t *biggest) -{ - /* make array length divisible by 16 , shortening the array */ - for (npy_int i = (right - left) % 16; i > 0; --i) { - *smallest = MIN(*smallest, arr[left]); - *biggest = MAX(*biggest, arr[left]); - if (arr[left] > pivot) { - swap(arr, left, --right); - } - else { - ++left; - } - } - - if (left == right) - return left; /* less than 16 elements in the array */ - - using zmm_t = typename vtype::zmm_t; - zmm_t pivot_vec = vtype::set1(pivot); - zmm_t min_vec = vtype::set1(*smallest); - zmm_t max_vec = vtype::set1(*biggest); - - if (right - left == 16) { - zmm_t vec = vtype::loadu(arr + left); - npy_int amount_gt_pivot = partition_vec( - arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec); - *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); - return left + (16 - amount_gt_pivot); - } - - // first and last 16 values are partitioned at the end - zmm_t vec_left = vtype::loadu(arr + left); - zmm_t vec_right = vtype::loadu(arr + (right - 16)); - // store points of the vectors - npy_intp r_store = right - 16; - npy_intp l_store = left; - // indices for loading the elements - left += 16; - right -= 16; - while (right - left != 0) { - zmm_t curr_vec; - /* - * if fewer elements are stored on the right side of the array, - * then next elements are loaded from the right side, - * otherwise from the left side - */ - if ((r_store + 16) - right < left - l_store) { - right -= 16; - curr_vec = vtype::loadu(arr + right); - } - else { - curr_vec = vtype::loadu(arr + left); - left += 16; - } - // partition the current vector and save it on both sides of the array - npy_int amount_gt_pivot = - partition_vec(arr, l_store, r_store + 16, curr_vec, - pivot_vec, &min_vec, &max_vec); - ; - r_store -= amount_gt_pivot; - l_store += (16 - amount_gt_pivot); - } - - /* partition and save vec_left and vec_right */ - npy_int amount_gt_pivot = - partition_vec(arr, l_store, r_store + 16, vec_left, - pivot_vec, &min_vec, &max_vec); - l_store += (16 - amount_gt_pivot); - amount_gt_pivot = - partition_vec(arr, l_store, l_store + 16, vec_right, - pivot_vec, &min_vec, &max_vec); - l_store += (16 - amount_gt_pivot); - *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); - return l_store; -} - -template -static inline void -qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters) -{ - /* - * Resort to heapsort if quicksort isn't making any progress - */ - if (max_iters <= 0) { - heapsort_(arr + left, right + 1 - left); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128(arr + left, (npy_int)(right + 1 - left)); - return; - } - - type_t pivot = get_pivot(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - npy_intp pivot_index = partition_avx512(arr, left, right + 1, pivot, - &smallest, &biggest); - if (pivot != smallest) - qsort_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_(arr, pivot_index, right, max_iters - 1); -} - -static inline npy_intp -replace_nan_with_inf(npy_float *arr, npy_intp arrsize) -{ - npy_intp nan_count = 0; - __mmask16 loadmask = 0xFFFF; - while (arrsize > 0) { - if (arrsize < 16) { - loadmask = (0x0001 << arrsize) - 0x0001; - } - __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); - __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((npy_int)nanmask); - _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); - arr += 16; - arrsize -= 16; - } - return nan_count; -} - -static inline void -replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count) -{ - for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = NPY_NANF; - nan_count -= 1; - } -} - -/*************************************** - * C > C++ dispatch - ***************************************/ - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize) -{ - if (arrsize > 1) { - qsort_, npy_int>((npy_int *)arr, 0, arrsize - 1, - 2 * (npy_int)log2(arrsize)); - } -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize) -{ - if (arrsize > 1) { - qsort_, npy_uint>((npy_uint *)arr, 0, arrsize - 1, - 2 * (npy_int)log2(arrsize)); - } -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize) -{ - if (arrsize > 1) { - npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize); - qsort_, npy_float>((npy_float *)arr, 0, arrsize - 1, - 2 * (npy_int)log2(arrsize)); - replace_inf_with_nan((npy_float *)arr, arrsize, nan_count); - } -} - -#endif // NPY_HAVE_AVX512_SKX diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h deleted file mode 100644 index 6340e2bc7..000000000 --- a/numpy/core/src/npysort/x86-qsort.h +++ /dev/null @@ -1,28 +0,0 @@ -#include "numpy/npy_common.h" - -#include "npy_cpu_dispatch.h" - -#ifndef NPY_NO_EXPORT -#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort.dispatch.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float, - (void *start, npy_intp num)) - -#ifdef __cplusplus -} -#endif -- cgit v1.2.1 From 882503ac9383b3fff0ecf5423e732e64469347ba Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 28 Sep 2022 13:34:11 -0700 Subject: ENH: Add AVX-512 based 16-bit dtype sort --- numpy/core/setup.py | 1 + numpy/core/src/npysort/quicksort.cpp | 34 +++++++++++++++++++++++ numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 29 +++++++++++++++++++ numpy/core/src/npysort/x86-qsort-icl.h | 24 ++++++++++++++++ 4 files changed, 88 insertions(+) create mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp create mode 100644 numpy/core/src/npysort/x86-qsort-icl.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 912867709..0331a2f9b 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -944,6 +944,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), + join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'), join('src', 'npysort', 'quicksort.cpp'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 06ac0c172..d89dac173 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -56,6 +56,7 @@ #include "numpy_tag.h" #include "x86-qsort-skx.h" +#include "x86-qsort-icl.h" #include #include @@ -86,6 +87,7 @@ struct x86_dispatch { static bool quicksort(typename Tag::type *, npy_intp) { return false; } }; + template <> struct x86_dispatch { static bool quicksort(npy_long *start, npy_intp num) @@ -170,6 +172,38 @@ struct x86_dispatch { } }; +#ifndef NPY_DISABLE_OPTIMIZATION +#include "x86-qsort-icl.dispatch.h" +#endif + +template <> +struct x86_dispatch { + static bool quicksort(npy_short *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + +template <> +struct x86_dispatch { + static bool quicksort(npy_ushort *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + } // namespace template diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp new file mode 100644 index 000000000..7d6dc331b --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp @@ -0,0 +1,29 @@ +/*@targets + * $maxopt $keep_baseline avx512_icl + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "x86-qsort-icl.h" +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#ifdef NPY_HAVE_AVX512_ICL +#include "avx512-16bit-qsort.hpp" + +/*************************************** + * C > C++ dispatch + ***************************************/ +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_short*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_ushort*)arr, arrsize); +} + +#endif // NPY_HAVE_AVX512_ICL diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h new file mode 100644 index 000000000..2093e0bce --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort-icl.h @@ -0,0 +1,24 @@ +#include "numpy/npy_common.h" + +#include "npy_cpu_dispatch.h" + +#ifndef NPY_NO_EXPORT +#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION +#include "x86-qsort-icl.dispatch.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort, + (void *start, npy_intp num)) + +#ifdef __cplusplus +} +#endif -- cgit v1.2.1 From 1b5f40c89634d9399c1f3a7906dedc153b202b69 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 28 Sep 2022 22:21:52 -0700 Subject: BUG: Use longlong when NPY_SIZEOF_LONG is 4 --- numpy/core/src/npysort/quicksort.cpp | 10 ++++++++++ numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index d89dac173..3af6b91d6 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -89,8 +89,13 @@ struct x86_dispatch { template <> +#if NPY_SIZEOF_LONG == 8 struct x86_dispatch { static bool quicksort(npy_long *start, npy_intp num) +#else +struct x86_dispatch { + static bool quicksort(npy_longlong *start, npy_intp num) +#endif { void (*dispfunc)(void *, npy_intp) = nullptr; NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long); @@ -103,8 +108,13 @@ struct x86_dispatch { }; template <> +#if NPY_SIZEOF_LONG == 8 struct x86_dispatch { static bool quicksort(npy_ulong *start, npy_intp num) +#else +struct x86_dispatch { + static bool quicksort(npy_ulonglong *start, npy_intp num) +#endif { void (*dispfunc)(void *, npy_intp) = nullptr; NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong); diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp index d26b8fc9f..fb328f547 100644 --- a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp +++ b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp @@ -18,13 +18,21 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize) { +#if NPY_SIZEOF_LONG == 8 avx512_qsort((npy_long*)arr, arrsize); +#else + avx512_qsort((npy_longlong*)arr, arrsize); +#endif } NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize) { +#if NPY_SIZEOF_LONG == 8 avx512_qsort((npy_ulong*)arr, arrsize); +#else + avx512_qsort((npy_ulonglong*)arr, arrsize); +#endif } NPY_NO_EXPORT void -- cgit v1.2.1 From 9edebc521b13bc2aa5a3367635730a4c4b4efac4 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 28 Sep 2022 22:22:41 -0700 Subject: Revert "ENH: Add AVX-512 based 16-bit dtype sort" This reverts commit 225c8bab83d239d8888bc7b688efed97ab2284cf. --- numpy/core/setup.py | 1 - numpy/core/src/npysort/quicksort.cpp | 34 ----------------------- numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 29 ------------------- numpy/core/src/npysort/x86-qsort-icl.h | 24 ---------------- 4 files changed, 88 deletions(-) delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 0331a2f9b..912867709 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -944,7 +944,6 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), - join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'), join('src', 'npysort', 'quicksort.cpp'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 3af6b91d6..85b4a1e62 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -56,7 +56,6 @@ #include "numpy_tag.h" #include "x86-qsort-skx.h" -#include "x86-qsort-icl.h" #include #include @@ -87,7 +86,6 @@ struct x86_dispatch { static bool quicksort(typename Tag::type *, npy_intp) { return false; } }; - template <> #if NPY_SIZEOF_LONG == 8 struct x86_dispatch { @@ -182,38 +180,6 @@ struct x86_dispatch { } }; -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-icl.dispatch.h" -#endif - -template <> -struct x86_dispatch { - static bool quicksort(npy_short *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch { - static bool quicksort(npy_ushort *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - } // namespace template diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp deleted file mode 100644 index 7d6dc331b..000000000 --- a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/*@targets - * $maxopt $keep_baseline avx512_icl - */ -// policy $keep_baseline is used to avoid skip building avx512_skx -// when its part of baseline features (--cpu-baseline), since -// 'baseline' option isn't specified within targets. - -#include "x86-qsort-icl.h" -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#ifdef NPY_HAVE_AVX512_ICL -#include "avx512-16bit-qsort.hpp" - -/*************************************** - * C > C++ dispatch - ***************************************/ -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_short*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_ushort*)arr, arrsize); -} - -#endif // NPY_HAVE_AVX512_ICL diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h deleted file mode 100644 index 2093e0bce..000000000 --- a/numpy/core/src/npysort/x86-qsort-icl.h +++ /dev/null @@ -1,24 +0,0 @@ -#include "numpy/npy_common.h" - -#include "npy_cpu_dispatch.h" - -#ifndef NPY_NO_EXPORT -#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-icl.dispatch.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort, - (void *start, npy_intp num)) - -#ifdef __cplusplus -} -#endif -- cgit v1.2.1 From 57215f84ce60653908b99179338416dd7c2bbd36 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 30 Sep 2022 11:06:40 -0700 Subject: BUG: Ensure long/longlong is 8 bytes for 64-bit qsort --- numpy/core/src/npysort/quicksort.cpp | 36 +++++++++++++++++------ numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp | 12 ++------ 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 85b4a1e62..0674d25ac 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -86,14 +86,10 @@ struct x86_dispatch { static bool quicksort(typename Tag::type *, npy_intp) { return false; } }; -template <> #if NPY_SIZEOF_LONG == 8 +template <> struct x86_dispatch { static bool quicksort(npy_long *start, npy_intp num) -#else -struct x86_dispatch { - static bool quicksort(npy_longlong *start, npy_intp num) -#endif { void (*dispfunc)(void *, npy_intp) = nullptr; NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long); @@ -104,15 +100,36 @@ struct x86_dispatch { return false; } }; - template <> -#if NPY_SIZEOF_LONG == 8 struct x86_dispatch { static bool quicksort(npy_ulong *start, npy_intp num) -#else + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; +#elif NPY_SIZEOF_LONGLONG == 8 +template <> +struct x86_dispatch { + static bool quicksort(npy_longlong *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; +template <> struct x86_dispatch { static bool quicksort(npy_ulonglong *start, npy_intp num) -#endif { void (*dispfunc)(void *, npy_intp) = nullptr; NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong); @@ -123,6 +140,7 @@ struct x86_dispatch { return false; } }; +#endif template <> struct x86_dispatch { diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp index fb328f547..521b198ce 100644 --- a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp +++ b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp @@ -18,21 +18,13 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize) { -#if NPY_SIZEOF_LONG == 8 - avx512_qsort((npy_long*)arr, arrsize); -#else - avx512_qsort((npy_longlong*)arr, arrsize); -#endif + avx512_qsort((int64_t*)arr, arrsize); } NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize) { -#if NPY_SIZEOF_LONG == 8 - avx512_qsort((npy_ulong*)arr, arrsize); -#else - avx512_qsort((npy_ulonglong*)arr, arrsize); -#endif + avx512_qsort((uint64_t*)arr, arrsize); } NPY_NO_EXPORT void -- cgit v1.2.1 From 73280879df00c9542909779bc9fbd99747681be7 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 5 Oct 2022 22:18:39 -0700 Subject: MAINT: Force inline bitonic network functions --- .../x86-simd-sort/src/avx512-16bit-qsort.hpp | 18 ++++++------- .../x86-simd-sort/src/avx512-32bit-qsort.hpp | 26 +++++++++---------- .../x86-simd-sort/src/avx512-64bit-qsort.hpp | 30 +++++++++++----------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp index 1673eb5da..26a54e36b 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp @@ -236,7 +236,7 @@ struct vector { * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -static inline zmm_t sort_zmm_16bit(zmm_t zmm) +NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) { // Level 1 zmm = cmp_merge( @@ -308,7 +308,7 @@ static inline zmm_t sort_zmm_16bit(zmm_t zmm) // Assumes zmm is bitonic and performs a recursive half cleaner template -static inline zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) +NPY_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) { // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc .. zmm = cmp_merge( @@ -340,7 +340,7 @@ static inline zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner template -static inline void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) +NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) { // 1) First step of a merging network: coex of zmm1 and zmm2 reversed zmm2 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm2); @@ -354,7 +354,7 @@ static inline void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive // half cleaner template -static inline void bitonic_merge_four_zmm_16bit(zmm_t *zmm) +NPY_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm) { zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[2]); zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[3]); @@ -375,7 +375,7 @@ static inline void bitonic_merge_four_zmm_16bit(zmm_t *zmm) } template -static inline void sort_32_16bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_32_16bit(type_t *arr, int32_t N) { typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF; typename vtype::zmm_t zmm @@ -384,7 +384,7 @@ static inline void sort_32_16bit(type_t *arr, int32_t N) } template -static inline void sort_64_16bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_64_16bit(type_t *arr, int32_t N) { if (N <= 32) { sort_32_16bit(arr, N); @@ -403,7 +403,7 @@ static inline void sort_64_16bit(type_t *arr, int32_t N) } template -static inline void sort_128_16bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_128_16bit(type_t *arr, int32_t N) { if (N <= 64) { sort_64_16bit(arr, N); @@ -436,7 +436,7 @@ static inline void sort_128_16bit(type_t *arr, int32_t N) } template -static inline type_t +NPY_FINLINE type_t get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right) { // median of 32 @@ -478,7 +478,7 @@ get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right) } template -static inline void +static void qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) { /* diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp index cbc5368f0..7899d8522 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp @@ -336,7 +336,7 @@ struct vector { * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -static inline zmm_t sort_zmm_32bit(zmm_t zmm) +NPY_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm) { zmm = cmp_merge( zmm, @@ -383,7 +383,7 @@ static inline zmm_t sort_zmm_32bit(zmm_t zmm) // Assumes zmm is bitonic and performs a recursive half cleaner template -static inline zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) +NPY_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) { // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. zmm = cmp_merge( @@ -410,7 +410,7 @@ static inline zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner template -static inline void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) +NPY_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) { // 1) First step of a merging network: coex of zmm1 and zmm2 reversed *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2); @@ -424,7 +424,7 @@ static inline void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive // half cleaner template -static inline void bitonic_merge_four_zmm_32bit(zmm_t *zmm) +NPY_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) { zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]); zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]); @@ -445,7 +445,7 @@ static inline void bitonic_merge_four_zmm_32bit(zmm_t *zmm) } template -static inline void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) +NPY_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) { zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]); zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]); @@ -482,7 +482,7 @@ static inline void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) } template -static inline void sort_16_32bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_16_32bit(type_t *arr, int32_t N) { typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001; typename vtype::zmm_t zmm @@ -491,7 +491,7 @@ static inline void sort_16_32bit(type_t *arr, int32_t N) } template -static inline void sort_32_32bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_32_32bit(type_t *arr, int32_t N) { if (N <= 16) { sort_16_32bit(arr, N); @@ -509,7 +509,7 @@ static inline void sort_32_32bit(type_t *arr, int32_t N) } template -static inline void sort_64_32bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_64_32bit(type_t *arr, int32_t N) { if (N <= 32) { sort_32_32bit(arr, N); @@ -540,7 +540,7 @@ static inline void sort_64_32bit(type_t *arr, int32_t N) } template -static inline void sort_128_32bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_128_32bit(type_t *arr, int32_t N) { if (N <= 64) { sort_64_32bit(arr, N); @@ -592,7 +592,7 @@ static inline void sort_128_32bit(type_t *arr, int32_t N) } template -static inline type_t +NPY_FINLINE type_t get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right) { // median of 16 @@ -626,7 +626,7 @@ get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right) } template -static inline void +static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) { /* @@ -655,7 +655,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_32bit_(arr, pivot_index, right, max_iters - 1); } -static inline int64_t replace_nan_with_inf(float *arr, int64_t arrsize) +NPY_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) { int64_t nan_count = 0; __mmask16 loadmask = 0xFFFF; @@ -671,7 +671,7 @@ static inline int64_t replace_nan_with_inf(float *arr, int64_t arrsize) return nan_count; } -static inline void +NPY_FINLINE void replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count) { for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp index f680c0704..62a7fa54e 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp @@ -331,7 +331,7 @@ struct vector { * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -static inline zmm_t sort_zmm_64bit(zmm_t zmm) +NPY_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm) { zmm = cmp_merge( zmm, vtype::template shuffle(zmm), 0xAA); @@ -353,7 +353,7 @@ static inline zmm_t sort_zmm_64bit(zmm_t zmm) // Assumes zmm is bitonic and performs a recursive half cleaner template -static inline zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) +NPY_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) { // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7 @@ -374,7 +374,7 @@ static inline zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner template -static inline void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) +NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) { // 1) First step of a merging network: coex of zmm1 and zmm2 reversed zmm2 = vtype::permutexvar(rev_index, zmm2); @@ -388,7 +388,7 @@ static inline void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive // half cleaner template -static inline void bitonic_merge_four_zmm_64bit(zmm_t *zmm) +NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) { // 1) First step of a merging network zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]); @@ -409,7 +409,7 @@ static inline void bitonic_merge_four_zmm_64bit(zmm_t *zmm) } template -static inline void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) +NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) { zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]); zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]); @@ -442,7 +442,7 @@ static inline void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) } template -static inline void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) +NPY_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) { zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]); zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]); @@ -515,7 +515,7 @@ static inline void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) } template -static inline void sort_8_64bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_8_64bit(type_t *arr, int32_t N) { typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; typename vtype::zmm_t zmm @@ -524,7 +524,7 @@ static inline void sort_8_64bit(type_t *arr, int32_t N) } template -static inline void sort_16_64bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_16_64bit(type_t *arr, int32_t N) { if (N <= 8) { sort_8_64bit(arr, N); @@ -542,7 +542,7 @@ static inline void sort_16_64bit(type_t *arr, int32_t N) } template -static inline void sort_32_64bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_32_64bit(type_t *arr, int32_t N) { if (N <= 16) { sort_16_64bit(arr, N); @@ -573,7 +573,7 @@ static inline void sort_32_64bit(type_t *arr, int32_t N) } template -static inline void sort_64_64bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_64_64bit(type_t *arr, int32_t N) { if (N <= 32) { sort_32_64bit(arr, N); @@ -624,7 +624,7 @@ static inline void sort_64_64bit(type_t *arr, int32_t N) } template -static inline void sort_128_64bit(type_t *arr, int32_t N) +NPY_FINLINE void sort_128_64bit(type_t *arr, int32_t N) { if (N <= 64) { sort_64_64bit(arr, N); @@ -714,7 +714,7 @@ static inline void sort_128_64bit(type_t *arr, int32_t N) } template -static inline type_t +NPY_FINLINE type_t get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right) { // median of 8 @@ -735,7 +735,7 @@ get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right) } template -static inline void +static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) { /* @@ -764,7 +764,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_64bit_(arr, pivot_index, right, max_iters - 1); } -static inline int64_t replace_nan_with_inf(double *arr, int64_t arrsize) +NPY_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize) { int64_t nan_count = 0; __mmask8 loadmask = 0xFF; @@ -780,7 +780,7 @@ static inline int64_t replace_nan_with_inf(double *arr, int64_t arrsize) return nan_count; } -static inline void +NPY_FINLINE void replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count) { for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { -- cgit v1.2.1 From fba06e75e4865168f5c3b6637c8a792fc1d9a2d7 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 6 Oct 2022 13:51:18 -0700 Subject: ENH: Use npyv_* for missing intrinsics in gcc-6 --- .../npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp | 20 ++++++++++---------- .../npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp | 18 +++++++++--------- .../npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp | 18 +++++++++--------- .../npysort/x86-simd-sort/src/avx512-common-qsort.h | 1 + 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp index 26a54e36b..51cb4dbb0 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp @@ -57,7 +57,7 @@ struct vector { static opmask_t knot_opmask(opmask_t x) { - return _knot_mask32(x); + return npyv_not_b16(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -106,16 +106,16 @@ struct vector { { zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo); - type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi); + type_t lo_max = (type_t)npyv_reduce_max_s32(lo); + type_t hi_max = (type_t)npyv_reduce_max_s32(hi); return std::max(lo_max, hi_max); } static type_t reducemin(zmm_t v) { zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo); - type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi); + type_t lo_min = (type_t)npyv_reduce_min_s32(lo); + type_t hi_min = (type_t)npyv_reduce_min_s32(hi); return std::min(lo_min, hi_min); } static zmm_t set1(type_t v) @@ -161,7 +161,7 @@ struct vector { //} static opmask_t knot_opmask(opmask_t x) { - return _knot_mask32(x); + return npyv_not_b16(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -203,16 +203,16 @@ struct vector { { zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo); - type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi); + type_t lo_max = (type_t)npyv_reduce_max_s32(lo); + type_t hi_max = (type_t)npyv_reduce_max_s32(hi); return std::max(lo_max, hi_max); } static type_t reducemin(zmm_t v) { zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo); - type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi); + type_t lo_min = (type_t)npyv_reduce_min_s32(lo); + type_t hi_min = (type_t)npyv_reduce_min_s32(hi); return std::min(lo_min, hi_min); } static zmm_t set1(type_t v) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp index 7899d8522..ac5bece7a 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp @@ -46,7 +46,7 @@ struct vector { static opmask_t knot_opmask(opmask_t x) { - return _knot_mask16(x); + return _mm512_knot(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -96,11 +96,11 @@ struct vector { } static type_t reducemax(zmm_t v) { - return _mm512_reduce_max_epi32(v); + return npyv_reduce_max_s32(v); } static type_t reducemin(zmm_t v) { - return _mm512_reduce_min_epi32(v); + return npyv_reduce_min_s32(v); } static zmm_t set1(type_t v) { @@ -158,7 +158,7 @@ struct vector { } static opmask_t knot_opmask(opmask_t x) { - return _knot_mask16(x); + return _mm512_knot(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -198,11 +198,11 @@ struct vector { } static type_t reducemax(zmm_t v) { - return _mm512_reduce_max_epu32(v); + return npyv_reduce_max_u32(v); } static type_t reducemin(zmm_t v) { - return _mm512_reduce_min_epu32(v); + return npyv_reduce_min_u32(v); } static zmm_t set1(type_t v) { @@ -250,7 +250,7 @@ struct vector { static opmask_t knot_opmask(opmask_t x) { - return _knot_mask16(x); + return _mm512_knot(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -301,11 +301,11 @@ struct vector { } static type_t reducemax(zmm_t v) { - return _mm512_reduce_max_ps(v); + return npyv_reduce_max_f32(v); } static type_t reducemin(zmm_t v) { - return _mm512_reduce_min_ps(v); + return npyv_reduce_min_f32(v); } static zmm_t set1(type_t v) { diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp index 62a7fa54e..e6b7f8943 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp @@ -56,7 +56,7 @@ struct vector { static opmask_t knot_opmask(opmask_t x) { - return _knot_mask8(x); + return npyv_not_b64(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -101,11 +101,11 @@ struct vector { } static type_t reducemax(zmm_t v) { - return _mm512_reduce_max_epi64(v); + return npyv_reduce_max_s64(v); } static type_t reducemin(zmm_t v) { - return _mm512_reduce_min_epi64(v); + return npyv_reduce_min_s64(v); } static zmm_t set1(type_t v) { @@ -163,7 +163,7 @@ struct vector { } static opmask_t knot_opmask(opmask_t x) { - return _knot_mask8(x); + return npyv_not_b64(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -203,11 +203,11 @@ struct vector { } static type_t reducemax(zmm_t v) { - return _mm512_reduce_max_epu64(v); + return npyv_reduce_max_u64(v); } static type_t reducemin(zmm_t v) { - return _mm512_reduce_min_epu64(v); + return npyv_reduce_min_u64(v); } static zmm_t set1(type_t v) { @@ -260,7 +260,7 @@ struct vector { static opmask_t knot_opmask(opmask_t x) { - return _knot_mask8(x); + return npyv_not_b64(x); } static opmask_t ge(zmm_t x, zmm_t y) { @@ -305,11 +305,11 @@ struct vector { } static type_t reducemax(zmm_t v) { - return _mm512_reduce_max_pd(v); + return npyv_reduce_max_f64(v); } static type_t reducemin(zmm_t v) { - return _mm512_reduce_min_pd(v); + return npyv_reduce_min_f64(v); } static zmm_t set1(type_t v) { diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h index e713e1f20..56560185c 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h @@ -38,6 +38,7 @@ #include #include #include +#include "simd/simd.h" #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() -- cgit v1.2.1 From 92bd9902d4233d9f5befe05fd47bfb8b2d4e102a Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 10 Oct 2022 22:31:02 -0700 Subject: MAINT: Disable AVX-512 qsort on macOS and WIN32 --- numpy/core/setup.py | 18 +++++++++++++++++- numpy/core/src/npysort/quicksort.cpp | 8 ++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 912867709..fb91f8e68 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -68,6 +68,15 @@ class CallOnceOnly: out = copy.deepcopy(pickle.loads(self._check_complex)) return out +# Temporarily disable AVX512 sorting on WIN32 and macOS until we can figure +# out why the build fails +def enable_avx512_qsort(): + enable = True + platform = sysconfig.get_platform() + if "win32" in platform or "macos" in platform: + enable = False + return enable + def can_link_svml(): """SVML library is supported only on x86_64 architecture and currently only on linux @@ -484,6 +493,9 @@ def configuration(parent_package='',top_path=None): if can_link_svml(): moredefs.append(('NPY_CAN_LINK_SVML', 1)) + if enable_avx512_qsort(): + moredefs.append(('NPY_ENABLE_AVX512_QSORT', 1)) + # Use bogus stride debug aid to flush out bugs where users use # strides of dimensions with length 1 to index a full contiguous # array. @@ -943,7 +955,6 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'usertypes.c'), join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), - join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), join('src', 'npysort', 'quicksort.cpp'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), @@ -967,6 +978,11 @@ def configuration(parent_package='',top_path=None): join('src', 'npymath', 'arm64_exports.c'), ] + if enable_avx512_qsort(): + multiarray_src += [ + join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), + ] + ####################################################################### # _multiarray_umath module - umath part # ####################################################################### diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 0674d25ac..363daf46f 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -55,13 +55,15 @@ #include "npysort_heapsort.h" #include "numpy_tag.h" -#include "x86-qsort-skx.h" #include #include +#ifdef NPY_ENABLE_AVX512_QSORT +#include "x86-qsort-skx.h" #ifndef NPY_DISABLE_OPTIMIZATION #include "x86-qsort-skx.dispatch.h" -#endif +#endif // NPY_DISABLE_OPTIMIZATION +#endif // NPY_ENABLE_AVX512_QSORT #define NOT_USED NPY_UNUSED(unused) /* @@ -86,6 +88,7 @@ struct x86_dispatch { static bool quicksort(typename Tag::type *, npy_intp) { return false; } }; +#ifdef NPY_ENABLE_AVX512_QSORT #if NPY_SIZEOF_LONG == 8 template <> struct x86_dispatch { @@ -197,6 +200,7 @@ struct x86_dispatch { return false; } }; +#endif // NPY_ENABLE_AVX512_QSORT } // namespace -- cgit v1.2.1 From 37c52d4757e71e4ce33483181302807d5f72340a Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 11 Oct 2022 10:10:51 -0700 Subject: BUG: Do not use a global static const __m512 variable --- numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp index e6b7f8943..d882d78d9 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp @@ -19,7 +19,6 @@ #define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7 #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 #define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 -static const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); template <> struct vector { @@ -333,6 +332,7 @@ struct vector { template NPY_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); zmm = cmp_merge( zmm, vtype::template shuffle(zmm), 0xAA); zmm = cmp_merge( @@ -376,6 +376,7 @@ NPY_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) template NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); // 1) First step of a merging network: coex of zmm1 and zmm2 reversed zmm2 = vtype::permutexvar(rev_index, zmm2); zmm_t zmm3 = vtype::min(zmm1, zmm2); @@ -390,6 +391,7 @@ NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) template NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); // 1) First step of a merging network zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]); zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]); @@ -411,6 +413,7 @@ NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) template NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]); zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]); zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]); @@ -444,6 +447,7 @@ NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) template NPY_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]); zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]); zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]); -- cgit v1.2.1 From 0d3feb0a829ea53d525487ea351055442b467c2c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 11 Oct 2022 10:34:15 -0700 Subject: ENH: Add AVX-512 based 16-bit dtype sort This reverts commit 138ba7583253e7540a206e7f0df3edcd5e26c518. --- numpy/core/setup.py | 3 +- numpy/core/src/npysort/quicksort.cpp | 51 +++++++++++++++++++---- numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 29 +++++++++++++ numpy/core/src/npysort/x86-qsort-icl.h | 24 +++++++++++ 4 files changed, 97 insertions(+), 10 deletions(-) create mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp create mode 100644 numpy/core/src/npysort/x86-qsort-icl.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index fb91f8e68..c5d8564f9 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -73,7 +73,7 @@ class CallOnceOnly: def enable_avx512_qsort(): enable = True platform = sysconfig.get_platform() - if "win32" in platform or "macos" in platform: + if "win32" in platform: enable = False return enable @@ -981,6 +981,7 @@ def configuration(parent_package='',top_path=None): if enable_avx512_qsort(): multiarray_src += [ join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), + join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'), ] ####################################################################### diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 363daf46f..6c90bf0bb 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -58,13 +58,6 @@ #include #include -#ifdef NPY_ENABLE_AVX512_QSORT -#include "x86-qsort-skx.h" -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-skx.dispatch.h" -#endif // NPY_DISABLE_OPTIMIZATION -#endif // NPY_ENABLE_AVX512_QSORT - #define NOT_USED NPY_UNUSED(unused) /* * pushing largest partition has upper bound of log2(n) space @@ -88,7 +81,15 @@ struct x86_dispatch { static bool quicksort(typename Tag::type *, npy_intp) { return false; } }; +// Currently disabled on WIN32 only #ifdef NPY_ENABLE_AVX512_QSORT +#include "x86-qsort-skx.h" +#include "x86-qsort-icl.h" + +#ifndef NPY_DISABLE_OPTIMIZATION +#include "x86-qsort-skx.dispatch.h" +#endif + #if NPY_SIZEOF_LONG == 8 template <> struct x86_dispatch { @@ -143,7 +144,7 @@ struct x86_dispatch { return false; } }; -#endif +#endif // NPY_SIZEOF_LONG template <> struct x86_dispatch { @@ -200,9 +201,41 @@ struct x86_dispatch { return false; } }; + +#ifndef NPY_DISABLE_OPTIMIZATION +#include "x86-qsort-icl.dispatch.h" +#endif + +template <> +struct x86_dispatch { + static bool quicksort(npy_short *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + +template <> +struct x86_dispatch { + static bool quicksort(npy_ushort *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; #endif // NPY_ENABLE_AVX512_QSORT -} // namespace +} // end namespace template static int diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp new file mode 100644 index 000000000..7d6dc331b --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp @@ -0,0 +1,29 @@ +/*@targets + * $maxopt $keep_baseline avx512_icl + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "x86-qsort-icl.h" +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#ifdef NPY_HAVE_AVX512_ICL +#include "avx512-16bit-qsort.hpp" + +/*************************************** + * C > C++ dispatch + ***************************************/ +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_short*)arr, arrsize); +} + +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize) +{ + avx512_qsort((npy_ushort*)arr, arrsize); +} + +#endif // NPY_HAVE_AVX512_ICL diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h new file mode 100644 index 000000000..2093e0bce --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort-icl.h @@ -0,0 +1,24 @@ +#include "numpy/npy_common.h" + +#include "npy_cpu_dispatch.h" + +#ifndef NPY_NO_EXPORT +#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION +#include "x86-qsort-icl.dispatch.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort, + (void *start, npy_intp num)) + +#ifdef __cplusplus +} +#endif -- cgit v1.2.1 From c71352232164ab7ddc4142ebc1db694493b34ff9 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 11 Oct 2022 14:38:30 -0700 Subject: MAINT: Fix comment --- numpy/core/setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index c5d8564f9..3ab00205f 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -68,12 +68,11 @@ class CallOnceOnly: out = copy.deepcopy(pickle.loads(self._check_complex)) return out -# Temporarily disable AVX512 sorting on WIN32 and macOS until we can figure -# out why the build fails +# Temporarily disable AVX512 sorting on WIN32 until we can figure +# out why it has test failures def enable_avx512_qsort(): enable = True - platform = sysconfig.get_platform() - if "win32" in platform: + if "win32" in sysconfig.get_platform(): enable = False return enable -- cgit v1.2.1 From e91610af8ed4b9ba200086c7edea2f9a1a4ca280 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 12 Oct 2022 14:02:08 -0700 Subject: MAINT: Use loadu intrinsic instead of set1_epi16 gcc-8 is missing the _mm512_set1_epi16 intrinsic --- .../x86-simd-sort/src/avx512-16bit-qsort.hpp | 170 +++++++++------------ 1 file changed, 74 insertions(+), 96 deletions(-) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp index 51cb4dbb0..5fcb8902d 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp @@ -15,24 +15,20 @@ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) */ // ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -#define NETWORK_16BIT_1 \ - 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, \ - 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -#define NETWORK_16BIT_2 \ - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, \ - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -#define NETWORK_16BIT_3 \ - 27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, 11, 10, 9, \ - 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 -#define NETWORK_16BIT_4 \ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -#define NETWORK_16BIT_5 \ - 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, 7, 6, 5, \ - 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 -#define NETWORK_16BIT_6 \ - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, \ - 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 +static const uint16_t network[6][32] + = {{7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, + 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}, + {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}, + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, + 20, 21, 22, 23, 16, 17, 18, 19, 28, 29, 30, 31, 24, 25, 26, 27}, + {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23}, + {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}; + template <> struct vector { @@ -42,6 +38,10 @@ struct vector { using opmask_t = __mmask32; static const uint8_t numlanes = 32; + static zmm_t get_network(int index) + { + return _mm512_loadu_si512(&network[index-1][0]); + } static type_t type_max() { return X86_SIMD_SORT_MAX_INT16; @@ -54,20 +54,15 @@ struct vector { { return _mm512_set1_epi16(type_max()); } - static opmask_t knot_opmask(opmask_t x) { return npyv_not_b16(x); } + static opmask_t ge(zmm_t x, zmm_t y) { return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT); } - //template - //static zmm_t i64gather(__m512i index, void const *base) - //{ - // return _mm512_i64gather_epi64(index, base, scale); - //} static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); @@ -141,6 +136,10 @@ struct vector { using opmask_t = __mmask32; static const uint8_t numlanes = 32; + static zmm_t get_network(int index) + { + return _mm512_loadu_si512(&network[index-1][0]); + } static type_t type_max() { return X86_SIMD_SORT_MAX_UINT16; @@ -152,13 +151,8 @@ struct vector { static zmm_t zmm_max() { return _mm512_set1_epi16(type_max()); - } // TODO: this should broadcast bits as is? + } - //template - //static zmm_t i64gather(__m512i index, void const *base) - //{ - // return _mm512_i64gather_epi64(index, base, scale); - //} static opmask_t knot_opmask(opmask_t x) { return npyv_not_b16(x); @@ -254,9 +248,7 @@ NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) 0xAAAAAAAA); // Level 3 zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_1), zmm), - 0xF0F0F0F0); + zmm, vtype::permutexvar(vtype::get_network(1), zmm), 0xF0F0F0F0); zmm = cmp_merge( zmm, vtype::template shuffle(zmm), @@ -267,13 +259,9 @@ NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) 0xAAAAAAAA); // Level 4 zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_2), zmm), - 0xFF00FF00); + zmm, vtype::permutexvar(vtype::get_network(2), zmm), 0xFF00FF00); zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm), - 0xF0F0F0F0); + zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0); zmm = cmp_merge( zmm, vtype::template shuffle(zmm), @@ -284,17 +272,11 @@ NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) 0xAAAAAAAA); // Level 5 zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm), - 0xFFFF0000); + zmm, vtype::permutexvar(vtype::get_network(4), zmm), 0xFFFF0000); zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm), - 0xFF00FF00); + zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00); zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm), - 0xF0F0F0F0); + zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0); zmm = cmp_merge( zmm, vtype::template shuffle(zmm), @@ -312,19 +294,13 @@ NPY_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) { // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc .. zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_6), zmm), - 0xFFFF0000); + zmm, vtype::permutexvar(vtype::get_network(6), zmm), 0xFFFF0000); // 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_5), zmm), - 0xFF00FF00); + zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00); // 3) half_cleaner[8] zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_3), zmm), - 0xF0F0F0F0); + zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0); // 3) half_cleaner[4] zmm = cmp_merge( zmm, @@ -343,7 +319,7 @@ template NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) { // 1) First step of a merging network: coex of zmm1 and zmm2 reversed - zmm2 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm2); + zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2); zmm_t zmm3 = vtype::min(zmm1, zmm2); zmm_t zmm4 = vtype::max(zmm1, zmm2); // 2) Recursive half cleaner for each @@ -356,13 +332,13 @@ NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) template NPY_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm) { - zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[2]); - zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), zmm[3]); + zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]); + zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]); zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); - zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), + zmm_t zmm_t3 = vtype::permutexvar(vtype::get_network(4), vtype::max(zmm[1], zmm2r)); - zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi16(NETWORK_16BIT_4), + zmm_t zmm_t4 = vtype::permutexvar(vtype::get_network(4), vtype::max(zmm[0], zmm3r)); zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); @@ -436,43 +412,45 @@ NPY_FINLINE void sort_128_16bit(type_t *arr, int32_t N) } template -NPY_FINLINE type_t -get_pivot_16bit(type_t *arr, const int64_t left, const int64_t right) +NPY_FINLINE type_t get_pivot_16bit(type_t *arr, + const int64_t left, + const int64_t right) { // median of 32 int64_t size = (right - left) / 32; - __m512i rand_vec = _mm512_set_epi16(arr[left], - arr[left + size], - arr[left + 2 * size], - arr[left + 3 * size], - arr[left + 4 * size], - arr[left + 5 * size], - arr[left + 6 * size], - arr[left + 7 * size], - arr[left + 8 * size], - arr[left + 9 * size], - arr[left + 10 * size], - arr[left + 11 * size], - arr[left + 12 * size], - arr[left + 13 * size], - arr[left + 14 * size], - arr[left + 15 * size], - arr[left + 16 * size], - arr[left + 17 * size], - arr[left + 18 * size], - arr[left + 19 * size], - arr[left + 20 * size], - arr[left + 21 * size], - arr[left + 22 * size], - arr[left + 23 * size], - arr[left + 24 * size], - arr[left + 25 * size], - arr[left + 26 * size], - arr[left + 27 * size], - arr[left + 28 * size], - arr[left + 29 * size], - arr[left + 30 * size], - arr[left + 31 * size]); + type_t vec_arr[32] = {arr[left], + arr[left + size], + arr[left + 2 * size], + arr[left + 3 * size], + arr[left + 4 * size], + arr[left + 5 * size], + arr[left + 6 * size], + arr[left + 7 * size], + arr[left + 8 * size], + arr[left + 9 * size], + arr[left + 10 * size], + arr[left + 11 * size], + arr[left + 12 * size], + arr[left + 13 * size], + arr[left + 14 * size], + arr[left + 15 * size], + arr[left + 16 * size], + arr[left + 17 * size], + arr[left + 18 * size], + arr[left + 19 * size], + arr[left + 20 * size], + arr[left + 21 * size], + arr[left + 22 * size], + arr[left + 23 * size], + arr[left + 24 * size], + arr[left + 25 * size], + arr[left + 26 * size], + arr[left + 27 * size], + arr[left + 28 * size], + arr[left + 29 * size], + arr[left + 30 * size], + arr[left + 31 * size]}; + __m512i rand_vec = _mm512_loadu_si512(vec_arr); __m512i sort = sort_zmm_16bit(rand_vec); return ((type_t *)&sort)[16]; } -- cgit v1.2.1 From 73aa5ea217818b93631cdf61ae0530b75e27303e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 13 Oct 2022 22:48:08 -0700 Subject: TST: Add quicksort test coverage for all 16, 32, 64 bit dtypes --- numpy/core/tests/test_multiarray.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 2d6f9c38c..0dc697bb0 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9858,39 +9858,39 @@ class TestViewDtype: # Test various array sizes that hit different code paths in quicksort-avx512 -@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191, - 256, 383, 512, 1023, 2047]) -def test_sort_float(N): +@pytest.mark.parametrize("N", np.arange(1,512)) +@pytest.mark.parametrize("dtype", ['e', 'f', 'd']) +def test_sort_float(N, dtype): # Regular data with nan sprinkled np.random.seed(42) - arr = -0.5 + np.random.sample(N).astype('f') + arr = -0.5 + np.random.sample(N).astype(dtype) arr[np.random.choice(arr.shape[0], 3)] = np.nan assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) # (2) with +INF - infarr = np.inf*np.ones(N, dtype='f') + infarr = np.inf*np.ones(N, dtype=dtype) infarr[np.random.choice(infarr.shape[0], 5)] = -1.0 assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) # (3) with -INF - neginfarr = -np.inf*np.ones(N, dtype='f') + neginfarr = -np.inf*np.ones(N, dtype=dtype) neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0 assert_equal(np.sort(neginfarr, kind='quick'), np.sort(neginfarr, kind='heap')) # (4) with +/-INF - infarr = np.inf*np.ones(N, dtype='f') + infarr = np.inf*np.ones(N, dtype=dtype) infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) -def test_sort_int(): - # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled - rng = np.random.default_rng(42) - N = 2047 - minv = np.iinfo(np.int32).min - maxv = np.iinfo(np.int32).max - arr = rng.integers(low=minv, high=maxv, size=N).astype('int32') +@pytest.mark.parametrize("N", np.arange(1,512)) +@pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L']) +def test_sort_int(N, dtype): + # Random data with MAX and MIN sprinkled + minv = np.iinfo(dtype).min + maxv = np.iinfo(dtype).max + arr = np.random.randint(low=minv, high=maxv-1, size=N, dtype=dtype) arr[np.random.choice(arr.shape[0], 10)] = minv arr[np.random.choice(arr.shape[0], 10)] = maxv assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) -- cgit v1.2.1 From e9b39401f51351fc05712c207a78fecaac6c02fa Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 13 Oct 2022 22:51:27 -0700 Subject: MAINT: Fix linter errors --- numpy/core/tests/test_multiarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 0dc697bb0..31c57f9bc 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9858,7 +9858,7 @@ class TestViewDtype: # Test various array sizes that hit different code paths in quicksort-avx512 -@pytest.mark.parametrize("N", np.arange(1,512)) +@pytest.mark.parametrize("N", np.arange(1, 512)) @pytest.mark.parametrize("dtype", ['e', 'f', 'd']) def test_sort_float(N, dtype): # Regular data with nan sprinkled @@ -9884,7 +9884,7 @@ def test_sort_float(N, dtype): assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) -@pytest.mark.parametrize("N", np.arange(1,512)) +@pytest.mark.parametrize("N", np.arange(1, 512)) @pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L']) def test_sort_int(N, dtype): # Random data with MAX and MIN sprinkled -- cgit v1.2.1 From df915b889125948cb2461c3bacf892b6143515f0 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 31 Oct 2022 11:05:36 -0700 Subject: ENH: Use AVX-512 qsort for half precision float --- numpy/core/src/npysort/quicksort.cpp | 15 ++ numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 6 + numpy/core/src/npysort/x86-qsort-icl.h | 3 + .../x86-simd-sort/src/avx512-16bit-qsort.hpp | 211 ++++++++++++++++++++- .../x86-simd-sort/src/avx512-32bit-qsort.hpp | 5 +- .../x86-simd-sort/src/avx512-64bit-qsort.hpp | 5 +- .../x86-simd-sort/src/avx512-common-qsort.h | 19 +- 7 files changed, 253 insertions(+), 11 deletions(-) diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 6c90bf0bb..f2cada873 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -206,6 +206,21 @@ struct x86_dispatch { #include "x86-qsort-icl.dispatch.h" #endif +template <> +struct x86_dispatch { + static bool quicksort(npy_half *start, npy_intp num) + { + void (*dispfunc)(void *, npy_intp) = nullptr; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_half); + if (dispfunc) { + (*dispfunc)(start, num); + return true; + } + return false; + } +}; + + template <> struct x86_dispatch { static bool quicksort(npy_short *start, npy_intp num) diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp index 7d6dc331b..3dce8a9b4 100644 --- a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp +++ b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp @@ -14,6 +14,12 @@ /*************************************** * C > C++ dispatch ***************************************/ +NPY_NO_EXPORT void +NPY_CPU_DISPATCH_CURFX(x86_quicksort_half)(void *arr, npy_intp arrsize) +{ + avx512_qsort_fp16((npy_half*)arr, arrsize); +} + NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize) { diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h index 2093e0bce..92cef9cbc 100644 --- a/numpy/core/src/npysort/x86-qsort-icl.h +++ b/numpy/core/src/npysort/x86-qsort-icl.h @@ -13,6 +13,9 @@ #ifdef __cplusplus extern "C" { #endif +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_half, + (void *start, npy_intp num)) + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short, (void *start, npy_intp num)) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp index 5fcb8902d..190188ecc 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp @@ -29,6 +29,142 @@ static const uint16_t network[6][32] {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}; +struct float16 { + uint16_t val; +}; + +template <> +struct vector { + using type_t = uint16_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask32; + static const uint8_t numlanes = 32; + + static zmm_t get_network(int index) + { + return _mm512_loadu_si512(&network[index - 1][0]); + } + static type_t type_max() + { + return X86_SIMD_SORT_INFINITYH; + } + static type_t type_min() + { + return X86_SIMD_SORT_NEGINFINITYH; + } + static zmm_t zmm_max() + { + return _mm512_set1_epi16(type_max()); + } + static opmask_t knot_opmask(opmask_t x) + { + return _knot_mask32(x); + } + + static opmask_t ge(zmm_t x, zmm_t y) + { + zmm_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000)); + zmm_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000)); + zmm_t exp_x = _mm512_and_si512(x, _mm512_set1_epi16(0x7c00)); + zmm_t exp_y = _mm512_and_si512(y, _mm512_set1_epi16(0x7c00)); + zmm_t mant_x = _mm512_and_si512(x, _mm512_set1_epi16(0x3ff)); + zmm_t mant_y = _mm512_and_si512(y, _mm512_set1_epi16(0x3ff)); + + __mmask32 mask_ge = _mm512_cmp_epu16_mask(sign_x, sign_y, _MM_CMPINT_LT); // only greater than + __mmask32 sign_eq = _mm512_cmpeq_epu16_mask(sign_x, sign_y); + __mmask32 neg = _mm512_mask_cmpeq_epu16_mask(sign_eq, sign_x, _mm512_set1_epi16(0x8000)); // both numbers are -ve + + // compare exponents only if signs are equal: + mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(sign_eq, exp_x, exp_y, _MM_CMPINT_NLE); + // get mask for elements for which both sign and exponents are equal: + __mmask32 exp_eq = _mm512_mask_cmpeq_epu16_mask(sign_eq, exp_x, exp_y); + + // compare mantissa for elements for which both sign and expponent are equal: + mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(exp_eq, mant_x, mant_y, _MM_CMPINT_NLT); + return _kxor_mask32(mask_ge, neg); + } + static zmm_t loadu(void const *mem) + { + return _mm512_loadu_si512(mem); + } + static zmm_t max(zmm_t x, zmm_t y) + { + return _mm512_mask_mov_epi16(y, ge(x, y), x); + } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) + { + // AVX512_VBMI2 + return _mm512_mask_compressstoreu_epi16(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) + { + // AVX512BW + return _mm512_mask_loadu_epi16(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) + { + return _mm512_mask_mov_epi16(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) + { + return _mm512_mask_storeu_epi16(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) + { + return _mm512_mask_mov_epi16(x, ge(x, y), y); + } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) + { + return _mm512_permutexvar_epi16(idx, zmm); + } + // Apparently this is a terrible for perf, npy_half_to_float seems to work + // better + //static float uint16_to_float(uint16_t val) + //{ + // // Ideally use _mm_loadu_si16, but its only gcc > 11.x + // // TODO: use inline ASM? https://godbolt.org/z/aGYvh7fMM + // __m128i xmm = _mm_maskz_loadu_epi16(0x01, &val); + // __m128 xmm2 = _mm_cvtph_ps(xmm); + // return _mm_cvtss_f32(xmm2); + //} + static type_t float_to_uint16(float val) + { + __m128 xmm = _mm_load_ss(&val); + __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC); + return _mm_extract_epi16(xmm2, 0); + } + static type_t reducemax(zmm_t v) + { + __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0)); + __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1)); + float lo_max = _mm512_reduce_max_ps(lo); + float hi_max = _mm512_reduce_max_ps(hi); + return float_to_uint16(std::max(lo_max, hi_max)); + } + static type_t reducemin(zmm_t v) + { + __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0)); + __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1)); + float lo_max = _mm512_reduce_min_ps(lo); + float hi_max = _mm512_reduce_min_ps(hi); + return float_to_uint16(std::min(lo_max, hi_max)); + } + static zmm_t set1(type_t v) + { + return _mm512_set1_epi16(v); + } + template + static zmm_t shuffle(zmm_t zmm) + { + zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); + return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) + { + return _mm512_storeu_si512(mem, x); + } +}; template <> struct vector { @@ -40,7 +176,7 @@ struct vector { static zmm_t get_network(int index) { - return _mm512_loadu_si512(&network[index-1][0]); + return _mm512_loadu_si512(&network[index - 1][0]); } static type_t type_max() { @@ -138,7 +274,7 @@ struct vector { static zmm_t get_network(int index) { - return _mm512_loadu_si512(&network[index-1][0]); + return _mm512_loadu_si512(&network[index - 1][0]); } static type_t type_max() { @@ -455,6 +591,38 @@ NPY_FINLINE type_t get_pivot_16bit(type_t *arr, return ((type_t *)&sort)[16]; } +template <> +bool comparison_func>(const uint16_t &a, const uint16_t &b) +{ + uint16_t signa = a & 0x8000, signb = b & 0x8000; + uint16_t expa = a & 0x7c00, expb = b & 0x7c00; + uint16_t manta = a & 0x3ff, mantb = b & 0x3ff; + if (signa != signb) { + // opposite signs + return a > b; + } + else if (signa > 0) { + // both -ve + if (expa != expb) { + return expa > expb; + } + else { + return manta > mantb; + } + } + else { + // both +ve + if (expa != expb) { + return expa < expb; + } + else { + return manta < mantb; + } + } + + //return npy_half_to_float(a) < npy_half_to_float(b); +} + template static void qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) @@ -463,7 +631,7 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) * Resort to std::sort if quicksort isnt making any progress */ if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1); + std::sort(arr + left, arr + right + 1, comparison_func); return; } /* @@ -485,6 +653,33 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_16bit_(arr, pivot_index, right, max_iters - 1); } +NPY_FINLINE int64_t replace_nan_with_inf(uint16_t *arr, int64_t arrsize) +{ + int64_t nan_count = 0; + __mmask16 loadmask = 0xFFFF; + while (arrsize > 0) { + if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } + __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr); + __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm); + __mmask16 nanmask = _mm512_cmp_ps_mask( + in_zmm_asfloat, in_zmm_asfloat, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((int32_t)nanmask); + _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF); + arr += 16; + arrsize -= 16; + } + return nan_count; +} + +NPY_FINLINE void +replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count) +{ + for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + arr[ii] = 0xFFFF; + nan_count -= 1; + } +} + template <> void avx512_qsort(int16_t *arr, int64_t arrsize) { @@ -502,4 +697,14 @@ void avx512_qsort(uint16_t *arr, int64_t arrsize) arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); } } + +void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qsort_16bit_, uint16_t>( + arr, 0, arrsize - 1, 2 * (63 - __builtin_clzll(arrsize))); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} #endif // __AVX512_QSORT_16BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp index ac5bece7a..877849d6c 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp @@ -592,8 +592,9 @@ NPY_FINLINE void sort_128_32bit(type_t *arr, int32_t N) } template -NPY_FINLINE type_t -get_pivot_32bit(type_t *arr, const int64_t left, const int64_t right) +NPY_FINLINE type_t get_pivot_32bit(type_t *arr, + const int64_t left, + const int64_t right) { // median of 16 int64_t size = (right - left) / 16; diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp index d882d78d9..b067f5eda 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp @@ -718,8 +718,9 @@ NPY_FINLINE void sort_128_64bit(type_t *arr, int32_t N) } template -NPY_FINLINE type_t -get_pivot_64bit(type_t *arr, const int64_t left, const int64_t right) +NPY_FINLINE type_t get_pivot_64bit(type_t *arr, + const int64_t left, + const int64_t right) { // median of 8 int64_t size = (right - left) / 8; diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h index 56560185c..639d2f788 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h @@ -33,15 +33,17 @@ * */ +#include "simd/simd.h" #include #include #include #include #include -#include "simd/simd.h" #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() +#define X86_SIMD_SORT_INFINITYH 0x7c00 +#define X86_SIMD_SORT_NEGINFINITYH 0xfc00 #define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits::max() #define X86_SIMD_SORT_MAX_INT16 std::numeric_limits::max() #define X86_SIMD_SORT_MIN_INT16 std::numeric_limits::min() @@ -57,6 +59,7 @@ #define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF) #define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32) #define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32) +#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH) #define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16) #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d @@ -67,6 +70,12 @@ struct vector; template void avx512_qsort(T *arr, int64_t arrsize); +template +bool comparison_func(const T &a, const T &b) +{ + return a < b; +} + /* * COEX == Compare and Exchange two registers by swapping min and max values */ @@ -127,9 +136,11 @@ static inline int64_t partition_avx512(type_t *arr, { /* make array length divisible by vtype::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { - *smallest = std::min(*smallest, arr[left]); - *biggest = std::max(*biggest, arr[left]); - if (arr[left] > pivot) { std::swap(arr[left], arr[--right]); } + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { ++left; } -- cgit v1.2.1 From 361a1a649b298e44e4233f2fec8276674248956d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 1 Nov 2022 22:12:45 -0700 Subject: BENCH: Add float16 to sort benchmarks --- benchmarks/benchmarks/bench_function_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py index 2e44ff76b..cc37bef39 100644 --- a/benchmarks/benchmarks/bench_function_base.py +++ b/benchmarks/benchmarks/bench_function_base.py @@ -248,7 +248,7 @@ class Sort(Benchmark): # In NumPy 1.17 and newer, 'merge' can be one of several # stable sorts, it isn't necessarily merge sort. ['quick', 'merge', 'heap'], - ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'], + ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'], [ ('random',), ('ordered',), -- cgit v1.2.1 From 47ed2780364a270a427d74f0db642bcd4a37e6f5 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 1 Nov 2022 22:13:24 -0700 Subject: TST: Add test for float16 quicksort --- numpy/core/tests/test_multiarray.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 31c57f9bc..796ee07c3 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9883,6 +9883,13 @@ def test_sort_float(N, dtype): infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) +def test_sort_float16(): + arr = np.arange(65536, dtype=np.int16) + temp = np.frombuffer(arr.tobytes(), dtype=np.float16) + data = np.copy(temp) + np.random.shuffle(data) + data_backup = data + assert_equal(np.sort(data, kind='quick'), np.sort(data_backup, kind='heap')) @pytest.mark.parametrize("N", np.arange(1, 512)) @pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L']) -- cgit v1.2.1 From f4c835332426d518c9e99bd00b45e8f5f453d6c8 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 2 Nov 2022 14:17:27 -0700 Subject: Fix linter errors' --- numpy/core/tests/test_multiarray.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 796ee07c3..1d4de8e6e 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9889,7 +9889,9 @@ def test_sort_float16(): data = np.copy(temp) np.random.shuffle(data) data_backup = data - assert_equal(np.sort(data, kind='quick'), np.sort(data_backup, kind='heap')) + assert_equal(np.sort(data, kind='quick'), + np.sort(data_backup, kind='heap')) + @pytest.mark.parametrize("N", np.arange(1, 512)) @pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L']) -- cgit v1.2.1 From 6f2ea90d4d7f69ccc3c6389ef70d50652f3064b7 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 2 Nov 2022 16:05:33 -0700 Subject: BUG: Use log2 instead a builtin --- numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp index 190188ecc..ce8637e32 100644 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp +++ b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp @@ -703,7 +703,7 @@ void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) if (arrsize > 1) { int64_t nan_count = replace_nan_with_inf(arr, arrsize); qsort_16bit_, uint16_t>( - arr, 0, arrsize - 1, 2 * (63 - __builtin_clzll(arrsize))); + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } } -- cgit v1.2.1 From 7c6615a229ec303b504dcacc695dabb4502e28b4 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 30 Jan 2023 13:03:39 -0800 Subject: Adding x86-simd-sort as submodule --- .gitmodules | 3 + numpy/core/src/npysort/x86-simd-sort | 1 + .../x86-simd-sort/src/avx512-16bit-qsort.hpp | 710 ------------------ .../x86-simd-sort/src/avx512-32bit-qsort.hpp | 713 ------------------ .../x86-simd-sort/src/avx512-64bit-qsort.hpp | 825 --------------------- .../x86-simd-sort/src/avx512-common-qsort.h | 230 ------ 6 files changed, 4 insertions(+), 2478 deletions(-) create mode 160000 numpy/core/src/npysort/x86-simd-sort delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp delete mode 100644 numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h diff --git a/.gitmodules b/.gitmodules index 1ea274daf..d849a3caf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "numpy/core/src/umath/svml"] path = numpy/core/src/umath/svml url = https://github.com/numpy/SVML.git +[submodule "numpy/core/src/npysort/x86-simd-sort"] + path = numpy/core/src/npysort/x86-simd-sort + url = https://github.com/intel/x86-simd-sort diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort new file mode 160000 index 000000000..0f1023bd0 --- /dev/null +++ b/numpy/core/src/npysort/x86-simd-sort @@ -0,0 +1 @@ +Subproject commit 0f1023bd0ffdabfe22883b85d4dfe55a6ed6ad3f diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp deleted file mode 100644 index ce8637e32..000000000 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-16bit-qsort.hpp +++ /dev/null @@ -1,710 +0,0 @@ -/******************************************************************* - * Copyright (C) 2022 Intel Corporation - * SPDX-License-Identifier: BSD-3-Clause - * Authors: Raghuveer Devulapalli - * ****************************************************************/ - -#ifndef __AVX512_QSORT_16BIT__ -#define __AVX512_QSORT_16BIT__ - -#include "avx512-common-qsort.h" - -/* - * Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -// ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -static const uint16_t network[6][32] - = {{7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, - 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}, - {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}, - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, - 20, 21, 22, 23, 16, 17, 18, 19, 28, 29, 30, 31, 24, 25, 26, 27}, - {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23}, - {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}; - -struct float16 { - uint16_t val; -}; - -template <> -struct vector { - using type_t = uint16_t; - using zmm_t = __m512i; - using ymm_t = __m256i; - using opmask_t = __mmask32; - static const uint8_t numlanes = 32; - - static zmm_t get_network(int index) - { - return _mm512_loadu_si512(&network[index - 1][0]); - } - static type_t type_max() - { - return X86_SIMD_SORT_INFINITYH; - } - static type_t type_min() - { - return X86_SIMD_SORT_NEGINFINITYH; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi16(type_max()); - } - static opmask_t knot_opmask(opmask_t x) - { - return _knot_mask32(x); - } - - static opmask_t ge(zmm_t x, zmm_t y) - { - zmm_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000)); - zmm_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000)); - zmm_t exp_x = _mm512_and_si512(x, _mm512_set1_epi16(0x7c00)); - zmm_t exp_y = _mm512_and_si512(y, _mm512_set1_epi16(0x7c00)); - zmm_t mant_x = _mm512_and_si512(x, _mm512_set1_epi16(0x3ff)); - zmm_t mant_y = _mm512_and_si512(y, _mm512_set1_epi16(0x3ff)); - - __mmask32 mask_ge = _mm512_cmp_epu16_mask(sign_x, sign_y, _MM_CMPINT_LT); // only greater than - __mmask32 sign_eq = _mm512_cmpeq_epu16_mask(sign_x, sign_y); - __mmask32 neg = _mm512_mask_cmpeq_epu16_mask(sign_eq, sign_x, _mm512_set1_epi16(0x8000)); // both numbers are -ve - - // compare exponents only if signs are equal: - mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(sign_eq, exp_x, exp_y, _MM_CMPINT_NLE); - // get mask for elements for which both sign and exponents are equal: - __mmask32 exp_eq = _mm512_mask_cmpeq_epu16_mask(sign_eq, exp_x, exp_y); - - // compare mantissa for elements for which both sign and expponent are equal: - mask_ge = mask_ge | _mm512_mask_cmp_epu16_mask(exp_eq, mant_x, mant_y, _MM_CMPINT_NLT); - return _kxor_mask32(mask_ge, neg); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_mask_mov_epi16(y, ge(x, y), x); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - // AVX512_VBMI2 - return _mm512_mask_compressstoreu_epi16(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - // AVX512BW - return _mm512_mask_loadu_epi16(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi16(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi16(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_mask_mov_epi16(x, ge(x, y), y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi16(idx, zmm); - } - // Apparently this is a terrible for perf, npy_half_to_float seems to work - // better - //static float uint16_to_float(uint16_t val) - //{ - // // Ideally use _mm_loadu_si16, but its only gcc > 11.x - // // TODO: use inline ASM? https://godbolt.org/z/aGYvh7fMM - // __m128i xmm = _mm_maskz_loadu_epi16(0x01, &val); - // __m128 xmm2 = _mm_cvtph_ps(xmm); - // return _mm_cvtss_f32(xmm2); - //} - static type_t float_to_uint16(float val) - { - __m128 xmm = _mm_load_ss(&val); - __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC); - return _mm_extract_epi16(xmm2, 0); - } - static type_t reducemax(zmm_t v) - { - __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0)); - __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1)); - float lo_max = _mm512_reduce_max_ps(lo); - float hi_max = _mm512_reduce_max_ps(hi); - return float_to_uint16(std::max(lo_max, hi_max)); - } - static type_t reducemin(zmm_t v) - { - __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0)); - __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1)); - float lo_max = _mm512_reduce_min_ps(lo); - float hi_max = _mm512_reduce_min_ps(hi); - return float_to_uint16(std::min(lo_max, hi_max)); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi16(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); - return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } -}; - -template <> -struct vector { - using type_t = int16_t; - using zmm_t = __m512i; - using ymm_t = __m256i; - using opmask_t = __mmask32; - static const uint8_t numlanes = 32; - - static zmm_t get_network(int index) - { - return _mm512_loadu_si512(&network[index - 1][0]); - } - static type_t type_max() - { - return X86_SIMD_SORT_MAX_INT16; - } - static type_t type_min() - { - return X86_SIMD_SORT_MIN_INT16; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi16(type_max()); - } - static opmask_t knot_opmask(opmask_t x) - { - return npyv_not_b16(x); - } - - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_epi16(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - // AVX512_VBMI2 - return _mm512_mask_compressstoreu_epi16(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - // AVX512BW - return _mm512_mask_loadu_epi16(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi16(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi16(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_epi16(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi16(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); - zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_max = (type_t)npyv_reduce_max_s32(lo); - type_t hi_max = (type_t)npyv_reduce_max_s32(hi); - return std::max(lo_max, hi_max); - } - static type_t reducemin(zmm_t v) - { - zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); - zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_min = (type_t)npyv_reduce_min_s32(lo); - type_t hi_min = (type_t)npyv_reduce_min_s32(hi); - return std::min(lo_min, hi_min); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi16(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); - return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } -}; -template <> -struct vector { - using type_t = uint16_t; - using zmm_t = __m512i; - using ymm_t = __m256i; - using opmask_t = __mmask32; - static const uint8_t numlanes = 32; - - static zmm_t get_network(int index) - { - return _mm512_loadu_si512(&network[index - 1][0]); - } - static type_t type_max() - { - return X86_SIMD_SORT_MAX_UINT16; - } - static type_t type_min() - { - return 0; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi16(type_max()); - } - - static opmask_t knot_opmask(opmask_t x) - { - return npyv_not_b16(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_epu16(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi16(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_epi16(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi16(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi16(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_epu16(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi16(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); - zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_max = (type_t)npyv_reduce_max_s32(lo); - type_t hi_max = (type_t)npyv_reduce_max_s32(hi); - return std::max(lo_max, hi_max); - } - static type_t reducemin(zmm_t v) - { - zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); - zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); - type_t lo_min = (type_t)npyv_reduce_min_s32(lo); - type_t hi_min = (type_t)npyv_reduce_min_s32(hi); - return std::min(lo_min, hi_min); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi16(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); - return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } -}; - -/* - * Assumes zmm is random and performs a full sorting network defined in - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg - */ -template -NPY_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) -{ - // Level 1 - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAAAAAA); - // Level 2 - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCCCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAAAAAA); - // Level 3 - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(1), zmm), 0xF0F0F0F0); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCCCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAAAAAA); - // Level 4 - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(2), zmm), 0xFF00FF00); - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCCCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAAAAAA); - // Level 5 - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(4), zmm), 0xFFFF0000); - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00); - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCCCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAAAAAA); - return zmm; -} - -// Assumes zmm is bitonic and performs a recursive half cleaner -template -NPY_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) -{ - // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc .. - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(6), zmm), 0xFFFF0000); - // 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00); - // 3) half_cleaner[8] - zmm = cmp_merge( - zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0); - // 3) half_cleaner[4] - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCCCCCC); - // 3) half_cleaner[2] - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAAAAAA); - return zmm; -} - -// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner -template -NPY_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) -{ - // 1) First step of a merging network: coex of zmm1 and zmm2 reversed - zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2); - zmm_t zmm3 = vtype::min(zmm1, zmm2); - zmm_t zmm4 = vtype::max(zmm1, zmm2); - // 2) Recursive half cleaner for each - zmm1 = bitonic_merge_zmm_16bit(zmm3); - zmm2 = bitonic_merge_zmm_16bit(zmm4); -} - -// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive -// half cleaner -template -NPY_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm) -{ - zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]); - zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); - zmm_t zmm_t3 = vtype::permutexvar(vtype::get_network(4), - vtype::max(zmm[1], zmm2r)); - zmm_t zmm_t4 = vtype::permutexvar(vtype::get_network(4), - vtype::max(zmm[0], zmm3r)); - zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); - zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); - zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); - zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); - zmm[0] = bitonic_merge_zmm_16bit(zmm0); - zmm[1] = bitonic_merge_zmm_16bit(zmm1); - zmm[2] = bitonic_merge_zmm_16bit(zmm2); - zmm[3] = bitonic_merge_zmm_16bit(zmm3); -} - -template -NPY_FINLINE void sort_32_16bit(type_t *arr, int32_t N) -{ - typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF; - typename vtype::zmm_t zmm - = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); - vtype::mask_storeu(arr, load_mask, sort_zmm_16bit(zmm)); -} - -template -NPY_FINLINE void sort_64_16bit(type_t *arr, int32_t N) -{ - if (N <= 32) { - sort_32_16bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - typename vtype::opmask_t load_mask - = ((0x1ull << (N - 32)) - 0x1ull) & 0xFFFFFFFF; - zmm_t zmm1 = vtype::loadu(arr); - zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 32); - zmm1 = sort_zmm_16bit(zmm1); - zmm2 = sort_zmm_16bit(zmm2); - bitonic_merge_two_zmm_16bit(zmm1, zmm2); - vtype::storeu(arr, zmm1); - vtype::mask_storeu(arr + 32, load_mask, zmm2); -} - -template -NPY_FINLINE void sort_128_16bit(type_t *arr, int32_t N) -{ - if (N <= 64) { - sort_64_16bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - using opmask_t = typename vtype::opmask_t; - zmm_t zmm[4]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 32); - opmask_t load_mask1 = 0xFFFFFFFF, load_mask2 = 0xFFFFFFFF; - if (N != 128) { - uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; - load_mask1 = combined_mask & 0xFFFFFFFF; - load_mask2 = (combined_mask >> 32) & 0xFFFFFFFF; - } - zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); - zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 96); - zmm[0] = sort_zmm_16bit(zmm[0]); - zmm[1] = sort_zmm_16bit(zmm[1]); - zmm[2] = sort_zmm_16bit(zmm[2]); - zmm[3] = sort_zmm_16bit(zmm[3]); - bitonic_merge_two_zmm_16bit(zmm[0], zmm[1]); - bitonic_merge_two_zmm_16bit(zmm[2], zmm[3]); - bitonic_merge_four_zmm_16bit(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 32, zmm[1]); - vtype::mask_storeu(arr + 64, load_mask1, zmm[2]); - vtype::mask_storeu(arr + 96, load_mask2, zmm[3]); -} - -template -NPY_FINLINE type_t get_pivot_16bit(type_t *arr, - const int64_t left, - const int64_t right) -{ - // median of 32 - int64_t size = (right - left) / 32; - type_t vec_arr[32] = {arr[left], - arr[left + size], - arr[left + 2 * size], - arr[left + 3 * size], - arr[left + 4 * size], - arr[left + 5 * size], - arr[left + 6 * size], - arr[left + 7 * size], - arr[left + 8 * size], - arr[left + 9 * size], - arr[left + 10 * size], - arr[left + 11 * size], - arr[left + 12 * size], - arr[left + 13 * size], - arr[left + 14 * size], - arr[left + 15 * size], - arr[left + 16 * size], - arr[left + 17 * size], - arr[left + 18 * size], - arr[left + 19 * size], - arr[left + 20 * size], - arr[left + 21 * size], - arr[left + 22 * size], - arr[left + 23 * size], - arr[left + 24 * size], - arr[left + 25 * size], - arr[left + 26 * size], - arr[left + 27 * size], - arr[left + 28 * size], - arr[left + 29 * size], - arr[left + 30 * size], - arr[left + 31 * size]}; - __m512i rand_vec = _mm512_loadu_si512(vec_arr); - __m512i sort = sort_zmm_16bit(rand_vec); - return ((type_t *)&sort)[16]; -} - -template <> -bool comparison_func>(const uint16_t &a, const uint16_t &b) -{ - uint16_t signa = a & 0x8000, signb = b & 0x8000; - uint16_t expa = a & 0x7c00, expb = b & 0x7c00; - uint16_t manta = a & 0x3ff, mantb = b & 0x3ff; - if (signa != signb) { - // opposite signs - return a > b; - } - else if (signa > 0) { - // both -ve - if (expa != expb) { - return expa > expb; - } - else { - return manta > mantb; - } - } - else { - // both +ve - if (expa != expb) { - return expa < expb; - } - else { - return manta < mantb; - } - } - - //return npy_half_to_float(a) < npy_half_to_float(b); -} - -template -static void -qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) -{ - /* - * Resort to std::sort if quicksort isnt making any progress - */ - if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1, comparison_func); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128_16bit(arr + left, (int32_t)(right + 1 - left)); - return; - } - - type_t pivot = get_pivot_16bit(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512( - arr, left, right + 1, pivot, &smallest, &biggest); - if (pivot != smallest) - qsort_16bit_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_16bit_(arr, pivot_index, right, max_iters - 1); -} - -NPY_FINLINE int64_t replace_nan_with_inf(uint16_t *arr, int64_t arrsize) -{ - int64_t nan_count = 0; - __mmask16 loadmask = 0xFFFF; - while (arrsize > 0) { - if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } - __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr); - __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm); - __mmask16 nanmask = _mm512_cmp_ps_mask( - in_zmm_asfloat, in_zmm_asfloat, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((int32_t)nanmask); - _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF); - arr += 16; - arrsize -= 16; - } - return nan_count; -} - -NPY_FINLINE void -replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count) -{ - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = 0xFFFF; - nan_count -= 1; - } -} - -template <> -void avx512_qsort(int16_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - qsort_16bit_, int16_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qsort(uint16_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - qsort_16bit_, uint16_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qsort_16bit_, uint16_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); - } -} -#endif // __AVX512_QSORT_16BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp deleted file mode 100644 index 877849d6c..000000000 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-32bit-qsort.hpp +++ /dev/null @@ -1,713 +0,0 @@ -/******************************************************************* - * Copyright (C) 2022 Intel Corporation - * Copyright (C) 2021 Serge Sans Paille - * SPDX-License-Identifier: BSD-3-Clause - * Authors: Raghuveer Devulapalli - * Serge Sans Paille - * ****************************************************************/ -#ifndef __AVX512_QSORT_32BIT__ -#define __AVX512_QSORT_32BIT__ - -#include "avx512-common-qsort.h" - -/* - * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -#define NETWORK_32BIT_1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 -#define NETWORK_32BIT_2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 -#define NETWORK_32BIT_3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -#define NETWORK_32BIT_4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 -#define NETWORK_32BIT_5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 -#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 - -template <> -struct vector { - using type_t = int32_t; - using zmm_t = __m512i; - using ymm_t = __m256i; - using opmask_t = __mmask16; - static const uint8_t numlanes = 16; - - static type_t type_max() - { - return X86_SIMD_SORT_MAX_INT32; - } - static type_t type_min() - { - return X86_SIMD_SORT_MIN_INT32; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi32(type_max()); - } - - static opmask_t knot_opmask(opmask_t x) - { - return _mm512_knot(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); - } - template - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t merge(ymm_t y1, ymm_t y2) - { - zmm_t z1 = _mm512_castsi256_si512(y1); - return _mm512_inserti32x8(z1, y2, 1); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_epi32(x, y); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_epi32(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - return npyv_reduce_max_s32(v); - } - static type_t reducemin(zmm_t v) - { - return npyv_reduce_min_s32(v); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi32(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) - { - return _mm256_max_epi32(x, y); - } - static ymm_t min(ymm_t x, ymm_t y) - { - return _mm256_min_epi32(x, y); - } -}; -template <> -struct vector { - using type_t = uint32_t; - using zmm_t = __m512i; - using ymm_t = __m256i; - using opmask_t = __mmask16; - static const uint8_t numlanes = 16; - - static type_t type_max() - { - return X86_SIMD_SORT_MAX_UINT32; - } - static type_t type_min() - { - return 0; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi32(type_max()); - } // TODO: this should broadcast bits as is? - - template - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t merge(ymm_t y1, ymm_t y2) - { - zmm_t z1 = _mm512_castsi256_si512(y1); - return _mm512_inserti32x8(z1, y2, 1); - } - static opmask_t knot_opmask(opmask_t x) - { - return _mm512_knot(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_epu32(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_epu32(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - return npyv_reduce_max_u32(v); - } - static type_t reducemin(zmm_t v) - { - return npyv_reduce_min_u32(v); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi32(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) - { - return _mm256_max_epu32(x, y); - } - static ymm_t min(ymm_t x, ymm_t y) - { - return _mm256_min_epu32(x, y); - } -}; -template <> -struct vector { - using type_t = float; - using zmm_t = __m512; - using ymm_t = __m256; - using opmask_t = __mmask16; - static const uint8_t numlanes = 16; - - static type_t type_max() - { - return X86_SIMD_SORT_INFINITYF; - } - static type_t type_min() - { - return -X86_SIMD_SORT_INFINITYF; - } - static zmm_t zmm_max() - { - return _mm512_set1_ps(type_max()); - } - - static opmask_t knot_opmask(opmask_t x) - { - return _mm512_knot(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); - } - template - static ymm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_ps(index, base, scale); - } - static zmm_t merge(ymm_t y1, ymm_t y2) - { - zmm_t z1 = _mm512_castsi512_ps( - _mm512_castsi256_si512(_mm256_castps_si256(y1))); - return _mm512_insertf32x8(z1, y2, 1); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_ps(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_ps(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_ps(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_ps(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_ps(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_ps(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_ps(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_ps(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - return npyv_reduce_max_f32(v); - } - static type_t reducemin(zmm_t v) - { - return npyv_reduce_min_f32(v); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_ps(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_ps(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) - { - return _mm256_max_ps(x, y); - } - static ymm_t min(ymm_t x, ymm_t y) - { - return _mm256_min_ps(x, y); - } -}; - -/* - * Assumes zmm is random and performs a full sorting network defined in - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg - */ -template -NPY_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm) -{ - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAA); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAA); - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_3), zmm), - 0xF0F0); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAA); - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm), - 0xFF00); - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm), - 0xF0F0); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCC); - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAA); - return zmm; -} - -// Assumes zmm is bitonic and performs a recursive half cleaner -template -NPY_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) -{ - // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm), - 0xFF00); - // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm), - 0xF0F0); - // 3) half_cleaner[4] - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xCCCC); - // 3) half_cleaner[1] - zmm = cmp_merge( - zmm, - vtype::template shuffle(zmm), - 0xAAAA); - return zmm; -} - -// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner -template -NPY_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) -{ - // 1) First step of a merging network: coex of zmm1 and zmm2 reversed - *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2); - zmm_t zmm3 = vtype::min(*zmm1, *zmm2); - zmm_t zmm4 = vtype::max(*zmm1, *zmm2); - // 2) Recursive half cleaner for each - *zmm1 = bitonic_merge_zmm_32bit(zmm3); - *zmm2 = bitonic_merge_zmm_32bit(zmm4); -} - -// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive -// half cleaner -template -NPY_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) -{ - zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]); - zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); - zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), - vtype::max(zmm[1], zmm2r)); - zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), - vtype::max(zmm[0], zmm3r)); - zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); - zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); - zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); - zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); - zmm[0] = bitonic_merge_zmm_32bit(zmm0); - zmm[1] = bitonic_merge_zmm_32bit(zmm1); - zmm[2] = bitonic_merge_zmm_32bit(zmm2); - zmm[3] = bitonic_merge_zmm_32bit(zmm3); -} - -template -NPY_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) -{ - zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]); - zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]); - zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]); - zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); - zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); - zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); - zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), - vtype::max(zmm[3], zmm4r)); - zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), - vtype::max(zmm[2], zmm5r)); - zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), - vtype::max(zmm[1], zmm6r)); - zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), - vtype::max(zmm[0], zmm7r)); - COEX(zmm_t1, zmm_t3); - COEX(zmm_t2, zmm_t4); - COEX(zmm_t5, zmm_t7); - COEX(zmm_t6, zmm_t8); - COEX(zmm_t1, zmm_t2); - COEX(zmm_t3, zmm_t4); - COEX(zmm_t5, zmm_t6); - COEX(zmm_t7, zmm_t8); - zmm[0] = bitonic_merge_zmm_32bit(zmm_t1); - zmm[1] = bitonic_merge_zmm_32bit(zmm_t2); - zmm[2] = bitonic_merge_zmm_32bit(zmm_t3); - zmm[3] = bitonic_merge_zmm_32bit(zmm_t4); - zmm[4] = bitonic_merge_zmm_32bit(zmm_t5); - zmm[5] = bitonic_merge_zmm_32bit(zmm_t6); - zmm[6] = bitonic_merge_zmm_32bit(zmm_t7); - zmm[7] = bitonic_merge_zmm_32bit(zmm_t8); -} - -template -NPY_FINLINE void sort_16_32bit(type_t *arr, int32_t N) -{ - typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001; - typename vtype::zmm_t zmm - = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); - vtype::mask_storeu(arr, load_mask, sort_zmm_32bit(zmm)); -} - -template -NPY_FINLINE void sort_32_32bit(type_t *arr, int32_t N) -{ - if (N <= 16) { - sort_16_32bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm1 = vtype::loadu(arr); - typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001; - zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16); - zmm1 = sort_zmm_32bit(zmm1); - zmm2 = sort_zmm_32bit(zmm2); - bitonic_merge_two_zmm_32bit(&zmm1, &zmm2); - vtype::storeu(arr, zmm1); - vtype::mask_storeu(arr + 16, load_mask, zmm2); -} - -template -NPY_FINLINE void sort_64_32bit(type_t *arr, int32_t N) -{ - if (N <= 32) { - sort_32_32bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - using opmask_t = typename vtype::opmask_t; - zmm_t zmm[4]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 16); - opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; - uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; - load_mask1 &= combined_mask & 0xFFFF; - load_mask2 &= (combined_mask >> 16) & 0xFFFF; - zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); - zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48); - zmm[0] = sort_zmm_32bit(zmm[0]); - zmm[1] = sort_zmm_32bit(zmm[1]); - zmm[2] = sort_zmm_32bit(zmm[2]); - zmm[3] = sort_zmm_32bit(zmm[3]); - bitonic_merge_two_zmm_32bit(&zmm[0], &zmm[1]); - bitonic_merge_two_zmm_32bit(&zmm[2], &zmm[3]); - bitonic_merge_four_zmm_32bit(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 16, zmm[1]); - vtype::mask_storeu(arr + 32, load_mask1, zmm[2]); - vtype::mask_storeu(arr + 48, load_mask2, zmm[3]); -} - -template -NPY_FINLINE void sort_128_32bit(type_t *arr, int32_t N) -{ - if (N <= 64) { - sort_64_32bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - using opmask_t = typename vtype::opmask_t; - zmm_t zmm[8]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 16); - zmm[2] = vtype::loadu(arr + 32); - zmm[3] = vtype::loadu(arr + 48); - zmm[0] = sort_zmm_32bit(zmm[0]); - zmm[1] = sort_zmm_32bit(zmm[1]); - zmm[2] = sort_zmm_32bit(zmm[2]); - zmm[3] = sort_zmm_32bit(zmm[3]); - opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; - opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; - if (N != 128) { - uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; - load_mask1 &= combined_mask & 0xFFFF; - load_mask2 &= (combined_mask >> 16) & 0xFFFF; - load_mask3 &= (combined_mask >> 32) & 0xFFFF; - load_mask4 &= (combined_mask >> 48) & 0xFFFF; - } - zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); - zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80); - zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96); - zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112); - zmm[4] = sort_zmm_32bit(zmm[4]); - zmm[5] = sort_zmm_32bit(zmm[5]); - zmm[6] = sort_zmm_32bit(zmm[6]); - zmm[7] = sort_zmm_32bit(zmm[7]); - bitonic_merge_two_zmm_32bit(&zmm[0], &zmm[1]); - bitonic_merge_two_zmm_32bit(&zmm[2], &zmm[3]); - bitonic_merge_two_zmm_32bit(&zmm[4], &zmm[5]); - bitonic_merge_two_zmm_32bit(&zmm[6], &zmm[7]); - bitonic_merge_four_zmm_32bit(zmm); - bitonic_merge_four_zmm_32bit(zmm + 4); - bitonic_merge_eight_zmm_32bit(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 16, zmm[1]); - vtype::storeu(arr + 32, zmm[2]); - vtype::storeu(arr + 48, zmm[3]); - vtype::mask_storeu(arr + 64, load_mask1, zmm[4]); - vtype::mask_storeu(arr + 80, load_mask2, zmm[5]); - vtype::mask_storeu(arr + 96, load_mask3, zmm[6]); - vtype::mask_storeu(arr + 112, load_mask4, zmm[7]); -} - -template -NPY_FINLINE type_t get_pivot_32bit(type_t *arr, - const int64_t left, - const int64_t right) -{ - // median of 16 - int64_t size = (right - left) / 16; - using zmm_t = typename vtype::zmm_t; - using ymm_t = typename vtype::ymm_t; - __m512i rand_index1 = _mm512_set_epi64(left + size, - left + 2 * size, - left + 3 * size, - left + 4 * size, - left + 5 * size, - left + 6 * size, - left + 7 * size, - left + 8 * size); - __m512i rand_index2 = _mm512_set_epi64(left + 9 * size, - left + 10 * size, - left + 11 * size, - left + 12 * size, - left + 13 * size, - left + 14 * size, - left + 15 * size, - left + 16 * size); - ymm_t rand_vec1 - = vtype::template i64gather(rand_index1, arr); - ymm_t rand_vec2 - = vtype::template i64gather(rand_index2, arr); - zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2); - zmm_t sort = sort_zmm_32bit(rand_vec); - // pivot will never be a nan, since there are no nan's! - return ((type_t *)&sort)[8]; -} - -template -static void -qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) -{ - /* - * Resort to std::sort if quicksort isnt making any progress - */ - if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); - return; - } - - type_t pivot = get_pivot_32bit(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512( - arr, left, right + 1, pivot, &smallest, &biggest); - if (pivot != smallest) - qsort_32bit_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_32bit_(arr, pivot_index, right, max_iters - 1); -} - -NPY_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) -{ - int64_t nan_count = 0; - __mmask16 loadmask = 0xFFFF; - while (arrsize > 0) { - if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } - __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); - __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((int32_t)nanmask); - _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); - arr += 16; - arrsize -= 16; - } - return nan_count; -} - -NPY_FINLINE void -replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count) -{ - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = std::nanf("1"); - nan_count -= 1; - } -} - -template <> -void avx512_qsort(int32_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - qsort_32bit_, int32_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qsort(uint32_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - qsort_32bit_, uint32_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qsort(float *arr, int64_t arrsize) -{ - if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qsort_32bit_, float>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); - } -} - -#endif //__AVX512_QSORT_32BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp b/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp deleted file mode 100644 index b067f5eda..000000000 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-64bit-qsort.hpp +++ /dev/null @@ -1,825 +0,0 @@ -/******************************************************************* - * Copyright (C) 2022 Intel Corporation - * SPDX-License-Identifier: BSD-3-Clause - * Authors: Raghuveer Devulapalli - * ****************************************************************/ - -#ifndef __AVX512_QSORT_64BIT__ -#define __AVX512_QSORT_64BIT__ - -#include "avx512-common-qsort.h" - -/* - * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -// ZMM 7, 6, 5, 4, 3, 2, 1, 0 -#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3 -#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7 -#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 -#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 - -template <> -struct vector { - using type_t = int64_t; - using zmm_t = __m512i; - using ymm_t = __m512i; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() - { - return X86_SIMD_SORT_MAX_INT64; - } - static type_t type_min() - { - return X86_SIMD_SORT_MIN_INT64; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi64(type_max()); - } // TODO: this should broadcast bits as is? - - static zmm_t set(type_t v1, - type_t v2, - type_t v3, - type_t v4, - type_t v5, - type_t v6, - type_t v7, - type_t v8) - { - return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); - } - - static opmask_t knot_opmask(opmask_t x) - { - return npyv_not_b64(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT); - } - template - static zmm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi64(index, base, scale); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_epi64(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi64(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_epi64(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi64(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi64(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_epi64(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi64(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - return npyv_reduce_max_s64(v); - } - static type_t reducemin(zmm_t v) - { - return npyv_reduce_min_s64(v); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi64(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - __m512d temp = _mm512_castsi512_pd(zmm); - return _mm512_castpd_si512( - _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } -}; -template <> -struct vector { - using type_t = uint64_t; - using zmm_t = __m512i; - using ymm_t = __m512i; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() - { - return X86_SIMD_SORT_MAX_UINT64; - } - static type_t type_min() - { - return 0; - } - static zmm_t zmm_max() - { - return _mm512_set1_epi64(type_max()); - } - - static zmm_t set(type_t v1, - type_t v2, - type_t v3, - type_t v4, - type_t v5, - type_t v6, - type_t v7, - type_t v8) - { - return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); - } - - template - static zmm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_epi64(index, base, scale); - } - static opmask_t knot_opmask(opmask_t x) - { - return npyv_not_b64(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_si512(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_epu64(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_epi64(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_epi64(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_epi64(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_epi64(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_epu64(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_epi64(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - return npyv_reduce_max_u64(v); - } - static type_t reducemin(zmm_t v) - { - return npyv_reduce_min_u64(v); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_epi64(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - __m512d temp = _mm512_castsi512_pd(zmm); - return _mm512_castpd_si512( - _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_si512(mem, x); - } -}; -template <> -struct vector { - using type_t = double; - using zmm_t = __m512d; - using ymm_t = __m512d; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() - { - return X86_SIMD_SORT_INFINITY; - } - static type_t type_min() - { - return -X86_SIMD_SORT_INFINITY; - } - static zmm_t zmm_max() - { - return _mm512_set1_pd(type_max()); - } - - static zmm_t set(type_t v1, - type_t v2, - type_t v3, - type_t v4, - type_t v5, - type_t v6, - type_t v7, - type_t v8) - { - return _mm512_set_pd(v1, v2, v3, v4, v5, v6, v7, v8); - } - - static opmask_t knot_opmask(opmask_t x) - { - return npyv_not_b64(x); - } - static opmask_t ge(zmm_t x, zmm_t y) - { - return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); - } - template - static zmm_t i64gather(__m512i index, void const *base) - { - return _mm512_i64gather_pd(index, base, scale); - } - static zmm_t loadu(void const *mem) - { - return _mm512_loadu_pd(mem); - } - static zmm_t max(zmm_t x, zmm_t y) - { - return _mm512_max_pd(x, y); - } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_compressstoreu_pd(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) - { - return _mm512_mask_loadu_pd(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) - { - return _mm512_mask_mov_pd(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) - { - return _mm512_mask_storeu_pd(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) - { - return _mm512_min_pd(x, y); - } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) - { - return _mm512_permutexvar_pd(idx, zmm); - } - static type_t reducemax(zmm_t v) - { - return npyv_reduce_max_f64(v); - } - static type_t reducemin(zmm_t v) - { - return npyv_reduce_min_f64(v); - } - static zmm_t set1(type_t v) - { - return _mm512_set1_pd(v); - } - template - static zmm_t shuffle(zmm_t zmm) - { - return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) - { - return _mm512_storeu_pd(mem, x); - } -}; - -/* - * Assumes zmm is random and performs a full sorting network defined in - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg - */ -template -NPY_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm) -{ - const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); - zmm = cmp_merge( - zmm, vtype::template shuffle(zmm), 0xAA); - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_1), zmm), - 0xCC); - zmm = cmp_merge( - zmm, vtype::template shuffle(zmm), 0xAA); - zmm = cmp_merge(zmm, vtype::permutexvar(rev_index, zmm), 0xF0); - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm), - 0xCC); - zmm = cmp_merge( - zmm, vtype::template shuffle(zmm), 0xAA); - return zmm; -} - -// Assumes zmm is bitonic and performs a recursive half cleaner -template -NPY_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) -{ - - // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7 - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_4), zmm), - 0xF0); - // 2) half_cleaner[4] - zmm = cmp_merge( - zmm, - vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm), - 0xCC); - // 3) half_cleaner[1] - zmm = cmp_merge( - zmm, vtype::template shuffle(zmm), 0xAA); - return zmm; -} - -// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner -template -NPY_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) -{ - const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); - // 1) First step of a merging network: coex of zmm1 and zmm2 reversed - zmm2 = vtype::permutexvar(rev_index, zmm2); - zmm_t zmm3 = vtype::min(zmm1, zmm2); - zmm_t zmm4 = vtype::max(zmm1, zmm2); - // 2) Recursive half cleaner for each - zmm1 = bitonic_merge_zmm_64bit(zmm3); - zmm2 = bitonic_merge_zmm_64bit(zmm4); -} - -// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive -// half cleaner -template -NPY_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) -{ - const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); - // 1) First step of a merging network - zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]); - zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); - // 2) Recursive half clearer: 16 - zmm_t zmm_t3 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm2r)); - zmm_t zmm_t4 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm3r)); - zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); - zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); - zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); - zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); - zmm[0] = bitonic_merge_zmm_64bit(zmm0); - zmm[1] = bitonic_merge_zmm_64bit(zmm1); - zmm[2] = bitonic_merge_zmm_64bit(zmm2); - zmm[3] = bitonic_merge_zmm_64bit(zmm3); -} - -template -NPY_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) -{ - const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); - zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]); - zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]); - zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]); - zmm_t zmm7r = vtype::permutexvar(rev_index, zmm[7]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); - zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); - zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); - zmm_t zmm_t5 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm4r)); - zmm_t zmm_t6 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm5r)); - zmm_t zmm_t7 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm6r)); - zmm_t zmm_t8 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm7r)); - COEX(zmm_t1, zmm_t3); - COEX(zmm_t2, zmm_t4); - COEX(zmm_t5, zmm_t7); - COEX(zmm_t6, zmm_t8); - COEX(zmm_t1, zmm_t2); - COEX(zmm_t3, zmm_t4); - COEX(zmm_t5, zmm_t6); - COEX(zmm_t7, zmm_t8); - zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); - zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); - zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); - zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); - zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); - zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); - zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); - zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); -} - -template -NPY_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) -{ - const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); - zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]); - zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]); - zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]); - zmm_t zmm11r = vtype::permutexvar(rev_index, zmm[11]); - zmm_t zmm12r = vtype::permutexvar(rev_index, zmm[12]); - zmm_t zmm13r = vtype::permutexvar(rev_index, zmm[13]); - zmm_t zmm14r = vtype::permutexvar(rev_index, zmm[14]); - zmm_t zmm15r = vtype::permutexvar(rev_index, zmm[15]); - zmm_t zmm_t1 = vtype::min(zmm[0], zmm15r); - zmm_t zmm_t2 = vtype::min(zmm[1], zmm14r); - zmm_t zmm_t3 = vtype::min(zmm[2], zmm13r); - zmm_t zmm_t4 = vtype::min(zmm[3], zmm12r); - zmm_t zmm_t5 = vtype::min(zmm[4], zmm11r); - zmm_t zmm_t6 = vtype::min(zmm[5], zmm10r); - zmm_t zmm_t7 = vtype::min(zmm[6], zmm9r); - zmm_t zmm_t8 = vtype::min(zmm[7], zmm8r); - zmm_t zmm_t9 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm8r)); - zmm_t zmm_t10 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm9r)); - zmm_t zmm_t11 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm10r)); - zmm_t zmm_t12 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm11r)); - zmm_t zmm_t13 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm12r)); - zmm_t zmm_t14 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm13r)); - zmm_t zmm_t15 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm14r)); - zmm_t zmm_t16 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm15r)); - // Recusive half clear 16 zmm regs - COEX(zmm_t1, zmm_t5); - COEX(zmm_t2, zmm_t6); - COEX(zmm_t3, zmm_t7); - COEX(zmm_t4, zmm_t8); - COEX(zmm_t9, zmm_t13); - COEX(zmm_t10, zmm_t14); - COEX(zmm_t11, zmm_t15); - COEX(zmm_t12, zmm_t16); - // - COEX(zmm_t1, zmm_t3); - COEX(zmm_t2, zmm_t4); - COEX(zmm_t5, zmm_t7); - COEX(zmm_t6, zmm_t8); - COEX(zmm_t9, zmm_t11); - COEX(zmm_t10, zmm_t12); - COEX(zmm_t13, zmm_t15); - COEX(zmm_t14, zmm_t16); - // - COEX(zmm_t1, zmm_t2); - COEX(zmm_t3, zmm_t4); - COEX(zmm_t5, zmm_t6); - COEX(zmm_t7, zmm_t8); - COEX(zmm_t9, zmm_t10); - COEX(zmm_t11, zmm_t12); - COEX(zmm_t13, zmm_t14); - COEX(zmm_t15, zmm_t16); - // - zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); - zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); - zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); - zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); - zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); - zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); - zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); - zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); - zmm[8] = bitonic_merge_zmm_64bit(zmm_t9); - zmm[9] = bitonic_merge_zmm_64bit(zmm_t10); - zmm[10] = bitonic_merge_zmm_64bit(zmm_t11); - zmm[11] = bitonic_merge_zmm_64bit(zmm_t12); - zmm[12] = bitonic_merge_zmm_64bit(zmm_t13); - zmm[13] = bitonic_merge_zmm_64bit(zmm_t14); - zmm[14] = bitonic_merge_zmm_64bit(zmm_t15); - zmm[15] = bitonic_merge_zmm_64bit(zmm_t16); -} - -template -NPY_FINLINE void sort_8_64bit(type_t *arr, int32_t N) -{ - typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; - typename vtype::zmm_t zmm - = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); - vtype::mask_storeu(arr, load_mask, sort_zmm_64bit(zmm)); -} - -template -NPY_FINLINE void sort_16_64bit(type_t *arr, int32_t N) -{ - if (N <= 8) { - sort_8_64bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - zmm_t zmm1 = vtype::loadu(arr); - typename vtype::opmask_t load_mask = (0x01 << (N - 8)) - 0x01; - zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 8); - zmm1 = sort_zmm_64bit(zmm1); - zmm2 = sort_zmm_64bit(zmm2); - bitonic_merge_two_zmm_64bit(zmm1, zmm2); - vtype::storeu(arr, zmm1); - vtype::mask_storeu(arr + 8, load_mask, zmm2); -} - -template -NPY_FINLINE void sort_32_64bit(type_t *arr, int32_t N) -{ - if (N <= 16) { - sort_16_64bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - using opmask_t = typename vtype::opmask_t; - zmm_t zmm[4]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 8); - opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; - uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; - load_mask1 = (combined_mask)&0xFF; - load_mask2 = (combined_mask >> 8) & 0xFF; - zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 16); - zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 24); - zmm[0] = sort_zmm_64bit(zmm[0]); - zmm[1] = sort_zmm_64bit(zmm[1]); - zmm[2] = sort_zmm_64bit(zmm[2]); - zmm[3] = sort_zmm_64bit(zmm[3]); - bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); - bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); - bitonic_merge_four_zmm_64bit(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 8, zmm[1]); - vtype::mask_storeu(arr + 16, load_mask1, zmm[2]); - vtype::mask_storeu(arr + 24, load_mask2, zmm[3]); -} - -template -NPY_FINLINE void sort_64_64bit(type_t *arr, int32_t N) -{ - if (N <= 32) { - sort_32_64bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - using opmask_t = typename vtype::opmask_t; - zmm_t zmm[8]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 8); - zmm[2] = vtype::loadu(arr + 16); - zmm[3] = vtype::loadu(arr + 24); - zmm[0] = sort_zmm_64bit(zmm[0]); - zmm[1] = sort_zmm_64bit(zmm[1]); - zmm[2] = sort_zmm_64bit(zmm[2]); - zmm[3] = sort_zmm_64bit(zmm[3]); - opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; - opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; - // N-32 >= 1 - uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; - load_mask1 = (combined_mask)&0xFF; - load_mask2 = (combined_mask >> 8) & 0xFF; - load_mask3 = (combined_mask >> 16) & 0xFF; - load_mask4 = (combined_mask >> 24) & 0xFF; - zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); - zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 40); - zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 48); - zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 56); - zmm[4] = sort_zmm_64bit(zmm[4]); - zmm[5] = sort_zmm_64bit(zmm[5]); - zmm[6] = sort_zmm_64bit(zmm[6]); - zmm[7] = sort_zmm_64bit(zmm[7]); - bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); - bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); - bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); - bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); - bitonic_merge_four_zmm_64bit(zmm); - bitonic_merge_four_zmm_64bit(zmm + 4); - bitonic_merge_eight_zmm_64bit(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 8, zmm[1]); - vtype::storeu(arr + 16, zmm[2]); - vtype::storeu(arr + 24, zmm[3]); - vtype::mask_storeu(arr + 32, load_mask1, zmm[4]); - vtype::mask_storeu(arr + 40, load_mask2, zmm[5]); - vtype::mask_storeu(arr + 48, load_mask3, zmm[6]); - vtype::mask_storeu(arr + 56, load_mask4, zmm[7]); -} - -template -NPY_FINLINE void sort_128_64bit(type_t *arr, int32_t N) -{ - if (N <= 64) { - sort_64_64bit(arr, N); - return; - } - using zmm_t = typename vtype::zmm_t; - using opmask_t = typename vtype::opmask_t; - zmm_t zmm[16]; - zmm[0] = vtype::loadu(arr); - zmm[1] = vtype::loadu(arr + 8); - zmm[2] = vtype::loadu(arr + 16); - zmm[3] = vtype::loadu(arr + 24); - zmm[4] = vtype::loadu(arr + 32); - zmm[5] = vtype::loadu(arr + 40); - zmm[6] = vtype::loadu(arr + 48); - zmm[7] = vtype::loadu(arr + 56); - zmm[0] = sort_zmm_64bit(zmm[0]); - zmm[1] = sort_zmm_64bit(zmm[1]); - zmm[2] = sort_zmm_64bit(zmm[2]); - zmm[3] = sort_zmm_64bit(zmm[3]); - zmm[4] = sort_zmm_64bit(zmm[4]); - zmm[5] = sort_zmm_64bit(zmm[5]); - zmm[6] = sort_zmm_64bit(zmm[6]); - zmm[7] = sort_zmm_64bit(zmm[7]); - opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; - opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; - opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF; - opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF; - if (N != 128) { - uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; - load_mask1 = (combined_mask)&0xFF; - load_mask2 = (combined_mask >> 8) & 0xFF; - load_mask3 = (combined_mask >> 16) & 0xFF; - load_mask4 = (combined_mask >> 24) & 0xFF; - load_mask5 = (combined_mask >> 32) & 0xFF; - load_mask6 = (combined_mask >> 40) & 0xFF; - load_mask7 = (combined_mask >> 48) & 0xFF; - load_mask8 = (combined_mask >> 56) & 0xFF; - } - zmm[8] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); - zmm[9] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 72); - zmm[10] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 80); - zmm[11] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 88); - zmm[12] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 96); - zmm[13] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 104); - zmm[14] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 112); - zmm[15] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 120); - zmm[8] = sort_zmm_64bit(zmm[8]); - zmm[9] = sort_zmm_64bit(zmm[9]); - zmm[10] = sort_zmm_64bit(zmm[10]); - zmm[11] = sort_zmm_64bit(zmm[11]); - zmm[12] = sort_zmm_64bit(zmm[12]); - zmm[13] = sort_zmm_64bit(zmm[13]); - zmm[14] = sort_zmm_64bit(zmm[14]); - zmm[15] = sort_zmm_64bit(zmm[15]); - bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); - bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); - bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); - bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); - bitonic_merge_two_zmm_64bit(zmm[8], zmm[9]); - bitonic_merge_two_zmm_64bit(zmm[10], zmm[11]); - bitonic_merge_two_zmm_64bit(zmm[12], zmm[13]); - bitonic_merge_two_zmm_64bit(zmm[14], zmm[15]); - bitonic_merge_four_zmm_64bit(zmm); - bitonic_merge_four_zmm_64bit(zmm + 4); - bitonic_merge_four_zmm_64bit(zmm + 8); - bitonic_merge_four_zmm_64bit(zmm + 12); - bitonic_merge_eight_zmm_64bit(zmm); - bitonic_merge_eight_zmm_64bit(zmm + 8); - bitonic_merge_sixteen_zmm_64bit(zmm); - vtype::storeu(arr, zmm[0]); - vtype::storeu(arr + 8, zmm[1]); - vtype::storeu(arr + 16, zmm[2]); - vtype::storeu(arr + 24, zmm[3]); - vtype::storeu(arr + 32, zmm[4]); - vtype::storeu(arr + 40, zmm[5]); - vtype::storeu(arr + 48, zmm[6]); - vtype::storeu(arr + 56, zmm[7]); - vtype::mask_storeu(arr + 64, load_mask1, zmm[8]); - vtype::mask_storeu(arr + 72, load_mask2, zmm[9]); - vtype::mask_storeu(arr + 80, load_mask3, zmm[10]); - vtype::mask_storeu(arr + 88, load_mask4, zmm[11]); - vtype::mask_storeu(arr + 96, load_mask5, zmm[12]); - vtype::mask_storeu(arr + 104, load_mask6, zmm[13]); - vtype::mask_storeu(arr + 112, load_mask7, zmm[14]); - vtype::mask_storeu(arr + 120, load_mask8, zmm[15]); -} - -template -NPY_FINLINE type_t get_pivot_64bit(type_t *arr, - const int64_t left, - const int64_t right) -{ - // median of 8 - int64_t size = (right - left) / 8; - using zmm_t = typename vtype::zmm_t; - __m512i rand_index = _mm512_set_epi64(left + size, - left + 2 * size, - left + 3 * size, - left + 4 * size, - left + 5 * size, - left + 6 * size, - left + 7 * size, - left + 8 * size); - zmm_t rand_vec = vtype::template i64gather(rand_index, arr); - // pivot will never be a nan, since there are no nan's! - zmm_t sort = sort_zmm_64bit(rand_vec); - return ((type_t *)&sort)[4]; -} - -template -static void -qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) -{ - /* - * Resort to std::sort if quicksort isnt making any progress - */ - if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128_64bit(arr + left, (int32_t)(right + 1 - left)); - return; - } - - type_t pivot = get_pivot_64bit(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512( - arr, left, right + 1, pivot, &smallest, &biggest); - if (pivot != smallest) - qsort_64bit_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_64bit_(arr, pivot_index, right, max_iters - 1); -} - -NPY_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize) -{ - int64_t nan_count = 0; - __mmask8 loadmask = 0xFF; - while (arrsize > 0) { - if (arrsize < 8) { loadmask = (0x01 << arrsize) - 0x01; } - __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr); - __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((int32_t)nanmask); - _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE); - arr += 8; - arrsize -= 8; - } - return nan_count; -} - -NPY_FINLINE void -replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count) -{ - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = std::nan("1"); - nan_count -= 1; - } -} - -template <> -void avx512_qsort(int64_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - qsort_64bit_, int64_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qsort(uint64_t *arr, int64_t arrsize) -{ - if (arrsize > 1) { - qsort_64bit_, uint64_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qsort(double *arr, int64_t arrsize) -{ - if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qsort_64bit_, double>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); - } -} -#endif // __AVX512_QSORT_64BIT__ diff --git a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h b/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h deleted file mode 100644 index 639d2f788..000000000 --- a/numpy/core/src/npysort/x86-simd-sort/src/avx512-common-qsort.h +++ /dev/null @@ -1,230 +0,0 @@ -/******************************************************************* - * Copyright (C) 2022 Intel Corporation - * Copyright (C) 2021 Serge Sans Paille - * SPDX-License-Identifier: BSD-3-Clause - * Authors: Raghuveer Devulapalli - * Serge Sans Paille - * ****************************************************************/ - -#ifndef __AVX512_QSORT_COMMON__ -#define __AVX512_QSORT_COMMON__ - -/* - * Quicksort using AVX-512. The ideas and code are based on these two research - * papers [1] and [2]. On a high level, the idea is to vectorize quicksort - * partitioning using AVX-512 compressstore instructions. If the array size is - * < 128, then use Bitonic sorting network implemented on 512-bit registers. - * The precise network definitions depend on the dtype and are defined in - * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and - * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting - * network. The core implementations of the vectorized qsort functions - * avx512_qsort(T*, int64_t) are modified versions of avx2 quicksort - * presented in the paper [2] and source code associated with that paper [3]. - * - * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types - * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ - * - * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel - * Skylake https://arxiv.org/pdf/1704.08579.pdf - * - * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT - * - * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 - * - */ - -#include "simd/simd.h" -#include -#include -#include -#include -#include - -#define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() -#define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() -#define X86_SIMD_SORT_INFINITYH 0x7c00 -#define X86_SIMD_SORT_NEGINFINITYH 0xfc00 -#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits::max() -#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits::max() -#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits::min() -#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits::max() -#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits::max() -#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits::min() -#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits::max() -#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits::max() -#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits::min() -#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY) -#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64) -#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64) -#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF) -#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32) -#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32) -#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH) -#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16) -#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) -#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d - -template -struct vector; - -template -void avx512_qsort(T *arr, int64_t arrsize); - -template -bool comparison_func(const T &a, const T &b) -{ - return a < b; -} - -/* - * COEX == Compare and Exchange two registers by swapping min and max values - */ -template -static void COEX(mm_t &a, mm_t &b) -{ - mm_t temp = a; - a = vtype::min(a, b); - b = vtype::max(temp, b); -} - -template -static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) -{ - zmm_t min = vtype::min(in2, in1); - zmm_t max = vtype::max(in2, in1); - return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max -} - -/* - * Parition one ZMM register based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template -static inline int32_t partition_vec(type_t *arr, - int64_t left, - int64_t right, - const zmm_t curr_vec, - const zmm_t pivot_vec, - zmm_t *smallest_vec, - zmm_t *biggest_vec) -{ - /* which elements are larger than the pivot */ - typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec); - int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); - vtype::mask_compressstoreu( - arr + left, vtype::knot_opmask(gt_mask), curr_vec); - vtype::mask_compressstoreu( - arr + right - amount_gt_pivot, gt_mask, curr_vec); - *smallest_vec = vtype::min(curr_vec, *smallest_vec); - *biggest_vec = vtype::max(curr_vec, *biggest_vec); - return amount_gt_pivot; -} - -/* - * Parition an array based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template -static inline int64_t partition_avx512(type_t *arr, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) -{ - /* make array length divisible by vtype::numlanes , shortening the array */ - for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { - *smallest = std::min(*smallest, arr[left], comparison_func); - *biggest = std::max(*biggest, arr[left], comparison_func); - if (!comparison_func(arr[left], pivot)) { - std::swap(arr[left], arr[--right]); - } - else { - ++left; - } - } - - if (left == right) - return left; /* less than vtype::numlanes elements in the array */ - - using zmm_t = typename vtype::zmm_t; - zmm_t pivot_vec = vtype::set1(pivot); - zmm_t min_vec = vtype::set1(*smallest); - zmm_t max_vec = vtype::set1(*biggest); - - if (right - left == vtype::numlanes) { - zmm_t vec = vtype::loadu(arr + left); - int32_t amount_gt_pivot = partition_vec(arr, - left, - left + vtype::numlanes, - vec, - pivot_vec, - &min_vec, - &max_vec); - *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); - return left + (vtype::numlanes - amount_gt_pivot); - } - - // first and last vtype::numlanes values are partitioned at the end - zmm_t vec_left = vtype::loadu(arr + left); - zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); - // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; - // indices for loading the elements - left += vtype::numlanes; - right -= vtype::numlanes; - while (right - left != 0) { - zmm_t curr_vec; - /* - * if fewer elements are stored on the right side of the array, - * then next elements are loaded from the right side, - * otherwise from the left side - */ - if ((r_store + vtype::numlanes) - right < left - l_store) { - right -= vtype::numlanes; - curr_vec = vtype::loadu(arr + right); - } - else { - curr_vec = vtype::loadu(arr + left); - left += vtype::numlanes; - } - // partition the current vector and save it on both sides of the array - int32_t amount_gt_pivot - = partition_vec(arr, - l_store, - r_store + vtype::numlanes, - curr_vec, - pivot_vec, - &min_vec, - &max_vec); - ; - r_store -= amount_gt_pivot; - l_store += (vtype::numlanes - amount_gt_pivot); - } - - /* partition and save vec_left and vec_right */ - int32_t amount_gt_pivot = partition_vec(arr, - l_store, - r_store + vtype::numlanes, - vec_left, - pivot_vec, - &min_vec, - &max_vec); - l_store += (vtype::numlanes - amount_gt_pivot); - amount_gt_pivot = partition_vec(arr, - l_store, - l_store + vtype::numlanes, - vec_right, - pivot_vec, - &min_vec, - &max_vec); - l_store += (vtype::numlanes - amount_gt_pivot); - *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); - return l_store; -} -#endif // __AVX512_QSORT_COMMON__ -- cgit v1.2.1 From e7240dcaf24aebca83c3f642a12fa070a557b9c4 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 31 Jan 2023 10:48:15 -0800 Subject: Add x86 simd sort dispatch files to meson.build --- numpy/core/meson.build | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 27d7ab851..74d983dbb 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -718,7 +718,8 @@ src_multiarray = [ 'src/multiarray/usertypes.c', 'src/multiarray/vdot.c', src_file.process('src/common/npy_sort.h.src'), - 'src/npysort/x86-qsort.dispatch.cpp', + 'src/npysort/x86-qsort-skx.dispatch.cpp', + 'src/npysort/x86-qsort-icl.dispatch.cpp', 'src/npysort/quicksort.cpp', 'src/npysort/mergesort.cpp', 'src/npysort/timsort.cpp', -- cgit v1.2.1 From a2f048f4886ef3bde2caef134a89c73a84163764 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 31 Jan 2023 12:46:47 -0800 Subject: Fetch submodules in macOS and Windows build --- azure-pipelines.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 18b72f490..9a95aad5f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -184,6 +184,9 @@ stages: - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'" displayName: 'Check for unreachable code paths in Python modules' + - script: git submodules update --init + displayName: 'Fetch submodules' + # prefer usage of clang over gcc proper # to match likely scenario on many user mac machines - script: python setup.py build -j 4 build_src --verbose-cfg install @@ -287,6 +290,7 @@ stages: steps: - template: azure-steps-windows.yml + submodules: true - job: Linux_conda -- cgit v1.2.1 From a5d416bd60ce1067108e99951131768dfd9ee440 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 31 Jan 2023 12:56:17 -0800 Subject: Update to latest commit x86-simd-sort --- numpy/core/src/npysort/x86-simd-sort | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort index 0f1023bd0..7d7591cf5 160000 --- a/numpy/core/src/npysort/x86-simd-sort +++ b/numpy/core/src/npysort/x86-simd-sort @@ -1 +1 @@ -Subproject commit 0f1023bd0ffdabfe22883b85d4dfe55a6ed6ad3f +Subproject commit 7d7591cf5927e83e4a1e7c4b6f2c4dc91a97889f -- cgit v1.2.1 From 774edbd8a572067556e9860d5e5c23f73107421a Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 1 Feb 2023 11:16:13 -0800 Subject: Fix azure-pipelines.yml to checkout submodules --- azure-pipelines.yml | 3 +-- azure-steps-windows.yml | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9a95aad5f..7657ab87f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -184,7 +184,7 @@ stages: - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'" displayName: 'Check for unreachable code paths in Python modules' - - script: git submodules update --init + - script: git submodule update --init displayName: 'Fetch submodules' # prefer usage of clang over gcc proper @@ -290,7 +290,6 @@ stages: steps: - template: azure-steps-windows.yml - submodules: true - job: Linux_conda diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml index 318f46398..a147ffd7a 100644 --- a/azure-steps-windows.yml +++ b/azure-steps-windows.yml @@ -1,4 +1,6 @@ steps: +- script: git submodule update --init + displayName: 'Fetch submodules' - task: UsePythonVersion@0 inputs: versionSpec: $(PYTHON_VERSION) -- cgit v1.2.1 From b358ba4fb3c42f296466d5a6271d253e7abb7db0 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 17:12:29 +0200 Subject: ENH: Towards modern C++ This patch initializes new C++ headers and also brings new namespace `np::` to break away from the current approach of using C++ which tends not to be drawn into modernity. --- numpy/core/src/common/common.hpp | 11 +++++++ numpy/core/src/common/half.hpp | 63 ++++++++++++++++++++++++++++++++++++++++ numpy/core/src/common/meta.hpp | 54 ++++++++++++++++++++++++++++++++++ numpy/core/src/common/npstd.hpp | 54 ++++++++++++++++++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 numpy/core/src/common/common.hpp create mode 100644 numpy/core/src/common/half.hpp create mode 100644 numpy/core/src/common/meta.hpp create mode 100644 numpy/core/src/common/npstd.hpp diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp new file mode 100644 index 000000000..47d790bcf --- /dev/null +++ b/numpy/core/src/common/common.hpp @@ -0,0 +1,11 @@ +#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP +#define NUMPY_CORE_SRC_COMMON_COMMON_HPP +/* + * The following C++ headers are safe to be used standalone, however, + * they are gathered to make it easy for us and for the future need to support PCH. + */ +#include "npstd.hpp" +#include "half.hpp" +#include "meta.hpp" + +#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp new file mode 100644 index 000000000..399f2fa79 --- /dev/null +++ b/numpy/core/src/common/half.hpp @@ -0,0 +1,63 @@ +#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP +#define NUMPY_CORE_SRC_COMMON_HALF_HPP + +#include "npstd.hpp" + +// TODO(@seiko2plus): +// - covers half-precision operations that being supported by numpy/halffloat.h +// - support __fp16 +// - optimize x86 half<->single via cpu_fp16 +// - optimize ppc64 half<->single via cpu_vsx3 + +namespace np { + +/// @addtogroup cpp_core_types +/// @{ + +/// Provides a type that implements 16-bit floating point (half-precision). +/// This type is ensured to be 16-bit size. +class Half final { + public: + /// @name Public Constructors + /// @{ + + /// Default constructor. initialize nothing. + Half() = default; + /// Copy. + Half(const Half &r) + { + data_.u = r.data_.u; + } + + /// @} + + /// Returns a new Half constracted from the IEEE 754 binary16. + /// @param b the value of binary16. + static Half FromBits(uint16_t b) + { + Half f; + f.data_.u = b; + return f; + } + /// Returns the IEEE 754 binary16 representation. + uint16_t Bits() const + { + return data_.u; + } + + private: + union { + uint16_t u; +/* +TODO(@seiko2plus): support __fp16 +#ifdef NPY_HAVE_HW_FP16 + __fp16 f; +#endif +*/ + } data_; +}; + +/// @} cpp_core_types + +} // namespace np +#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp new file mode 100644 index 000000000..27ea1857e --- /dev/null +++ b/numpy/core/src/common/meta.hpp @@ -0,0 +1,54 @@ +#ifndef NUMPY_CORE_SRC_COMMON_META_HPP +#define NUMPY_CORE_SRC_COMMON_META_HPP + +#include "npstd.hpp" + +namespace np { namespace meta { +/// @addtogroup cpp_core_meta +/// @{ + +namespace details { +template +struct IntBySize; + +template +struct IntBySize { + using Type = typename std::conditional< + unsig, uint8_t, int8_t>::type; +}; +template +struct IntBySize { + using Type = typename std::conditional< + unsig, uint16_t, int16_t>::type; +}; +template +struct IntBySize { + using Type = typename std::conditional< + unsig, uint32_t, int32_t>::type; +}; +template +struct IntBySize { + using Type = typename std::conditional< + unsig, uint64_t, int64_t>::type; +}; +} // namespace details + +/// Provides safe conversion of any integer type synonyms +/// to a fixed-width integer type. +template +struct FixedWidth { + using TF_ = typename details::IntBySize< + sizeof(T), std::is_unsigned::value + >::Type; + + using Type = typename std::conditional< + std::is_integral::value, TF_, T + >::type; +}; + +/// @} cpp_core_meta + +}} // namespace np::meta + +#endif // NUMPY_CORE_SRC_COMMON_META_HPP + diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp new file mode 100644 index 000000000..71993bd7c --- /dev/null +++ b/numpy/core/src/common/npstd.hpp @@ -0,0 +1,54 @@ +#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP +#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "npy_config.h" + +namespace np { +/// @addtogroup cpp_core_types +/// @{ +using std::uint8_t; +using std::int8_t; +using std::uint16_t; +using std::int16_t; +using std::uint32_t; +using std::int32_t; +using std::uint64_t; +using std::int64_t; +using std::uintptr_t; +using std::intptr_t; +using std::complex; + +/** Guard for long double. + * + * The C implementation defines long double as double + * on MinGW to provide compatibility with MSVC to unify + * one behavior under Windows OS, which makes npy_longdouble + * not fit to be used with template specialization or overloading. + * + * This type will be set to `void` when `npy_longdouble` is not defined + * as `long double`. + */ +using LongDouble = typename std::conditional< + !std::is_same::value, + void, npy_longdouble +>::type; +/// @} cpp_core_types + +} // namespace np + +#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP + -- cgit v1.2.1 From 6d26364d4ca94f86acf7c813d3a69431a75455d0 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 17:15:10 +0200 Subject: ENH, SIMD: reimplement CPU dispatching of qsort For a Few C++ More --- numpy/core/meson.build | 4 +- numpy/core/setup.py | 4 +- numpy/core/src/npysort/quicksort.cpp | 238 +++++---------------- numpy/core/src/npysort/simd_qsort.dispatch.cpp | 44 ++++ numpy/core/src/npysort/simd_qsort.hpp | 19 ++ .../core/src/npysort/simd_qsort_16bit.dispatch.cpp | 31 +++ numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp | 35 --- numpy/core/src/npysort/x86-qsort-icl.h | 27 --- numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp | 54 ----- numpy/core/src/npysort/x86-qsort-skx.h | 37 ---- 10 files changed, 155 insertions(+), 338 deletions(-) create mode 100644 numpy/core/src/npysort/simd_qsort.dispatch.cpp create mode 100644 numpy/core/src/npysort/simd_qsort.hpp create mode 100644 numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp delete mode 100644 numpy/core/src/npysort/x86-qsort-icl.h delete mode 100644 numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp delete mode 100644 numpy/core/src/npysort/x86-qsort-skx.h diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 74d983dbb..05f286a50 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -718,8 +718,8 @@ src_multiarray = [ 'src/multiarray/usertypes.c', 'src/multiarray/vdot.c', src_file.process('src/common/npy_sort.h.src'), - 'src/npysort/x86-qsort-skx.dispatch.cpp', - 'src/npysort/x86-qsort-icl.dispatch.cpp', + 'src/npysort/simd_qsort.dispatch.cpp', + 'src/npysort/simd_qsort_16bit.dispatch.cpp', 'src/npysort/quicksort.cpp', 'src/npysort/mergesort.cpp', 'src/npysort/timsort.cpp', diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 3ab00205f..cfae34e31 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -979,8 +979,8 @@ def configuration(parent_package='',top_path=None): if enable_avx512_qsort(): multiarray_src += [ - join('src', 'npysort', 'x86-qsort-skx.dispatch.cpp'), - join('src', 'npysort', 'x86-qsort-icl.dispatch.cpp'), + join('src', 'npysort', 'simd_qsort.dispatch.cpp'), + join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'), ] ####################################################################### diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index f2cada873..0e65dc9bc 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -54,6 +54,7 @@ #include "npysort_common.h" #include "npysort_heapsort.h" #include "numpy_tag.h" +#include "simd_qsort.hpp" #include #include @@ -68,197 +69,39 @@ #define SMALL_MERGESORT 20 #define SMALL_STRING 16 +template +inline bool quicksort_dispatch(T *start, npy_intp num) +{ + using TF = typename np::meta::FixedWidth::Type; + void (*dispfunc)(TF*, intptr_t) = nullptr; + if (sizeof(T) == sizeof(uint16_t)) { + #ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort_16bit.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, ); + } + else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) { + #ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, ); + } + if (dispfunc) { + (*dispfunc)(reinterpret_cast(start), static_cast(num)); + return true; + } + return false; +} /* ***************************************************************************** ** NUMERIC SORTS ** ***************************************************************************** */ -namespace { - -template -struct x86_dispatch { - static bool quicksort(typename Tag::type *, npy_intp) { return false; } -}; - -// Currently disabled on WIN32 only -#ifdef NPY_ENABLE_AVX512_QSORT -#include "x86-qsort-skx.h" -#include "x86-qsort-icl.h" - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-skx.dispatch.h" -#endif - -#if NPY_SIZEOF_LONG == 8 -template <> -struct x86_dispatch { - static bool quicksort(npy_long *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; -template <> -struct x86_dispatch { - static bool quicksort(npy_ulong *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; -#elif NPY_SIZEOF_LONGLONG == 8 -template <> -struct x86_dispatch { - static bool quicksort(npy_longlong *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_long); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; -template <> -struct x86_dispatch { - static bool quicksort(npy_ulonglong *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ulong); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; -#endif // NPY_SIZEOF_LONG - -template <> -struct x86_dispatch { - static bool quicksort(npy_double *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_double); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch { - static bool quicksort(npy_int *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch { - static bool quicksort(npy_uint *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch { - static bool quicksort(npy_float *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-icl.dispatch.h" -#endif - -template <> -struct x86_dispatch { - static bool quicksort(npy_half *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_half); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - - -template <> -struct x86_dispatch { - static bool quicksort(npy_short *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_short); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; - -template <> -struct x86_dispatch { - static bool quicksort(npy_ushort *start, npy_intp num) - { - void (*dispfunc)(void *, npy_intp) = nullptr; - NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_ushort); - if (dispfunc) { - (*dispfunc)(start, num); - return true; - } - return false; - } -}; -#endif // NPY_ENABLE_AVX512_QSORT - -} // end namespace - template static int quicksort_(type *start, npy_intp num) { - if (x86_dispatch::quicksort(start, num)) - return 0; - type vp; type *pl = start; type *pr = pl + num - 1; @@ -851,56 +694,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr)) NPY_NO_EXPORT int quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_short *)start, n)) { + return 0; + } return quicksort_((npy_short *)start, n); } NPY_NO_EXPORT int quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_ushort *)start, n)) { + return 0; + } return quicksort_((npy_ushort *)start, n); } NPY_NO_EXPORT int quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_int *)start, n)) { + return 0; + } return quicksort_((npy_int *)start, n); } NPY_NO_EXPORT int quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_uint *)start, n)) { + return 0; + } return quicksort_((npy_uint *)start, n); } NPY_NO_EXPORT int quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_long *)start, n)) { + return 0; + } return quicksort_((npy_long *)start, n); } NPY_NO_EXPORT int quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_ulong *)start, n)) { + return 0; + } return quicksort_((npy_ulong *)start, n); } NPY_NO_EXPORT int quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_longlong *)start, n)) { + return 0; + } return quicksort_((npy_longlong *)start, n); } NPY_NO_EXPORT int quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_ulonglong *)start, n)) { + return 0; + } return quicksort_((npy_ulonglong *)start, n); } NPY_NO_EXPORT int quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((np::Half *)start, n)) { + return 0; + } return quicksort_((npy_half *)start, n); } NPY_NO_EXPORT int quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_float *)start, n)) { + return 0; + } return quicksort_((npy_float *)start, n); } NPY_NO_EXPORT int quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr)) { + if (quicksort_dispatch((npy_double *)start, n)) { + return 0; + } return quicksort_((npy_double *)start, n); } NPY_NO_EXPORT int diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp new file mode 100644 index 000000000..36b5d799c --- /dev/null +++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp @@ -0,0 +1,44 @@ +/*@targets + * $maxopt $keep_baseline avx512_skx + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "simd_qsort.hpp" + +#ifdef NPY_HAVE_AVX512_SKX + #include "avx512-32bit-qsort.hpp" + #include "avx512-64bit-qsort.hpp" +#endif + +namespace np { namespace qsort_simd { + +#ifdef NPY_HAVE_AVX512_SKX +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +#endif // NPY_HAVE_AVX512_SKX + +}} // namespace np::simd diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp new file mode 100644 index 000000000..7cdee774d --- /dev/null +++ b/numpy/core/src/npysort/simd_qsort.hpp @@ -0,0 +1,19 @@ +#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP +#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP + +#include "common.hpp" + +namespace np { namespace qsort_simd { + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(template void QSort, (T *arr, intptr_t size)) + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_qsort_16bit.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(template void QSort, (T *arr, intptr_t size)) + +} } // np::qsort_simd +#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp new file mode 100644 index 000000000..a816b8781 --- /dev/null +++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp @@ -0,0 +1,31 @@ +/*@targets + * $maxopt $keep_baseline avx512_icl + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "simd_qsort.hpp" + +#ifdef NPY_HAVE_AVX512_ICL + #include "avx512-16bit-qsort.hpp" +#endif + +namespace np { namespace qsort_simd { + +#ifdef NPY_HAVE_AVX512_ICL +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size) +{ + avx512_qsort_fp16(reinterpret_cast(arr), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size) +{ + avx512_qsort(arr, size); +} +#endif // NPY_HAVE_AVX512_ICL + +}} // namespace np::qsort_simd diff --git a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp deleted file mode 100644 index 3dce8a9b4..000000000 --- a/numpy/core/src/npysort/x86-qsort-icl.dispatch.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/*@targets - * $maxopt $keep_baseline avx512_icl - */ -// policy $keep_baseline is used to avoid skip building avx512_skx -// when its part of baseline features (--cpu-baseline), since -// 'baseline' option isn't specified within targets. - -#include "x86-qsort-icl.h" -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#ifdef NPY_HAVE_AVX512_ICL -#include "avx512-16bit-qsort.hpp" - -/*************************************** - * C > C++ dispatch - ***************************************/ -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_half)(void *arr, npy_intp arrsize) -{ - avx512_qsort_fp16((npy_half*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_short)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_short*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_ushort)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_ushort*)arr, arrsize); -} - -#endif // NPY_HAVE_AVX512_ICL diff --git a/numpy/core/src/npysort/x86-qsort-icl.h b/numpy/core/src/npysort/x86-qsort-icl.h deleted file mode 100644 index 92cef9cbc..000000000 --- a/numpy/core/src/npysort/x86-qsort-icl.h +++ /dev/null @@ -1,27 +0,0 @@ -#include "numpy/npy_common.h" - -#include "npy_cpu_dispatch.h" - -#ifndef NPY_NO_EXPORT -#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-icl.dispatch.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_half, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_short, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ushort, - (void *start, npy_intp num)) - -#ifdef __cplusplus -} -#endif diff --git a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp b/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp deleted file mode 100644 index 521b198ce..000000000 --- a/numpy/core/src/npysort/x86-qsort-skx.dispatch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/*@targets - * $maxopt $keep_baseline avx512_skx - */ -// policy $keep_baseline is used to avoid skip building avx512_skx -// when its part of baseline features (--cpu-baseline), since -// 'baseline' option isn't specified within targets. - -#include "x86-qsort-skx.h" -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#ifdef NPY_HAVE_AVX512_SKX -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" - -/*************************************** - * C > C++ dispatch - ***************************************/ -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_long)(void *arr, npy_intp arrsize) -{ - avx512_qsort((int64_t*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_ulong)(void *arr, npy_intp arrsize) -{ - avx512_qsort((uint64_t*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_double)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_double*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_int*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_uint*)arr, arrsize); -} - -NPY_NO_EXPORT void -NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize) -{ - avx512_qsort((npy_float*)arr, arrsize); -} - -#endif // NPY_HAVE_AVX512_SKX diff --git a/numpy/core/src/npysort/x86-qsort-skx.h b/numpy/core/src/npysort/x86-qsort-skx.h deleted file mode 100644 index 9a5cb2c9d..000000000 --- a/numpy/core/src/npysort/x86-qsort-skx.h +++ /dev/null @@ -1,37 +0,0 @@ -#include "numpy/npy_common.h" - -#include "npy_cpu_dispatch.h" - -#ifndef NPY_NO_EXPORT -#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#ifndef NPY_DISABLE_OPTIMIZATION -#include "x86-qsort-skx.dispatch.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_long, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_ulong, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_double, - (void *start, npy_intp num)) - - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint, - (void *start, npy_intp num)) - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float, - (void *start, npy_intp num)) - -#ifdef __cplusplus -} -#endif -- cgit v1.2.1 From ba157435ab5c26350bb992149ae6a644a96ff06b Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 17:16:56 +0200 Subject: ENH, SIMD: include npy_cpu_dipatch.h by npy_config.h To guarantee of having #defs NPY_HAVE_[CPU features] in the scope --- numpy/core/src/common/npy_config.h | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h index d6886c5ea..715b17777 100644 --- a/numpy/core/src/common/npy_config.h +++ b/numpy/core/src/common/npy_config.h @@ -2,6 +2,7 @@ #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ #include "config.h" +#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features] #include "numpy/numpyconfig.h" #include "numpy/utils.h" #include "numpy/npy_os.h" -- cgit v1.2.1 From 7ddb5daa866984caa78e3fa4b5cd4869f4ee94cf Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 21:04:27 +0200 Subject: ENH, SIMD: removes #NPY_ENABLE_AVX512_QSORT and use #directives instead --- numpy/core/setup.py | 19 ++----------------- numpy/core/src/npysort/quicksort.cpp | 11 +++++++++++ numpy/core/src/npysort/simd_qsort.dispatch.cpp | 4 ++-- numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp | 4 ++-- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index cfae34e31..d6117f02d 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -68,14 +68,6 @@ class CallOnceOnly: out = copy.deepcopy(pickle.loads(self._check_complex)) return out -# Temporarily disable AVX512 sorting on WIN32 until we can figure -# out why it has test failures -def enable_avx512_qsort(): - enable = True - if "win32" in sysconfig.get_platform(): - enable = False - return enable - def can_link_svml(): """SVML library is supported only on x86_64 architecture and currently only on linux @@ -492,9 +484,6 @@ def configuration(parent_package='',top_path=None): if can_link_svml(): moredefs.append(('NPY_CAN_LINK_SVML', 1)) - if enable_avx512_qsort(): - moredefs.append(('NPY_ENABLE_AVX512_QSORT', 1)) - # Use bogus stride debug aid to flush out bugs where users use # strides of dimensions with length 1 to index a full contiguous # array. @@ -975,14 +964,10 @@ def configuration(parent_package='',top_path=None): # links to the arm64 npymath library, # see gh-22673 join('src', 'npymath', 'arm64_exports.c'), + join('src', 'npysort', 'simd_qsort.dispatch.cpp'), + join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'), ] - if enable_avx512_qsort(): - multiarray_src += [ - join('src', 'npysort', 'simd_qsort.dispatch.cpp'), - join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'), - ] - ####################################################################### # _multiarray_umath module - umath part # ####################################################################### diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 0e65dc9bc..625fdebbb 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -69,6 +69,15 @@ #define SMALL_MERGESORT 20 #define SMALL_STRING 16 +// Temporarily disable AVX512 sorting on WIN32 until we can figure +// out why it has test failures +#ifdef _MSC_VER +template +inline bool quicksort_dispatch(T*, npy_intp) +{ + return false; +} +#else template inline bool quicksort_dispatch(T *start, npy_intp num) { @@ -92,6 +101,8 @@ inline bool quicksort_dispatch(T *start, npy_intp num) } return false; } +#endif // _MSC_VER + /* ***************************************************************************** ** NUMERIC SORTS ** diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp index 36b5d799c..c2ac5a2ae 100644 --- a/numpy/core/src/npysort/simd_qsort.dispatch.cpp +++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp @@ -7,14 +7,14 @@ #include "simd_qsort.hpp" -#ifdef NPY_HAVE_AVX512_SKX +#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER) #include "avx512-32bit-qsort.hpp" #include "avx512-64bit-qsort.hpp" #endif namespace np { namespace qsort_simd { -#ifdef NPY_HAVE_AVX512_SKX +#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER) template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size) { avx512_qsort(arr, size); diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp index a816b8781..673a2f81e 100644 --- a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp +++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp @@ -7,13 +7,13 @@ #include "simd_qsort.hpp" -#ifdef NPY_HAVE_AVX512_ICL +#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER) #include "avx512-16bit-qsort.hpp" #endif namespace np { namespace qsort_simd { -#ifdef NPY_HAVE_AVX512_ICL +#if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER) template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size) { avx512_qsort_fp16(reinterpret_cast(arr), size); -- cgit v1.2.1 From 3e84a70000f27487f2cc680795620d92f2d9b3a4 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 21:20:09 +0200 Subject: fix up meson --- numpy/core/meson.build | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 05f286a50..fad6f462e 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -453,6 +453,11 @@ if cc.get_id() == 'msvc' staticlib_cflags += '-d2VolatileMetadata-' endif endif +# TODO: change to "feature" option in meson_options.txt? See +# https://mesonbuild.com/Build-options.html#build-options +if get_option('disable-simd-optimizations') + staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION' +endif npy_math_internal_h = custom_target( output: 'npy_math_internal.h', -- cgit v1.2.1 From 344fe0587ba0ed48e75eb358a3dfbbb27a013354 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 21:36:04 +0200 Subject: fix up up meson --- numpy/core/meson.build | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/core/meson.build b/numpy/core/meson.build index fad6f462e..eea31faac 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -599,7 +599,8 @@ np_core_dep = declare_dependency( '.', 'include', 'src/common', - ] + ], + compile_args: disable_simd_optimizations ) -- cgit v1.2.1 From 472a47f8ea9aa9ffe933c15ac4c0c148570b1781 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 7 Feb 2023 21:56:43 +0200 Subject: No need for add x86-simd-sort as global directory --- numpy/core/setup.py | 1 - numpy/core/src/npysort/simd_qsort.dispatch.cpp | 4 ++-- numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index d6117f02d..0793ad561 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -650,7 +650,6 @@ def configuration(parent_package='',top_path=None): config.add_include_dirs(join('src', 'multiarray')) config.add_include_dirs(join('src', 'umath')) config.add_include_dirs(join('src', 'npysort')) - config.add_include_dirs(join('src', 'npysort', 'x86-simd-sort', 'src')) config.add_include_dirs(join('src', '_simd')) config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp index c2ac5a2ae..101bb3dcc 100644 --- a/numpy/core/src/npysort/simd_qsort.dispatch.cpp +++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp @@ -8,8 +8,8 @@ #include "simd_qsort.hpp" #if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER) - #include "avx512-32bit-qsort.hpp" - #include "avx512-64bit-qsort.hpp" + #include "x86-simd-sort/src/avx512-32bit-qsort.hpp" + #include "x86-simd-sort/src/avx512-64bit-qsort.hpp" #endif namespace np { namespace qsort_simd { diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp index 673a2f81e..a6465a883 100644 --- a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp +++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp @@ -8,7 +8,7 @@ #include "simd_qsort.hpp" #if defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER) - #include "avx512-16bit-qsort.hpp" + #include "x86-simd-sort/src/avx512-16bit-qsort.hpp" #endif namespace np { namespace qsort_simd { -- cgit v1.2.1 From d07d5584fc63df10025190a4ea38c4863c1b1723 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 7 Feb 2023 14:50:47 -0800 Subject: Disable on CYGWIN --- numpy/core/src/npysort/quicksort.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp index 625fdebbb..7497ebaa3 100644 --- a/numpy/core/src/npysort/quicksort.cpp +++ b/numpy/core/src/npysort/quicksort.cpp @@ -69,9 +69,9 @@ #define SMALL_MERGESORT 20 #define SMALL_STRING 16 -// Temporarily disable AVX512 sorting on WIN32 until we can figure -// out why it has test failures -#ifdef _MSC_VER +// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure +// out why it has test failures +#if defined(_MSC_VER) || defined(__CYGWIN__) template inline bool quicksort_dispatch(T*, npy_intp) { @@ -101,7 +101,7 @@ inline bool quicksort_dispatch(T *start, npy_intp num) } return false; } -#endif // _MSC_VER +#endif // _MSC_VER || CYGWIN /* ***************************************************************************** -- cgit v1.2.1