diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-12-23 12:04:18 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-23 12:04:18 +0200 |
commit | 85df388d344f4ebd70921dca0bc723770e05a37b (patch) | |
tree | fe80b510a7283f13b107b72ca5f448127d775271 | |
parent | 08d7e1163a70224b448469e072981b7347ed179d (diff) | |
parent | b3681b6af3be4220a6f380c0f34f63f77aaf4b07 (diff) | |
download | numpy-85df388d344f4ebd70921dca0bc723770e05a37b.tar.gz |
Merge pull request #17958 from Qiyu8/countnz
MAINT: Optimize the performance of count_nonzero by using universal intrinsics
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 16 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 9 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/conversion.h | 15 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 11 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/conversion.h | 39 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 7 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/conversion.h | 15 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 8 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/conversion.h | 17 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 7 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/conversion.h | 19 | ||||
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 85 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 28 |
13 files changed, 268 insertions, 8 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index eaff81338..af42192a9 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -15,13 +15,15 @@ /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# + * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64# + * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #sum_sup = 0, 0, 0, 0, 1, 0, 0, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# @@ -323,7 +325,9 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) ***************************/ SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@) - +#if @expand_sup@ +SIMD_IMPL_INTRIN_1(expand_@esfx@_@sfx@, v@esfx@x2, v@sfx@) +#endif // expand_sup /*************************** * Arithmetic ***************************/ @@ -440,13 +444,15 @@ static PyMethodDef simd__intrinsics_methods[] = { /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# + * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64# + * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #sum_sup = 0, 0, 0, 0, 1, 0, 0, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# @@ -528,7 +534,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) ***************************/ SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) - +#if @expand_sup@ +SIMD_INTRIN_DEF(expand_@esfx@_@sfx@) +#endif // expand_sup /*************************** * Arithmetic ***************************/ diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 3a6dc9535..3a3a82798 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -118,6 +118,15 @@ #endif // !NPY_HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a) +{ + __m256i s0 = _mm256_hadd_epi32(a, a); + s0 = _mm256_hadd_epi32(s0, s0); + __m128i s1 = _mm256_extracti128_si256(s0, 1);; + s1 = _mm_add_epi32(_mm256_castsi256_si128(s0), s1); + return _mm_cvtsi128_si32(s1); +} + NPY_FINLINE float npyv_sum_f32(__m256 a) { __m256 sum_halves = _mm256_hadd_ps(a, a); diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h index dc6b18766..64e051686 100644 --- a/numpy/core/src/common/simd/avx2/conversion.h +++ b/numpy/core/src/common/simd/avx2/conversion.h @@ -43,6 +43,21 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); } +// expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { + npyv_u16x2 r; + r.val[0] = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data)); + r.val[1] = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 1)); + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { + npyv_u32x2 r; + r.val[0] = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(data)); + r.val[1] = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(data, 1)); + return r; +} + // round to nearest integer (assuming even) #define npyv_round_s32_f32 _mm256_cvtps_epi32 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 7372ca29e..6f668f439 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -130,7 +130,7 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_nmulsub_f64 _mm512_fnmsub_pd /*************************** - * Reduce Sum + * Reduce Sum: Calculates the sum of all vector elements. * there are three ways to implement reduce sum for AVX512: * 1- split(256) /add /split(128) /add /hadd /hadd /extract * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract @@ -144,6 +144,15 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) * The third one is almost the same as the second one but only works for * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. ***************************/ + +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) +{ + __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a)); + __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); + quarter = _mm_hadd_epi32(quarter, quarter); + return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter)); +} + #ifdef NPY_HAVE_AVX512F_REDUCE #define npyv_sum_f32 _mm512_reduce_add_ps #define npyv_sum_f64 _mm512_reduce_add_pd diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index 1d71d7b49..7f4ae484d 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -51,6 +51,45 @@ #define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A)) #define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A)) +// expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) +{ + npyv_u16x2 r; + __m256i lo = npyv512_lower_si256(data); + __m256i hi = npyv512_higher_si256(data); +#ifdef NPY_HAVE_AVX512BW + r.val[0] = _mm512_cvtepu8_epi16(lo); + r.val[1] = _mm512_cvtepu8_epi16(hi); +#else + __m256i loelo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(lo)); + __m256i loehi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(lo, 1)); + __m256i hielo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(hi)); + __m256i hiehi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(hi, 1)); + r.val[0] = npyv512_combine_si256(loelo, loehi); + r.val[1] = npyv512_combine_si256(hielo, hiehi); +#endif + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) +{ + npyv_u32x2 r; + __m256i lo = npyv512_lower_si256(data); + __m256i hi = npyv512_higher_si256(data); +#ifdef NPY_HAVE_AVX512BW + r.val[0] = _mm512_cvtepu16_epi32(lo); + r.val[1] = _mm512_cvtepu16_epi32(hi); +#else + __m256i loelo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(lo)); + __m256i loehi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 1)); + __m256i hielo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(hi)); + __m256i hiehi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 1)); + r.val[0] = npyv512_combine_si256(loelo, loehi); + r.val[1] = npyv512_combine_si256(hielo, hiehi); +#endif + return r; +} + // convert boolean vectors to integer bitfield NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) { diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 87e00d5d1..1c8bde15a 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -133,9 +133,16 @@ // Horizontal add: Calculates the sum of all vector elements. #if NPY_SIMD_F64 + #define npyv_sum_u32 vaddvq_u32 #define npyv_sum_f32 vaddvq_f32 #define npyv_sum_f64 vaddvq_f64 #else + NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) + { + uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a)); + return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0); + } + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h index f92910b66..7487559d1 100644 --- a/numpy/core/src/common/simd/neon/conversion.h +++ b/numpy/core/src/common/simd/neon/conversion.h @@ -71,6 +71,21 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1); } +//expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { + npyv_u16x2 r; + r.val[0] = vmovl_u8(vget_low_u8(data)); + r.val[1] = vmovl_u8(vget_high_u8(data)); + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { + npyv_u32x2 r; + r.val[0] = vmovl_u16(vget_low_u16(data)); + r.val[1] = vmovl_u16(vget_high_u16(data)); + return r; +} + // round to nearest integer #if NPY_SIMD_F64 #define npyv_round_s32_f32 vcvtnq_s32_f32 diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 8440cc52e..faf5685d9 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -149,6 +149,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) #endif // !NPY_HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. + +NPY_FINLINE npy_uint32 npyv_sum_u32(__m128i a) +{ + __m128i t = _mm_add_epi32(a, _mm_srli_si128(a, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (unsigned)_mm_cvtsi128_si32(t); +} + NPY_FINLINE float npyv_sum_f32(__m128 a) { #ifdef NPY_HAVE_SSE3 diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h index d690ec313..ab7eb4907 100644 --- a/numpy/core/src/common/simd/sse/conversion.h +++ b/numpy/core/src/common/simd/sse/conversion.h @@ -42,6 +42,23 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); } +// expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { + npyv_u16x2 r; + const __m128i z = _mm_setzero_si128(); + r.val[0] = _mm_unpacklo_epi8(data, z); + r.val[1] = _mm_unpackhi_epi8(data, z); + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { + npyv_u32x2 r; + const __m128i z = _mm_setzero_si128(); + r.val[0] = _mm_unpacklo_epi16(data, z); + r.val[1] = _mm_unpackhi_epi16(data, z); + return r; +} + // round to nearest integer (assuming even) #define npyv_round_s32_f32 _mm_cvtps_epi32 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 2f6762e63..1288a52a7 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -117,6 +117,13 @@ #define npyv_nmulsub_f64 vec_nmadd // Horizontal add: Calculates the sum of all vector elements. + +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) +{ + const npyv_u32 rs = vec_add(a, vec_sld(a, a, 8)); + return vec_extract(vec_add(rs, vec_sld(rs, rs, 4)), 0); +} + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h index 72fe10e69..36bea7bba 100644 --- a/numpy/core/src/common/simd/vsx/conversion.h +++ b/numpy/core/src/common/simd/vsx/conversion.h @@ -29,6 +29,25 @@ #define npyv_cvt_b32_f32(A) ((npyv_b32) A) #define npyv_cvt_b64_f64(A) ((npyv_b64) A) +//expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) +{ + npyv_u16x2 r; + npyv_u8 zero = npyv_zero_u8(); + r.val[0] = (npyv_u16)vec_mergeh(data, zero); + r.val[1] = (npyv_u16)vec_mergel(data, zero); + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) +{ + npyv_u32x2 r; + npyv_u16 zero = npyv_zero_u16(); + r.val[0] = (npyv_u32)vec_mergeh(data, zero); + r.val[1] = (npyv_u32)vec_mergel(data, zero); + return r; +} + // convert boolean vector to integer bitfield NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) { diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index b279ffc2f..77fff5eb4 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -4,6 +4,7 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE + #include "numpy/arrayobject.h" #include "numpy/arrayscalars.h" @@ -27,7 +28,7 @@ #include "alloc.h" #include "arraytypes.h" #include "array_coercion.h" - +#include "simd/simd.h" static NPY_GCC_OPT_3 NPY_INLINE int npy_fasttake_impl( @@ -2128,6 +2129,80 @@ count_nonzero_bytes_384(const npy_uint64 * w) return r; } +#if NPY_SIMD + +/* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */ +static NPY_INLINE NPY_GCC_OPT_3 npyv_u8 +count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count) +{ + const npyv_u8 vone = npyv_setall_u8(1); + const npyv_u8 vzero = npyv_zero_u8(); + + npy_intp lane_max = 0; + npyv_u8 vsum8 = npyv_zero_u8(); + while (*d < end && lane_max <= max_count - 1) { + // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs + npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); + vt = npyv_and_u8(vt, vone); + vsum8 = npyv_add_u8(vsum8, vt); + *d += npyv_nlanes_u8; + lane_max += 1; + } + return vsum8; +} + +static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2 +count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) +{ + npyv_u16x2 vsum16; + vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); + npy_intp lane_max = 0; + while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) { + npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); + npyv_u16x2 part = npyv_expand_u16_u8(vsum8); + vsum16.val[0] = npyv_add_u16(vsum16.val[0], part.val[0]); + vsum16.val[1] = npyv_add_u16(vsum16.val[1], part.val[1]); + lane_max += NPY_MAX_UINT8; + } + return vsum16; +} + +/* + * Counts the number of non-zero values in a raw array. + * The one loop process is shown below(take SSE2 with 128bits vector for example): + * |------------16 lanes---------| + *[vsum8] 255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements + * !! + * |------------8 lanes---------| + *[vsum16] 65535 65535 65535 ... 65535 count_zero_bytes_u16: counting (2*16-1)*16 elements + * 65535 65535 65535 ... 65535 + * !! + * |------------4 lanes---------| + *[sum_32_0] 65535 65535 65535 65535 count_nonzero_bytes + * 65535 65535 65535 65535 + *[sum_32_1] 65535 65535 65535 65535 + * 65535 65535 65535 65535 + * !! + * (2*16-1)*16 +*/ +static NPY_INLINE NPY_GCC_OPT_3 npy_intp +count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) +{ + npy_intp zero_count = 0; + const npy_uint8 *end = d + unrollx; + while (d < end) { + npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16); + npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]); + npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]); + zero_count += npyv_sum_u32(npyv_add_u32( + npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]), + npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1]) + )); + } + return unrollx - zero_count; +} + +#endif /* * Counts the number of True values in a raw boolean array. This * is a low-overhead function which does no heap allocations. @@ -2137,6 +2212,7 @@ count_nonzero_bytes_384(const npy_uint64 * w) NPY_NO_EXPORT npy_intp count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides) { + int idim; npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS]; npy_intp i, coord[NPY_MAXDIMS]; @@ -2158,13 +2234,17 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const } NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); - /* Special case for contiguous inner loop */ if (strides[0] == 1) { NPY_RAW_ITER_START(idim, ndim, coord, shape) { /* Process the innermost dimension */ const char *d = data; const char *e = data + shape[0]; +#if NPY_SIMD + npy_uintp stride = shape[0] & -npyv_nlanes_u8; + count += count_nonzero_bytes((const npy_uint8 *)d, stride); + d += stride; +#else if (NPY_CPU_HAVE_UNALIGNED_ACCESS || npy_is_aligned(d, sizeof(npy_uint64))) { npy_uintp stride = 6 * sizeof(npy_uint64); @@ -2172,6 +2252,7 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const count += count_nonzero_bytes_384((const npy_uint64 *)d); } } +#endif for (; d < e; ++d) { count += (*d != 0); } diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 2f378667d..71356f812 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -663,6 +663,26 @@ class _SIMD_ALL(_Test_Utility): true_vsfx = from_boolean(true_vb) assert false_vsfx != true_vsfx + def test_conversion_expand(self): + """ + Test expand intrinics: + npyv_expand_u16_u8 + npyv_expand_u32_u16 + """ + if self.sfx not in ("u8", "u16"): + return + totype = self.sfx[0]+str(int(self.sfx[1:])*2) + expand = getattr(self.npyv, f"expand_{totype}_{self.sfx}") + # close enough from the edge to detect any deviation + data = self._data(self._int_max() - self.nlanes) + vdata = self.load(data) + edata = expand(vdata) + # lower half part + data_lo = data[:self.nlanes//2] + # higher half part + data_hi = data[self.nlanes//2:] + assert edata == (data_lo, data_hi) + def test_arithmetic_subadd(self): if self._is_fp(): data_a = self._data() @@ -707,7 +727,13 @@ class _SIMD_ALL(_Test_Utility): assert div == data_div def test_arithmetic_reduce_sum(self): - if not self._is_fp(): + """ + Test reduce sum intrinics: + npyv_sum_u32 + npyv_sum_f32 + npyv_sum_f64 + """ + if self.sfx not in ("u32", "f32", "f64"): return # reduce sum data = self._data() |