Merge pull request #17958 from Qiyu8/countnz

MAINT: Optimize the performance of count_nonzero by using universal intrinsics
author: Matti Picus <matti.picus@gmail.com> 2020-12-23 12:04:18 +0200
committer: GitHub <noreply@github.com> 2020-12-23 12:04:18 +0200
commit: 85df388d344f4ebd70921dca0bc723770e05a37b (patch)
tree: fe80b510a7283f13b107b72ca5f448127d775271
parent: 08d7e1163a70224b448469e072981b7347ed179d (diff)
parent: b3681b6af3be4220a6f380c0f34f63f77aaf4b07 (diff)
download: numpy-85df388d344f4ebd70921dca0bc723770e05a37b.tar.gz
13 files changed, 268 insertions, 8 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index eaff81338..af42192a9 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -15,13 +15,15 @@
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #esfx      = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
+ * #expand_sup =1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
  * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
  * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
- * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sum_sup   = 0,  0,  0,   0,   1,   0,   0,   0,   1,   1#
  * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
  * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
@@ -323,7 +325,9 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
  ***************************/
 SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@,  v@bsfx@)
 SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
-
+#if @expand_sup@
+SIMD_IMPL_INTRIN_1(expand_@esfx@_@sfx@, v@esfx@x2, v@sfx@)
+#endif // expand_sup
 /***************************
  * Arithmetic
  ***************************/
@@ -440,13 +444,15 @@ static PyMethodDef simd__intrinsics_methods[] = {
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #esfx      = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
+ * #expand_sup =1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
  * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
  * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
- * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sum_sup   = 0,  0,  0,   0,   1,   0,   0,   0,   1,   1#
  * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
  * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
@@ -528,7 +534,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  ***************************/
 SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
 SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
-
+#if @expand_sup@
+SIMD_INTRIN_DEF(expand_@esfx@_@sfx@)
+#endif // expand_sup
 /***************************
  * Arithmetic
  ***************************/
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 3a6dc9535..3a3a82798 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -118,6 +118,15 @@
 #endif // !NPY_HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a)
+{
+    __m256i s0 = _mm256_hadd_epi32(a, a);
+            s0 = _mm256_hadd_epi32(s0, s0);
+    __m128i s1 = _mm256_extracti128_si256(s0, 1);;
+            s1 = _mm_add_epi32(_mm256_castsi256_si128(s0), s1);
+    return _mm_cvtsi128_si32(s1);
+}
+
 NPY_FINLINE float npyv_sum_f32(__m256 a)
 {
     __m256 sum_halves = _mm256_hadd_ps(a, a);
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index dc6b18766..64e051686 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -43,6 +43,21 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
 NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
 { return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
 
+// expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
+    npyv_u16x2 r;
+    r.val[0] = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
+    r.val[1] = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 1));
+    return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
+    npyv_u32x2 r;
+    r.val[0] = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(data));
+    r.val[1] = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(data, 1));
+    return r;
+}
+
 // round to nearest integer (assuming even)
 #define npyv_round_s32_f32 _mm256_cvtps_epi32
 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 7372ca29e..6f668f439 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -130,7 +130,7 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_nmulsub_f64 _mm512_fnmsub_pd
 
 /***************************
- * Reduce Sum
+ * Reduce Sum: Calculates the sum of all vector elements.
  * there are three ways to implement reduce sum for AVX512:
  * 1- split(256) /add /split(128) /add /hadd /hadd /extract
  * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
@@ -144,6 +144,15 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
  * The third one is almost the same as the second one but only works for
  * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
  ***************************/
+
+NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
+{
+    __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    quarter = _mm_hadd_epi32(quarter, quarter);
+    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+
 #ifdef NPY_HAVE_AVX512F_REDUCE
     #define npyv_sum_f32 _mm512_reduce_add_ps
     #define npyv_sum_f64 _mm512_reduce_add_pd
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 1d71d7b49..7f4ae484d 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -51,6 +51,45 @@
 #define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
 #define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
 
+// expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
+{
+    npyv_u16x2 r;
+    __m256i lo = npyv512_lower_si256(data);
+    __m256i hi = npyv512_higher_si256(data);
+#ifdef NPY_HAVE_AVX512BW
+    r.val[0] = _mm512_cvtepu8_epi16(lo);
+    r.val[1] = _mm512_cvtepu8_epi16(hi);
+#else
+    __m256i loelo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(lo));
+    __m256i loehi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(lo, 1));
+    __m256i hielo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(hi));
+    __m256i hiehi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(hi, 1));
+    r.val[0] = npyv512_combine_si256(loelo, loehi);
+    r.val[1] = npyv512_combine_si256(hielo, hiehi);
+#endif
+    return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
+{
+    npyv_u32x2 r;
+    __m256i lo = npyv512_lower_si256(data);
+    __m256i hi = npyv512_higher_si256(data);
+#ifdef NPY_HAVE_AVX512BW
+    r.val[0] = _mm512_cvtepu16_epi32(lo);
+    r.val[1] = _mm512_cvtepu16_epi32(hi);
+#else
+    __m256i loelo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(lo));
+    __m256i loehi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 1));
+    __m256i hielo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(hi));
+    __m256i hiehi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 1));
+    r.val[0] = npyv512_combine_si256(loelo, loehi);
+    r.val[1] = npyv512_combine_si256(hielo, hiehi);
+#endif
+    return r;
+}
+
 // convert boolean vectors to integer bitfield
 NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
 {
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 87e00d5d1..1c8bde15a 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -133,9 +133,16 @@
 
 // Horizontal add: Calculates the sum of all vector elements.
 #if NPY_SIMD_F64
+    #define npyv_sum_u32 vaddvq_u32
     #define npyv_sum_f32 vaddvq_f32
     #define npyv_sum_f64 vaddvq_f64
 #else
+    NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
+    {
+        uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a)); 
+        return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0);
+    }
+
     NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
     {
         float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index f92910b66..7487559d1 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -71,6 +71,21 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
     return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
 }
 
+//expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
+    npyv_u16x2 r;
+    r.val[0] = vmovl_u8(vget_low_u8(data));
+    r.val[1] = vmovl_u8(vget_high_u8(data));
+    return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
+    npyv_u32x2 r;
+    r.val[0] = vmovl_u16(vget_low_u16(data));
+    r.val[1] = vmovl_u16(vget_high_u16(data));
+    return r;
+}
+
 // round to nearest integer
 #if NPY_SIMD_F64
     #define npyv_round_s32_f32 vcvtnq_s32_f32
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 8440cc52e..faf5685d9 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -149,6 +149,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 #endif // !NPY_HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
+
+NPY_FINLINE npy_uint32 npyv_sum_u32(__m128i a)
+{
+    __m128i t = _mm_add_epi32(a, _mm_srli_si128(a, 8));
+    t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+    return (unsigned)_mm_cvtsi128_si32(t);
+}
+
 NPY_FINLINE float npyv_sum_f32(__m128 a)
 {
 #ifdef NPY_HAVE_SSE3
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index d690ec313..ab7eb4907 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -42,6 +42,23 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
 NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
 { return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
 
+// expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
+    npyv_u16x2 r;
+    const __m128i z = _mm_setzero_si128();
+    r.val[0] = _mm_unpacklo_epi8(data, z);
+    r.val[1] = _mm_unpackhi_epi8(data, z);
+    return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
+    npyv_u32x2 r;
+    const __m128i z = _mm_setzero_si128();
+    r.val[0]  = _mm_unpacklo_epi16(data, z);
+    r.val[1]  = _mm_unpackhi_epi16(data, z);
+    return r;
+}
+
 // round to nearest integer (assuming even)
 #define npyv_round_s32_f32 _mm_cvtps_epi32
 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 2f6762e63..1288a52a7 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -117,6 +117,13 @@
 #define npyv_nmulsub_f64 vec_nmadd
 
 // Horizontal add: Calculates the sum of all vector elements.
+
+NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
+{
+    const npyv_u32 rs = vec_add(a, vec_sld(a, a, 8));
+    return vec_extract(vec_add(rs, vec_sld(rs, rs, 4)), 0);
+}
+
 NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 72fe10e69..36bea7bba 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -29,6 +29,25 @@
 #define npyv_cvt_b32_f32(A) ((npyv_b32) A)
 #define npyv_cvt_b64_f64(A) ((npyv_b64) A)
 
+//expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
+{
+    npyv_u16x2 r;
+    npyv_u8 zero = npyv_zero_u8();
+    r.val[0] = (npyv_u16)vec_mergeh(data, zero);
+    r.val[1] = (npyv_u16)vec_mergel(data, zero);
+    return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
+{
+    npyv_u32x2 r;
+    npyv_u16 zero = npyv_zero_u16();
+    r.val[0] = (npyv_u32)vec_mergeh(data, zero);
+    r.val[1] = (npyv_u32)vec_mergel(data, zero);
+    return r;
+}
+
 // convert boolean vector to integer bitfield
 NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
 {
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index b279ffc2f..77fff5eb4 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -4,6 +4,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -27,7 +28,7 @@
 #include "alloc.h"
 #include "arraytypes.h"
 #include "array_coercion.h"
-
+#include "simd/simd.h"
 
 static NPY_GCC_OPT_3 NPY_INLINE int
 npy_fasttake_impl(
@@ -2128,6 +2129,80 @@ count_nonzero_bytes_384(const npy_uint64 * w)
     return r;
 }
 
+#if NPY_SIMD
+
+/* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
+static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
+count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
+{
+    const npyv_u8 vone = npyv_setall_u8(1);
+    const npyv_u8 vzero = npyv_zero_u8();
+
+    npy_intp lane_max = 0;
+    npyv_u8 vsum8 = npyv_zero_u8();
+    while (*d < end && lane_max <= max_count - 1) {
+        // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs
+        npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero));
+        vt = npyv_and_u8(vt, vone);
+        vsum8 = npyv_add_u8(vsum8, vt);
+        *d += npyv_nlanes_u8;
+        lane_max += 1;
+    }
+    return vsum8;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2
+count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count)
+{
+    npyv_u16x2 vsum16;
+    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16();
+    npy_intp lane_max = 0;
+    while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) {
+        npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8);
+        npyv_u16x2 part = npyv_expand_u16_u8(vsum8);
+        vsum16.val[0] = npyv_add_u16(vsum16.val[0], part.val[0]);
+        vsum16.val[1] = npyv_add_u16(vsum16.val[1], part.val[1]);
+        lane_max += NPY_MAX_UINT8;
+    }
+    return vsum16;
+}
+
+/*
+ * Counts the number of non-zero values in a raw array.
+ * The one loop process is shown below(take SSE2 with 128bits vector for example):
+ *          |------------16 lanes---------|          
+ *[vsum8]   255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements
+ *                          !!
+ *           |------------8 lanes---------|          
+ *[vsum16]   65535 65535 65535 ...   65535  count_zero_bytes_u16: counting (2*16-1)*16 elements
+ *           65535 65535 65535 ...   65535
+ *                          !!
+ *           |------------4 lanes---------|          
+ *[sum_32_0] 65535    65535   65535   65535  count_nonzero_bytes
+ *           65535    65535   65535   65535
+ *[sum_32_1] 65535    65535   65535   65535
+ *           65535    65535   65535   65535
+ *                          !!
+ *                     (2*16-1)*16
+*/
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
+{
+    npy_intp zero_count = 0;
+    const npy_uint8 *end = d + unrollx;
+    while (d < end) {
+        npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16);
+        npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
+        npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
+        zero_count += npyv_sum_u32(npyv_add_u32(
+                npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
+                npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
+        ));
+    }
+    return unrollx - zero_count;
+}
+
+#endif
 /*
  * Counts the number of True values in a raw boolean array. This
  * is a low-overhead function which does no heap allocations.
@@ -2137,6 +2212,7 @@ count_nonzero_bytes_384(const npy_uint64 * w)
 NPY_NO_EXPORT npy_intp
 count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
 {
+    
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     npy_intp i, coord[NPY_MAXDIMS];
@@ -2158,13 +2234,17 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
     }
 
     NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-
     /* Special case for contiguous inner loop */
     if (strides[0] == 1) {
         NPY_RAW_ITER_START(idim, ndim, coord, shape) {
             /* Process the innermost dimension */
             const char *d = data;
             const char *e = data + shape[0];
+#if NPY_SIMD
+            npy_uintp stride = shape[0] & -npyv_nlanes_u8;
+            count += count_nonzero_bytes((const npy_uint8 *)d, stride);
+            d += stride;
+#else
             if (NPY_CPU_HAVE_UNALIGNED_ACCESS ||
                     npy_is_aligned(d, sizeof(npy_uint64))) {
                 npy_uintp stride = 6 * sizeof(npy_uint64);
@@ -2172,6 +2252,7 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
                     count += count_nonzero_bytes_384((const npy_uint64 *)d);
                 }
             }
+#endif
             for (; d < e; ++d) {
                 count += (*d != 0);
             }
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 2f378667d..71356f812 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -663,6 +663,26 @@ class _SIMD_ALL(_Test_Utility):
         true_vsfx = from_boolean(true_vb)
         assert false_vsfx != true_vsfx
 
+    def test_conversion_expand(self):
+        """
+        Test expand intrinics:
+            npyv_expand_u16_u8
+            npyv_expand_u32_u16
+        """
+        if self.sfx not in ("u8", "u16"):
+            return
+        totype = self.sfx[0]+str(int(self.sfx[1:])*2)
+        expand = getattr(self.npyv, f"expand_{totype}_{self.sfx}")
+        # close enough from the edge to detect any deviation
+        data  = self._data(self._int_max() - self.nlanes)
+        vdata = self.load(data)
+        edata = expand(vdata)
+        # lower half part
+        data_lo = data[:self.nlanes//2]
+        # higher half part
+        data_hi = data[self.nlanes//2:]
+        assert edata == (data_lo, data_hi)
+
     def test_arithmetic_subadd(self):
         if self._is_fp():
             data_a = self._data()
@@ -707,7 +727,13 @@ class _SIMD_ALL(_Test_Utility):
         assert div == data_div
 
     def test_arithmetic_reduce_sum(self):
-        if not self._is_fp():
+        """
+        Test reduce sum intrinics:
+            npyv_sum_u32
+            npyv_sum_f32
+            npyv_sum_f64
+        """
+        if self.sfx not in ("u32", "f32", "f64"):
             return
         # reduce sum
         data = self._data()
author	Matti Picus <matti.picus@gmail.com>	2020-12-23 12:04:18 +0200
committer	GitHub <noreply@github.com>	2020-12-23 12:04:18 +0200
commit	85df388d344f4ebd70921dca0bc723770e05a37b (patch)
tree	fe80b510a7283f13b107b72ca5f448127d775271
parent	08d7e1163a70224b448469e072981b7347ed179d (diff)
parent	b3681b6af3be4220a6f380c0f34f63f77aaf4b07 (diff)
download	numpy-85df388d344f4ebd70921dca0bc723770e05a37b.tar.gz