summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2020-12-23 12:04:18 +0200
committerGitHub <noreply@github.com>2020-12-23 12:04:18 +0200
commit85df388d344f4ebd70921dca0bc723770e05a37b (patch)
treefe80b510a7283f13b107b72ca5f448127d775271
parent08d7e1163a70224b448469e072981b7347ed179d (diff)
parentb3681b6af3be4220a6f380c0f34f63f77aaf4b07 (diff)
downloadnumpy-85df388d344f4ebd70921dca0bc723770e05a37b.tar.gz
Merge pull request #17958 from Qiyu8/countnz
MAINT: Optimize the performance of count_nonzero by using universal intrinsics
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src16
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h9
-rw-r--r--numpy/core/src/common/simd/avx2/conversion.h15
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h11
-rw-r--r--numpy/core/src/common/simd/avx512/conversion.h39
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h7
-rw-r--r--numpy/core/src/common/simd/neon/conversion.h15
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h8
-rw-r--r--numpy/core/src/common/simd/sse/conversion.h17
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h7
-rw-r--r--numpy/core/src/common/simd/vsx/conversion.h19
-rw-r--r--numpy/core/src/multiarray/item_selection.c85
-rw-r--r--numpy/core/tests/test_simd.py28
13 files changed, 268 insertions, 8 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index eaff81338..af42192a9 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -15,13 +15,15 @@
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
+ * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
- * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #sum_sup = 0, 0, 0, 0, 1, 0, 0, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
@@ -323,7 +325,9 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
***************************/
SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
-
+#if @expand_sup@
+SIMD_IMPL_INTRIN_1(expand_@esfx@_@sfx@, v@esfx@x2, v@sfx@)
+#endif // expand_sup
/***************************
* Arithmetic
***************************/
@@ -440,13 +444,15 @@ static PyMethodDef simd__intrinsics_methods[] = {
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
+ * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
- * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #sum_sup = 0, 0, 0, 0, 1, 0, 0, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
@@ -528,7 +534,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
***************************/
SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
-
+#if @expand_sup@
+SIMD_INTRIN_DEF(expand_@esfx@_@sfx@)
+#endif // expand_sup
/***************************
* Arithmetic
***************************/
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 3a6dc9535..3a3a82798 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -118,6 +118,15 @@
#endif // !NPY_HAVE_FMA3
// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a)
+{
+ __m256i s0 = _mm256_hadd_epi32(a, a);
+ s0 = _mm256_hadd_epi32(s0, s0);
+ __m128i s1 = _mm256_extracti128_si256(s0, 1);;
+ s1 = _mm_add_epi32(_mm256_castsi256_si128(s0), s1);
+ return _mm_cvtsi128_si32(s1);
+}
+
NPY_FINLINE float npyv_sum_f32(__m256 a)
{
__m256 sum_halves = _mm256_hadd_ps(a, a);
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index dc6b18766..64e051686 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -43,6 +43,21 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
+// expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
+ npyv_u16x2 r;
+ r.val[0] = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
+ r.val[1] = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 1));
+ return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
+ npyv_u32x2 r;
+ r.val[0] = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(data));
+ r.val[1] = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(data, 1));
+ return r;
+}
+
// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm256_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 7372ca29e..6f668f439 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -130,7 +130,7 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
#define npyv_nmulsub_f64 _mm512_fnmsub_pd
/***************************
- * Reduce Sum
+ * Reduce Sum: Calculates the sum of all vector elements.
* there are three ways to implement reduce sum for AVX512:
* 1- split(256) /add /split(128) /add /hadd /hadd /extract
* 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
@@ -144,6 +144,15 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
* The third one is almost the same as the second one but only works for
* intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
***************************/
+
+NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
+{
+ __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a));
+ __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+ quarter = _mm_hadd_epi32(quarter, quarter);
+ return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+
#ifdef NPY_HAVE_AVX512F_REDUCE
#define npyv_sum_f32 _mm512_reduce_add_ps
#define npyv_sum_f64 _mm512_reduce_add_pd
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 1d71d7b49..7f4ae484d 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -51,6 +51,45 @@
#define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
#define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
+// expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
+{
+ npyv_u16x2 r;
+ __m256i lo = npyv512_lower_si256(data);
+ __m256i hi = npyv512_higher_si256(data);
+#ifdef NPY_HAVE_AVX512BW
+ r.val[0] = _mm512_cvtepu8_epi16(lo);
+ r.val[1] = _mm512_cvtepu8_epi16(hi);
+#else
+ __m256i loelo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(lo));
+ __m256i loehi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(lo, 1));
+ __m256i hielo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(hi));
+ __m256i hiehi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(hi, 1));
+ r.val[0] = npyv512_combine_si256(loelo, loehi);
+ r.val[1] = npyv512_combine_si256(hielo, hiehi);
+#endif
+ return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
+{
+ npyv_u32x2 r;
+ __m256i lo = npyv512_lower_si256(data);
+ __m256i hi = npyv512_higher_si256(data);
+#ifdef NPY_HAVE_AVX512BW
+ r.val[0] = _mm512_cvtepu16_epi32(lo);
+ r.val[1] = _mm512_cvtepu16_epi32(hi);
+#else
+ __m256i loelo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(lo));
+ __m256i loehi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 1));
+ __m256i hielo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(hi));
+ __m256i hiehi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 1));
+ r.val[0] = npyv512_combine_si256(loelo, loehi);
+ r.val[1] = npyv512_combine_si256(hielo, hiehi);
+#endif
+ return r;
+}
+
// convert boolean vectors to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 87e00d5d1..1c8bde15a 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -133,9 +133,16 @@
// Horizontal add: Calculates the sum of all vector elements.
#if NPY_SIMD_F64
+ #define npyv_sum_u32 vaddvq_u32
#define npyv_sum_f32 vaddvq_f32
#define npyv_sum_f64 vaddvq_f64
#else
+ NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
+ {
+ uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
+ return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0);
+ }
+
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index f92910b66..7487559d1 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -71,6 +71,21 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
}
+//expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
+ npyv_u16x2 r;
+ r.val[0] = vmovl_u8(vget_low_u8(data));
+ r.val[1] = vmovl_u8(vget_high_u8(data));
+ return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
+ npyv_u32x2 r;
+ r.val[0] = vmovl_u16(vget_low_u16(data));
+ r.val[1] = vmovl_u16(vget_high_u16(data));
+ return r;
+}
+
// round to nearest integer
#if NPY_SIMD_F64
#define npyv_round_s32_f32 vcvtnq_s32_f32
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 8440cc52e..faf5685d9 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -149,6 +149,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
#endif // !NPY_HAVE_FMA3
// Horizontal add: Calculates the sum of all vector elements.
+
+NPY_FINLINE npy_uint32 npyv_sum_u32(__m128i a)
+{
+ __m128i t = _mm_add_epi32(a, _mm_srli_si128(a, 8));
+ t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+ return (unsigned)_mm_cvtsi128_si32(t);
+}
+
NPY_FINLINE float npyv_sum_f32(__m128 a)
{
#ifdef NPY_HAVE_SSE3
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index d690ec313..ab7eb4907 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -42,6 +42,23 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
+// expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
+ npyv_u16x2 r;
+ const __m128i z = _mm_setzero_si128();
+ r.val[0] = _mm_unpacklo_epi8(data, z);
+ r.val[1] = _mm_unpackhi_epi8(data, z);
+ return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
+ npyv_u32x2 r;
+ const __m128i z = _mm_setzero_si128();
+ r.val[0] = _mm_unpacklo_epi16(data, z);
+ r.val[1] = _mm_unpackhi_epi16(data, z);
+ return r;
+}
+
// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 2f6762e63..1288a52a7 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -117,6 +117,13 @@
#define npyv_nmulsub_f64 vec_nmadd
// Horizontal add: Calculates the sum of all vector elements.
+
+NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
+{
+ const npyv_u32 rs = vec_add(a, vec_sld(a, a, 8));
+ return vec_extract(vec_add(rs, vec_sld(rs, rs, 4)), 0);
+}
+
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 72fe10e69..36bea7bba 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -29,6 +29,25 @@
#define npyv_cvt_b32_f32(A) ((npyv_b32) A)
#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
+//expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
+{
+ npyv_u16x2 r;
+ npyv_u8 zero = npyv_zero_u8();
+ r.val[0] = (npyv_u16)vec_mergeh(data, zero);
+ r.val[1] = (npyv_u16)vec_mergel(data, zero);
+ return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
+{
+ npyv_u32x2 r;
+ npyv_u16 zero = npyv_zero_u16();
+ r.val[0] = (npyv_u32)vec_mergeh(data, zero);
+ r.val[1] = (npyv_u32)vec_mergel(data, zero);
+ return r;
+}
+
// convert boolean vector to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index b279ffc2f..77fff5eb4 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -4,6 +4,7 @@
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
+
#include "numpy/arrayobject.h"
#include "numpy/arrayscalars.h"
@@ -27,7 +28,7 @@
#include "alloc.h"
#include "arraytypes.h"
#include "array_coercion.h"
-
+#include "simd/simd.h"
static NPY_GCC_OPT_3 NPY_INLINE int
npy_fasttake_impl(
@@ -2128,6 +2129,80 @@ count_nonzero_bytes_384(const npy_uint64 * w)
return r;
}
+#if NPY_SIMD
+
+/* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
+static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
+count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
+{
+ const npyv_u8 vone = npyv_setall_u8(1);
+ const npyv_u8 vzero = npyv_zero_u8();
+
+ npy_intp lane_max = 0;
+ npyv_u8 vsum8 = npyv_zero_u8();
+ while (*d < end && lane_max <= max_count - 1) {
+ // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs
+ npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero));
+ vt = npyv_and_u8(vt, vone);
+ vsum8 = npyv_add_u8(vsum8, vt);
+ *d += npyv_nlanes_u8;
+ lane_max += 1;
+ }
+ return vsum8;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2
+count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count)
+{
+ npyv_u16x2 vsum16;
+ vsum16.val[0] = vsum16.val[1] = npyv_zero_u16();
+ npy_intp lane_max = 0;
+ while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) {
+ npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8);
+ npyv_u16x2 part = npyv_expand_u16_u8(vsum8);
+ vsum16.val[0] = npyv_add_u16(vsum16.val[0], part.val[0]);
+ vsum16.val[1] = npyv_add_u16(vsum16.val[1], part.val[1]);
+ lane_max += NPY_MAX_UINT8;
+ }
+ return vsum16;
+}
+
+/*
+ * Counts the number of non-zero values in a raw array.
+ * The one loop process is shown below(take SSE2 with 128bits vector for example):
+ * |------------16 lanes---------|
+ *[vsum8] 255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements
+ * !!
+ * |------------8 lanes---------|
+ *[vsum16] 65535 65535 65535 ... 65535 count_zero_bytes_u16: counting (2*16-1)*16 elements
+ * 65535 65535 65535 ... 65535
+ * !!
+ * |------------4 lanes---------|
+ *[sum_32_0] 65535 65535 65535 65535 count_nonzero_bytes
+ * 65535 65535 65535 65535
+ *[sum_32_1] 65535 65535 65535 65535
+ * 65535 65535 65535 65535
+ * !!
+ * (2*16-1)*16
+*/
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
+{
+ npy_intp zero_count = 0;
+ const npy_uint8 *end = d + unrollx;
+ while (d < end) {
+ npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16);
+ npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
+ npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
+ zero_count += npyv_sum_u32(npyv_add_u32(
+ npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
+ npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
+ ));
+ }
+ return unrollx - zero_count;
+}
+
+#endif
/*
* Counts the number of True values in a raw boolean array. This
* is a low-overhead function which does no heap allocations.
@@ -2137,6 +2212,7 @@ count_nonzero_bytes_384(const npy_uint64 * w)
NPY_NO_EXPORT npy_intp
count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
{
+
int idim;
npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
npy_intp i, coord[NPY_MAXDIMS];
@@ -2158,13 +2234,17 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
}
NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-
/* Special case for contiguous inner loop */
if (strides[0] == 1) {
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
/* Process the innermost dimension */
const char *d = data;
const char *e = data + shape[0];
+#if NPY_SIMD
+ npy_uintp stride = shape[0] & -npyv_nlanes_u8;
+ count += count_nonzero_bytes((const npy_uint8 *)d, stride);
+ d += stride;
+#else
if (NPY_CPU_HAVE_UNALIGNED_ACCESS ||
npy_is_aligned(d, sizeof(npy_uint64))) {
npy_uintp stride = 6 * sizeof(npy_uint64);
@@ -2172,6 +2252,7 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
count += count_nonzero_bytes_384((const npy_uint64 *)d);
}
}
+#endif
for (; d < e; ++d) {
count += (*d != 0);
}
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 2f378667d..71356f812 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -663,6 +663,26 @@ class _SIMD_ALL(_Test_Utility):
true_vsfx = from_boolean(true_vb)
assert false_vsfx != true_vsfx
+ def test_conversion_expand(self):
+ """
+ Test expand intrinics:
+ npyv_expand_u16_u8
+ npyv_expand_u32_u16
+ """
+ if self.sfx not in ("u8", "u16"):
+ return
+ totype = self.sfx[0]+str(int(self.sfx[1:])*2)
+ expand = getattr(self.npyv, f"expand_{totype}_{self.sfx}")
+ # close enough from the edge to detect any deviation
+ data = self._data(self._int_max() - self.nlanes)
+ vdata = self.load(data)
+ edata = expand(vdata)
+ # lower half part
+ data_lo = data[:self.nlanes//2]
+ # higher half part
+ data_hi = data[self.nlanes//2:]
+ assert edata == (data_lo, data_hi)
+
def test_arithmetic_subadd(self):
if self._is_fp():
data_a = self._data()
@@ -707,7 +727,13 @@ class _SIMD_ALL(_Test_Utility):
assert div == data_div
def test_arithmetic_reduce_sum(self):
- if not self._is_fp():
+ """
+ Test reduce sum intrinics:
+ npyv_sum_u32
+ npyv_sum_f32
+ npyv_sum_f64
+ """
+ if self.sfx not in ("u32", "f32", "f64"):
return
# reduce sum
data = self._data()