summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2021-03-08 09:24:50 +0200
committerGitHub <noreply@github.com>2021-03-08 09:24:50 +0200
commitdf2cd0f7ef1dba763039514fc11669d9ff505c2d (patch)
treeab626376b0ee8be13a97d6615063160d8b4aaee8
parent0cea1d9b585ebe9895c898c60f05572840579c89 (diff)
parent8aae310556359517394df263f3b5819eff8c3daa (diff)
downloadnumpy-df2cd0f7ef1dba763039514fc11669d9ff505c2d.tar.gz
Merge pull request #18178 from seiko2plus/npyv_fast_div
SIMD: add fast integer division intrinsics for all supported platforms
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src19
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h160
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h224
-rw-r--r--numpy/core/src/common/simd/intdiv.h473
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h150
-rw-r--r--numpy/core/src/common/simd/simd.h5
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h177
-rw-r--r--numpy/core/src/common/simd/sse/misc.h25
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h132
-rw-r--r--numpy/core/tests/test_simd.py70
10 files changed, 1411 insertions, 24 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index e5b58a8d2..4acd20a75 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -15,8 +15,8 @@
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
- * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
- * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
+ * #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
+ * #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
@@ -27,6 +27,7 @@
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
+ * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
@@ -354,6 +355,11 @@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
#endif // div_sup
+#if @intdiv_sup@
+SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@)
+SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
+#endif // intdiv_sup
+
#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
@@ -442,6 +448,7 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/
+
//#########################################################################
//## Attach module functions
//#########################################################################
@@ -449,7 +456,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
- * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
+ * #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
@@ -461,6 +468,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
+ * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
@@ -568,6 +576,11 @@ SIMD_INTRIN_DEF(mul_@sfx@)
SIMD_INTRIN_DEF(div_@sfx@)
#endif // div_sup
+#if @intdiv_sup@
+SIMD_INTRIN_DEF(divisor_@sfx@)
+SIMD_INTRIN_DEF(divc_@sfx@)
+#endif // intdiv_sup
+
#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 4b8258759..7cd5a0ea6 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -67,6 +67,164 @@
// TODO: after implment Packs intrins
/***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+ const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+ const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
+ const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
+ // high part of unsigned multiplication
+ __m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]);
+ __m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
+ mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
+ __m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m256i q = _mm256_sub_epi8(a, mulhi);
+ q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
+ q = _mm256_add_epi8(mulhi, q);
+ q = _mm256_and_si256(_mm256_srl_epi16(q, shf2), shf2b);
+ return q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+ const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
+ // instead of _mm256_cvtepi8_epi16/_mm256_packs_epi16 to wrap around overflow
+ __m256i divc_even = npyv_divc_s16(_mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8), divisor);
+ __m256i divc_odd = npyv_divc_s16(_mm256_srai_epi16(a, 8), divisor);
+ divc_odd = _mm256_slli_epi16(divc_odd, 8);
+ return _mm256_blendv_epi8(divc_odd, divc_even, bmask);
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+ // high part of unsigned multiplication
+ __m256i mulhi = _mm256_mulhi_epu16(a, divisor.val[0]);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m256i q = _mm256_sub_epi16(a, mulhi);
+ q = _mm256_srl_epi16(q, shf1);
+ q = _mm256_add_epi16(mulhi, q);
+ q = _mm256_srl_epi16(q, shf2);
+ return q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ // high part of signed multiplication
+ __m256i mulhi = _mm256_mulhi_epi16(a, divisor.val[0]);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ __m256i q = _mm256_sra_epi16(_mm256_add_epi16(a, mulhi), shf1);
+ q = _mm256_sub_epi16(q, _mm256_srai_epi16(a, 15));
+ q = _mm256_sub_epi16(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+ // high part of unsigned multiplication
+ __m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor.val[0]), 32);
+ __m256i mulhi_odd = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor.val[0]);
+ __m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m256i q = _mm256_sub_epi32(a, mulhi);
+ q = _mm256_srl_epi32(q, shf1);
+ q = _mm256_add_epi32(mulhi, q);
+ q = _mm256_srl_epi32(q, shf2);
+ return q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ // high part of signed multiplication
+ __m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor.val[0]), 32);
+ __m256i mulhi_odd = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor.val[0]);
+ __m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ __m256i q = _mm256_sra_epi32(_mm256_add_epi32(a, mulhi), shf1);
+ q = _mm256_sub_epi32(q, _mm256_srai_epi32(a, 31));
+ q = _mm256_sub_epi32(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// returns the high 64 bits of unsigned 64-bit multiplication
+// xref https://stackoverflow.com/a/28827013
+NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
+{
+ __m256i lomask = npyv_setall_s64(0xffffffff);
+ __m256i a_hi = _mm256_srli_epi64(a, 32); // a0l, a0h, a1l, a1h
+ __m256i b_hi = _mm256_srli_epi64(b, 32); // b0l, b0h, b1l, b1h
+ // compute partial products
+ __m256i w0 = _mm256_mul_epu32(a, b); // a0l*b0l, a1l*b1l
+ __m256i w1 = _mm256_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h
+ __m256i w2 = _mm256_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l
+ __m256i w3 = _mm256_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h
+ // sum partial products
+ __m256i w0h = _mm256_srli_epi64(w0, 32);
+ __m256i s1 = _mm256_add_epi64(w1, w0h);
+ __m256i s1l = _mm256_and_si256(s1, lomask);
+ __m256i s1h = _mm256_srli_epi64(s1, 32);
+
+ __m256i s2 = _mm256_add_epi64(w2, s1l);
+ __m256i s2h = _mm256_srli_epi64(s2, 32);
+
+ __m256i hi = _mm256_add_epi64(w3, s1h);
+ hi = _mm256_add_epi64(hi, s2h);
+ return hi;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+ // high part of unsigned multiplication
+ __m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m256i q = _mm256_sub_epi64(a, mulhi);
+ q = _mm256_srl_epi64(q, shf1);
+ q = _mm256_add_epi64(mulhi, q);
+ q = _mm256_srl_epi64(q, shf2);
+ return q;
+}
+// divide each unsigned 64-bit element by a divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+ const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+ // high part of unsigned multiplication
+ __m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+ // convert unsigned to signed high multiplication
+ // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+ __m256i asign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
+ __m256i msign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor.val[0]);
+ __m256i m_asign = _mm256_and_si256(divisor.val[0], asign);
+ __m256i a_msign = _mm256_and_si256(a, msign);
+ mulhi = _mm256_sub_epi64(mulhi, m_asign);
+ mulhi = _mm256_sub_epi64(mulhi, a_msign);
+ // q = (a + mulhi) >> sh
+ __m256i q = _mm256_add_epi64(a, mulhi);
+ // emulate arithmetic right shift
+ const __m256i sigb = npyv_setall_s64(1LL << 63);
+ q = _mm256_srl_epi64(_mm256_add_epi64(q, sigb), shf1);
+ q = _mm256_sub_epi64(q, _mm256_srl_epi64(sigb, shf1));
+ // q = q - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ q = _mm256_sub_epi64(q, asign);
+ q = _mm256_sub_epi64(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+/***************************
* Division
***************************/
// TODO: emulate integer division
@@ -176,5 +334,3 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
}
#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
-
-
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 450da7ea5..29e1af7e8 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -107,6 +107,226 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
// TODO: after implment Packs intrins
/***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
+ const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
+ // high part of unsigned multiplication
+ __m512i mulhi_odd = _mm512_mulhi_epu16(a, divisor.val[0]);
+ __m512i mulhi_even = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]);
+ mulhi_even = _mm512_srli_epi16(mulhi_even, 8);
+ __m512i mulhi = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m512i q = _mm512_sub_epi8(a, mulhi);
+ q = _mm512_and_si512(_mm512_srl_epi16(q, shf1), shf1b);
+ q = _mm512_add_epi8(mulhi, q);
+ q = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b);
+ return q;
+#else
+ const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
+ const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
+ const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
+ const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b);
+ const __m256i mulc = npyv512_lower_si256(divisor.val[0]);
+ //// lower 256-bit
+ __m256i lo_a = npyv512_lower_si256(a);
+ // high part of unsigned multiplication
+ __m256i mulhi_odd = _mm256_mulhi_epu16(lo_a, mulc);
+ __m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc);
+ mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
+ __m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m256i lo_q = _mm256_sub_epi8(lo_a, mulhi);
+ lo_q = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b);
+ lo_q = _mm256_add_epi8(mulhi, lo_q);
+ lo_q = _mm256_srl_epi16(lo_q, shf2); // no sign extend
+
+ //// higher 256-bit
+ __m256i hi_a = npyv512_higher_si256(a);
+ // high part of unsigned multiplication
+ mulhi_odd = _mm256_mulhi_epu16(hi_a, mulc);
+ mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc);
+ mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
+ mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m256i hi_q = _mm256_sub_epi8(hi_a, mulhi);
+ hi_q = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b);
+ hi_q = _mm256_add_epi8(mulhi, hi_q);
+ hi_q = _mm256_srl_epi16(hi_q, shf2); // no sign extend
+ return _mm512_and_si512(npyv512_combine_si256(lo_q, hi_q), shf2bw); // extend sign
+#endif
+}
+// divide each signed 8-bit element by divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+ __m512i divc_even = npyv_divc_s16(npyv_shri_s16(npyv_shli_s16(a, 8), 8), divisor);
+ __m512i divc_odd = npyv_divc_s16(npyv_shri_s16(a, 8), divisor);
+ divc_odd = npyv_shli_s16(divc_odd, 8);
+#ifdef NPY_HAVE_AVX512BW
+ return _mm512_mask_mov_epi8(divc_even, 0xAAAAAAAAAAAAAAAA, divc_odd);
+#else
+ const __m512i bmask = _mm512_set1_epi32(0x00FF00FF);
+ return npyv_select_u8(bmask, divc_even, divc_odd);
+#endif
+}
+// divide each unsigned 16-bit element by divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ #define NPYV__DIVC_U16(RLEN, A, MULC, R) \
+ mulhi = _mm##RLEN##_mulhi_epu16(A, MULC); \
+ R = _mm##RLEN##_sub_epi16(A, mulhi); \
+ R = _mm##RLEN##_srl_epi16(R, shf1); \
+ R = _mm##RLEN##_add_epi16(mulhi, R); \
+ R = _mm##RLEN##_srl_epi16(R, shf2);
+
+#ifdef NPY_HAVE_AVX512BW
+ __m512i mulhi, q;
+ NPYV__DIVC_U16(512, a, divisor.val[0], q)
+ return q;
+#else
+ const __m256i m = npyv512_lower_si256(divisor.val[0]);
+ __m256i lo_a = npyv512_lower_si256(a);
+ __m256i hi_a = npyv512_higher_si256(a);
+
+ __m256i mulhi, lo_q, hi_q;
+ NPYV__DIVC_U16(256, lo_a, m, lo_q)
+ NPYV__DIVC_U16(256, hi_a, m, hi_q)
+ return npyv512_combine_si256(lo_q, hi_q);
+#endif
+ #undef NPYV__DIVC_U16
+}
+// divide each signed 16-bit element by divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ #define NPYV__DIVC_S16(RLEN, A, MULC, DSIGN, R) \
+ mulhi = _mm##RLEN##_mulhi_epi16(A, MULC); \
+ R = _mm##RLEN##_sra_epi16(_mm##RLEN##_add_epi16(A, mulhi), shf1); \
+ R = _mm##RLEN##_sub_epi16(R, _mm##RLEN##_srai_epi16(A, 15)); \
+ R = _mm##RLEN##_sub_epi16(_mm##RLEN##_xor_si##RLEN(R, DSIGN), DSIGN);
+
+#ifdef NPY_HAVE_AVX512BW
+ __m512i mulhi, q;
+ NPYV__DIVC_S16(512, a, divisor.val[0], divisor.val[2], q)
+ return q;
+#else
+ const __m256i m = npyv512_lower_si256(divisor.val[0]);
+ const __m256i dsign = npyv512_lower_si256(divisor.val[2]);
+ __m256i lo_a = npyv512_lower_si256(a);
+ __m256i hi_a = npyv512_higher_si256(a);
+
+ __m256i mulhi, lo_q, hi_q;
+ NPYV__DIVC_S16(256, lo_a, m, dsign, lo_q)
+ NPYV__DIVC_S16(256, hi_a, m, dsign, hi_q)
+ return npyv512_combine_si256(lo_q, hi_q);
+#endif
+ #undef NPYV__DIVC_S16
+}
+// divide each unsigned 32-bit element by divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+ // high part of unsigned multiplication
+ __m512i mulhi_even = _mm512_srli_epi64(_mm512_mul_epu32(a, divisor.val[0]), 32);
+ __m512i mulhi_odd = _mm512_mul_epu32(_mm512_srli_epi64(a, 32), divisor.val[0]);
+ __m512i mulhi = _mm512_mask_mov_epi32(mulhi_even, 0xAAAA, mulhi_odd);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m512i q = _mm512_sub_epi32(a, mulhi);
+ q = _mm512_srl_epi32(q, shf1);
+ q = _mm512_add_epi32(mulhi, q);
+ q = _mm512_srl_epi32(q, shf2);
+ return q;
+}
+// divide each signed 32-bit element by divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ // high part of signed multiplication
+ __m512i mulhi_even = _mm512_srli_epi64(_mm512_mul_epi32(a, divisor.val[0]), 32);
+ __m512i mulhi_odd = _mm512_mul_epi32(_mm512_srli_epi64(a, 32), divisor.val[0]);
+ __m512i mulhi = _mm512_mask_mov_epi32(mulhi_even, 0xAAAA, mulhi_odd);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ __m512i q = _mm512_sra_epi32(_mm512_add_epi32(a, mulhi), shf1);
+ q = _mm512_sub_epi32(q, _mm512_srai_epi32(a, 31));
+ q = _mm512_sub_epi32(_mm512_xor_si512(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// returns the high 64 bits of unsigned 64-bit multiplication
+// xref https://stackoverflow.com/a/28827013
+NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
+{
+ __m512i lomask = npyv_setall_s64(0xffffffff);
+ __m512i a_hi = _mm512_srli_epi64(a, 32); // a0l, a0h, a1l, a1h
+ __m512i b_hi = _mm512_srli_epi64(b, 32); // b0l, b0h, b1l, b1h
+ // compute partial products
+ __m512i w0 = _mm512_mul_epu32(a, b); // a0l*b0l, a1l*b1l
+ __m512i w1 = _mm512_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h
+ __m512i w2 = _mm512_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l
+ __m512i w3 = _mm512_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h
+ // sum partial products
+ __m512i w0h = _mm512_srli_epi64(w0, 32);
+ __m512i s1 = _mm512_add_epi64(w1, w0h);
+ __m512i s1l = _mm512_and_si512(s1, lomask);
+ __m512i s1h = _mm512_srli_epi64(s1, 32);
+
+ __m512i s2 = _mm512_add_epi64(w2, s1l);
+ __m512i s2h = _mm512_srli_epi64(s2, 32);
+
+ __m512i hi = _mm512_add_epi64(w3, s1h);
+ hi = _mm512_add_epi64(hi, s2h);
+ return hi;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+ // high part of unsigned multiplication
+ __m512i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m512i q = _mm512_sub_epi64(a, mulhi);
+ q = _mm512_srl_epi64(q, shf1);
+ q = _mm512_add_epi64(mulhi, q);
+ q = _mm512_srl_epi64(q, shf2);
+ return q;
+}
+// divide each unsigned 64-bit element by a divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+ const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+ // high part of unsigned multiplication
+ __m512i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+ // convert unsigned to signed high multiplication
+ // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+ __m512i asign = _mm512_srai_epi64(a, 63);
+ __m512i msign = _mm512_srai_epi64(divisor.val[0], 63);
+ __m512i m_asign = _mm512_and_si512(divisor.val[0], asign);
+ __m512i a_msign = _mm512_and_si512(a, msign);
+ mulhi = _mm512_sub_epi64(mulhi, m_asign);
+ mulhi = _mm512_sub_epi64(mulhi, a_msign);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ __m512i q = _mm512_sra_epi64(_mm512_add_epi64(a, mulhi), shf1);
+ q = _mm512_sub_epi64(q, asign);
+ q = _mm512_sub_epi64(_mm512_xor_si512(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+/***************************
* Division
***************************/
// TODO: emulate integer division
@@ -136,11 +356,11 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
* 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
* 3- _mm512_reduce_add_ps/pd
* The first one is been widely used by many projects
- *
+ *
* the second one is used by Intel Compiler, maybe because the
* latency of hadd increased by (2-3) starting from Skylake-X which makes two
* extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
- *
+ *
* The third one is almost the same as the second one but only works for
* intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
***************************/
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
new file mode 100644
index 000000000..a6c293d87
--- /dev/null
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -0,0 +1,473 @@
+/**
+ * This header implements `npyv_divisor_*` intrinsics used for computing the parameters
+ * of fast integer division, while division intrinsics `npyv_divc_*` are defined in
+ * {extension}/arithmetic.h.
+ */
+#ifndef NPY_SIMD
+ #error "Not a standalone header, use simd/simd.h instead"
+#endif
+#ifndef _NPY_SIMD_INTDIV_H
+#define _NPY_SIMD_INTDIV_H
+/**********************************************************************************
+ ** Integer division
+ **********************************************************************************
+ * Almost all architecture (except Power10) doesn't support integer vector division,
+ * also the cost of scalar division in architectures like x86 is too high it can take
+ * 30 to 40 cycles on modern chips and up to 100 on old ones.
+ *
+ * Therefore we are using division by multiplying with precomputed reciprocal technique,
+ * the method that been used in this implementation is based on T. Granlund and P. L. Montgomery
+ * “Division by invariant integers using multiplication(see [Figure 4.1, 5.1]
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)
+ *
+ * It shows a good impact for all architectures especially on X86,
+ * however computing divisor parameters is kind of expensive so this implementation
+ * should only works when divisor is a scalar and used multiple of times.
+ *
+ * The division process is separated into two intrinsics for each data type
+ *
+ * 1- npyv_{dtype}x3 npyv_divisor_{dtype} ({dtype} divisor);
+ * For computing the divisor parameters (multiplier + shifters + sign of divisor(signed only))
+ *
+ * 2- npyv_{dtype} npyv_divisor_{dtype} (npyv_{dtype} dividend, npyv_{dtype}x3 divisor_parms);
+ * For performing the final division.
+ *
+ ** For example:
+ * int vstep = npyv_nlanes_s32; // number of lanes
+ * int x = 0x6e70;
+ * npyv_s32x3 divisor = npyv_divisor_s32(x); // init divisor params
+ * for (; len >= vstep; src += vstep, dst += vstep, len -= vstep) {
+ * npyv_s32 a = npyv_load_s32(*src); // load s32 vector from memory
+ * a = npyv_divc_s32(a, divisor); // divide all elements by x
+ * npyv_store_s32(dst, a); // store s32 vector into memroy
+ * }
+ *
+ ** NOTES:
+ * - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division
+ * since emulating multiply-high is expensive and both architectures have very fast dividers.
+ *
+ ** TODO:
+ * - Add support for Power10(VSX4)
+ *
+ ***************************************************************
+ ** Figure 4.1: Unsigned division by run–time invariant divisor
+ ***************************************************************
+ * Initialization (given uword d with 1 ≤ d < 2^N):
+ * int l = ceil(log2(d));
+ * uword m = 2^N * (2^l− d) / d + 1;
+ * int sh1 = min(l, 1);
+ * int sh2 = max(l − 1, 0);
+ *
+ * For q = FLOOR(a/d), all uword:
+ * uword t1 = MULUH(m, a);
+ * q = SRL(t1 + SRL(a − t1, sh1), sh2);
+ *
+ ************************************************************************************
+ ** Figure 5.1: Signed division by run–time invariant divisor, rounded towards zero
+ ************************************************************************************
+ * Initialization (given constant sword d with d !=0):
+ * int l = max(ceil(log2(abs(d))), 1);
+ * udword m0 = 1 + (2^(N+l-1)) / abs(d);
+ * sword m = m0 − 2^N;
+ * sword dsign = XSIGN(d);
+ * int sh = l − 1;
+ *
+ * For q = TRUNC(a/d), all sword:
+ * sword q0 = a + MULSH(m, a);
+ * q0 = SRA(q0, sh) − XSIGN(a);
+ * q = EOR(q0, dsign) − dsign;
+ */
+/**
+ * bit-scan reverse for non-zeros. returns the index of the highest set bit.
+ * equivalent to floor(log2(a))
+ */
+#ifdef _MSC_VER
+ #include <intrin.h> // _BitScanReverse
+#endif
+NPY_FINLINE unsigned npyv__bitscan_revnz_u32(npy_uint32 a)
+{
+ assert(a > 0); // due to use __builtin_clz
+ unsigned r;
+#if defined(NPY_HAVE_SSE2) && defined(_MSC_VER)
+ unsigned long rl;
+ (void)_BitScanReverse(&rl, (unsigned long)a);
+ r = (unsigned)rl;
+#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+ __asm__("bsr %1, %0" : "=r" (r) : "r"(a));
+#elif defined(__GNUC__) || defined(__clang__)
+ r = 31 - __builtin_clz(a); // performs on arm -> clz, ppc -> cntlzw
+#else
+ r = 0;
+ while (a >>= 1) {
+ r++;
+ }
+#endif
+ return r;
+}
+NPY_FINLINE unsigned npyv__bitscan_revnz_u64(npy_uint64 a)
+{
+ assert(a > 0); // due to use __builtin_clzll
+#if defined(_M_AMD64) && defined(_MSC_VER)
+ unsigned long rl;
+ (void)_BitScanReverse64(&rl, a);
+ return (unsigned)rl;
+#elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+ npy_uint64 r;
+ __asm__("bsrq %1, %0" : "=r"(r) : "r"(a));
+ return (unsigned)r;
+#elif defined(__GNUC__) || defined(__clang__)
+ return 63 - __builtin_clzll(a);
+#else
+ npy_uint64 a_hi = a >> 32;
+ if (a_hi == 0) {
+ return npyv__bitscan_revnz_u32((npy_uint32)a);
+ }
+ return 32 + npyv__bitscan_revnz_u32((npy_uint32)a_hi);
+#endif
+}
+/**
+ * Divides 128-bit unsigned integer by a 64-bit when the lower
+ * 64-bit of the dividend is zero.
+ *
+ * This function is needed to calculate the multiplier of 64-bit integer division
+ * see npyv_divisor_u64/npyv_divisor_s64.
+ */
+NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor)
+{
+ assert(divisor > 1);
+ npy_uint64 quotient;
+#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920
+ npy_uint64 remainder;
+ quotient = _udiv128(high, 0, divisor, &remainder);
+ (void)remainder;
+#elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+ __asm__("divq %[d]" : "=a"(quotient) : [d] "r"(divisor), "a"(0), "d"(high));
+#elif defined(__SIZEOF_INT128__)
+ quotient = (npy_uint64)((((__uint128_t)high) << 64) / divisor);
+#else
+ /**
+ * Minified version based on Donald Knuth’s Algorithm D (Division of nonnegative integers),
+ * and Generic implementation in Hacker’s Delight.
+ *
+ * See https://skanthak.homepage.t-online.de/division.html
+ * with respect to the license of the Hacker's Delight book
+ * (https://web.archive.org/web/20190408122508/http://www.hackersdelight.org/permissions.htm)
+ */
+ // shift amount for normalize
+ unsigned ldz = 63 - npyv__bitscan_revnz_u64(divisor);
+ // normalize divisor
+ divisor <<= ldz;
+ high <<= ldz;
+ // break divisor up into two 32-bit digits
+ npy_uint32 divisor_hi = divisor >> 32;
+ npy_uint32 divisor_lo = divisor & 0xFFFFFFFF;
+ // compute high quotient digit
+ npy_uint32 quotient_hi = (npy_uint32)(high / divisor_hi);
+ npy_uint64 remainder = high - divisor_hi * quotient_hi;
+ npy_uint64 base32 = 1ULL << 32;
+ while (quotient_hi >= base32 || quotient_hi*divisor_lo > base32*remainder) {
+ remainder += --divisor_hi;
+ if (remainder >= base32) {
+ break;
+ }
+ }
+ // compute dividend digit pairs
+ npy_uint64 dividend_pairs = base32*high - divisor*quotient_hi;
+ // compute second quotient digit for lower zeros
+ npy_uint32 quotient_lo = (npy_uint32)(dividend_pairs / divisor_hi);
+ quotient = base32*quotient_hi + quotient_lo;
+#endif
+ return quotient;
+}
+// Initializing divisor parameters for unsigned 8-bit division
+NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
+{
+ unsigned l, l2, sh1, sh2, m;
+ switch (d) {
+ case 0: // LCOV_EXCL_LINE
+ // for potential divide by zero, On x86 GCC inserts `ud2` instruction
+ // instead of letting the HW/CPU trap it which leads to illegal instruction exception.
+ // 'volatile' should suppress this behavior and allow us to raise HW/CPU
+ // arithmetic exception.
+ m = sh1 = sh2 = 1 / ((npy_uint8 volatile *)&d)[0];
+ break;
+ case 1:
+ m = 1; sh1 = sh2 = 0;
+ break;
+ case 2:
+ m = 1; sh1 = 1; sh2 = 0;
+ break;
+ default:
+ l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d))
+ l2 = (npy_uint8)(1 << l); // 2^l, overflow to 0 if l = 8
+ m = ((l2 - d) << 8) / d + 1; // multiplier
+ sh1 = 1; sh2 = l - 1; // shift counts
+ }
+ npyv_u8x3 divisor;
+ divisor.val[0] = npyv_setall_u8(m);
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_u8(sh1);
+ divisor.val[2] = npyv_set_u8(sh2);
+#elif defined(NPY_HAVE_VSX2)
+ divisor.val[1] = npyv_setall_u8(sh1);
+ divisor.val[2] = npyv_setall_u8(sh2);
+#elif defined(NPY_HAVE_NEON)
+ divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
+ divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
+#else
+ #error "please initialize the shifting operand for the new architecture"
+#endif
+ return divisor;
+}
+// Initializing divisor parameters for signed 8-bit division
+NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d);
+NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
+{
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ npyv_s16x3 p = npyv_divisor_s16(d);
+ npyv_s8x3 r;
+ r.val[0] = npyv_reinterpret_s8_s16(p.val[0]);
+ r.val[1] = npyv_reinterpret_s8_s16(p.val[1]);
+ r.val[2] = npyv_reinterpret_s8_s16(p.val[2]);
+ return r;
+#else
+ int d1 = abs(d);
+ int sh, m;
+ if (d1 > 1) {
+ sh = (int)npyv__bitscan_revnz_u32(d1-1); // ceil(log2(abs(d))) - 1
+ m = (1 << (8 + sh)) / d1 + 1; // multiplier
+ }
+ else if (d1 == 1) {
+ sh = 0; m = 1;
+ }
+ else {
+ // raise arithmetic exception for d == 0
+ sh = m = 1 / ((npy_int8 volatile *)&d)[0]; // LCOV_EXCL_LINE
+ }
+ npyv_s8x3 divisor;
+ divisor.val[0] = npyv_setall_s8(m);
+ divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0);
+ #ifdef NPY_HAVE_VSX2
+ divisor.val[1] = npyv_setall_s8(sh);
+ #elif defined(NPY_HAVE_NEON)
+ divisor.val[1] = npyv_setall_s8(-sh);
+ #else
+ #error "please initialize the shifting operand for the new architecture"
+ #endif
+ return divisor;
+#endif
+}
+// Initializing divisor parameters for unsigned 16-bit division
+NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
+{
+ unsigned l, l2, sh1, sh2, m;
+ switch (d) {
+ case 0: // LCOV_EXCL_LINE
+ // raise arithmetic exception for d == 0
+ m = sh1 = sh2 = 1 / ((npy_uint16 volatile *)&d)[0];
+ break;
+ case 1:
+ m = 1; sh1 = sh2 = 0;
+ break;
+ case 2:
+ m = 1; sh1 = 1; sh2 = 0;
+ break;
+ default:
+ l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d))
+ l2 = (npy_uint16)(1 << l); // 2^l, overflow to 0 if l = 16
+ m = ((l2 - d) << 16) / d + 1; // multiplier
+ sh1 = 1; sh2 = l - 1; // shift counts
+ }
+ npyv_u16x3 divisor;
+ divisor.val[0] = npyv_setall_u16(m);
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_u16(sh1);
+ divisor.val[2] = npyv_set_u16(sh2);
+#elif defined(NPY_HAVE_VSX2)
+ divisor.val[1] = npyv_setall_u16(sh1);
+ divisor.val[2] = npyv_setall_u16(sh2);
+#elif defined(NPY_HAVE_NEON)
+ divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1));
+ divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2));
+#else
+ #error "please initialize the shifting operand for the new architecture"
+#endif
+ return divisor;
+}
+// Initializing divisor parameters for signed 16-bit division
+NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
+{
+ int d1 = abs(d);
+ int sh, m;
+ if (d1 > 1) {
+ sh = (int)npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
+ m = (1 << (16 + sh)) / d1 + 1; // multiplier
+ }
+ else if (d1 == 1) {
+ sh = 0; m = 1;
+ }
+ else {
+ // raise arithmetic exception for d == 0
+ sh = m = 1 / ((npy_int16 volatile *)&d)[0]; // LCOV_EXCL_LINE
+ }
+ npyv_s16x3 divisor;
+ divisor.val[0] = npyv_setall_s16(m);
+ divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_s16(sh);
+#elif defined(NPY_HAVE_VSX2)
+ divisor.val[1] = npyv_setall_s16(sh);
+#elif defined(NPY_HAVE_NEON)
+ divisor.val[1] = npyv_setall_s16(-sh);
+#else
+ #error "please initialize the shifting operand for the new architecture"
+#endif
+ return divisor;
+}
+// Initializing divisor parameters for unsigned 32-bit division
+NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
+{
+ npy_uint32 l, l2, sh1, sh2, m;
+ switch (d) {
+ case 0: // LCOV_EXCL_LINE
+ // raise arithmetic exception for d == 0
+ m = sh1 = sh2 = 1 / ((npy_uint32 volatile *)&d)[0]; // LCOV_EXCL_LINE
+ break;
+ case 1:
+ m = 1; sh1 = sh2 = 0;
+ break;
+ case 2:
+ m = 1; sh1 = 1; sh2 = 0;
+ break;
+ default:
+ l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d))
+ l2 = (npy_uint32)(1ULL << l); // 2^l, overflow to 0 if l = 32
+ m = ((npy_uint64)(l2 - d) << 32) / d + 1; // multiplier
+ sh1 = 1; sh2 = l - 1; // shift counts
+ }
+ npyv_u32x3 divisor;
+ divisor.val[0] = npyv_setall_u32(m);
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_u32(sh1);
+ divisor.val[2] = npyv_set_u32(sh2);
+#elif defined(NPY_HAVE_VSX2)
+ divisor.val[1] = npyv_setall_u32(sh1);
+ divisor.val[2] = npyv_setall_u32(sh2);
+#elif defined(NPY_HAVE_NEON)
+ divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1));
+ divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2));
+#else
+ #error "please initialize the shifting operand for the new architecture"
+#endif
+ return divisor;
+}
+// Initializing divisor parameters for signed 32-bit division
+NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
+{
+ npy_int32 d1 = abs(d);
+ npy_int32 sh, m;
+ if (d1 > 1) {
+ sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
+ m = (1ULL << (32 + sh)) / d1 + 1; // multiplier
+ }
+ else if (d1 == 1) {
+ sh = 0; m = 1;
+ }
+ // fix abs overflow
+ else if (d == (1 << 31)) {
+ m = d + 1;
+ sh = 30;
+ }
+ else {
+ // raise arithmetic exception for d == 0
+ sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE
+ }
+ npyv_s32x3 divisor;
+ divisor.val[0] = npyv_setall_s32(m);
+ divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_s32(sh);
+#elif defined(NPY_HAVE_VSX2)
+ divisor.val[1] = npyv_setall_s32(sh);
+#elif defined(NPY_HAVE_NEON)
+ divisor.val[1] = npyv_setall_s32(-sh);
+#else
+ #error "please initialize the shifting operand for the new architecture"
+#endif
+ return divisor;
+}
+// Initializing divisor parameters for unsigned 64-bit division
+NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
+{
+ npyv_u64x3 divisor;
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON)
+ divisor.val[0] = npyv_setall_u64(d);
+#else
+ npy_uint64 l, l2, sh1, sh2, m;
+ switch (d) {
+ case 0: // LCOV_EXCL_LINE
+ // raise arithmetic exception for d == 0
+ m = sh1 = sh2 = 1 / ((npy_uint64 volatile *)&d)[0]; // LCOV_EXCL_LINE
+ break;
+ case 1:
+ m = 1; sh1 = sh2 = 0;
+ break;
+ case 2:
+ m = 1; sh1 = 1; sh2 = 0;
+ break;
+ default:
+ l = npyv__bitscan_revnz_u64(d - 1) + 1; // ceil(log2(d))
+ l2 = l < 64 ? 1ULL << l : 0; // 2^l
+ m = npyv__divh128_u64(l2 - d, d) + 1; // multiplier
+ sh1 = 1; sh2 = l - 1; // shift counts
+ }
+ divisor.val[0] = npyv_setall_u64(m);
+ #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_u64(sh1);
+ divisor.val[2] = npyv_set_u64(sh2);
+ #else
+ #error "please initialize the shifting operand for the new architecture"
+ #endif
+#endif
+ return divisor;
+}
+// Initializing divisor parameters for signed 64-bit division
+NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
+{
+ npyv_s64x3 divisor;
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON)
+ divisor.val[0] = npyv_setall_s64(d);
+ divisor.val[1] = npyv_cvt_s64_b64(
+ npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0])
+ );
+#else
+ npy_int64 d1 = llabs(d);
+ npy_int64 sh, m;
+ if (d1 > 1) {
+ sh = npyv__bitscan_revnz_u64(d1 - 1); // ceil(log2(abs(d))) - 1
+ m = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier
+ }
+ else if (d1 == 1) {
+ sh = 0; m = 1;
+ }
+ // fix abs overflow
+ else if (d == (1LL << 63)) {
+ m = d + 1;
+ sh = 62;
+ }
+ else {
+ // raise arithmetic exception for d == 0
+ sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE
+ }
+ divisor.val[0] = npyv_setall_s64(m);
+ divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor
+ #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+ divisor.val[1] = npyv_set_s64(sh);
+ #else
+ #error "please initialize the shifting operand for the new architecture"
+ #endif
+#endif
+ return divisor;
+}
+
+#endif // _NPY_SIMD_INTDIV_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 69a49f571..00994806d 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -61,6 +61,154 @@
#define npyv_mul_f64 vmulq_f64
/***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+ const uint8x8_t mulc_lo = vget_low_u8(divisor.val[0]);
+ // high part of unsigned multiplication
+ uint16x8_t mull_lo = vmull_u8(vget_low_u8(a), mulc_lo);
+#if NPY_SIMD_F64
+ uint16x8_t mull_hi = vmull_high_u8(a, divisor.val[0]);
+ // get the high unsigned bytes
+ uint8x16_t mulhi = vuzp2q_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi));
+#else
+ const uint8x8_t mulc_hi = vget_high_u8(divisor.val[0]);
+ uint16x8_t mull_hi = vmull_u8(vget_high_u8(a), mulc_hi);
+ uint8x16_t mulhi = vuzpq_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi)).val[1];
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ uint8x16_t q = vsubq_u8(a, mulhi);
+ q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[1]));
+ q = vaddq_u8(mulhi, q);
+ q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[2]));
+ return q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+ const int8x8_t mulc_lo = vget_low_s8(divisor.val[0]);
+ // high part of signed multiplication
+ int16x8_t mull_lo = vmull_s8(vget_low_s8(a), mulc_lo);
+#if NPY_SIMD_F64
+ int16x8_t mull_hi = vmull_high_s8(a, divisor.val[0]);
+ // get the high unsigned bytes
+ int8x16_t mulhi = vuzp2q_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi));
+#else
+ const int8x8_t mulc_hi = vget_high_s8(divisor.val[0]);
+ int16x8_t mull_hi = vmull_s8(vget_high_s8(a), mulc_hi);
+ int8x16_t mulhi = vuzpq_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi)).val[1];
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ int8x16_t q = vshlq_s8(vaddq_s8(a, mulhi), divisor.val[1]);
+ q = vsubq_s8(q, vshrq_n_s8(a, 7));
+ q = vsubq_s8(veorq_s8(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+ const uint16x4_t mulc_lo = vget_low_u16(divisor.val[0]);
+ // high part of unsigned multiplication
+ uint32x4_t mull_lo = vmull_u16(vget_low_u16(a), mulc_lo);
+#if NPY_SIMD_F64
+ uint32x4_t mull_hi = vmull_high_u16(a, divisor.val[0]);
+ // get the high unsigned bytes
+ uint16x8_t mulhi = vuzp2q_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi));
+#else
+ const uint16x4_t mulc_hi = vget_high_u16(divisor.val[0]);
+ uint32x4_t mull_hi = vmull_u16(vget_high_u16(a), mulc_hi);
+ uint16x8_t mulhi = vuzpq_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi)).val[1];
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ uint16x8_t q = vsubq_u16(a, mulhi);
+ q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[1]));
+ q = vaddq_u16(mulhi, q);
+ q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[2]));
+ return q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+ const int16x4_t mulc_lo = vget_low_s16(divisor.val[0]);
+ // high part of signed multiplication
+ int32x4_t mull_lo = vmull_s16(vget_low_s16(a), mulc_lo);
+#if NPY_SIMD_F64
+ int32x4_t mull_hi = vmull_high_s16(a, divisor.val[0]);
+ // get the high unsigned bytes
+ int16x8_t mulhi = vuzp2q_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi));
+#else
+ const int16x4_t mulc_hi = vget_high_s16(divisor.val[0]);
+ int32x4_t mull_hi = vmull_s16(vget_high_s16(a), mulc_hi);
+ int16x8_t mulhi = vuzpq_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi)).val[1];
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ int16x8_t q = vshlq_s16(vaddq_s16(a, mulhi), divisor.val[1]);
+ q = vsubq_s16(q, vshrq_n_s16(a, 15));
+ q = vsubq_s16(veorq_s16(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+ const uint32x2_t mulc_lo = vget_low_u32(divisor.val[0]);
+ // high part of unsigned multiplication
+ uint64x2_t mull_lo = vmull_u32(vget_low_u32(a), mulc_lo);
+#if NPY_SIMD_F64
+ uint64x2_t mull_hi = vmull_high_u32(a, divisor.val[0]);
+ // get the high unsigned bytes
+ uint32x4_t mulhi = vuzp2q_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi));
+#else
+ const uint32x2_t mulc_hi = vget_high_u32(divisor.val[0]);
+ uint64x2_t mull_hi = vmull_u32(vget_high_u32(a), mulc_hi);
+ uint32x4_t mulhi = vuzpq_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi)).val[1];
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ uint32x4_t q = vsubq_u32(a, mulhi);
+ q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[1]));
+ q = vaddq_u32(mulhi, q);
+ q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[2]));
+ return q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+ const int32x2_t mulc_lo = vget_low_s32(divisor.val[0]);
+ // high part of signed multiplication
+ int64x2_t mull_lo = vmull_s32(vget_low_s32(a), mulc_lo);
+#if NPY_SIMD_F64
+ int64x2_t mull_hi = vmull_high_s32(a, divisor.val[0]);
+ // get the high unsigned bytes
+ int32x4_t mulhi = vuzp2q_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi));
+#else
+ const int32x2_t mulc_hi = vget_high_s32(divisor.val[0]);
+ int64x2_t mull_hi = vmull_s32(vget_high_s32(a), mulc_hi);
+ int32x4_t mulhi = vuzpq_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi)).val[1];
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ int32x4_t q = vshlq_s32(vaddq_s32(a, mulhi), divisor.val[1]);
+ q = vsubq_s32(q, vshrq_n_s32(a, 31));
+ q = vsubq_s32(veorq_s32(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+ const uint64_t d = vgetq_lane_u64(divisor.val[0], 0);
+ return npyv_set_u64(vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d);
+}
+// returns the high 64 bits of signed 64-bit multiplication
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+ const int64_t d = vgetq_lane_s64(divisor.val[0], 0);
+ return npyv_set_s64(vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d);
+}
+/***************************
* Division
***************************/
#if NPY_SIMD_F64
@@ -148,7 +296,7 @@
NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
{
- uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
+ uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0);
}
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index d6c14228d..a3e2b95de 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -56,6 +56,11 @@ typedef double npyv_lanetype_f64;
#include "emulate_maskop.h"
#endif
+// enable integer divisor generator for all SIMD extensions
+#if NPY_SIMD
+ #include "intdiv.h"
+#endif
+
/**
* Some SIMD extensions currently(AVX2, AVX512F) require (de facto)
* a maximum number of strides sizes when dealing with non-contiguous memory access.
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index c21b7da2d..a1ec88f56 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -86,6 +86,181 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
// TODO: after implment Packs intrins
/***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+ const __m128i bmask = _mm_set1_epi32(0xFF00FF00);
+ const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1]));
+ const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2]));
+ // high part of unsigned multiplication
+ __m128i mulhi_odd = _mm_mulhi_epu16(a, divisor.val[0]);
+ __m128i mulhi_even = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]);
+ mulhi_even = _mm_srli_epi16(mulhi_even, 8);
+ __m128i mulhi = npyv_select_u8(bmask, mulhi_odd, mulhi_even);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m128i q = _mm_sub_epi8(a, mulhi);
+ q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b);
+ q = _mm_add_epi8(mulhi, q);
+ q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[2]), shf2b);
+ return q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+ const __m128i bmask = _mm_set1_epi32(0x00FF00FF);
+ // instead of _mm_cvtepi8_epi16/_mm_packs_epi16 to wrap around overflow
+ __m128i divc_even = npyv_divc_s16(_mm_srai_epi16(_mm_slli_epi16(a, 8), 8), divisor);
+ __m128i divc_odd = npyv_divc_s16(_mm_srai_epi16(a, 8), divisor);
+ divc_odd = _mm_slli_epi16(divc_odd, 8);
+ return npyv_select_u8(bmask, divc_even, divc_odd);
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+ // high part of unsigned multiplication
+ __m128i mulhi = _mm_mulhi_epu16(a, divisor.val[0]);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m128i q = _mm_sub_epi16(a, mulhi);
+ q = _mm_srl_epi16(q, divisor.val[1]);
+ q = _mm_add_epi16(mulhi, q);
+ q = _mm_srl_epi16(q, divisor.val[2]);
+ return q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+ // high part of signed multiplication
+ __m128i mulhi = _mm_mulhi_epi16(a, divisor.val[0]);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ __m128i q = _mm_sra_epi16(_mm_add_epi16(a, mulhi), divisor.val[1]);
+ q = _mm_sub_epi16(q, _mm_srai_epi16(a, 15));
+ q = _mm_sub_epi16(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+ // high part of unsigned multiplication
+ __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epu32(a, divisor.val[0]), 32);
+ __m128i mulhi_odd = _mm_mul_epu32(_mm_srli_epi64(a, 32), divisor.val[0]);
+#ifdef NPY_HAVE_SSE41
+ __m128i mulhi = _mm_blend_epi16(mulhi_even, mulhi_odd, 0xCC);
+#else
+ __m128i mask_13 = _mm_setr_epi32(0, -1, 0, -1);
+ mulhi_odd = _mm_and_si128(mulhi_odd, mask_13);
+ __m128i mulhi = _mm_or_si128(mulhi_even, mulhi_odd);
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m128i q = _mm_sub_epi32(a, mulhi);
+ q = _mm_srl_epi32(q, divisor.val[1]);
+ q = _mm_add_epi32(mulhi, q);
+ q = _mm_srl_epi32(q, divisor.val[2]);
+ return q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+ __m128i asign = _mm_srai_epi32(a, 31);
+#ifdef NPY_HAVE_SSE41
+ // high part of signed multiplication
+ __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epi32(a, divisor.val[0]), 32);
+ __m128i mulhi_odd = _mm_mul_epi32(_mm_srli_epi64(a, 32), divisor.val[0]);
+ __m128i mulhi = _mm_blend_epi16(mulhi_even, mulhi_odd, 0xCC);
+#else // not SSE4.1
+ // high part of "unsigned" multiplication
+ __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epu32(a, divisor.val[0]), 32);
+ __m128i mulhi_odd = _mm_mul_epu32(_mm_srli_epi64(a, 32), divisor.val[0]);
+ __m128i mask_13 = _mm_setr_epi32(0, -1, 0, -1);
+ mulhi_odd = _mm_and_si128(mulhi_odd, mask_13);
+ __m128i mulhi = _mm_or_si128(mulhi_even, mulhi_odd);
+ // convert unsigned to signed high multiplication
+ // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+ const __m128i msign= _mm_srai_epi32(divisor.val[0], 31);
+ __m128i m_asign = _mm_and_si128(divisor.val[0], asign);
+ __m128i a_msign = _mm_and_si128(a, msign);
+ mulhi = _mm_sub_epi32(mulhi, m_asign);
+ mulhi = _mm_sub_epi32(mulhi, a_msign);
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ __m128i q = _mm_sra_epi32(_mm_add_epi32(a, mulhi), divisor.val[1]);
+ q = _mm_sub_epi32(q, asign);
+ q = _mm_sub_epi32(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// returns the high 64 bits of unsigned 64-bit multiplication
+// xref https://stackoverflow.com/a/28827013
+NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
+{
+ __m128i lomask = npyv_setall_s64(0xffffffff);
+ __m128i a_hi = _mm_srli_epi64(a, 32); // a0l, a0h, a1l, a1h
+ __m128i b_hi = _mm_srli_epi64(b, 32); // b0l, b0h, b1l, b1h
+ // compute partial products
+ __m128i w0 = _mm_mul_epu32(a, b); // a0l*b0l, a1l*b1l
+ __m128i w1 = _mm_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h
+ __m128i w2 = _mm_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l
+ __m128i w3 = _mm_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h
+ // sum partial products
+ __m128i w0h = _mm_srli_epi64(w0, 32);
+ __m128i s1 = _mm_add_epi64(w1, w0h);
+ __m128i s1l = _mm_and_si128(s1, lomask);
+ __m128i s1h = _mm_srli_epi64(s1, 32);
+
+ __m128i s2 = _mm_add_epi64(w2, s1l);
+ __m128i s2h = _mm_srli_epi64(s2, 32);
+
+ __m128i hi = _mm_add_epi64(w3, s1h);
+ hi = _mm_add_epi64(hi, s2h);
+ return hi;
+}
+// divide each unsigned 64-bit element by a precomputed divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+ // high part of unsigned multiplication
+ __m128i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ __m128i q = _mm_sub_epi64(a, mulhi);
+ q = _mm_srl_epi64(q, divisor.val[1]);
+ q = _mm_add_epi64(mulhi, q);
+ q = _mm_srl_epi64(q, divisor.val[2]);
+ return q;
+}
+// divide each signed 64-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+ // high part of unsigned multiplication
+ __m128i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+ // convert unsigned to signed high multiplication
+ // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+#ifdef NPY_HAVE_SSE42
+ const __m128i msign= _mm_cmpgt_epi64(_mm_setzero_si128(), divisor.val[0]);
+ __m128i asign = _mm_cmpgt_epi64(_mm_setzero_si128(), a);
+#else
+ const __m128i msign= _mm_srai_epi32(_mm_shuffle_epi32(divisor.val[0], _MM_SHUFFLE(3, 3, 1, 1)), 31);
+ __m128i asign = _mm_srai_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)), 31);
+#endif
+ __m128i m_asign = _mm_and_si128(divisor.val[0], asign);
+ __m128i a_msign = _mm_and_si128(a, msign);
+ mulhi = _mm_sub_epi64(mulhi, m_asign);
+ mulhi = _mm_sub_epi64(mulhi, a_msign);
+ // q = (a + mulhi) >> sh
+ __m128i q = _mm_add_epi64(a, mulhi);
+ // emulate arithmetic right shift
+ const __m128i sigb = npyv_setall_s64(1LL << 63);
+ q = _mm_srl_epi64(_mm_add_epi64(q, sigb), divisor.val[1]);
+ q = _mm_sub_epi64(q, _mm_srl_epi64(sigb, divisor.val[1]));
+ // q = q - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ q = _mm_sub_epi64(q, asign);
+ q = _mm_sub_epi64(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+/***************************
* Division
***************************/
// TODO: emulate integer division
@@ -175,7 +350,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
__m128 t2 = _mm_add_ps(a, t1);
__m128 t3 = _mm_shuffle_ps(t2, t2, 1);
__m128 t4 = _mm_add_ss(t2, t3);
- return _mm_cvtss_f32(t4);
+ return _mm_cvtss_f32(t4);
#endif
}
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
index 7ba47bc68..1099c491d 100644
--- a/numpy/core/src/common/simd/sse/misc.h
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -18,21 +18,16 @@
#define npyv_zero_f64 _mm_setzero_pd
// vector with a specific value set to all lanes
-#define npyv_setall_u8(VAL) _mm_set1_epi8((char)VAL)
-#define npyv_setall_s8(VAL) _mm_set1_epi8((char)VAL)
-#define npyv_setall_u16(VAL) _mm_set1_epi16((short)VAL)
-#define npyv_setall_s16(VAL) _mm_set1_epi16((short)VAL)
-#define npyv_setall_u32(VAL) _mm_set1_epi32((int)VAL)
-#define npyv_setall_s32(VAL) _mm_set1_epi32(VAL)
-#if !defined(__x86_64__) && !defined(_M_X64)
- #define npyv_setall_u64(VAL) _mm_set_epi32((int)(VAL >> 32), (int)VAL, (int)(VAL >> 32), (int)VAL)
- #define npyv_setall_s64 npyv_setall_u64
-#else
- #define npyv_setall_u64(VAL) _mm_set1_epi64x(VAL)
- #define npyv_setall_s64(VAL) _mm_set1_epi64x(VAL)
-#endif
-#define npyv_setall_f32(VAL) _mm_set1_ps(VAL)
-#define npyv_setall_f64(VAL) _mm_set1_pd(VAL)
+#define npyv_setall_u8(VAL) _mm_set1_epi8((char)(VAL))
+#define npyv_setall_s8(VAL) _mm_set1_epi8((char)(VAL))
+#define npyv_setall_u16(VAL) _mm_set1_epi16((short)(VAL))
+#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
+#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
+#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
+#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
+#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
+#define npyv_setall_f32 _mm_set1_ps
+#define npyv_setall_f64 _mm_set1_pd
/**
* vector with specific values set to each lane and
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 7c4e32f27..123fcaf92 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -95,6 +95,138 @@
#define npyv_mul_f64 vec_mul
/***************************
+ * Integer Division
+ ***************************/
+/***
+ * TODO: Add support for VSX4(Power10)
+ */
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+ const npyv_u8 mergeo_perm = {
+ 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
+ };
+ // high part of unsigned multiplication
+ npyv_u16 mul_even = vec_mule(a, divisor.val[0]);
+ npyv_u16 mul_odd = vec_mulo(a, divisor.val[0]);
+ npyv_u8 mulhi = (npyv_u8)vec_perm(mul_even, mul_odd, mergeo_perm);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ npyv_u8 q = vec_sub(a, mulhi);
+ q = vec_sr(q, divisor.val[1]);
+ q = vec_add(mulhi, q);
+ q = vec_sr(q, divisor.val[2]);
+ return q;
+}
+// divide each signed 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+ const npyv_u8 mergeo_perm = {
+ 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
+ };
+ // high part of signed multiplication
+ npyv_s16 mul_even = vec_mule(a, divisor.val[0]);
+ npyv_s16 mul_odd = vec_mulo(a, divisor.val[0]);
+ npyv_s8 mulhi = (npyv_s8)vec_perm(mul_even, mul_odd, mergeo_perm);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ npyv_s8 q = vec_sra(vec_add(a, mulhi), (npyv_u8)divisor.val[1]);
+ q = vec_sub(q, vec_sra(a, npyv_setall_u8(7)));
+ q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+ const npyv_u8 mergeo_perm = {
+ 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
+ };
+ // high part of unsigned multiplication
+ npyv_u32 mul_even = vec_mule(a, divisor.val[0]);
+ npyv_u32 mul_odd = vec_mulo(a, divisor.val[0]);
+ npyv_u16 mulhi = (npyv_u16)vec_perm(mul_even, mul_odd, mergeo_perm);
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ npyv_u16 q = vec_sub(a, mulhi);
+ q = vec_sr(q, divisor.val[1]);
+ q = vec_add(mulhi, q);
+ q = vec_sr(q, divisor.val[2]);
+ return q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+ const npyv_u8 mergeo_perm = {
+ 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
+ };
+ // high part of signed multiplication
+ npyv_s32 mul_even = vec_mule(a, divisor.val[0]);
+ npyv_s32 mul_odd = vec_mulo(a, divisor.val[0]);
+ npyv_s16 mulhi = (npyv_s16)vec_perm(mul_even, mul_odd, mergeo_perm);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ npyv_s16 q = vec_sra(vec_add(a, mulhi), (npyv_u16)divisor.val[1]);
+ q = vec_sub(q, vec_sra(a, npyv_setall_u16(15)));
+ q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+#if defined(__GNUC__) && __GNUC__ < 8
+ // Doubleword integer wide multiplication supported by GCC 8+
+ npyv_u64 mul_even, mul_odd;
+ __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
+ __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0]));
+#else
+ // Doubleword integer wide multiplication supported by GCC 8+
+ npyv_u64 mul_even = vec_mule(a, divisor.val[0]);
+ npyv_u64 mul_odd = vec_mulo(a, divisor.val[0]);
+#endif
+ // high part of unsigned multiplication
+ npyv_u32 mulhi = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd);
+ // floor(x/d) = (((a-mulhi) >> sh1) + mulhi) >> sh2
+ npyv_u32 q = vec_sub(a, mulhi);
+ q = vec_sr(q, divisor.val[1]);
+ q = vec_add(mulhi, q);
+ q = vec_sr(q, divisor.val[2]);
+ return q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+#if defined(__GNUC__) && __GNUC__ < 8
+ // Doubleword integer wide multiplication supported by GCC8+
+ npyv_s64 mul_even, mul_odd;
+ __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
+ __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0]));
+#else
+ // Doubleword integer wide multiplication supported by GCC8+
+ npyv_s64 mul_even = vec_mule(a, divisor.val[0]);
+ npyv_s64 mul_odd = vec_mulo(a, divisor.val[0]);
+#endif
+ // high part of signed multiplication
+ npyv_s32 mulhi = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd);
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ npyv_s32 q = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]);
+ q = vec_sub(q, vec_sra(a, npyv_setall_u32(31)));
+ q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 64-bit element by a precomputed divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+ const npy_uint64 d = vec_extract(divisor.val[0], 0);
+ return npyv_set_u64(vec_extract(a, 0) / d, vec_extract(a, 1) / d);
+}
+// divide each signed 64-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+ npyv_b64 overflow = vec_and(vec_cmpeq(a, npyv_setall_s64(-1LL << 63)), (npyv_b64)divisor.val[1]);
+ npyv_s64 d = vec_sel(divisor.val[0], npyv_setall_s64(1), overflow);
+ return vec_div(a, d);
+}
+/***************************
* Division
***************************/
#define npyv_div_f32 vec_div
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 1d1a111be..8306a86d3 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -733,6 +733,76 @@ class _SIMD_ALL(_Test_Utility):
div = self.div(vdata_a, vdata_b)
assert div == data_div
+ def test_arithmetic_intdiv(self):
+ """
+ Test integer division intrinics:
+ npyv_divisor_##sfx
+ npyv_divc_##sfx
+ """
+ if self._is_fp():
+ return
+
+ def trunc_div(a, d):
+ """
+ Divide towards zero works with large integers > 2^53,
+ equivalent to int(a/d)
+ """
+ sign_a, sign_d = a < 0, d < 0
+ if a == 0 or sign_a == sign_d:
+ return a // d
+ return (a + sign_d - sign_a) // d + 1
+
+ int_min = self._int_min() if self._is_signed() else 1
+ int_max = self._int_max()
+ rdata = (
+ 0, 1, self.nlanes, int_max-self.nlanes,
+ int_min, int_min//2 + 1
+ )
+ divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2)
+
+ for x, d in zip(rdata, divisors):
+ data = self._data(x)
+ vdata = self.load(data)
+ data_divc = [trunc_div(a, d) for a in data]
+ divisor = self.divisor(d)
+ divc = self.divc(vdata, divisor)
+ assert divc == data_divc
+
+ if not self._is_signed():
+ return
+
+ safe_neg = lambda x: -x-1 if -x > int_max else -x
+ # test round divison for signed integers
+ for x, d in zip(rdata, divisors):
+ d_neg = safe_neg(d)
+ data = self._data(x)
+ data_neg = [safe_neg(a) for a in data]
+ vdata = self.load(data)
+ vdata_neg = self.load(data_neg)
+ divisor = self.divisor(d)
+ divisor_neg = self.divisor(d_neg)
+
+ # round towards zero
+ data_divc = [trunc_div(a, d_neg) for a in data]
+ divc = self.divc(vdata, divisor_neg)
+ assert divc == data_divc
+ data_divc = [trunc_div(a, d) for a in data_neg]
+ divc = self.divc(vdata_neg, divisor)
+ assert divc == data_divc
+
+ # test truncate sign if the dividend is zero
+ vzero = self.zero()
+ for d in (-1, -10, -100, int_min//2, int_min):
+ divisor = self.divisor(d)
+ divc = self.divc(vzero, divisor)
+ assert divc == vzero
+
+ # test overflow
+ vmin = self.setall(int_min)
+ divisor = self.divisor(-1)
+ divc = self.divc(vmin, divisor)
+ assert divc == vmin
+
def test_arithmetic_reduce_sum(self):
"""
Test reduce sum intrinics: