Merge pull request #18178 from seiko2plus/npyv_fast_div

SIMD: add fast integer division intrinsics for all supported platforms
author: Matti Picus <matti.picus@gmail.com> 2021-03-08 09:24:50 +0200
committer: GitHub <noreply@github.com> 2021-03-08 09:24:50 +0200
commit: df2cd0f7ef1dba763039514fc11669d9ff505c2d (patch)
tree: ab626376b0ee8be13a97d6615063160d8b4aaee8
parent: 0cea1d9b585ebe9895c898c60f05572840579c89 (diff)
parent: 8aae310556359517394df263f3b5819eff8c3daa (diff)
download: numpy-df2cd0f7ef1dba763039514fc11669d9ff505c2d.tar.gz
10 files changed, 1411 insertions, 24 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index e5b58a8d2..4acd20a75 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -15,8 +15,8 @@
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
- * #esfx      = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
- * #expand_sup =1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
+ * #esfx      = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
+ * #expand_sup= 1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
  * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
  * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
@@ -27,6 +27,7 @@
  * #sum_sup   = 0,  0,  0,   0,   1,   0,   1,   0,   1,   1#
  * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
  * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
+ * #intdiv_sup= 1,  1,  1,   1,   1,   1,   1,   1,   0,   0#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
  */
@@ -354,6 +355,11 @@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
 #endif // div_sup
 
+#if @intdiv_sup@
+SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@)
+SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
+#endif // intdiv_sup
+
 #if @fused_sup@
 /**begin repeat1
  * #intrin  = muladd, mulsub, nmuladd, nmulsub#
@@ -442,6 +448,7 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
 /**end repeat**/
 
+
 //#########################################################################
 //## Attach module functions
 //#########################################################################
@@ -449,7 +456,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
- * #esfx      = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
+ * #esfx      = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
  * #expand_sup =1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
  * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
  * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
@@ -461,6 +468,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
  * #sum_sup   = 0,  0,  0,   0,   1,   0,   1,   0,   1,   1#
  * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
  * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
+ * #intdiv_sup= 1,  1,  1,   1,   1,   1,   1,   1,   0,   0#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
  */
@@ -568,6 +576,11 @@ SIMD_INTRIN_DEF(mul_@sfx@)
 SIMD_INTRIN_DEF(div_@sfx@)
 #endif // div_sup
 
+#if @intdiv_sup@
+SIMD_INTRIN_DEF(divisor_@sfx@)
+SIMD_INTRIN_DEF(divc_@sfx@)
+#endif // intdiv_sup
+
 #if @fused_sup@
 /**begin repeat1
  * #intrin  = muladd, mulsub, nmuladd, nmulsub#
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 4b8258759..7cd5a0ea6 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -67,6 +67,164 @@
 // TODO: after implment Packs intrins
 
 /***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+    const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
+    const __m128i shf1  = _mm256_castsi256_si128(divisor.val[1]);
+    const __m128i shf2  = _mm256_castsi256_si128(divisor.val[2]);
+    const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
+    const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
+    // high part of unsigned multiplication
+    __m256i mulhi_odd   = _mm256_mulhi_epu16(a, divisor.val[0]);
+    __m256i mulhi_even  = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
+            mulhi_even  = _mm256_srli_epi16(mulhi_even, 8);
+    __m256i mulhi       = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m256i q           = _mm256_sub_epi8(a, mulhi);
+            q           = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
+            q           = _mm256_add_epi8(mulhi, q);
+            q           = _mm256_and_si256(_mm256_srl_epi16(q, shf2), shf2b);
+    return  q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+    const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
+    // instead of _mm256_cvtepi8_epi16/_mm256_packs_epi16 to wrap around overflow
+    __m256i divc_even = npyv_divc_s16(_mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8), divisor);
+    __m256i divc_odd  = npyv_divc_s16(_mm256_srai_epi16(a, 8), divisor);
+            divc_odd  = _mm256_slli_epi16(divc_odd, 8);
+    return _mm256_blendv_epi8(divc_odd, divc_even, bmask);
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+    const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+    const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+    // high part of unsigned multiplication
+    __m256i mulhi      = _mm256_mulhi_epu16(a, divisor.val[0]);
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m256i q          = _mm256_sub_epi16(a, mulhi);
+            q          = _mm256_srl_epi16(q, shf1);
+            q          = _mm256_add_epi16(mulhi, q);
+            q          = _mm256_srl_epi16(q, shf2);
+    return  q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+    const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+    // high part of signed multiplication
+    __m256i mulhi      = _mm256_mulhi_epi16(a, divisor.val[0]);
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    __m256i q          = _mm256_sra_epi16(_mm256_add_epi16(a, mulhi), shf1);
+            q          = _mm256_sub_epi16(q, _mm256_srai_epi16(a, 15));
+            q          = _mm256_sub_epi16(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+    const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+    const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+    // high part of unsigned multiplication
+    __m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor.val[0]), 32);
+    __m256i mulhi_odd  = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor.val[0]);
+    __m256i mulhi      = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m256i q          = _mm256_sub_epi32(a, mulhi);
+            q          = _mm256_srl_epi32(q, shf1);
+            q          = _mm256_add_epi32(mulhi, q);
+            q          = _mm256_srl_epi32(q, shf2);
+    return  q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+    const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+    // high part of signed multiplication
+    __m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor.val[0]), 32);
+    __m256i mulhi_odd  = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor.val[0]);
+    __m256i mulhi      = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    __m256i q          = _mm256_sra_epi32(_mm256_add_epi32(a, mulhi), shf1);
+            q          = _mm256_sub_epi32(q, _mm256_srai_epi32(a, 31));
+            q          = _mm256_sub_epi32(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+// returns the high 64 bits of unsigned 64-bit multiplication
+// xref https://stackoverflow.com/a/28827013
+NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
+{
+    __m256i lomask = npyv_setall_s64(0xffffffff);
+    __m256i a_hi   = _mm256_srli_epi64(a, 32);        // a0l, a0h, a1l, a1h
+    __m256i b_hi   = _mm256_srli_epi64(b, 32);        // b0l, b0h, b1l, b1h
+    // compute partial products
+    __m256i w0     = _mm256_mul_epu32(a, b);          // a0l*b0l, a1l*b1l
+    __m256i w1     = _mm256_mul_epu32(a, b_hi);       // a0l*b0h, a1l*b1h
+    __m256i w2     = _mm256_mul_epu32(a_hi, b);       // a0h*b0l, a1h*b0l
+    __m256i w3     = _mm256_mul_epu32(a_hi, b_hi);    // a0h*b0h, a1h*b1h
+    // sum partial products
+    __m256i w0h    = _mm256_srli_epi64(w0, 32);
+    __m256i s1     = _mm256_add_epi64(w1, w0h);
+    __m256i s1l    = _mm256_and_si256(s1, lomask);
+    __m256i s1h    = _mm256_srli_epi64(s1, 32);
+
+    __m256i s2     = _mm256_add_epi64(w2, s1l);
+    __m256i s2h    = _mm256_srli_epi64(s2, 32);
+
+    __m256i hi     = _mm256_add_epi64(w3, s1h);
+            hi     = _mm256_add_epi64(hi, s2h);
+    return hi;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+    const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+    const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
+    // high part of unsigned multiplication
+    __m256i mulhi      = npyv__mullhi_u64(a, divisor.val[0]);
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m256i q          = _mm256_sub_epi64(a, mulhi);
+            q          = _mm256_srl_epi64(q, shf1);
+            q          = _mm256_add_epi64(mulhi, q);
+            q          = _mm256_srl_epi64(q, shf2);
+    return  q;
+}
+// divide each unsigned 64-bit element by a divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+    const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
+    // high part of unsigned multiplication
+    __m256i mulhi      = npyv__mullhi_u64(a, divisor.val[0]);
+    // convert unsigned to signed high multiplication
+    // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+    __m256i asign      = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
+    __m256i msign      = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor.val[0]);
+    __m256i m_asign    = _mm256_and_si256(divisor.val[0], asign);
+    __m256i a_msign    = _mm256_and_si256(a, msign);
+            mulhi      = _mm256_sub_epi64(mulhi, m_asign);
+            mulhi      = _mm256_sub_epi64(mulhi, a_msign);
+    // q               = (a + mulhi) >> sh
+    __m256i q          = _mm256_add_epi64(a, mulhi);
+    // emulate arithmetic right shift
+    const __m256i sigb = npyv_setall_s64(1LL << 63);
+            q          = _mm256_srl_epi64(_mm256_add_epi64(q, sigb), shf1);
+            q          = _mm256_sub_epi64(q, _mm256_srl_epi64(sigb, shf1));
+    // q               = q - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+            q          = _mm256_sub_epi64(q, asign);
+            q          = _mm256_sub_epi64(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+/***************************
  * Division
  ***************************/
 // TODO: emulate integer division
@@ -176,5 +334,3 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
 }
 
 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H
-
-
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 450da7ea5..29e1af7e8 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -107,6 +107,226 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 // TODO: after implment Packs intrins
 
 /***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+    const __m128i shf1  = _mm512_castsi512_si128(divisor.val[1]);
+    const __m128i shf2  = _mm512_castsi512_si128(divisor.val[2]);
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
+    const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
+    // high part of unsigned multiplication
+    __m512i mulhi_odd   = _mm512_mulhi_epu16(a, divisor.val[0]);
+    __m512i mulhi_even  = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]);
+            mulhi_even  = _mm512_srli_epi16(mulhi_even, 8);
+    __m512i mulhi       = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd);
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m512i q           = _mm512_sub_epi8(a, mulhi);
+            q           = _mm512_and_si512(_mm512_srl_epi16(q, shf1), shf1b);
+            q           = _mm512_add_epi8(mulhi, q);
+            q           = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b);
+    return  q;
+#else
+    const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
+    const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
+    const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
+    const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b);
+    const __m256i mulc  = npyv512_lower_si256(divisor.val[0]);
+    //// lower 256-bit
+    __m256i lo_a        = npyv512_lower_si256(a);
+    // high part of unsigned multiplication
+    __m256i mulhi_odd   = _mm256_mulhi_epu16(lo_a, mulc);
+    __m256i mulhi_even  = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc);
+            mulhi_even  = _mm256_srli_epi16(mulhi_even, 8);
+    __m256i mulhi       = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m256i lo_q        = _mm256_sub_epi8(lo_a, mulhi);
+            lo_q        = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b);
+            lo_q        = _mm256_add_epi8(mulhi, lo_q);
+            lo_q        = _mm256_srl_epi16(lo_q, shf2); // no sign extend
+
+    //// higher 256-bit
+    __m256i hi_a        = npyv512_higher_si256(a);
+    // high part of unsigned multiplication
+            mulhi_odd   = _mm256_mulhi_epu16(hi_a, mulc);
+            mulhi_even  = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc);
+            mulhi_even  = _mm256_srli_epi16(mulhi_even, 8);
+            mulhi       = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m256i hi_q        = _mm256_sub_epi8(hi_a, mulhi);
+            hi_q        = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b);
+            hi_q        = _mm256_add_epi8(mulhi, hi_q);
+            hi_q        = _mm256_srl_epi16(hi_q, shf2); // no sign extend
+    return _mm512_and_si512(npyv512_combine_si256(lo_q, hi_q), shf2bw); // extend sign
+#endif
+}
+// divide each signed 8-bit element by divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+    __m512i divc_even = npyv_divc_s16(npyv_shri_s16(npyv_shli_s16(a, 8), 8), divisor);
+    __m512i divc_odd  = npyv_divc_s16(npyv_shri_s16(a, 8), divisor);
+            divc_odd  = npyv_shli_s16(divc_odd, 8);
+#ifdef NPY_HAVE_AVX512BW
+    return _mm512_mask_mov_epi8(divc_even, 0xAAAAAAAAAAAAAAAA, divc_odd);
+#else
+    const __m512i bmask = _mm512_set1_epi32(0x00FF00FF);
+    return npyv_select_u8(bmask, divc_even, divc_odd);
+#endif
+}
+// divide each unsigned 16-bit element by divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+    const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+    const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    #define NPYV__DIVC_U16(RLEN, A, MULC, R)      \
+        mulhi = _mm##RLEN##_mulhi_epu16(A, MULC); \
+        R     = _mm##RLEN##_sub_epi16(A, mulhi);  \
+        R     = _mm##RLEN##_srl_epi16(R, shf1);   \
+        R     = _mm##RLEN##_add_epi16(mulhi, R);  \
+        R     = _mm##RLEN##_srl_epi16(R, shf2);
+
+#ifdef NPY_HAVE_AVX512BW
+    __m512i mulhi, q;
+    NPYV__DIVC_U16(512, a, divisor.val[0], q)
+    return q;
+#else
+    const __m256i m = npyv512_lower_si256(divisor.val[0]);
+    __m256i lo_a    = npyv512_lower_si256(a);
+    __m256i hi_a    = npyv512_higher_si256(a);
+
+    __m256i mulhi, lo_q, hi_q;
+    NPYV__DIVC_U16(256, lo_a, m, lo_q)
+    NPYV__DIVC_U16(256, hi_a, m, hi_q)
+    return npyv512_combine_si256(lo_q, hi_q);
+#endif
+    #undef NPYV__DIVC_U16
+}
+// divide each signed 16-bit element by divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+    const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    #define NPYV__DIVC_S16(RLEN, A, MULC, DSIGN, R)                       \
+        mulhi  = _mm##RLEN##_mulhi_epi16(A, MULC);                        \
+        R = _mm##RLEN##_sra_epi16(_mm##RLEN##_add_epi16(A, mulhi), shf1); \
+        R = _mm##RLEN##_sub_epi16(R, _mm##RLEN##_srai_epi16(A, 15));      \
+        R = _mm##RLEN##_sub_epi16(_mm##RLEN##_xor_si##RLEN(R, DSIGN), DSIGN);
+
+#ifdef NPY_HAVE_AVX512BW
+    __m512i mulhi, q;
+    NPYV__DIVC_S16(512, a, divisor.val[0], divisor.val[2], q)
+    return q;
+#else
+    const __m256i m     = npyv512_lower_si256(divisor.val[0]);
+    const __m256i dsign = npyv512_lower_si256(divisor.val[2]);
+    __m256i lo_a        = npyv512_lower_si256(a);
+    __m256i hi_a        = npyv512_higher_si256(a);
+
+    __m256i mulhi, lo_q, hi_q;
+    NPYV__DIVC_S16(256, lo_a, m, dsign, lo_q)
+    NPYV__DIVC_S16(256, hi_a, m, dsign, hi_q)
+    return npyv512_combine_si256(lo_q, hi_q);
+#endif
+    #undef NPYV__DIVC_S16
+}
+// divide each unsigned 32-bit element by divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+    const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+    const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+    // high part of unsigned multiplication
+    __m512i mulhi_even = _mm512_srli_epi64(_mm512_mul_epu32(a, divisor.val[0]), 32);
+    __m512i mulhi_odd  = _mm512_mul_epu32(_mm512_srli_epi64(a, 32), divisor.val[0]);
+    __m512i mulhi      = _mm512_mask_mov_epi32(mulhi_even, 0xAAAA, mulhi_odd);
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m512i q          = _mm512_sub_epi32(a, mulhi);
+            q          = _mm512_srl_epi32(q, shf1);
+            q          = _mm512_add_epi32(mulhi, q);
+            q          = _mm512_srl_epi32(q, shf2);
+    return  q;
+}
+// divide each signed 32-bit element by divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+    const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+    // high part of signed multiplication
+    __m512i mulhi_even = _mm512_srli_epi64(_mm512_mul_epi32(a, divisor.val[0]), 32);
+    __m512i mulhi_odd  = _mm512_mul_epi32(_mm512_srli_epi64(a, 32), divisor.val[0]);
+    __m512i mulhi      = _mm512_mask_mov_epi32(mulhi_even, 0xAAAA, mulhi_odd);
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    __m512i q          = _mm512_sra_epi32(_mm512_add_epi32(a, mulhi), shf1);
+            q          = _mm512_sub_epi32(q, _mm512_srai_epi32(a, 31));
+            q          = _mm512_sub_epi32(_mm512_xor_si512(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+// returns the high 64 bits of unsigned 64-bit multiplication
+// xref https://stackoverflow.com/a/28827013
+NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
+{
+    __m512i lomask = npyv_setall_s64(0xffffffff);
+    __m512i a_hi   = _mm512_srli_epi64(a, 32);        // a0l, a0h, a1l, a1h
+    __m512i b_hi   = _mm512_srli_epi64(b, 32);        // b0l, b0h, b1l, b1h
+    // compute partial products
+    __m512i w0     = _mm512_mul_epu32(a, b);          // a0l*b0l, a1l*b1l
+    __m512i w1     = _mm512_mul_epu32(a, b_hi);       // a0l*b0h, a1l*b1h
+    __m512i w2     = _mm512_mul_epu32(a_hi, b);       // a0h*b0l, a1h*b0l
+    __m512i w3     = _mm512_mul_epu32(a_hi, b_hi);    // a0h*b0h, a1h*b1h
+    // sum partial products
+    __m512i w0h    = _mm512_srli_epi64(w0, 32);
+    __m512i s1     = _mm512_add_epi64(w1, w0h);
+    __m512i s1l    = _mm512_and_si512(s1, lomask);
+    __m512i s1h    = _mm512_srli_epi64(s1, 32);
+
+    __m512i s2     = _mm512_add_epi64(w2, s1l);
+    __m512i s2h    = _mm512_srli_epi64(s2, 32);
+
+    __m512i hi     = _mm512_add_epi64(w3, s1h);
+            hi     = _mm512_add_epi64(hi, s2h);
+    return hi;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+    const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+    const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
+    // high part of unsigned multiplication
+    __m512i mulhi      = npyv__mullhi_u64(a, divisor.val[0]);
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m512i q          = _mm512_sub_epi64(a, mulhi);
+            q          = _mm512_srl_epi64(q, shf1);
+            q          = _mm512_add_epi64(mulhi, q);
+            q          = _mm512_srl_epi64(q, shf2);
+    return  q;
+}
+// divide each unsigned 64-bit element by a divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+    const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
+    // high part of unsigned multiplication
+    __m512i mulhi      = npyv__mullhi_u64(a, divisor.val[0]);
+    // convert unsigned to signed high multiplication
+    // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+    __m512i asign      = _mm512_srai_epi64(a, 63);
+    __m512i msign      = _mm512_srai_epi64(divisor.val[0], 63);
+    __m512i m_asign    = _mm512_and_si512(divisor.val[0], asign);
+    __m512i a_msign    = _mm512_and_si512(a, msign);
+            mulhi      = _mm512_sub_epi64(mulhi, m_asign);
+            mulhi      = _mm512_sub_epi64(mulhi, a_msign);
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    __m512i q          = _mm512_sra_epi64(_mm512_add_epi64(a, mulhi), shf1);
+            q          = _mm512_sub_epi64(q, asign);
+            q          = _mm512_sub_epi64(_mm512_xor_si512(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+/***************************
  * Division
  ***************************/
 // TODO: emulate integer division
@@ -136,11 +356,11 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
  * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
  * 3- _mm512_reduce_add_ps/pd
  * The first one is been widely used by many projects
- * 
+ *
  * the second one is used by Intel Compiler, maybe because the
  * latency of hadd increased by (2-3) starting from Skylake-X which makes two
  * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
- * 
+ *
  * The third one is almost the same as the second one but only works for
  * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
  ***************************/
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
new file mode 100644
index 000000000..a6c293d87
--- /dev/null
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -0,0 +1,473 @@
+/**
+ * This header implements `npyv_divisor_*` intrinsics used for computing the parameters
+ * of fast integer division, while division intrinsics `npyv_divc_*` are defined in
+ * {extension}/arithmetic.h.
+ */
+#ifndef NPY_SIMD
+    #error "Not a standalone header, use simd/simd.h instead"
+#endif
+#ifndef _NPY_SIMD_INTDIV_H
+#define _NPY_SIMD_INTDIV_H
+/**********************************************************************************
+ ** Integer division
+ **********************************************************************************
+ * Almost all architecture (except Power10) doesn't support integer vector division,
+ * also the cost of scalar division in architectures like x86 is too high it can take
+ * 30 to 40 cycles on modern chips and up to 100 on old ones.
+ *
+ * Therefore we are using division by multiplying with precomputed reciprocal technique,
+ * the method that been used in this implementation is based on T. Granlund and P. L. Montgomery
+ * “Division by invariant integers using multiplication(see [Figure 4.1, 5.1]
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)
+ *
+ * It shows a good impact for all architectures especially on X86,
+ * however computing divisor parameters is kind of expensive so this implementation
+ * should only works when divisor is a scalar and used multiple of times.
+ *
+ * The division process is separated into two intrinsics for each data type
+ *
+ *  1- npyv_{dtype}x3 npyv_divisor_{dtype} ({dtype} divisor);
+ *     For computing the divisor parameters (multiplier + shifters + sign of divisor(signed only))
+ *
+ *  2- npyv_{dtype} npyv_divisor_{dtype} (npyv_{dtype} dividend, npyv_{dtype}x3 divisor_parms);
+ *     For performing the final division.
+ *
+ ** For example:
+ *    int vstep = npyv_nlanes_s32;                // number of lanes
+ *    int x     = 0x6e70;
+ *    npyv_s32x3 divisor = npyv_divisor_s32(x);   // init divisor params
+ *    for (; len >= vstep; src += vstep, dst += vstep, len -= vstep) {
+ *        npyv_s32 a = npyv_load_s32(*src);       // load s32 vector from memory
+ *                 a = npyv_divc_s32(a, divisor); // divide all elements by x
+ *        npyv_store_s32(dst, a);                 // store s32 vector into memroy
+ *    }
+ *
+ ** NOTES:
+ *  - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division
+ *    since emulating multiply-high is expensive and both architectures have very fast dividers.
+ *
+ ** TODO:
+ *   - Add support for Power10(VSX4)
+ *
+ ***************************************************************
+ ** Figure 4.1: Unsigned division by run–time invariant divisor
+ ***************************************************************
+ * Initialization (given uword d with 1 ≤ d < 2^N):
+ *    int l   = ceil(log2(d));
+ *    uword m = 2^N * (2^l− d) / d + 1;
+ *    int sh1 = min(l, 1);
+ *    int sh2 = max(l − 1, 0);
+ *
+ * For q = FLOOR(a/d), all uword:
+ *    uword t1 = MULUH(m, a);
+ *    q = SRL(t1 + SRL(a − t1, sh1), sh2);
+ *
+ ************************************************************************************
+ ** Figure 5.1: Signed division by run–time invariant divisor, rounded towards zero
+ ************************************************************************************
+ * Initialization (given constant sword d with d !=0):
+ *    int l       = max(ceil(log2(abs(d))), 1);
+ *    udword m0   = 1 + (2^(N+l-1)) / abs(d);
+ *    sword  m    = m0 − 2^N;
+ *    sword dsign = XSIGN(d);
+ *    int sh      = l − 1;
+ *
+ * For q = TRUNC(a/d), all sword:
+ *    sword q0 = a + MULSH(m, a);
+ *          q0 = SRA(q0, sh) − XSIGN(a);
+ *    q = EOR(q0, dsign) − dsign;
+ */
+/**
+ * bit-scan reverse for non-zeros. returns the index of the highest set bit.
+ * equivalent to floor(log2(a))
+ */
+#ifdef _MSC_VER
+    #include <intrin.h> // _BitScanReverse
+#endif
+NPY_FINLINE unsigned npyv__bitscan_revnz_u32(npy_uint32 a)
+{
+    assert(a > 0); // due to use __builtin_clz
+    unsigned r;
+#if defined(NPY_HAVE_SSE2) && defined(_MSC_VER)
+    unsigned long rl;
+    (void)_BitScanReverse(&rl, (unsigned long)a);
+    r = (unsigned)rl;
+#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+    __asm__("bsr %1, %0" : "=r" (r) : "r"(a));
+#elif defined(__GNUC__) || defined(__clang__)
+    r = 31 - __builtin_clz(a); // performs on arm -> clz, ppc -> cntlzw
+#else
+    r = 0;
+    while (a >>= 1) {
+        r++;
+    }
+#endif
+    return r;
+}
+NPY_FINLINE unsigned npyv__bitscan_revnz_u64(npy_uint64 a)
+{
+    assert(a > 0); // due to use __builtin_clzll
+#if defined(_M_AMD64) && defined(_MSC_VER)
+    unsigned long rl;
+    (void)_BitScanReverse64(&rl, a);
+    return (unsigned)rl;
+#elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+    npy_uint64 r;
+    __asm__("bsrq %1, %0" : "=r"(r) : "r"(a));
+    return (unsigned)r;
+#elif defined(__GNUC__) || defined(__clang__)
+    return 63 - __builtin_clzll(a);
+#else
+    npy_uint64 a_hi = a >> 32;
+    if (a_hi == 0) {
+        return npyv__bitscan_revnz_u32((npy_uint32)a);
+    }
+    return 32 + npyv__bitscan_revnz_u32((npy_uint32)a_hi);
+#endif
+}
+/**
+ * Divides 128-bit unsigned integer by a 64-bit when the lower
+ * 64-bit of the dividend is zero.
+ *
+ * This function is needed to calculate the multiplier of 64-bit integer division
+ * see npyv_divisor_u64/npyv_divisor_s64.
+ */
+NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor)
+{
+    assert(divisor > 1);
+    npy_uint64 quotient;
+#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920
+    npy_uint64 remainder;
+    quotient = _udiv128(high, 0, divisor, &remainder);
+    (void)remainder;
+#elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+    __asm__("divq %[d]" : "=a"(quotient) : [d] "r"(divisor), "a"(0), "d"(high));
+#elif defined(__SIZEOF_INT128__)
+    quotient = (npy_uint64)((((__uint128_t)high) << 64) / divisor);
+#else
+    /**
+     * Minified version based on Donald Knuth’s Algorithm D (Division of nonnegative integers),
+     * and Generic implementation in Hacker’s Delight.
+     *
+     * See https://skanthak.homepage.t-online.de/division.html
+     * with respect to the license of the Hacker's Delight book
+     * (https://web.archive.org/web/20190408122508/http://www.hackersdelight.org/permissions.htm)
+     */
+    // shift amount for normalize
+    unsigned ldz = 63 - npyv__bitscan_revnz_u64(divisor);
+    // normalize divisor
+    divisor <<= ldz;
+    high    <<= ldz;
+    // break divisor up into two 32-bit digits
+    npy_uint32 divisor_hi  = divisor >> 32;
+    npy_uint32 divisor_lo  = divisor & 0xFFFFFFFF;
+    // compute high quotient digit
+    npy_uint32 quotient_hi = (npy_uint32)(high / divisor_hi);
+    npy_uint64 remainder   = high - divisor_hi * quotient_hi;
+    npy_uint64 base32      = 1ULL << 32;
+    while (quotient_hi >= base32 || quotient_hi*divisor_lo > base32*remainder) {
+        remainder += --divisor_hi;
+        if (remainder >= base32) {
+            break;
+        }
+    }
+    // compute dividend digit pairs
+    npy_uint64 dividend_pairs = base32*high - divisor*quotient_hi;
+    // compute second quotient digit for lower zeros
+    npy_uint32 quotient_lo = (npy_uint32)(dividend_pairs / divisor_hi);
+    quotient = base32*quotient_hi + quotient_lo;
+#endif
+    return quotient;
+}
+// Initializing divisor parameters for unsigned 8-bit division
+NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
+{
+    unsigned l, l2, sh1, sh2, m;
+    switch (d) {
+    case 0: // LCOV_EXCL_LINE
+        // for potential divide by zero, On x86 GCC inserts `ud2` instruction
+        // instead of letting the HW/CPU trap it which leads to illegal instruction exception.
+        // 'volatile' should suppress this behavior and allow us to raise HW/CPU
+        // arithmetic exception.
+        m = sh1 = sh2 = 1 / ((npy_uint8 volatile *)&d)[0];
+        break;
+    case 1:
+        m = 1; sh1 = sh2 = 0;
+        break;
+    case 2:
+        m = 1; sh1 = 1; sh2 = 0;
+        break;
+    default:
+        l   = npyv__bitscan_revnz_u32(d - 1) + 1;  // ceil(log2(d))
+        l2  = (npy_uint8)(1 << l);                 // 2^l, overflow to 0 if l = 8
+        m   = ((l2 - d) << 8) / d + 1;             // multiplier
+        sh1 = 1;  sh2 = l - 1;                     // shift counts
+    }
+    npyv_u8x3 divisor;
+    divisor.val[0] = npyv_setall_u8(m);
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[1] = npyv_set_u8(sh1);
+    divisor.val[2] = npyv_set_u8(sh2);
+#elif defined(NPY_HAVE_VSX2)
+    divisor.val[1] = npyv_setall_u8(sh1);
+    divisor.val[2] = npyv_setall_u8(sh2);
+#elif defined(NPY_HAVE_NEON)
+    divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
+    divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
+#else
+    #error "please initialize the shifting operand for the new architecture"
+#endif
+    return divisor;
+}
+// Initializing divisor parameters for signed 8-bit division
+NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d);
+NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
+{
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    npyv_s16x3 p = npyv_divisor_s16(d);
+    npyv_s8x3 r;
+    r.val[0] = npyv_reinterpret_s8_s16(p.val[0]);
+    r.val[1] = npyv_reinterpret_s8_s16(p.val[1]);
+    r.val[2] = npyv_reinterpret_s8_s16(p.val[2]);
+    return r;
+#else
+    int d1 = abs(d);
+    int sh, m;
+    if (d1 > 1) {
+        sh = (int)npyv__bitscan_revnz_u32(d1-1); // ceil(log2(abs(d))) - 1
+        m = (1 << (8 + sh)) / d1 + 1;            // multiplier
+    }
+    else if (d1 == 1) {
+        sh = 0; m = 1;
+    }
+    else {
+        // raise arithmetic exception for d == 0
+        sh = m = 1 / ((npy_int8 volatile *)&d)[0]; // LCOV_EXCL_LINE
+    }
+    npyv_s8x3 divisor;
+    divisor.val[0] = npyv_setall_s8(m);
+    divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0);
+    #ifdef NPY_HAVE_VSX2
+        divisor.val[1] = npyv_setall_s8(sh);
+    #elif defined(NPY_HAVE_NEON)
+        divisor.val[1] = npyv_setall_s8(-sh);
+    #else
+        #error "please initialize the shifting operand for the new architecture"
+    #endif
+    return divisor;
+#endif
+}
+// Initializing divisor parameters for unsigned 16-bit division
+NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
+{
+    unsigned l, l2, sh1, sh2, m;
+    switch (d) {
+    case 0: // LCOV_EXCL_LINE
+        // raise arithmetic exception for d == 0
+        m = sh1 = sh2 = 1 / ((npy_uint16 volatile *)&d)[0];
+        break;
+    case 1:
+        m = 1; sh1 = sh2 = 0;
+        break;
+    case 2:
+        m = 1; sh1 = 1; sh2 = 0;
+        break;
+    default:
+        l   = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d))
+        l2  = (npy_uint16)(1 << l);               // 2^l, overflow to 0 if l = 16
+        m   = ((l2 - d) << 16) / d + 1;           // multiplier
+        sh1 = 1;  sh2 = l - 1;                    // shift counts
+    }
+    npyv_u16x3 divisor;
+    divisor.val[0] = npyv_setall_u16(m);
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[1] = npyv_set_u16(sh1);
+    divisor.val[2] = npyv_set_u16(sh2);
+#elif defined(NPY_HAVE_VSX2)
+    divisor.val[1] = npyv_setall_u16(sh1);
+    divisor.val[2] = npyv_setall_u16(sh2);
+#elif defined(NPY_HAVE_NEON)
+    divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1));
+    divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2));
+#else
+    #error "please initialize the shifting operand for the new architecture"
+#endif
+    return divisor;
+}
+// Initializing divisor parameters for signed 16-bit division
+NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
+{
+    int d1 = abs(d);
+    int sh, m;
+    if (d1 > 1) {
+        sh = (int)npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
+        m = (1 << (16 + sh)) / d1 + 1;             // multiplier
+    }
+    else if (d1 == 1) {
+        sh = 0; m = 1;
+    }
+    else {
+        // raise arithmetic exception for d == 0
+        sh = m = 1 / ((npy_int16 volatile *)&d)[0]; // LCOV_EXCL_LINE
+    }
+    npyv_s16x3 divisor;
+    divisor.val[0] = npyv_setall_s16(m);
+    divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[1] = npyv_set_s16(sh);
+#elif defined(NPY_HAVE_VSX2)
+    divisor.val[1] = npyv_setall_s16(sh);
+#elif defined(NPY_HAVE_NEON)
+    divisor.val[1] = npyv_setall_s16(-sh);
+#else
+    #error "please initialize the shifting operand for the new architecture"
+#endif
+    return divisor;
+}
+// Initializing divisor parameters for unsigned 32-bit division
+NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
+{
+    npy_uint32 l, l2, sh1, sh2, m;
+    switch (d) {
+    case 0: // LCOV_EXCL_LINE
+        // raise arithmetic exception for d == 0
+        m = sh1 = sh2 = 1 / ((npy_uint32 volatile *)&d)[0]; // LCOV_EXCL_LINE
+        break;
+    case 1:
+        m = 1; sh1 = sh2 = 0;
+        break;
+    case 2:
+        m = 1; sh1 = 1; sh2 = 0;
+        break;
+    default:
+        l   = npyv__bitscan_revnz_u32(d - 1) + 1;     // ceil(log2(d))
+        l2  = (npy_uint32)(1ULL << l);                // 2^l, overflow to 0 if l = 32
+        m   = ((npy_uint64)(l2 - d) << 32) / d + 1;   // multiplier
+        sh1 = 1;  sh2 = l - 1;                        // shift counts
+    }
+    npyv_u32x3 divisor;
+    divisor.val[0] = npyv_setall_u32(m);
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[1] = npyv_set_u32(sh1);
+    divisor.val[2] = npyv_set_u32(sh2);
+#elif defined(NPY_HAVE_VSX2)
+    divisor.val[1] = npyv_setall_u32(sh1);
+    divisor.val[2] = npyv_setall_u32(sh2);
+#elif defined(NPY_HAVE_NEON)
+    divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1));
+    divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2));
+#else
+    #error "please initialize the shifting operand for the new architecture"
+#endif
+    return divisor;
+}
+// Initializing divisor parameters for signed 32-bit division
+NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
+{
+    npy_int32 d1 = abs(d);
+    npy_int32 sh, m;
+    if (d1 > 1) {
+        sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
+        m =  (1ULL << (32 + sh)) / d1 + 1;    // multiplier
+    }
+    else if (d1 == 1) {
+        sh = 0; m = 1;
+    }
+    // fix abs overflow
+    else if (d == (1 << 31)) {
+        m = d + 1;
+        sh = 30;
+    }
+    else {
+        // raise arithmetic exception for d == 0
+        sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE
+    }
+    npyv_s32x3 divisor;
+    divisor.val[0] = npyv_setall_s32(m);
+    divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor
+#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[1] = npyv_set_s32(sh);
+#elif defined(NPY_HAVE_VSX2)
+    divisor.val[1] = npyv_setall_s32(sh);
+#elif defined(NPY_HAVE_NEON)
+    divisor.val[1] = npyv_setall_s32(-sh);
+#else
+    #error "please initialize the shifting operand for the new architecture"
+#endif
+    return divisor;
+}
+// Initializing divisor parameters for unsigned 64-bit division
+NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
+{
+    npyv_u64x3 divisor;
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON)
+    divisor.val[0] = npyv_setall_u64(d);
+#else
+    npy_uint64 l, l2, sh1, sh2, m;
+    switch (d) {
+    case 0: // LCOV_EXCL_LINE
+        // raise arithmetic exception for d == 0
+        m = sh1 = sh2 = 1 / ((npy_uint64 volatile *)&d)[0]; // LCOV_EXCL_LINE
+        break;
+    case 1:
+        m = 1; sh1 = sh2 = 0;
+        break;
+    case 2:
+        m = 1; sh1 = 1; sh2 = 0;
+        break;
+    default:
+        l = npyv__bitscan_revnz_u64(d - 1) + 1;      // ceil(log2(d))
+        l2 = l < 64 ? 1ULL << l : 0;                 // 2^l
+        m = npyv__divh128_u64(l2 - d, d) + 1;        // multiplier
+        sh1 = 1;  sh2 = l - 1;                       // shift counts
+    }
+    divisor.val[0] = npyv_setall_u64(m);
+    #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+        divisor.val[1] = npyv_set_u64(sh1);
+        divisor.val[2] = npyv_set_u64(sh2);
+    #else
+        #error "please initialize the shifting operand for the new architecture"
+    #endif
+#endif
+    return divisor;
+}
+// Initializing divisor parameters for signed 64-bit division
+NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
+{
+    npyv_s64x3 divisor;
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON)
+    divisor.val[0] = npyv_setall_s64(d);
+    divisor.val[1] = npyv_cvt_s64_b64(
+        npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0])
+    );
+#else
+    npy_int64 d1 = llabs(d);
+    npy_int64 sh, m;
+    if (d1 > 1) {
+        sh = npyv__bitscan_revnz_u64(d1 - 1);       // ceil(log2(abs(d))) - 1
+        m  = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier
+    }
+    else if (d1 == 1) {
+        sh = 0; m = 1;
+    }
+    // fix abs overflow
+    else if (d == (1LL << 63)) {
+        m = d + 1;
+        sh = 62;
+    }
+    else {
+        // raise arithmetic exception for d == 0
+        sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE
+    }
+    divisor.val[0] = npyv_setall_s64(m);
+    divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0);  // sign of divisor
+    #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[1] = npyv_set_s64(sh);
+    #else
+        #error "please initialize the shifting operand for the new architecture"
+    #endif
+#endif
+    return divisor;
+}
+
+#endif // _NPY_SIMD_INTDIV_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 69a49f571..00994806d 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -61,6 +61,154 @@
 #define npyv_mul_f64 vmulq_f64
 
 /***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+    const uint8x8_t mulc_lo = vget_low_u8(divisor.val[0]);
+    // high part of unsigned multiplication
+    uint16x8_t mull_lo  = vmull_u8(vget_low_u8(a), mulc_lo);
+#if NPY_SIMD_F64
+    uint16x8_t mull_hi  = vmull_high_u8(a, divisor.val[0]);
+    // get the high unsigned bytes
+    uint8x16_t mulhi    = vuzp2q_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi));
+#else
+    const uint8x8_t mulc_hi = vget_high_u8(divisor.val[0]);
+    uint16x8_t mull_hi  = vmull_u8(vget_high_u8(a), mulc_hi);
+    uint8x16_t mulhi    = vuzpq_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi)).val[1];
+#endif
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    uint8x16_t q        = vsubq_u8(a, mulhi);
+               q        = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[1]));
+               q        = vaddq_u8(mulhi, q);
+               q        = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[2]));
+    return q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+    const int8x8_t mulc_lo = vget_low_s8(divisor.val[0]);
+    // high part of signed multiplication
+    int16x8_t mull_lo  = vmull_s8(vget_low_s8(a), mulc_lo);
+#if NPY_SIMD_F64
+    int16x8_t mull_hi  = vmull_high_s8(a, divisor.val[0]);
+    // get the high unsigned bytes
+    int8x16_t mulhi    = vuzp2q_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi));
+#else
+    const int8x8_t mulc_hi = vget_high_s8(divisor.val[0]);
+    int16x8_t mull_hi  = vmull_s8(vget_high_s8(a), mulc_hi);
+    int8x16_t mulhi    = vuzpq_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi)).val[1];
+#endif
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    int8x16_t q        = vshlq_s8(vaddq_s8(a, mulhi), divisor.val[1]);
+              q        = vsubq_s8(q, vshrq_n_s8(a, 7));
+              q        = vsubq_s8(veorq_s8(q, divisor.val[2]), divisor.val[2]);
+    return q;
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+    const uint16x4_t mulc_lo = vget_low_u16(divisor.val[0]);
+    // high part of unsigned multiplication
+    uint32x4_t mull_lo  = vmull_u16(vget_low_u16(a), mulc_lo);
+#if NPY_SIMD_F64
+    uint32x4_t mull_hi  = vmull_high_u16(a, divisor.val[0]);
+    // get the high unsigned bytes
+    uint16x8_t mulhi    = vuzp2q_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi));
+#else
+    const uint16x4_t mulc_hi = vget_high_u16(divisor.val[0]);
+    uint32x4_t mull_hi  = vmull_u16(vget_high_u16(a), mulc_hi);
+    uint16x8_t mulhi    = vuzpq_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi)).val[1];
+#endif
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    uint16x8_t q        = vsubq_u16(a, mulhi);
+               q        = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[1]));
+               q        = vaddq_u16(mulhi, q);
+               q        = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[2]));
+    return q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+    const int16x4_t mulc_lo = vget_low_s16(divisor.val[0]);
+    // high part of signed multiplication
+    int32x4_t mull_lo  = vmull_s16(vget_low_s16(a), mulc_lo);
+#if NPY_SIMD_F64
+    int32x4_t mull_hi  = vmull_high_s16(a, divisor.val[0]);
+    // get the high unsigned bytes
+    int16x8_t mulhi    = vuzp2q_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi));
+#else
+    const int16x4_t mulc_hi = vget_high_s16(divisor.val[0]);
+    int32x4_t mull_hi  = vmull_s16(vget_high_s16(a), mulc_hi);
+    int16x8_t mulhi    = vuzpq_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi)).val[1];
+#endif
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    int16x8_t q        = vshlq_s16(vaddq_s16(a, mulhi), divisor.val[1]);
+              q        = vsubq_s16(q, vshrq_n_s16(a, 15));
+              q        = vsubq_s16(veorq_s16(q, divisor.val[2]), divisor.val[2]);
+    return q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+    const uint32x2_t mulc_lo = vget_low_u32(divisor.val[0]);
+    // high part of unsigned multiplication
+    uint64x2_t mull_lo  = vmull_u32(vget_low_u32(a), mulc_lo);
+#if NPY_SIMD_F64
+    uint64x2_t mull_hi  = vmull_high_u32(a, divisor.val[0]);
+    // get the high unsigned bytes
+    uint32x4_t mulhi    = vuzp2q_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi));
+#else
+    const uint32x2_t mulc_hi = vget_high_u32(divisor.val[0]);
+    uint64x2_t mull_hi  = vmull_u32(vget_high_u32(a), mulc_hi);
+    uint32x4_t mulhi    = vuzpq_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi)).val[1];
+#endif
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    uint32x4_t q        =  vsubq_u32(a, mulhi);
+               q        =  vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[1]));
+               q        =  vaddq_u32(mulhi, q);
+               q        =  vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[2]));
+    return q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+    const int32x2_t mulc_lo = vget_low_s32(divisor.val[0]);
+    // high part of signed multiplication
+    int64x2_t mull_lo  = vmull_s32(vget_low_s32(a), mulc_lo);
+#if NPY_SIMD_F64
+    int64x2_t mull_hi  = vmull_high_s32(a, divisor.val[0]);
+    // get the high unsigned bytes
+    int32x4_t mulhi    = vuzp2q_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi));
+#else
+    const int32x2_t mulc_hi = vget_high_s32(divisor.val[0]);
+    int64x2_t mull_hi  = vmull_s32(vget_high_s32(a), mulc_hi);
+    int32x4_t mulhi    = vuzpq_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi)).val[1];
+#endif
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    int32x4_t q        = vshlq_s32(vaddq_s32(a, mulhi), divisor.val[1]);
+              q        = vsubq_s32(q, vshrq_n_s32(a, 31));
+              q        = vsubq_s32(veorq_s32(q, divisor.val[2]), divisor.val[2]);
+    return q;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+    const uint64_t d = vgetq_lane_u64(divisor.val[0], 0);
+    return npyv_set_u64(vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d);
+}
+// returns the high 64 bits of signed 64-bit multiplication
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+    const int64_t d = vgetq_lane_s64(divisor.val[0], 0);
+    return npyv_set_s64(vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d);
+}
+/***************************
  * Division
  ***************************/
 #if NPY_SIMD_F64
@@ -148,7 +296,7 @@
 
     NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
     {
-        uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a)); 
+        uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
         return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0);
     }
 
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index d6c14228d..a3e2b95de 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -56,6 +56,11 @@ typedef double     npyv_lanetype_f64;
     #include "emulate_maskop.h"
 #endif
 
+// enable integer divisor generator for all SIMD extensions
+#if NPY_SIMD
+    #include "intdiv.h"
+#endif
+
 /**
  * Some SIMD extensions currently(AVX2, AVX512F) require (de facto)
  * a maximum number of strides sizes when dealing with non-contiguous memory access.
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index c21b7da2d..a1ec88f56 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -86,6 +86,181 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 // TODO: after implment Packs intrins
 
 /***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+    const __m128i bmask = _mm_set1_epi32(0xFF00FF00);
+    const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1]));
+    const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2]));
+    // high part of unsigned multiplication
+    __m128i mulhi_odd   = _mm_mulhi_epu16(a, divisor.val[0]);
+    __m128i mulhi_even  = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]);
+            mulhi_even  = _mm_srli_epi16(mulhi_even, 8);
+    __m128i mulhi       = npyv_select_u8(bmask, mulhi_odd, mulhi_even);
+    // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m128i q           = _mm_sub_epi8(a, mulhi);
+            q           = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b);
+            q           = _mm_add_epi8(mulhi, q);
+            q           = _mm_and_si128(_mm_srl_epi16(q, divisor.val[2]), shf2b);
+    return  q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+    const __m128i bmask = _mm_set1_epi32(0x00FF00FF);
+    // instead of _mm_cvtepi8_epi16/_mm_packs_epi16 to wrap around overflow
+    __m128i divc_even = npyv_divc_s16(_mm_srai_epi16(_mm_slli_epi16(a, 8), 8), divisor);
+    __m128i divc_odd  = npyv_divc_s16(_mm_srai_epi16(a, 8), divisor);
+            divc_odd  = _mm_slli_epi16(divc_odd, 8);
+    return npyv_select_u8(bmask, divc_even, divc_odd);
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+    // high part of unsigned multiplication
+    __m128i mulhi = _mm_mulhi_epu16(a, divisor.val[0]);
+    // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m128i q     = _mm_sub_epi16(a, mulhi);
+            q     = _mm_srl_epi16(q, divisor.val[1]);
+            q     = _mm_add_epi16(mulhi, q);
+            q     = _mm_srl_epi16(q, divisor.val[2]);
+    return  q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+    // high part of signed multiplication
+    __m128i mulhi = _mm_mulhi_epi16(a, divisor.val[0]);
+    // q          = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d) = (q ^ dsign) - dsign
+    __m128i q     = _mm_sra_epi16(_mm_add_epi16(a, mulhi), divisor.val[1]);
+            q     = _mm_sub_epi16(q, _mm_srai_epi16(a, 15));
+            q     = _mm_sub_epi16(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+    // high part of unsigned multiplication
+    __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epu32(a, divisor.val[0]), 32);
+    __m128i mulhi_odd  = _mm_mul_epu32(_mm_srli_epi64(a, 32), divisor.val[0]);
+#ifdef NPY_HAVE_SSE41
+    __m128i mulhi      = _mm_blend_epi16(mulhi_even, mulhi_odd, 0xCC);
+#else
+    __m128i mask_13    = _mm_setr_epi32(0, -1, 0, -1);
+           mulhi_odd   = _mm_and_si128(mulhi_odd, mask_13);
+    __m128i mulhi      = _mm_or_si128(mulhi_even, mulhi_odd);
+#endif
+    // floor(a/d)      = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m128i q          = _mm_sub_epi32(a, mulhi);
+            q          = _mm_srl_epi32(q, divisor.val[1]);
+            q          = _mm_add_epi32(mulhi, q);
+            q          = _mm_srl_epi32(q, divisor.val[2]);
+    return  q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+    __m128i asign      = _mm_srai_epi32(a, 31);
+#ifdef NPY_HAVE_SSE41
+    // high part of signed multiplication
+    __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epi32(a, divisor.val[0]), 32);
+    __m128i mulhi_odd  = _mm_mul_epi32(_mm_srli_epi64(a, 32), divisor.val[0]);
+    __m128i mulhi      = _mm_blend_epi16(mulhi_even, mulhi_odd, 0xCC);
+#else  // not SSE4.1
+    // high part of "unsigned" multiplication
+    __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epu32(a, divisor.val[0]), 32);
+    __m128i mulhi_odd  = _mm_mul_epu32(_mm_srli_epi64(a, 32), divisor.val[0]);
+    __m128i mask_13    = _mm_setr_epi32(0, -1, 0, -1);
+            mulhi_odd  = _mm_and_si128(mulhi_odd, mask_13);
+    __m128i mulhi      = _mm_or_si128(mulhi_even, mulhi_odd);
+    // convert unsigned to signed high multiplication
+    // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+    const __m128i msign= _mm_srai_epi32(divisor.val[0], 31);
+    __m128i m_asign    = _mm_and_si128(divisor.val[0], asign);
+    __m128i a_msign    = _mm_and_si128(a, msign);
+            mulhi      = _mm_sub_epi32(mulhi, m_asign);
+            mulhi      = _mm_sub_epi32(mulhi, a_msign);
+#endif
+    // q               = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+    __m128i q          = _mm_sra_epi32(_mm_add_epi32(a, mulhi), divisor.val[1]);
+            q          = _mm_sub_epi32(q, asign);
+            q          = _mm_sub_epi32(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+// returns the high 64 bits of unsigned 64-bit multiplication
+// xref https://stackoverflow.com/a/28827013
+NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
+{
+    __m128i lomask = npyv_setall_s64(0xffffffff);
+    __m128i a_hi   = _mm_srli_epi64(a, 32);        // a0l, a0h, a1l, a1h
+    __m128i b_hi   = _mm_srli_epi64(b, 32);        // b0l, b0h, b1l, b1h
+    // compute partial products
+    __m128i w0     = _mm_mul_epu32(a, b);          // a0l*b0l, a1l*b1l
+    __m128i w1     = _mm_mul_epu32(a, b_hi);       // a0l*b0h, a1l*b1h
+    __m128i w2     = _mm_mul_epu32(a_hi, b);       // a0h*b0l, a1h*b0l
+    __m128i w3     = _mm_mul_epu32(a_hi, b_hi);    // a0h*b0h, a1h*b1h
+    // sum partial products
+    __m128i w0h    = _mm_srli_epi64(w0, 32);
+    __m128i s1     = _mm_add_epi64(w1, w0h);
+    __m128i s1l    = _mm_and_si128(s1, lomask);
+    __m128i s1h    = _mm_srli_epi64(s1, 32);
+
+    __m128i s2     = _mm_add_epi64(w2, s1l);
+    __m128i s2h    = _mm_srli_epi64(s2, 32);
+
+    __m128i hi     = _mm_add_epi64(w3, s1h);
+            hi     = _mm_add_epi64(hi, s2h);
+    return hi;
+}
+// divide each unsigned 64-bit element by a precomputed divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+    // high part of unsigned multiplication
+    __m128i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
+    // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    __m128i q     = _mm_sub_epi64(a, mulhi);
+            q     = _mm_srl_epi64(q, divisor.val[1]);
+            q     = _mm_add_epi64(mulhi, q);
+            q     = _mm_srl_epi64(q, divisor.val[2]);
+    return  q;
+}
+// divide each signed 64-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+    // high part of unsigned multiplication
+    __m128i mulhi      = npyv__mullhi_u64(a, divisor.val[0]);
+    // convert unsigned to signed high multiplication
+    // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
+#ifdef NPY_HAVE_SSE42
+    const __m128i msign= _mm_cmpgt_epi64(_mm_setzero_si128(), divisor.val[0]);
+    __m128i asign      = _mm_cmpgt_epi64(_mm_setzero_si128(), a);
+#else
+    const __m128i msign= _mm_srai_epi32(_mm_shuffle_epi32(divisor.val[0], _MM_SHUFFLE(3, 3, 1, 1)), 31);
+    __m128i asign      = _mm_srai_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)), 31);
+#endif
+    __m128i m_asign    = _mm_and_si128(divisor.val[0], asign);
+    __m128i a_msign    = _mm_and_si128(a, msign);
+            mulhi      = _mm_sub_epi64(mulhi, m_asign);
+            mulhi      = _mm_sub_epi64(mulhi, a_msign);
+    // q               = (a + mulhi) >> sh
+    __m128i q          = _mm_add_epi64(a, mulhi);
+    // emulate arithmetic right shift
+    const __m128i sigb = npyv_setall_s64(1LL << 63);
+            q          = _mm_srl_epi64(_mm_add_epi64(q, sigb), divisor.val[1]);
+            q          = _mm_sub_epi64(q, _mm_srl_epi64(sigb, divisor.val[1]));
+    // q               = q - XSIGN(a)
+    // trunc(a/d)      = (q ^ dsign) - dsign
+            q          = _mm_sub_epi64(q, asign);
+            q          = _mm_sub_epi64(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+/***************************
  * Division
  ***************************/
 // TODO: emulate integer division
@@ -175,7 +350,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
     __m128 t2 = _mm_add_ps(a, t1);
     __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
     __m128 t4 = _mm_add_ss(t2, t3);
-    return _mm_cvtss_f32(t4); 
+    return _mm_cvtss_f32(t4);
 #endif
 }
 
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
index 7ba47bc68..1099c491d 100644
--- a/numpy/core/src/common/simd/sse/misc.h
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -18,21 +18,16 @@
 #define npyv_zero_f64 _mm_setzero_pd
 
 // vector with a specific value set to all lanes
-#define npyv_setall_u8(VAL)  _mm_set1_epi8((char)VAL)
-#define npyv_setall_s8(VAL)  _mm_set1_epi8((char)VAL)
-#define npyv_setall_u16(VAL) _mm_set1_epi16((short)VAL)
-#define npyv_setall_s16(VAL) _mm_set1_epi16((short)VAL)
-#define npyv_setall_u32(VAL) _mm_set1_epi32((int)VAL)
-#define npyv_setall_s32(VAL) _mm_set1_epi32(VAL)
-#if !defined(__x86_64__) && !defined(_M_X64)
-    #define npyv_setall_u64(VAL) _mm_set_epi32((int)(VAL >> 32), (int)VAL, (int)(VAL >> 32), (int)VAL)
-    #define npyv_setall_s64 npyv_setall_u64
-#else
-    #define npyv_setall_u64(VAL) _mm_set1_epi64x(VAL)
-    #define npyv_setall_s64(VAL) _mm_set1_epi64x(VAL)
-#endif
-#define npyv_setall_f32(VAL) _mm_set1_ps(VAL)
-#define npyv_setall_f64(VAL) _mm_set1_pd(VAL)
+#define npyv_setall_u8(VAL)  _mm_set1_epi8((char)(VAL))
+#define npyv_setall_s8(VAL)  _mm_set1_epi8((char)(VAL))
+#define npyv_setall_u16(VAL) _mm_set1_epi16((short)(VAL))
+#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
+#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
+#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
+#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
+#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
+#define npyv_setall_f32 _mm_set1_ps
+#define npyv_setall_f64 _mm_set1_pd
 
 /**
  * vector with specific values set to each lane and
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 7c4e32f27..123fcaf92 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -95,6 +95,138 @@
 #define npyv_mul_f64 vec_mul
 
 /***************************
+ * Integer Division
+ ***************************/
+/***
+ * TODO: Add support for VSX4(Power10)
+ */
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+    const npyv_u8 mergeo_perm = {
+        1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
+    };
+    // high part of unsigned multiplication
+    npyv_u16 mul_even = vec_mule(a, divisor.val[0]);
+    npyv_u16 mul_odd  = vec_mulo(a, divisor.val[0]);
+    npyv_u8  mulhi    = (npyv_u8)vec_perm(mul_even, mul_odd, mergeo_perm);
+    // floor(a/d)     = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    npyv_u8 q         = vec_sub(a, mulhi);
+            q         = vec_sr(q, divisor.val[1]);
+            q         = vec_add(mulhi, q);
+            q         = vec_sr(q, divisor.val[2]);
+    return  q;
+}
+// divide each signed 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+    const npyv_u8 mergeo_perm = {
+        1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
+    };
+    // high part of signed multiplication
+    npyv_s16 mul_even = vec_mule(a, divisor.val[0]);
+    npyv_s16 mul_odd  = vec_mulo(a, divisor.val[0]);
+    npyv_s8  mulhi    = (npyv_s8)vec_perm(mul_even, mul_odd, mergeo_perm);
+    // q              = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)     = (q ^ dsign) - dsign
+    npyv_s8 q         = vec_sra(vec_add(a, mulhi), (npyv_u8)divisor.val[1]);
+            q         = vec_sub(q, vec_sra(a, npyv_setall_u8(7)));
+            q         = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
+    return  q;
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+    const npyv_u8 mergeo_perm = {
+        2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
+    };
+    // high part of unsigned multiplication
+    npyv_u32 mul_even = vec_mule(a, divisor.val[0]);
+    npyv_u32 mul_odd  = vec_mulo(a, divisor.val[0]);
+    npyv_u16 mulhi    = (npyv_u16)vec_perm(mul_even, mul_odd, mergeo_perm);
+    // floor(a/d)     = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+    npyv_u16 q        = vec_sub(a, mulhi);
+             q        = vec_sr(q, divisor.val[1]);
+             q        = vec_add(mulhi, q);
+             q        = vec_sr(q, divisor.val[2]);
+    return   q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+    const npyv_u8 mergeo_perm = {
+        2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
+    };
+    // high part of signed multiplication
+    npyv_s32 mul_even = vec_mule(a, divisor.val[0]);
+    npyv_s32 mul_odd  = vec_mulo(a, divisor.val[0]);
+    npyv_s16 mulhi    = (npyv_s16)vec_perm(mul_even, mul_odd, mergeo_perm);
+    // q              = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)     = (q ^ dsign) - dsign
+    npyv_s16 q        = vec_sra(vec_add(a, mulhi), (npyv_u16)divisor.val[1]);
+             q        = vec_sub(q, vec_sra(a, npyv_setall_u16(15)));
+             q        = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
+    return   q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+#if defined(__GNUC__) && __GNUC__ < 8
+    // Doubleword integer wide multiplication supported by GCC 8+
+    npyv_u64 mul_even, mul_odd;
+    __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
+    __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd)  : "v" (a), "v" (divisor.val[0]));
+#else
+    // Doubleword integer wide multiplication supported by GCC 8+
+    npyv_u64 mul_even = vec_mule(a, divisor.val[0]);
+    npyv_u64 mul_odd  = vec_mulo(a, divisor.val[0]);
+#endif
+    // high part of unsigned multiplication
+    npyv_u32 mulhi    = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd);
+    // floor(x/d)     = (((a-mulhi) >> sh1) + mulhi) >> sh2
+    npyv_u32 q        = vec_sub(a, mulhi);
+             q        = vec_sr(q, divisor.val[1]);
+             q        = vec_add(mulhi, q);
+             q        = vec_sr(q, divisor.val[2]);
+    return   q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+#if defined(__GNUC__) && __GNUC__ < 8
+    // Doubleword integer wide multiplication supported by GCC8+
+    npyv_s64 mul_even, mul_odd;
+    __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
+    __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd)  : "v" (a), "v" (divisor.val[0]));
+#else
+    // Doubleword integer wide multiplication supported by GCC8+
+    npyv_s64 mul_even = vec_mule(a, divisor.val[0]);
+    npyv_s64 mul_odd  = vec_mulo(a, divisor.val[0]);
+#endif
+    // high part of signed multiplication
+    npyv_s32 mulhi    = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd);
+    // q              = ((a + mulhi) >> sh1) - XSIGN(a)
+    // trunc(a/d)     = (q ^ dsign) - dsign
+    npyv_s32 q        = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]);
+             q        = vec_sub(q, vec_sra(a, npyv_setall_u32(31)));
+             q        = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
+    return   q;
+}
+// divide each unsigned 64-bit element by a precomputed divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+    const npy_uint64 d = vec_extract(divisor.val[0], 0);
+    return npyv_set_u64(vec_extract(a, 0) / d, vec_extract(a, 1) / d);
+}
+// divide each signed 64-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+    npyv_b64 overflow = vec_and(vec_cmpeq(a, npyv_setall_s64(-1LL << 63)), (npyv_b64)divisor.val[1]);
+    npyv_s64 d = vec_sel(divisor.val[0], npyv_setall_s64(1), overflow);
+    return vec_div(a, d);
+}
+/***************************
  * Division
  ***************************/
 #define npyv_div_f32 vec_div
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 1d1a111be..8306a86d3 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -733,6 +733,76 @@ class _SIMD_ALL(_Test_Utility):
         div = self.div(vdata_a, vdata_b)
         assert div == data_div
 
+    def test_arithmetic_intdiv(self):
+        """
+        Test integer division intrinics:
+            npyv_divisor_##sfx
+            npyv_divc_##sfx
+        """
+        if self._is_fp():
+            return
+
+        def trunc_div(a, d):
+            """
+            Divide towards zero works with large integers > 2^53,
+            equivalent to int(a/d)
+            """
+            sign_a, sign_d = a < 0, d < 0
+            if a == 0 or sign_a == sign_d:
+                return a // d
+            return (a + sign_d - sign_a) // d + 1
+
+        int_min = self._int_min() if self._is_signed() else 1
+        int_max = self._int_max()
+        rdata = (
+            0, 1, self.nlanes, int_max-self.nlanes,
+            int_min, int_min//2 + 1
+        )
+        divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2)
+
+        for x, d in zip(rdata, divisors):
+            data = self._data(x)
+            vdata = self.load(data)
+            data_divc = [trunc_div(a, d) for a in data]
+            divisor = self.divisor(d)
+            divc = self.divc(vdata, divisor)
+            assert divc == data_divc
+
+        if not self._is_signed():
+            return
+
+        safe_neg = lambda x: -x-1 if -x > int_max else -x
+        # test round divison for signed integers
+        for x, d in zip(rdata, divisors):
+            d_neg = safe_neg(d)
+            data = self._data(x)
+            data_neg = [safe_neg(a) for a in data]
+            vdata = self.load(data)
+            vdata_neg = self.load(data_neg)
+            divisor = self.divisor(d)
+            divisor_neg = self.divisor(d_neg)
+
+            # round towards zero
+            data_divc = [trunc_div(a, d_neg) for a in data]
+            divc = self.divc(vdata, divisor_neg)
+            assert divc == data_divc
+            data_divc = [trunc_div(a, d) for a in data_neg]
+            divc = self.divc(vdata_neg, divisor)
+            assert divc == data_divc
+
+        # test truncate sign if the dividend is zero
+        vzero = self.zero()
+        for d in (-1, -10, -100, int_min//2, int_min):
+            divisor = self.divisor(d)
+            divc = self.divc(vzero, divisor)
+            assert divc == vzero
+
+        # test overflow
+        vmin = self.setall(int_min)
+        divisor = self.divisor(-1)
+        divc = self.divc(vmin, divisor)
+        assert divc == vmin
+
     def test_arithmetic_reduce_sum(self):
         """
         Test reduce sum intrinics:
author	Matti Picus <matti.picus@gmail.com>	2021-03-08 09:24:50 +0200
committer	GitHub <noreply@github.com>	2021-03-08 09:24:50 +0200
commit	df2cd0f7ef1dba763039514fc11669d9ff505c2d (patch)
tree	ab626376b0ee8be13a97d6615063160d8b4aaee8
parent	0cea1d9b585ebe9895c898c60f05572840579c89 (diff)
parent	8aae310556359517394df263f3b5819eff8c3daa (diff)
download	numpy-df2cd0f7ef1dba763039514fc11669d9ff505c2d.tar.gz