diff options
author | Matti Picus <matti.picus@gmail.com> | 2021-03-08 09:24:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-08 09:24:50 +0200 |
commit | df2cd0f7ef1dba763039514fc11669d9ff505c2d (patch) | |
tree | ab626376b0ee8be13a97d6615063160d8b4aaee8 | |
parent | 0cea1d9b585ebe9895c898c60f05572840579c89 (diff) | |
parent | 8aae310556359517394df263f3b5819eff8c3daa (diff) | |
download | numpy-df2cd0f7ef1dba763039514fc11669d9ff505c2d.tar.gz |
Merge pull request #18178 from seiko2plus/npyv_fast_div
SIMD: add fast integer division intrinsics for all supported platforms
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 19 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 160 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 224 | ||||
-rw-r--r-- | numpy/core/src/common/simd/intdiv.h | 473 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 150 | ||||
-rw-r--r-- | numpy/core/src/common/simd/simd.h | 5 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 177 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/misc.h | 25 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 132 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 70 |
10 files changed, 1411 insertions, 24 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index e5b58a8d2..4acd20a75 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -15,8 +15,8 @@ /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# - * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64# - * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0# + * #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64# + * #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# @@ -27,6 +27,7 @@ * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# + * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# */ @@ -354,6 +355,11 @@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@) #endif // div_sup +#if @intdiv_sup@ +SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@) +SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3) +#endif // intdiv_sup + #if @fused_sup@ /**begin repeat1 * #intrin = muladd, mulsub, nmuladd, nmulsub# @@ -442,6 +448,7 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@) /**end repeat**/ + //######################################################################### //## Attach module functions //######################################################################### @@ -449,7 +456,7 @@ static PyMethodDef simd__intrinsics_methods[] = { /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# - * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64# + * #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64# * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# @@ -461,6 +468,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# + * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# */ @@ -568,6 +576,11 @@ SIMD_INTRIN_DEF(mul_@sfx@) SIMD_INTRIN_DEF(div_@sfx@) #endif // div_sup +#if @intdiv_sup@ +SIMD_INTRIN_DEF(divisor_@sfx@) +SIMD_INTRIN_DEF(divc_@sfx@) +#endif // intdiv_sup + #if @fused_sup@ /**begin repeat1 * #intrin = muladd, mulsub, nmuladd, nmulsub# diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 4b8258759..7cd5a0ea6 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -67,6 +67,164 @@ // TODO: after implment Packs intrins /*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + const __m256i bmask = _mm256_set1_epi32(0xFF00FF00); + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]); + const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1)); + const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2)); + // high part of unsigned multiplication + __m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]); + __m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]); + mulhi_even = _mm256_srli_epi16(mulhi_even, 8); + __m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m256i q = _mm256_sub_epi8(a, mulhi); + q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b); + q = _mm256_add_epi8(mulhi, q); + q = _mm256_and_si256(_mm256_srl_epi16(q, shf2), shf2b); + return q; +} +// divide each signed 8-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor); +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + const __m256i bmask = _mm256_set1_epi32(0x00FF00FF); + // instead of _mm256_cvtepi8_epi16/_mm256_packs_epi16 to wrap around overflow + __m256i divc_even = npyv_divc_s16(_mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8), divisor); + __m256i divc_odd = npyv_divc_s16(_mm256_srai_epi16(a, 8), divisor); + divc_odd = _mm256_slli_epi16(divc_odd, 8); + return _mm256_blendv_epi8(divc_odd, divc_even, bmask); +} +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]); + // high part of unsigned multiplication + __m256i mulhi = _mm256_mulhi_epu16(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m256i q = _mm256_sub_epi16(a, mulhi); + q = _mm256_srl_epi16(q, shf1); + q = _mm256_add_epi16(mulhi, q); + q = _mm256_srl_epi16(q, shf2); + return q; +} +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + // high part of signed multiplication + __m256i mulhi = _mm256_mulhi_epi16(a, divisor.val[0]); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m256i q = _mm256_sra_epi16(_mm256_add_epi16(a, mulhi), shf1); + q = _mm256_sub_epi16(q, _mm256_srai_epi16(a, 15)); + q = _mm256_sub_epi16(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]); + // high part of unsigned multiplication + __m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor.val[0]), 32); + __m256i mulhi_odd = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor.val[0]); + __m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m256i q = _mm256_sub_epi32(a, mulhi); + q = _mm256_srl_epi32(q, shf1); + q = _mm256_add_epi32(mulhi, q); + q = _mm256_srl_epi32(q, shf2); + return q; +} +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + // high part of signed multiplication + __m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor.val[0]), 32); + __m256i mulhi_odd = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor.val[0]); + __m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m256i q = _mm256_sra_epi32(_mm256_add_epi32(a, mulhi), shf1); + q = _mm256_sub_epi32(q, _mm256_srai_epi32(a, 31)); + q = _mm256_sub_epi32(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]); + return q; +} +// returns the high 64 bits of unsigned 64-bit multiplication +// xref https://stackoverflow.com/a/28827013 +NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b) +{ + __m256i lomask = npyv_setall_s64(0xffffffff); + __m256i a_hi = _mm256_srli_epi64(a, 32); // a0l, a0h, a1l, a1h + __m256i b_hi = _mm256_srli_epi64(b, 32); // b0l, b0h, b1l, b1h + // compute partial products + __m256i w0 = _mm256_mul_epu32(a, b); // a0l*b0l, a1l*b1l + __m256i w1 = _mm256_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h + __m256i w2 = _mm256_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l + __m256i w3 = _mm256_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h + // sum partial products + __m256i w0h = _mm256_srli_epi64(w0, 32); + __m256i s1 = _mm256_add_epi64(w1, w0h); + __m256i s1l = _mm256_and_si256(s1, lomask); + __m256i s1h = _mm256_srli_epi64(s1, 32); + + __m256i s2 = _mm256_add_epi64(w2, s1l); + __m256i s2h = _mm256_srli_epi64(s2, 32); + + __m256i hi = _mm256_add_epi64(w3, s1h); + hi = _mm256_add_epi64(hi, s2h); + return hi; +} +// divide each unsigned 64-bit element by a divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]); + // high part of unsigned multiplication + __m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m256i q = _mm256_sub_epi64(a, mulhi); + q = _mm256_srl_epi64(q, shf1); + q = _mm256_add_epi64(mulhi, q); + q = _mm256_srl_epi64(q, shf2); + return q; +} +// divide each unsigned 64-bit element by a divisor (round towards zero) +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); + // high part of unsigned multiplication + __m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]); + // convert unsigned to signed high multiplication + // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0); + __m256i asign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a); + __m256i msign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor.val[0]); + __m256i m_asign = _mm256_and_si256(divisor.val[0], asign); + __m256i a_msign = _mm256_and_si256(a, msign); + mulhi = _mm256_sub_epi64(mulhi, m_asign); + mulhi = _mm256_sub_epi64(mulhi, a_msign); + // q = (a + mulhi) >> sh + __m256i q = _mm256_add_epi64(a, mulhi); + // emulate arithmetic right shift + const __m256i sigb = npyv_setall_s64(1LL << 63); + q = _mm256_srl_epi64(_mm256_add_epi64(q, sigb), shf1); + q = _mm256_sub_epi64(q, _mm256_srl_epi64(sigb, shf1)); + // q = q - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + q = _mm256_sub_epi64(q, asign); + q = _mm256_sub_epi64(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]); + return q; +} +/*************************** * Division ***************************/ // TODO: emulate integer division @@ -176,5 +334,3 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) } #endif // _NPY_SIMD_AVX2_ARITHMETIC_H - - diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 450da7ea5..29e1af7e8 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -107,6 +107,226 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) // TODO: after implment Packs intrins /*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]); +#ifdef NPY_HAVE_AVX512BW + const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1)); + const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2)); + // high part of unsigned multiplication + __m512i mulhi_odd = _mm512_mulhi_epu16(a, divisor.val[0]); + __m512i mulhi_even = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]); + mulhi_even = _mm512_srli_epi16(mulhi_even, 8); + __m512i mulhi = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m512i q = _mm512_sub_epi8(a, mulhi); + q = _mm512_and_si512(_mm512_srl_epi16(q, shf1), shf1b); + q = _mm512_add_epi8(mulhi, q); + q = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b); + return q; +#else + const __m256i bmask = _mm256_set1_epi32(0xFF00FF00); + const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1)); + const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2)); + const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b); + const __m256i mulc = npyv512_lower_si256(divisor.val[0]); + //// lower 256-bit + __m256i lo_a = npyv512_lower_si256(a); + // high part of unsigned multiplication + __m256i mulhi_odd = _mm256_mulhi_epu16(lo_a, mulc); + __m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc); + mulhi_even = _mm256_srli_epi16(mulhi_even, 8); + __m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m256i lo_q = _mm256_sub_epi8(lo_a, mulhi); + lo_q = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b); + lo_q = _mm256_add_epi8(mulhi, lo_q); + lo_q = _mm256_srl_epi16(lo_q, shf2); // no sign extend + + //// higher 256-bit + __m256i hi_a = npyv512_higher_si256(a); + // high part of unsigned multiplication + mulhi_odd = _mm256_mulhi_epu16(hi_a, mulc); + mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc); + mulhi_even = _mm256_srli_epi16(mulhi_even, 8); + mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m256i hi_q = _mm256_sub_epi8(hi_a, mulhi); + hi_q = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b); + hi_q = _mm256_add_epi8(mulhi, hi_q); + hi_q = _mm256_srl_epi16(hi_q, shf2); // no sign extend + return _mm512_and_si512(npyv512_combine_si256(lo_q, hi_q), shf2bw); // extend sign +#endif +} +// divide each signed 8-bit element by divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor); +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + __m512i divc_even = npyv_divc_s16(npyv_shri_s16(npyv_shli_s16(a, 8), 8), divisor); + __m512i divc_odd = npyv_divc_s16(npyv_shri_s16(a, 8), divisor); + divc_odd = npyv_shli_s16(divc_odd, 8); +#ifdef NPY_HAVE_AVX512BW + return _mm512_mask_mov_epi8(divc_even, 0xAAAAAAAAAAAAAAAA, divc_odd); +#else + const __m512i bmask = _mm512_set1_epi32(0x00FF00FF); + return npyv_select_u8(bmask, divc_even, divc_odd); +#endif +} +// divide each unsigned 16-bit element by divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + #define NPYV__DIVC_U16(RLEN, A, MULC, R) \ + mulhi = _mm##RLEN##_mulhi_epu16(A, MULC); \ + R = _mm##RLEN##_sub_epi16(A, mulhi); \ + R = _mm##RLEN##_srl_epi16(R, shf1); \ + R = _mm##RLEN##_add_epi16(mulhi, R); \ + R = _mm##RLEN##_srl_epi16(R, shf2); + +#ifdef NPY_HAVE_AVX512BW + __m512i mulhi, q; + NPYV__DIVC_U16(512, a, divisor.val[0], q) + return q; +#else + const __m256i m = npyv512_lower_si256(divisor.val[0]); + __m256i lo_a = npyv512_lower_si256(a); + __m256i hi_a = npyv512_higher_si256(a); + + __m256i mulhi, lo_q, hi_q; + NPYV__DIVC_U16(256, lo_a, m, lo_q) + NPYV__DIVC_U16(256, hi_a, m, hi_q) + return npyv512_combine_si256(lo_q, hi_q); +#endif + #undef NPYV__DIVC_U16 +} +// divide each signed 16-bit element by divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + #define NPYV__DIVC_S16(RLEN, A, MULC, DSIGN, R) \ + mulhi = _mm##RLEN##_mulhi_epi16(A, MULC); \ + R = _mm##RLEN##_sra_epi16(_mm##RLEN##_add_epi16(A, mulhi), shf1); \ + R = _mm##RLEN##_sub_epi16(R, _mm##RLEN##_srai_epi16(A, 15)); \ + R = _mm##RLEN##_sub_epi16(_mm##RLEN##_xor_si##RLEN(R, DSIGN), DSIGN); + +#ifdef NPY_HAVE_AVX512BW + __m512i mulhi, q; + NPYV__DIVC_S16(512, a, divisor.val[0], divisor.val[2], q) + return q; +#else + const __m256i m = npyv512_lower_si256(divisor.val[0]); + const __m256i dsign = npyv512_lower_si256(divisor.val[2]); + __m256i lo_a = npyv512_lower_si256(a); + __m256i hi_a = npyv512_higher_si256(a); + + __m256i mulhi, lo_q, hi_q; + NPYV__DIVC_S16(256, lo_a, m, dsign, lo_q) + NPYV__DIVC_S16(256, hi_a, m, dsign, hi_q) + return npyv512_combine_si256(lo_q, hi_q); +#endif + #undef NPYV__DIVC_S16 +} +// divide each unsigned 32-bit element by divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]); + // high part of unsigned multiplication + __m512i mulhi_even = _mm512_srli_epi64(_mm512_mul_epu32(a, divisor.val[0]), 32); + __m512i mulhi_odd = _mm512_mul_epu32(_mm512_srli_epi64(a, 32), divisor.val[0]); + __m512i mulhi = _mm512_mask_mov_epi32(mulhi_even, 0xAAAA, mulhi_odd); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m512i q = _mm512_sub_epi32(a, mulhi); + q = _mm512_srl_epi32(q, shf1); + q = _mm512_add_epi32(mulhi, q); + q = _mm512_srl_epi32(q, shf2); + return q; +} +// divide each signed 32-bit element by divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + // high part of signed multiplication + __m512i mulhi_even = _mm512_srli_epi64(_mm512_mul_epi32(a, divisor.val[0]), 32); + __m512i mulhi_odd = _mm512_mul_epi32(_mm512_srli_epi64(a, 32), divisor.val[0]); + __m512i mulhi = _mm512_mask_mov_epi32(mulhi_even, 0xAAAA, mulhi_odd); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m512i q = _mm512_sra_epi32(_mm512_add_epi32(a, mulhi), shf1); + q = _mm512_sub_epi32(q, _mm512_srai_epi32(a, 31)); + q = _mm512_sub_epi32(_mm512_xor_si512(q, divisor.val[2]), divisor.val[2]); + return q; +} +// returns the high 64 bits of unsigned 64-bit multiplication +// xref https://stackoverflow.com/a/28827013 +NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b) +{ + __m512i lomask = npyv_setall_s64(0xffffffff); + __m512i a_hi = _mm512_srli_epi64(a, 32); // a0l, a0h, a1l, a1h + __m512i b_hi = _mm512_srli_epi64(b, 32); // b0l, b0h, b1l, b1h + // compute partial products + __m512i w0 = _mm512_mul_epu32(a, b); // a0l*b0l, a1l*b1l + __m512i w1 = _mm512_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h + __m512i w2 = _mm512_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l + __m512i w3 = _mm512_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h + // sum partial products + __m512i w0h = _mm512_srli_epi64(w0, 32); + __m512i s1 = _mm512_add_epi64(w1, w0h); + __m512i s1l = _mm512_and_si512(s1, lomask); + __m512i s1h = _mm512_srli_epi64(s1, 32); + + __m512i s2 = _mm512_add_epi64(w2, s1l); + __m512i s2h = _mm512_srli_epi64(s2, 32); + + __m512i hi = _mm512_add_epi64(w3, s1h); + hi = _mm512_add_epi64(hi, s2h); + return hi; +} +// divide each unsigned 64-bit element by a divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]); + // high part of unsigned multiplication + __m512i mulhi = npyv__mullhi_u64(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m512i q = _mm512_sub_epi64(a, mulhi); + q = _mm512_srl_epi64(q, shf1); + q = _mm512_add_epi64(mulhi, q); + q = _mm512_srl_epi64(q, shf2); + return q; +} +// divide each unsigned 64-bit element by a divisor (round towards zero) +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); + // high part of unsigned multiplication + __m512i mulhi = npyv__mullhi_u64(a, divisor.val[0]); + // convert unsigned to signed high multiplication + // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0); + __m512i asign = _mm512_srai_epi64(a, 63); + __m512i msign = _mm512_srai_epi64(divisor.val[0], 63); + __m512i m_asign = _mm512_and_si512(divisor.val[0], asign); + __m512i a_msign = _mm512_and_si512(a, msign); + mulhi = _mm512_sub_epi64(mulhi, m_asign); + mulhi = _mm512_sub_epi64(mulhi, a_msign); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m512i q = _mm512_sra_epi64(_mm512_add_epi64(a, mulhi), shf1); + q = _mm512_sub_epi64(q, asign); + q = _mm512_sub_epi64(_mm512_xor_si512(q, divisor.val[2]), divisor.val[2]); + return q; +} +/*************************** * Division ***************************/ // TODO: emulate integer division @@ -136,11 +356,11 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract * 3- _mm512_reduce_add_ps/pd * The first one is been widely used by many projects - * + * * the second one is used by Intel Compiler, maybe because the * latency of hadd increased by (2-3) starting from Skylake-X which makes two * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info. - * + * * The third one is almost the same as the second one but only works for * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. ***************************/ diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h new file mode 100644 index 000000000..a6c293d87 --- /dev/null +++ b/numpy/core/src/common/simd/intdiv.h @@ -0,0 +1,473 @@ +/** + * This header implements `npyv_divisor_*` intrinsics used for computing the parameters + * of fast integer division, while division intrinsics `npyv_divc_*` are defined in + * {extension}/arithmetic.h. + */ +#ifndef NPY_SIMD + #error "Not a standalone header, use simd/simd.h instead" +#endif +#ifndef _NPY_SIMD_INTDIV_H +#define _NPY_SIMD_INTDIV_H +/********************************************************************************** + ** Integer division + ********************************************************************************** + * Almost all architecture (except Power10) doesn't support integer vector division, + * also the cost of scalar division in architectures like x86 is too high it can take + * 30 to 40 cycles on modern chips and up to 100 on old ones. + * + * Therefore we are using division by multiplying with precomputed reciprocal technique, + * the method that been used in this implementation is based on T. Granlund and P. L. Montgomery + * “Division by invariant integers using multiplication(see [Figure 4.1, 5.1] + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556) + * + * It shows a good impact for all architectures especially on X86, + * however computing divisor parameters is kind of expensive so this implementation + * should only works when divisor is a scalar and used multiple of times. + * + * The division process is separated into two intrinsics for each data type + * + * 1- npyv_{dtype}x3 npyv_divisor_{dtype} ({dtype} divisor); + * For computing the divisor parameters (multiplier + shifters + sign of divisor(signed only)) + * + * 2- npyv_{dtype} npyv_divisor_{dtype} (npyv_{dtype} dividend, npyv_{dtype}x3 divisor_parms); + * For performing the final division. + * + ** For example: + * int vstep = npyv_nlanes_s32; // number of lanes + * int x = 0x6e70; + * npyv_s32x3 divisor = npyv_divisor_s32(x); // init divisor params + * for (; len >= vstep; src += vstep, dst += vstep, len -= vstep) { + * npyv_s32 a = npyv_load_s32(*src); // load s32 vector from memory + * a = npyv_divc_s32(a, divisor); // divide all elements by x + * npyv_store_s32(dst, a); // store s32 vector into memroy + * } + * + ** NOTES: + * - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division + * since emulating multiply-high is expensive and both architectures have very fast dividers. + * + ** TODO: + * - Add support for Power10(VSX4) + * + *************************************************************** + ** Figure 4.1: Unsigned division by run–time invariant divisor + *************************************************************** + * Initialization (given uword d with 1 ≤ d < 2^N): + * int l = ceil(log2(d)); + * uword m = 2^N * (2^l− d) / d + 1; + * int sh1 = min(l, 1); + * int sh2 = max(l − 1, 0); + * + * For q = FLOOR(a/d), all uword: + * uword t1 = MULUH(m, a); + * q = SRL(t1 + SRL(a − t1, sh1), sh2); + * + ************************************************************************************ + ** Figure 5.1: Signed division by run–time invariant divisor, rounded towards zero + ************************************************************************************ + * Initialization (given constant sword d with d !=0): + * int l = max(ceil(log2(abs(d))), 1); + * udword m0 = 1 + (2^(N+l-1)) / abs(d); + * sword m = m0 − 2^N; + * sword dsign = XSIGN(d); + * int sh = l − 1; + * + * For q = TRUNC(a/d), all sword: + * sword q0 = a + MULSH(m, a); + * q0 = SRA(q0, sh) − XSIGN(a); + * q = EOR(q0, dsign) − dsign; + */ +/** + * bit-scan reverse for non-zeros. returns the index of the highest set bit. + * equivalent to floor(log2(a)) + */ +#ifdef _MSC_VER + #include <intrin.h> // _BitScanReverse +#endif +NPY_FINLINE unsigned npyv__bitscan_revnz_u32(npy_uint32 a) +{ + assert(a > 0); // due to use __builtin_clz + unsigned r; +#if defined(NPY_HAVE_SSE2) && defined(_MSC_VER) + unsigned long rl; + (void)_BitScanReverse(&rl, (unsigned long)a); + r = (unsigned)rl; +#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) + __asm__("bsr %1, %0" : "=r" (r) : "r"(a)); +#elif defined(__GNUC__) || defined(__clang__) + r = 31 - __builtin_clz(a); // performs on arm -> clz, ppc -> cntlzw +#else + r = 0; + while (a >>= 1) { + r++; + } +#endif + return r; +} +NPY_FINLINE unsigned npyv__bitscan_revnz_u64(npy_uint64 a) +{ + assert(a > 0); // due to use __builtin_clzll +#if defined(_M_AMD64) && defined(_MSC_VER) + unsigned long rl; + (void)_BitScanReverse64(&rl, a); + return (unsigned)rl; +#elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) + npy_uint64 r; + __asm__("bsrq %1, %0" : "=r"(r) : "r"(a)); + return (unsigned)r; +#elif defined(__GNUC__) || defined(__clang__) + return 63 - __builtin_clzll(a); +#else + npy_uint64 a_hi = a >> 32; + if (a_hi == 0) { + return npyv__bitscan_revnz_u32((npy_uint32)a); + } + return 32 + npyv__bitscan_revnz_u32((npy_uint32)a_hi); +#endif +} +/** + * Divides 128-bit unsigned integer by a 64-bit when the lower + * 64-bit of the dividend is zero. + * + * This function is needed to calculate the multiplier of 64-bit integer division + * see npyv_divisor_u64/npyv_divisor_s64. + */ +NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor) +{ + assert(divisor > 1); + npy_uint64 quotient; +#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920 + npy_uint64 remainder; + quotient = _udiv128(high, 0, divisor, &remainder); + (void)remainder; +#elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) + __asm__("divq %[d]" : "=a"(quotient) : [d] "r"(divisor), "a"(0), "d"(high)); +#elif defined(__SIZEOF_INT128__) + quotient = (npy_uint64)((((__uint128_t)high) << 64) / divisor); +#else + /** + * Minified version based on Donald Knuth’s Algorithm D (Division of nonnegative integers), + * and Generic implementation in Hacker’s Delight. + * + * See https://skanthak.homepage.t-online.de/division.html + * with respect to the license of the Hacker's Delight book + * (https://web.archive.org/web/20190408122508/http://www.hackersdelight.org/permissions.htm) + */ + // shift amount for normalize + unsigned ldz = 63 - npyv__bitscan_revnz_u64(divisor); + // normalize divisor + divisor <<= ldz; + high <<= ldz; + // break divisor up into two 32-bit digits + npy_uint32 divisor_hi = divisor >> 32; + npy_uint32 divisor_lo = divisor & 0xFFFFFFFF; + // compute high quotient digit + npy_uint32 quotient_hi = (npy_uint32)(high / divisor_hi); + npy_uint64 remainder = high - divisor_hi * quotient_hi; + npy_uint64 base32 = 1ULL << 32; + while (quotient_hi >= base32 || quotient_hi*divisor_lo > base32*remainder) { + remainder += --divisor_hi; + if (remainder >= base32) { + break; + } + } + // compute dividend digit pairs + npy_uint64 dividend_pairs = base32*high - divisor*quotient_hi; + // compute second quotient digit for lower zeros + npy_uint32 quotient_lo = (npy_uint32)(dividend_pairs / divisor_hi); + quotient = base32*quotient_hi + quotient_lo; +#endif + return quotient; +} +// Initializing divisor parameters for unsigned 8-bit division +NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) +{ + unsigned l, l2, sh1, sh2, m; + switch (d) { + case 0: // LCOV_EXCL_LINE + // for potential divide by zero, On x86 GCC inserts `ud2` instruction + // instead of letting the HW/CPU trap it which leads to illegal instruction exception. + // 'volatile' should suppress this behavior and allow us to raise HW/CPU + // arithmetic exception. + m = sh1 = sh2 = 1 / ((npy_uint8 volatile *)&d)[0]; + break; + case 1: + m = 1; sh1 = sh2 = 0; + break; + case 2: + m = 1; sh1 = 1; sh2 = 0; + break; + default: + l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d)) + l2 = (npy_uint8)(1 << l); // 2^l, overflow to 0 if l = 8 + m = ((l2 - d) << 8) / d + 1; // multiplier + sh1 = 1; sh2 = l - 1; // shift counts + } + npyv_u8x3 divisor; + divisor.val[0] = npyv_setall_u8(m); +#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_u8(sh1); + divisor.val[2] = npyv_set_u8(sh2); +#elif defined(NPY_HAVE_VSX2) + divisor.val[1] = npyv_setall_u8(sh1); + divisor.val[2] = npyv_setall_u8(sh2); +#elif defined(NPY_HAVE_NEON) + divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1)); + divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2)); +#else + #error "please initialize the shifting operand for the new architecture" +#endif + return divisor; +} +// Initializing divisor parameters for signed 8-bit division +NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d); +NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) +{ +#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + npyv_s16x3 p = npyv_divisor_s16(d); + npyv_s8x3 r; + r.val[0] = npyv_reinterpret_s8_s16(p.val[0]); + r.val[1] = npyv_reinterpret_s8_s16(p.val[1]); + r.val[2] = npyv_reinterpret_s8_s16(p.val[2]); + return r; +#else + int d1 = abs(d); + int sh, m; + if (d1 > 1) { + sh = (int)npyv__bitscan_revnz_u32(d1-1); // ceil(log2(abs(d))) - 1 + m = (1 << (8 + sh)) / d1 + 1; // multiplier + } + else if (d1 == 1) { + sh = 0; m = 1; + } + else { + // raise arithmetic exception for d == 0 + sh = m = 1 / ((npy_int8 volatile *)&d)[0]; // LCOV_EXCL_LINE + } + npyv_s8x3 divisor; + divisor.val[0] = npyv_setall_s8(m); + divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0); + #ifdef NPY_HAVE_VSX2 + divisor.val[1] = npyv_setall_s8(sh); + #elif defined(NPY_HAVE_NEON) + divisor.val[1] = npyv_setall_s8(-sh); + #else + #error "please initialize the shifting operand for the new architecture" + #endif + return divisor; +#endif +} +// Initializing divisor parameters for unsigned 16-bit division +NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d) +{ + unsigned l, l2, sh1, sh2, m; + switch (d) { + case 0: // LCOV_EXCL_LINE + // raise arithmetic exception for d == 0 + m = sh1 = sh2 = 1 / ((npy_uint16 volatile *)&d)[0]; + break; + case 1: + m = 1; sh1 = sh2 = 0; + break; + case 2: + m = 1; sh1 = 1; sh2 = 0; + break; + default: + l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d)) + l2 = (npy_uint16)(1 << l); // 2^l, overflow to 0 if l = 16 + m = ((l2 - d) << 16) / d + 1; // multiplier + sh1 = 1; sh2 = l - 1; // shift counts + } + npyv_u16x3 divisor; + divisor.val[0] = npyv_setall_u16(m); +#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_u16(sh1); + divisor.val[2] = npyv_set_u16(sh2); +#elif defined(NPY_HAVE_VSX2) + divisor.val[1] = npyv_setall_u16(sh1); + divisor.val[2] = npyv_setall_u16(sh2); +#elif defined(NPY_HAVE_NEON) + divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1)); + divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2)); +#else + #error "please initialize the shifting operand for the new architecture" +#endif + return divisor; +} +// Initializing divisor parameters for signed 16-bit division +NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d) +{ + int d1 = abs(d); + int sh, m; + if (d1 > 1) { + sh = (int)npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1 + m = (1 << (16 + sh)) / d1 + 1; // multiplier + } + else if (d1 == 1) { + sh = 0; m = 1; + } + else { + // raise arithmetic exception for d == 0 + sh = m = 1 / ((npy_int16 volatile *)&d)[0]; // LCOV_EXCL_LINE + } + npyv_s16x3 divisor; + divisor.val[0] = npyv_setall_s16(m); + divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor +#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_s16(sh); +#elif defined(NPY_HAVE_VSX2) + divisor.val[1] = npyv_setall_s16(sh); +#elif defined(NPY_HAVE_NEON) + divisor.val[1] = npyv_setall_s16(-sh); +#else + #error "please initialize the shifting operand for the new architecture" +#endif + return divisor; +} +// Initializing divisor parameters for unsigned 32-bit division +NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d) +{ + npy_uint32 l, l2, sh1, sh2, m; + switch (d) { + case 0: // LCOV_EXCL_LINE + // raise arithmetic exception for d == 0 + m = sh1 = sh2 = 1 / ((npy_uint32 volatile *)&d)[0]; // LCOV_EXCL_LINE + break; + case 1: + m = 1; sh1 = sh2 = 0; + break; + case 2: + m = 1; sh1 = 1; sh2 = 0; + break; + default: + l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d)) + l2 = (npy_uint32)(1ULL << l); // 2^l, overflow to 0 if l = 32 + m = ((npy_uint64)(l2 - d) << 32) / d + 1; // multiplier + sh1 = 1; sh2 = l - 1; // shift counts + } + npyv_u32x3 divisor; + divisor.val[0] = npyv_setall_u32(m); +#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_u32(sh1); + divisor.val[2] = npyv_set_u32(sh2); +#elif defined(NPY_HAVE_VSX2) + divisor.val[1] = npyv_setall_u32(sh1); + divisor.val[2] = npyv_setall_u32(sh2); +#elif defined(NPY_HAVE_NEON) + divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1)); + divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2)); +#else + #error "please initialize the shifting operand for the new architecture" +#endif + return divisor; +} +// Initializing divisor parameters for signed 32-bit division +NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) +{ + npy_int32 d1 = abs(d); + npy_int32 sh, m; + if (d1 > 1) { + sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1 + m = (1ULL << (32 + sh)) / d1 + 1; // multiplier + } + else if (d1 == 1) { + sh = 0; m = 1; + } + // fix abs overflow + else if (d == (1 << 31)) { + m = d + 1; + sh = 30; + } + else { + // raise arithmetic exception for d == 0 + sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE + } + npyv_s32x3 divisor; + divisor.val[0] = npyv_setall_s32(m); + divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor +#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_s32(sh); +#elif defined(NPY_HAVE_VSX2) + divisor.val[1] = npyv_setall_s32(sh); +#elif defined(NPY_HAVE_NEON) + divisor.val[1] = npyv_setall_s32(-sh); +#else + #error "please initialize the shifting operand for the new architecture" +#endif + return divisor; +} +// Initializing divisor parameters for unsigned 64-bit division +NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) +{ + npyv_u64x3 divisor; +#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON) + divisor.val[0] = npyv_setall_u64(d); +#else + npy_uint64 l, l2, sh1, sh2, m; + switch (d) { + case 0: // LCOV_EXCL_LINE + // raise arithmetic exception for d == 0 + m = sh1 = sh2 = 1 / ((npy_uint64 volatile *)&d)[0]; // LCOV_EXCL_LINE + break; + case 1: + m = 1; sh1 = sh2 = 0; + break; + case 2: + m = 1; sh1 = 1; sh2 = 0; + break; + default: + l = npyv__bitscan_revnz_u64(d - 1) + 1; // ceil(log2(d)) + l2 = l < 64 ? 1ULL << l : 0; // 2^l + m = npyv__divh128_u64(l2 - d, d) + 1; // multiplier + sh1 = 1; sh2 = l - 1; // shift counts + } + divisor.val[0] = npyv_setall_u64(m); + #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_u64(sh1); + divisor.val[2] = npyv_set_u64(sh2); + #else + #error "please initialize the shifting operand for the new architecture" + #endif +#endif + return divisor; +} +// Initializing divisor parameters for signed 64-bit division +NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) +{ + npyv_s64x3 divisor; +#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON) + divisor.val[0] = npyv_setall_s64(d); + divisor.val[1] = npyv_cvt_s64_b64( + npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0]) + ); +#else + npy_int64 d1 = llabs(d); + npy_int64 sh, m; + if (d1 > 1) { + sh = npyv__bitscan_revnz_u64(d1 - 1); // ceil(log2(abs(d))) - 1 + m = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier + } + else if (d1 == 1) { + sh = 0; m = 1; + } + // fix abs overflow + else if (d == (1LL << 63)) { + m = d + 1; + sh = 62; + } + else { + // raise arithmetic exception for d == 0 + sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE + } + divisor.val[0] = npyv_setall_s64(m); + divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor + #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[1] = npyv_set_s64(sh); + #else + #error "please initialize the shifting operand for the new architecture" + #endif +#endif + return divisor; +} + +#endif // _NPY_SIMD_INTDIV_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 69a49f571..00994806d 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -61,6 +61,154 @@ #define npyv_mul_f64 vmulq_f64 /*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + const uint8x8_t mulc_lo = vget_low_u8(divisor.val[0]); + // high part of unsigned multiplication + uint16x8_t mull_lo = vmull_u8(vget_low_u8(a), mulc_lo); +#if NPY_SIMD_F64 + uint16x8_t mull_hi = vmull_high_u8(a, divisor.val[0]); + // get the high unsigned bytes + uint8x16_t mulhi = vuzp2q_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi)); +#else + const uint8x8_t mulc_hi = vget_high_u8(divisor.val[0]); + uint16x8_t mull_hi = vmull_u8(vget_high_u8(a), mulc_hi); + uint8x16_t mulhi = vuzpq_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi)).val[1]; +#endif + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + uint8x16_t q = vsubq_u8(a, mulhi); + q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[1])); + q = vaddq_u8(mulhi, q); + q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[2])); + return q; +} +// divide each signed 8-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + const int8x8_t mulc_lo = vget_low_s8(divisor.val[0]); + // high part of signed multiplication + int16x8_t mull_lo = vmull_s8(vget_low_s8(a), mulc_lo); +#if NPY_SIMD_F64 + int16x8_t mull_hi = vmull_high_s8(a, divisor.val[0]); + // get the high unsigned bytes + int8x16_t mulhi = vuzp2q_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi)); +#else + const int8x8_t mulc_hi = vget_high_s8(divisor.val[0]); + int16x8_t mull_hi = vmull_s8(vget_high_s8(a), mulc_hi); + int8x16_t mulhi = vuzpq_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi)).val[1]; +#endif + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + int8x16_t q = vshlq_s8(vaddq_s8(a, mulhi), divisor.val[1]); + q = vsubq_s8(q, vshrq_n_s8(a, 7)); + q = vsubq_s8(veorq_s8(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + const uint16x4_t mulc_lo = vget_low_u16(divisor.val[0]); + // high part of unsigned multiplication + uint32x4_t mull_lo = vmull_u16(vget_low_u16(a), mulc_lo); +#if NPY_SIMD_F64 + uint32x4_t mull_hi = vmull_high_u16(a, divisor.val[0]); + // get the high unsigned bytes + uint16x8_t mulhi = vuzp2q_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi)); +#else + const uint16x4_t mulc_hi = vget_high_u16(divisor.val[0]); + uint32x4_t mull_hi = vmull_u16(vget_high_u16(a), mulc_hi); + uint16x8_t mulhi = vuzpq_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi)).val[1]; +#endif + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + uint16x8_t q = vsubq_u16(a, mulhi); + q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[1])); + q = vaddq_u16(mulhi, q); + q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[2])); + return q; +} +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + const int16x4_t mulc_lo = vget_low_s16(divisor.val[0]); + // high part of signed multiplication + int32x4_t mull_lo = vmull_s16(vget_low_s16(a), mulc_lo); +#if NPY_SIMD_F64 + int32x4_t mull_hi = vmull_high_s16(a, divisor.val[0]); + // get the high unsigned bytes + int16x8_t mulhi = vuzp2q_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi)); +#else + const int16x4_t mulc_hi = vget_high_s16(divisor.val[0]); + int32x4_t mull_hi = vmull_s16(vget_high_s16(a), mulc_hi); + int16x8_t mulhi = vuzpq_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi)).val[1]; +#endif + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + int16x8_t q = vshlq_s16(vaddq_s16(a, mulhi), divisor.val[1]); + q = vsubq_s16(q, vshrq_n_s16(a, 15)); + q = vsubq_s16(veorq_s16(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + const uint32x2_t mulc_lo = vget_low_u32(divisor.val[0]); + // high part of unsigned multiplication + uint64x2_t mull_lo = vmull_u32(vget_low_u32(a), mulc_lo); +#if NPY_SIMD_F64 + uint64x2_t mull_hi = vmull_high_u32(a, divisor.val[0]); + // get the high unsigned bytes + uint32x4_t mulhi = vuzp2q_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi)); +#else + const uint32x2_t mulc_hi = vget_high_u32(divisor.val[0]); + uint64x2_t mull_hi = vmull_u32(vget_high_u32(a), mulc_hi); + uint32x4_t mulhi = vuzpq_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi)).val[1]; +#endif + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + uint32x4_t q = vsubq_u32(a, mulhi); + q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[1])); + q = vaddq_u32(mulhi, q); + q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[2])); + return q; +} +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + const int32x2_t mulc_lo = vget_low_s32(divisor.val[0]); + // high part of signed multiplication + int64x2_t mull_lo = vmull_s32(vget_low_s32(a), mulc_lo); +#if NPY_SIMD_F64 + int64x2_t mull_hi = vmull_high_s32(a, divisor.val[0]); + // get the high unsigned bytes + int32x4_t mulhi = vuzp2q_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi)); +#else + const int32x2_t mulc_hi = vget_high_s32(divisor.val[0]); + int64x2_t mull_hi = vmull_s32(vget_high_s32(a), mulc_hi); + int32x4_t mulhi = vuzpq_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi)).val[1]; +#endif + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + int32x4_t q = vshlq_s32(vaddq_s32(a, mulhi), divisor.val[1]); + q = vsubq_s32(q, vshrq_n_s32(a, 31)); + q = vsubq_s32(veorq_s32(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 64-bit element by a divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + const uint64_t d = vgetq_lane_u64(divisor.val[0], 0); + return npyv_set_u64(vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d); +} +// returns the high 64 bits of signed 64-bit multiplication +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + const int64_t d = vgetq_lane_s64(divisor.val[0], 0); + return npyv_set_s64(vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d); +} +/*************************** * Division ***************************/ #if NPY_SIMD_F64 @@ -148,7 +296,7 @@ NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { - uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a)); + uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a)); return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0); } diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index d6c14228d..a3e2b95de 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -56,6 +56,11 @@ typedef double npyv_lanetype_f64; #include "emulate_maskop.h" #endif +// enable integer divisor generator for all SIMD extensions +#if NPY_SIMD + #include "intdiv.h" +#endif + /** * Some SIMD extensions currently(AVX2, AVX512F) require (de facto) * a maximum number of strides sizes when dealing with non-contiguous memory access. diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index c21b7da2d..a1ec88f56 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -86,6 +86,181 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) // TODO: after implment Packs intrins /*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + const __m128i bmask = _mm_set1_epi32(0xFF00FF00); + const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1])); + const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2])); + // high part of unsigned multiplication + __m128i mulhi_odd = _mm_mulhi_epu16(a, divisor.val[0]); + __m128i mulhi_even = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]); + mulhi_even = _mm_srli_epi16(mulhi_even, 8); + __m128i mulhi = npyv_select_u8(bmask, mulhi_odd, mulhi_even); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = _mm_sub_epi8(a, mulhi); + q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b); + q = _mm_add_epi8(mulhi, q); + q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[2]), shf2b); + return q; +} +// divide each signed 8-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor); +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + const __m128i bmask = _mm_set1_epi32(0x00FF00FF); + // instead of _mm_cvtepi8_epi16/_mm_packs_epi16 to wrap around overflow + __m128i divc_even = npyv_divc_s16(_mm_srai_epi16(_mm_slli_epi16(a, 8), 8), divisor); + __m128i divc_odd = npyv_divc_s16(_mm_srai_epi16(a, 8), divisor); + divc_odd = _mm_slli_epi16(divc_odd, 8); + return npyv_select_u8(bmask, divc_even, divc_odd); +} +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = _mm_mulhi_epu16(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = _mm_sub_epi16(a, mulhi); + q = _mm_srl_epi16(q, divisor.val[1]); + q = _mm_add_epi16(mulhi, q); + q = _mm_srl_epi16(q, divisor.val[2]); + return q; +} +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + // high part of signed multiplication + __m128i mulhi = _mm_mulhi_epi16(a, divisor.val[0]); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m128i q = _mm_sra_epi16(_mm_add_epi16(a, mulhi), divisor.val[1]); + q = _mm_sub_epi16(q, _mm_srai_epi16(a, 15)); + q = _mm_sub_epi16(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epu32(a, divisor.val[0]), 32); + __m128i mulhi_odd = _mm_mul_epu32(_mm_srli_epi64(a, 32), divisor.val[0]); +#ifdef NPY_HAVE_SSE41 + __m128i mulhi = _mm_blend_epi16(mulhi_even, mulhi_odd, 0xCC); +#else + __m128i mask_13 = _mm_setr_epi32(0, -1, 0, -1); + mulhi_odd = _mm_and_si128(mulhi_odd, mask_13); + __m128i mulhi = _mm_or_si128(mulhi_even, mulhi_odd); +#endif + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = _mm_sub_epi32(a, mulhi); + q = _mm_srl_epi32(q, divisor.val[1]); + q = _mm_add_epi32(mulhi, q); + q = _mm_srl_epi32(q, divisor.val[2]); + return q; +} +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + __m128i asign = _mm_srai_epi32(a, 31); +#ifdef NPY_HAVE_SSE41 + // high part of signed multiplication + __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epi32(a, divisor.val[0]), 32); + __m128i mulhi_odd = _mm_mul_epi32(_mm_srli_epi64(a, 32), divisor.val[0]); + __m128i mulhi = _mm_blend_epi16(mulhi_even, mulhi_odd, 0xCC); +#else // not SSE4.1 + // high part of "unsigned" multiplication + __m128i mulhi_even = _mm_srli_epi64(_mm_mul_epu32(a, divisor.val[0]), 32); + __m128i mulhi_odd = _mm_mul_epu32(_mm_srli_epi64(a, 32), divisor.val[0]); + __m128i mask_13 = _mm_setr_epi32(0, -1, 0, -1); + mulhi_odd = _mm_and_si128(mulhi_odd, mask_13); + __m128i mulhi = _mm_or_si128(mulhi_even, mulhi_odd); + // convert unsigned to signed high multiplication + // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0); + const __m128i msign= _mm_srai_epi32(divisor.val[0], 31); + __m128i m_asign = _mm_and_si128(divisor.val[0], asign); + __m128i a_msign = _mm_and_si128(a, msign); + mulhi = _mm_sub_epi32(mulhi, m_asign); + mulhi = _mm_sub_epi32(mulhi, a_msign); +#endif + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m128i q = _mm_sra_epi32(_mm_add_epi32(a, mulhi), divisor.val[1]); + q = _mm_sub_epi32(q, asign); + q = _mm_sub_epi32(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]); + return q; +} +// returns the high 64 bits of unsigned 64-bit multiplication +// xref https://stackoverflow.com/a/28827013 +NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b) +{ + __m128i lomask = npyv_setall_s64(0xffffffff); + __m128i a_hi = _mm_srli_epi64(a, 32); // a0l, a0h, a1l, a1h + __m128i b_hi = _mm_srli_epi64(b, 32); // b0l, b0h, b1l, b1h + // compute partial products + __m128i w0 = _mm_mul_epu32(a, b); // a0l*b0l, a1l*b1l + __m128i w1 = _mm_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h + __m128i w2 = _mm_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l + __m128i w3 = _mm_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h + // sum partial products + __m128i w0h = _mm_srli_epi64(w0, 32); + __m128i s1 = _mm_add_epi64(w1, w0h); + __m128i s1l = _mm_and_si128(s1, lomask); + __m128i s1h = _mm_srli_epi64(s1, 32); + + __m128i s2 = _mm_add_epi64(w2, s1l); + __m128i s2h = _mm_srli_epi64(s2, 32); + + __m128i hi = _mm_add_epi64(w3, s1h); + hi = _mm_add_epi64(hi, s2h); + return hi; +} +// divide each unsigned 64-bit element by a precomputed divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = npyv__mullhi_u64(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = _mm_sub_epi64(a, mulhi); + q = _mm_srl_epi64(q, divisor.val[1]); + q = _mm_add_epi64(mulhi, q); + q = _mm_srl_epi64(q, divisor.val[2]); + return q; +} +// divide each signed 64-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = npyv__mullhi_u64(a, divisor.val[0]); + // convert unsigned to signed high multiplication + // mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0); +#ifdef NPY_HAVE_SSE42 + const __m128i msign= _mm_cmpgt_epi64(_mm_setzero_si128(), divisor.val[0]); + __m128i asign = _mm_cmpgt_epi64(_mm_setzero_si128(), a); +#else + const __m128i msign= _mm_srai_epi32(_mm_shuffle_epi32(divisor.val[0], _MM_SHUFFLE(3, 3, 1, 1)), 31); + __m128i asign = _mm_srai_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)), 31); +#endif + __m128i m_asign = _mm_and_si128(divisor.val[0], asign); + __m128i a_msign = _mm_and_si128(a, msign); + mulhi = _mm_sub_epi64(mulhi, m_asign); + mulhi = _mm_sub_epi64(mulhi, a_msign); + // q = (a + mulhi) >> sh + __m128i q = _mm_add_epi64(a, mulhi); + // emulate arithmetic right shift + const __m128i sigb = npyv_setall_s64(1LL << 63); + q = _mm_srl_epi64(_mm_add_epi64(q, sigb), divisor.val[1]); + q = _mm_sub_epi64(q, _mm_srl_epi64(sigb, divisor.val[1])); + // q = q - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + q = _mm_sub_epi64(q, asign); + q = _mm_sub_epi64(_mm_xor_si128(q, divisor.val[2]), divisor.val[2]); + return q; +} +/*************************** * Division ***************************/ // TODO: emulate integer division @@ -175,7 +350,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) __m128 t2 = _mm_add_ps(a, t1); __m128 t3 = _mm_shuffle_ps(t2, t2, 1); __m128 t4 = _mm_add_ss(t2, t3); - return _mm_cvtss_f32(t4); + return _mm_cvtss_f32(t4); #endif } diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h index 7ba47bc68..1099c491d 100644 --- a/numpy/core/src/common/simd/sse/misc.h +++ b/numpy/core/src/common/simd/sse/misc.h @@ -18,21 +18,16 @@ #define npyv_zero_f64 _mm_setzero_pd // vector with a specific value set to all lanes -#define npyv_setall_u8(VAL) _mm_set1_epi8((char)VAL) -#define npyv_setall_s8(VAL) _mm_set1_epi8((char)VAL) -#define npyv_setall_u16(VAL) _mm_set1_epi16((short)VAL) -#define npyv_setall_s16(VAL) _mm_set1_epi16((short)VAL) -#define npyv_setall_u32(VAL) _mm_set1_epi32((int)VAL) -#define npyv_setall_s32(VAL) _mm_set1_epi32(VAL) -#if !defined(__x86_64__) && !defined(_M_X64) - #define npyv_setall_u64(VAL) _mm_set_epi32((int)(VAL >> 32), (int)VAL, (int)(VAL >> 32), (int)VAL) - #define npyv_setall_s64 npyv_setall_u64 -#else - #define npyv_setall_u64(VAL) _mm_set1_epi64x(VAL) - #define npyv_setall_s64(VAL) _mm_set1_epi64x(VAL) -#endif -#define npyv_setall_f32(VAL) _mm_set1_ps(VAL) -#define npyv_setall_f64(VAL) _mm_set1_pd(VAL) +#define npyv_setall_u8(VAL) _mm_set1_epi8((char)(VAL)) +#define npyv_setall_s8(VAL) _mm_set1_epi8((char)(VAL)) +#define npyv_setall_u16(VAL) _mm_set1_epi16((short)(VAL)) +#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL)) +#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL)) +#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL)) +#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL)) +#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL)) +#define npyv_setall_f32 _mm_set1_ps +#define npyv_setall_f64 _mm_set1_pd /** * vector with specific values set to each lane and diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 7c4e32f27..123fcaf92 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -95,6 +95,138 @@ #define npyv_mul_f64 vec_mul /*************************** + * Integer Division + ***************************/ +/*** + * TODO: Add support for VSX4(Power10) + */ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + const npyv_u8 mergeo_perm = { + 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 + }; + // high part of unsigned multiplication + npyv_u16 mul_even = vec_mule(a, divisor.val[0]); + npyv_u16 mul_odd = vec_mulo(a, divisor.val[0]); + npyv_u8 mulhi = (npyv_u8)vec_perm(mul_even, mul_odd, mergeo_perm); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + npyv_u8 q = vec_sub(a, mulhi); + q = vec_sr(q, divisor.val[1]); + q = vec_add(mulhi, q); + q = vec_sr(q, divisor.val[2]); + return q; +} +// divide each signed 8-bit element by a precomputed divisor +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + const npyv_u8 mergeo_perm = { + 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 + }; + // high part of signed multiplication + npyv_s16 mul_even = vec_mule(a, divisor.val[0]); + npyv_s16 mul_odd = vec_mulo(a, divisor.val[0]); + npyv_s8 mulhi = (npyv_s8)vec_perm(mul_even, mul_odd, mergeo_perm); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + npyv_s8 q = vec_sra(vec_add(a, mulhi), (npyv_u8)divisor.val[1]); + q = vec_sub(q, vec_sra(a, npyv_setall_u8(7))); + q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + const npyv_u8 mergeo_perm = { + 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 + }; + // high part of unsigned multiplication + npyv_u32 mul_even = vec_mule(a, divisor.val[0]); + npyv_u32 mul_odd = vec_mulo(a, divisor.val[0]); + npyv_u16 mulhi = (npyv_u16)vec_perm(mul_even, mul_odd, mergeo_perm); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + npyv_u16 q = vec_sub(a, mulhi); + q = vec_sr(q, divisor.val[1]); + q = vec_add(mulhi, q); + q = vec_sr(q, divisor.val[2]); + return q; +} +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + const npyv_u8 mergeo_perm = { + 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 + }; + // high part of signed multiplication + npyv_s32 mul_even = vec_mule(a, divisor.val[0]); + npyv_s32 mul_odd = vec_mulo(a, divisor.val[0]); + npyv_s16 mulhi = (npyv_s16)vec_perm(mul_even, mul_odd, mergeo_perm); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + npyv_s16 q = vec_sra(vec_add(a, mulhi), (npyv_u16)divisor.val[1]); + q = vec_sub(q, vec_sra(a, npyv_setall_u16(15))); + q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ +#if defined(__GNUC__) && __GNUC__ < 8 + // Doubleword integer wide multiplication supported by GCC 8+ + npyv_u64 mul_even, mul_odd; + __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0])); + __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0])); +#else + // Doubleword integer wide multiplication supported by GCC 8+ + npyv_u64 mul_even = vec_mule(a, divisor.val[0]); + npyv_u64 mul_odd = vec_mulo(a, divisor.val[0]); +#endif + // high part of unsigned multiplication + npyv_u32 mulhi = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd); + // floor(x/d) = (((a-mulhi) >> sh1) + mulhi) >> sh2 + npyv_u32 q = vec_sub(a, mulhi); + q = vec_sr(q, divisor.val[1]); + q = vec_add(mulhi, q); + q = vec_sr(q, divisor.val[2]); + return q; +} +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ +#if defined(__GNUC__) && __GNUC__ < 8 + // Doubleword integer wide multiplication supported by GCC8+ + npyv_s64 mul_even, mul_odd; + __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0])); + __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0])); +#else + // Doubleword integer wide multiplication supported by GCC8+ + npyv_s64 mul_even = vec_mule(a, divisor.val[0]); + npyv_s64 mul_odd = vec_mulo(a, divisor.val[0]); +#endif + // high part of signed multiplication + npyv_s32 mulhi = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + npyv_s32 q = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]); + q = vec_sub(q, vec_sra(a, npyv_setall_u32(31))); + q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 64-bit element by a precomputed divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + const npy_uint64 d = vec_extract(divisor.val[0], 0); + return npyv_set_u64(vec_extract(a, 0) / d, vec_extract(a, 1) / d); +} +// divide each signed 64-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + npyv_b64 overflow = vec_and(vec_cmpeq(a, npyv_setall_s64(-1LL << 63)), (npyv_b64)divisor.val[1]); + npyv_s64 d = vec_sel(divisor.val[0], npyv_setall_s64(1), overflow); + return vec_div(a, d); +} +/*************************** * Division ***************************/ #define npyv_div_f32 vec_div diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 1d1a111be..8306a86d3 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -733,6 +733,76 @@ class _SIMD_ALL(_Test_Utility): div = self.div(vdata_a, vdata_b) assert div == data_div + def test_arithmetic_intdiv(self): + """ + Test integer division intrinics: + npyv_divisor_##sfx + npyv_divc_##sfx + """ + if self._is_fp(): + return + + def trunc_div(a, d): + """ + Divide towards zero works with large integers > 2^53, + equivalent to int(a/d) + """ + sign_a, sign_d = a < 0, d < 0 + if a == 0 or sign_a == sign_d: + return a // d + return (a + sign_d - sign_a) // d + 1 + + int_min = self._int_min() if self._is_signed() else 1 + int_max = self._int_max() + rdata = ( + 0, 1, self.nlanes, int_max-self.nlanes, + int_min, int_min//2 + 1 + ) + divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2) + + for x, d in zip(rdata, divisors): + data = self._data(x) + vdata = self.load(data) + data_divc = [trunc_div(a, d) for a in data] + divisor = self.divisor(d) + divc = self.divc(vdata, divisor) + assert divc == data_divc + + if not self._is_signed(): + return + + safe_neg = lambda x: -x-1 if -x > int_max else -x + # test round divison for signed integers + for x, d in zip(rdata, divisors): + d_neg = safe_neg(d) + data = self._data(x) + data_neg = [safe_neg(a) for a in data] + vdata = self.load(data) + vdata_neg = self.load(data_neg) + divisor = self.divisor(d) + divisor_neg = self.divisor(d_neg) + + # round towards zero + data_divc = [trunc_div(a, d_neg) for a in data] + divc = self.divc(vdata, divisor_neg) + assert divc == data_divc + data_divc = [trunc_div(a, d) for a in data_neg] + divc = self.divc(vdata_neg, divisor) + assert divc == data_divc + + # test truncate sign if the dividend is zero + vzero = self.zero() + for d in (-1, -10, -100, int_min//2, int_min): + divisor = self.divisor(d) + divc = self.divc(vzero, divisor) + assert divc == vzero + + # test overflow + vmin = self.setall(int_min) + divisor = self.divisor(-1) + divc = self.divc(vmin, divisor) + assert divc == vmin + def test_arithmetic_reduce_sum(self): """ Test reduce sum intrinics: |