summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2021-01-16 14:01:44 +0000
committerSayed Adel <seiko@imavr.com>2021-03-08 08:19:12 +0200
commit2da9858aed5efca22d92682000f03e341dc46d64 (patch)
tree76900553eaba01e5e48f6768956f9c4e4add5edd
parent5c185cc7c104928cea93917ebb806797d5d8d7dd (diff)
downloadnumpy-2da9858aed5efca22d92682000f03e341dc46d64.tar.gz
SIMD: add NPYV fast integer division intrinsics for NEON
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h150
1 files changed, 149 insertions, 1 deletions
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 69a49f571..00994806d 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -61,6 +61,154 @@
#define npyv_mul_f64 vmulq_f64
/***************************
+ * Integer Division
+ ***************************/
+// See simd/intdiv.h for more clarification
+// divide each unsigned 8-bit element by a precomputed divisor
+NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
+{
+ const uint8x8_t mulc_lo = vget_low_u8(divisor.val[0]);
+ // high part of unsigned multiplication
+ uint16x8_t mull_lo = vmull_u8(vget_low_u8(a), mulc_lo);
+#if NPY_SIMD_F64
+ uint16x8_t mull_hi = vmull_high_u8(a, divisor.val[0]);
+ // get the high unsigned bytes
+ uint8x16_t mulhi = vuzp2q_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi));
+#else
+ const uint8x8_t mulc_hi = vget_high_u8(divisor.val[0]);
+ uint16x8_t mull_hi = vmull_u8(vget_high_u8(a), mulc_hi);
+ uint8x16_t mulhi = vuzpq_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi)).val[1];
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ uint8x16_t q = vsubq_u8(a, mulhi);
+ q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[1]));
+ q = vaddq_u8(mulhi, q);
+ q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[2]));
+ return q;
+}
+// divide each signed 8-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
+{
+ const int8x8_t mulc_lo = vget_low_s8(divisor.val[0]);
+ // high part of signed multiplication
+ int16x8_t mull_lo = vmull_s8(vget_low_s8(a), mulc_lo);
+#if NPY_SIMD_F64
+ int16x8_t mull_hi = vmull_high_s8(a, divisor.val[0]);
+ // get the high unsigned bytes
+ int8x16_t mulhi = vuzp2q_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi));
+#else
+ const int8x8_t mulc_hi = vget_high_s8(divisor.val[0]);
+ int16x8_t mull_hi = vmull_s8(vget_high_s8(a), mulc_hi);
+ int8x16_t mulhi = vuzpq_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi)).val[1];
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ int8x16_t q = vshlq_s8(vaddq_s8(a, mulhi), divisor.val[1]);
+ q = vsubq_s8(q, vshrq_n_s8(a, 7));
+ q = vsubq_s8(veorq_s8(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 16-bit element by a precomputed divisor
+NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
+{
+ const uint16x4_t mulc_lo = vget_low_u16(divisor.val[0]);
+ // high part of unsigned multiplication
+ uint32x4_t mull_lo = vmull_u16(vget_low_u16(a), mulc_lo);
+#if NPY_SIMD_F64
+ uint32x4_t mull_hi = vmull_high_u16(a, divisor.val[0]);
+ // get the high unsigned bytes
+ uint16x8_t mulhi = vuzp2q_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi));
+#else
+ const uint16x4_t mulc_hi = vget_high_u16(divisor.val[0]);
+ uint32x4_t mull_hi = vmull_u16(vget_high_u16(a), mulc_hi);
+ uint16x8_t mulhi = vuzpq_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi)).val[1];
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ uint16x8_t q = vsubq_u16(a, mulhi);
+ q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[1]));
+ q = vaddq_u16(mulhi, q);
+ q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[2]));
+ return q;
+}
+// divide each signed 16-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
+{
+ const int16x4_t mulc_lo = vget_low_s16(divisor.val[0]);
+ // high part of signed multiplication
+ int32x4_t mull_lo = vmull_s16(vget_low_s16(a), mulc_lo);
+#if NPY_SIMD_F64
+ int32x4_t mull_hi = vmull_high_s16(a, divisor.val[0]);
+ // get the high unsigned bytes
+ int16x8_t mulhi = vuzp2q_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi));
+#else
+ const int16x4_t mulc_hi = vget_high_s16(divisor.val[0]);
+ int32x4_t mull_hi = vmull_s16(vget_high_s16(a), mulc_hi);
+ int16x8_t mulhi = vuzpq_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi)).val[1];
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ int16x8_t q = vshlq_s16(vaddq_s16(a, mulhi), divisor.val[1]);
+ q = vsubq_s16(q, vshrq_n_s16(a, 15));
+ q = vsubq_s16(veorq_s16(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 32-bit element by a precomputed divisor
+NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
+{
+ const uint32x2_t mulc_lo = vget_low_u32(divisor.val[0]);
+ // high part of unsigned multiplication
+ uint64x2_t mull_lo = vmull_u32(vget_low_u32(a), mulc_lo);
+#if NPY_SIMD_F64
+ uint64x2_t mull_hi = vmull_high_u32(a, divisor.val[0]);
+ // get the high unsigned bytes
+ uint32x4_t mulhi = vuzp2q_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi));
+#else
+ const uint32x2_t mulc_hi = vget_high_u32(divisor.val[0]);
+ uint64x2_t mull_hi = vmull_u32(vget_high_u32(a), mulc_hi);
+ uint32x4_t mulhi = vuzpq_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi)).val[1];
+#endif
+ // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
+ uint32x4_t q = vsubq_u32(a, mulhi);
+ q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[1]));
+ q = vaddq_u32(mulhi, q);
+ q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[2]));
+ return q;
+}
+// divide each signed 32-bit element by a precomputed divisor (round towards zero)
+NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
+{
+ const int32x2_t mulc_lo = vget_low_s32(divisor.val[0]);
+ // high part of signed multiplication
+ int64x2_t mull_lo = vmull_s32(vget_low_s32(a), mulc_lo);
+#if NPY_SIMD_F64
+ int64x2_t mull_hi = vmull_high_s32(a, divisor.val[0]);
+ // get the high unsigned bytes
+ int32x4_t mulhi = vuzp2q_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi));
+#else
+ const int32x2_t mulc_hi = vget_high_s32(divisor.val[0]);
+ int64x2_t mull_hi = vmull_s32(vget_high_s32(a), mulc_hi);
+ int32x4_t mulhi = vuzpq_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi)).val[1];
+#endif
+ // q = ((a + mulhi) >> sh1) - XSIGN(a)
+ // trunc(a/d) = (q ^ dsign) - dsign
+ int32x4_t q = vshlq_s32(vaddq_s32(a, mulhi), divisor.val[1]);
+ q = vsubq_s32(q, vshrq_n_s32(a, 31));
+ q = vsubq_s32(veorq_s32(q, divisor.val[2]), divisor.val[2]);
+ return q;
+}
+// divide each unsigned 64-bit element by a divisor
+NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
+{
+ const uint64_t d = vgetq_lane_u64(divisor.val[0], 0);
+ return npyv_set_u64(vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d);
+}
+// returns the high 64 bits of signed 64-bit multiplication
+NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
+{
+ const int64_t d = vgetq_lane_s64(divisor.val[0], 0);
+ return npyv_set_s64(vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d);
+}
+/***************************
* Division
***************************/
#if NPY_SIMD_F64
@@ -148,7 +296,7 @@
NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
{
- uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
+ uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
return (unsigned)vget_lane_u32(vpadd_u32(a0, vget_high_u32(a)),0);
}