summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Rowley <timothy.o.rowley@intel.com>2017-11-13 18:39:38 -0600
committerAndres Gomez <agomez@igalia.com>2017-11-21 18:16:46 +0200
commit8edbc8f1091595d677621b407c648512437f1dd7 (patch)
treea4c2d658a73b3517c05213593ece6f4398c8c31f
parent0f4dfee254e1f054c851a729deb4217cc4e14003 (diff)
downloadmesa-8edbc8f1091595d677621b407c648512437f1dd7.tar.gz
swr/rast: Faster emulated simd16 permute
Speed up simd16 frontend (default) on avx/avx2 platforms; fixes performance regression caused by switch to simdlib. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com> Cc: mesa-stable@lists.freedesktop.org (cherry picked from commit d8489517a572c7e5c5405ebf510db9d20b1e2591)
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl34
1 files changed, 11 insertions, 23 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
index a45429f4b6b..fd248b3792b 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -518,36 +518,24 @@ SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
- Integer result;
-
- // Ugly slow implementation
- uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
- uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
- uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
-
- for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
- {
- pResult[i] = pA[0xF & pSwiz[i]];
- }
-
- return result;
+ return castps_si(permute_ps(castsi_ps(a), swiz));
}
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
- Float result;
+ const auto mask = SIMD256T::set1_epi32(7);
- // Ugly slow implementation
- float const *pA = reinterpret_cast<float const*>(&a);
- uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
- float *pResult = reinterpret_cast<float *>(&result);
+ auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
+ auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
- for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
- {
- pResult[i] = pA[0xF & pSwiz[i]];
- }
+ auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
+ auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
- return result;
+ return Float
+ {
+ SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
+ SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
+ };
}
// All of the 512-bit permute2f128_XX intrinsics do the following: