diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2022-01-17 15:53:56 +0100 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2022-11-09 11:47:41 +0100 |
commit | 47169462cb21ee013c3db6bc179bd55ddbaaa42e (patch) | |
tree | d4f90ada21e563fd91d453a1f168f3bae562d37c | |
parent | c9b11379ee3e282a82a53995b64160aead8bfa94 (diff) | |
download | qtwebengine-chromium-47169462cb21ee013c3db6bc179bd55ddbaaa42e.tar.gz |
Optimize skvx::fma for GCC
Add specific specializations and make sure we get to them from the 32 entry
width being used.
Change-Id: I7b6f3636a63742c40e5953661709eeb0d5c676b3
Reviewed-by: Peter Varga <pvarga@inf.u-szeged.hu>
-rw-r--r-- | chromium/third_party/skia/include/private/SkVx.h | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/chromium/third_party/skia/include/private/SkVx.h b/chromium/third_party/skia/include/private/SkVx.h index 2fcc42ed3a3..8993d903dcf 100644 --- a/chromium/third_party/skia/include/private/SkVx.h +++ b/chromium/third_party/skia/include/private/SkVx.h @@ -649,11 +649,53 @@ SIN Vec<N,float> abs(const Vec<N,float>& x) { return map( fabsf, x); } SIN Vec<N,float> fma(const Vec<N,float>& x, const Vec<N,float>& y, const Vec<N,float>& z) { +#if defined(__clang__) // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly. auto fn = [](float x, float y, float z) { return fmaf(x,y,z); }; return map(fn, x,y,z); +#else + if constexpr(N > 8) { + Vec<N,float> out; + out.hi = fma(x.hi, y.hi, z.hi); + out.lo = fma(x.lo, y.lo, z.lo); + return out; + } else { + return map(fmaf, x,y,z); + } +#endif } +#if !defined(SKNX_NO_SIMD) && (defined(__GNUC__) || defined(__clang__)) +#if defined(__AVX2__) + SI Vec<4,float> fma(const Vec<4,float>& x, const Vec<4,float>& y, const Vec<4,float>& z) { + return to_vec<4,float>(_mm_fmadd_ps(to_vext<4,float>(x), + to_vext<4,float>(y), + to_vext<4,float>(z))); + } + + SI Vec<8,float> fma(const Vec<8,float>& x, const Vec<8,float>& y, const Vec<8,float>& z) { + return to_vec<8,float>(_mm256_fmadd_ps(to_vext<8,float>(x), + to_vext<8,float>(y), + to_vext<8,float>(z))); + } +#if defined(__AVX512F__) + SI Vec<16,float> fma(const Vec<16,float>& x, const Vec<16,float>& y, const Vec<16,float>& z) { + return to_vec<16,float>(_mm512_fmadd_ps(to_vext<16,float>(x), + to_vext<16,float>(y), + to_vext<16,float>(z))); + } +#endif +#elif defined(__aarch64__) + SI Vec<4,float> fma(const Vec<4,float>& x, const Vec<4,float>& y, const Vec<4,float>& z) { + // These instructions tend to work like z += xy, so the order here is z,x,y. + return to_vec<4,float>(vfmaq_f32(to_vext<4,float>(z), + to_vext<4,float>(x), + to_vext<4,float>(y))); + } +#endif + +#endif // !defined(SKNX_NO_SIMD) + SI Vec<1,int> lrint(const Vec<1,float>& x) { return (int)lrintf(x.val); } |