summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2022-01-17 15:53:56 +0100
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2022-11-09 11:47:41 +0100
commit47169462cb21ee013c3db6bc179bd55ddbaaa42e (patch)
treed4f90ada21e563fd91d453a1f168f3bae562d37c
parentc9b11379ee3e282a82a53995b64160aead8bfa94 (diff)
downloadqtwebengine-chromium-47169462cb21ee013c3db6bc179bd55ddbaaa42e.tar.gz
Optimize skvx::fma for GCC
Add specific specializations and make sure we get to them from the 32 entry width being used. Change-Id: I7b6f3636a63742c40e5953661709eeb0d5c676b3 Reviewed-by: Peter Varga <pvarga@inf.u-szeged.hu>
-rw-r--r--chromium/third_party/skia/include/private/SkVx.h42
1 files changed, 42 insertions, 0 deletions
diff --git a/chromium/third_party/skia/include/private/SkVx.h b/chromium/third_party/skia/include/private/SkVx.h
index 2fcc42ed3a3..8993d903dcf 100644
--- a/chromium/third_party/skia/include/private/SkVx.h
+++ b/chromium/third_party/skia/include/private/SkVx.h
@@ -649,11 +649,53 @@ SIN Vec<N,float> abs(const Vec<N,float>& x) { return map( fabsf, x); }
SIN Vec<N,float> fma(const Vec<N,float>& x,
const Vec<N,float>& y,
const Vec<N,float>& z) {
+#if defined(__clang__)
// I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.
auto fn = [](float x, float y, float z) { return fmaf(x,y,z); };
return map(fn, x,y,z);
+#else
+ if constexpr(N > 8) {
+ Vec<N,float> out;
+ out.hi = fma(x.hi, y.hi, z.hi);
+ out.lo = fma(x.lo, y.lo, z.lo);
+ return out;
+ } else {
+ return map(fmaf, x,y,z);
+ }
+#endif
}
+#if !defined(SKNX_NO_SIMD) && (defined(__GNUC__) || defined(__clang__))
+#if defined(__AVX2__)
+ SI Vec<4,float> fma(const Vec<4,float>& x, const Vec<4,float>& y, const Vec<4,float>& z) {
+ return to_vec<4,float>(_mm_fmadd_ps(to_vext<4,float>(x),
+ to_vext<4,float>(y),
+ to_vext<4,float>(z)));
+ }
+
+ SI Vec<8,float> fma(const Vec<8,float>& x, const Vec<8,float>& y, const Vec<8,float>& z) {
+ return to_vec<8,float>(_mm256_fmadd_ps(to_vext<8,float>(x),
+ to_vext<8,float>(y),
+ to_vext<8,float>(z)));
+ }
+#if defined(__AVX512F__)
+ SI Vec<16,float> fma(const Vec<16,float>& x, const Vec<16,float>& y, const Vec<16,float>& z) {
+ return to_vec<16,float>(_mm512_fmadd_ps(to_vext<16,float>(x),
+ to_vext<16,float>(y),
+ to_vext<16,float>(z)));
+ }
+#endif
+#elif defined(__aarch64__)
+ SI Vec<4,float> fma(const Vec<4,float>& x, const Vec<4,float>& y, const Vec<4,float>& z) {
+ // These instructions tend to work like z += xy, so the order here is z,x,y.
+ return to_vec<4,float>(vfmaq_f32(to_vext<4,float>(z),
+ to_vext<4,float>(x),
+ to_vext<4,float>(y)));
+ }
+#endif
+
+#endif // !defined(SKNX_NO_SIMD)
+
SI Vec<1,int> lrint(const Vec<1,float>& x) {
return (int)lrintf(x.val);
}