Optimize skvx::fma for GCC

Add specific specializations and make sure we get to them from the 32 entry width being used. Change-Id: I7b6f3636a63742c40e5953661709eeb0d5c676b3 Reviewed-by: Peter Varga <pvarga@inf.u-szeged.hu>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2022-01-17 15:53:56 +0100
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2022-11-09 11:47:41 +0100
commit: 47169462cb21ee013c3db6bc179bd55ddbaaa42e (patch)
tree: d4f90ada21e563fd91d453a1f168f3bae562d37c
parent: c9b11379ee3e282a82a53995b64160aead8bfa94 (diff)
download: qtwebengine-chromium-47169462cb21ee013c3db6bc179bd55ddbaaa42e.tar.gz
1 files changed, 42 insertions, 0 deletions
diff --git a/chromium/third_party/skia/include/private/SkVx.h b/chromium/third_party/skia/include/private/SkVx.h
index 2fcc42ed3a3..8993d903dcf 100644
--- a/chromium/third_party/skia/include/private/SkVx.h
+++ b/chromium/third_party/skia/include/private/SkVx.h
@@ -649,11 +649,53 @@ SIN Vec<N,float>   abs(const Vec<N,float>& x) { return map( fabsf, x); }
 SIN Vec<N,float>   fma(const Vec<N,float>& x,
                        const Vec<N,float>& y,
                        const Vec<N,float>& z) {
+#if defined(__clang__)
     // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.
     auto fn = [](float x, float y, float z) { return fmaf(x,y,z); };
     return map(fn, x,y,z);
+#else
+    if constexpr(N > 8) {
+        Vec<N,float> out;
+        out.hi = fma(x.hi, y.hi, z.hi);
+        out.lo = fma(x.lo, y.lo, z.lo);
+        return out;
+    } else {
+        return map(fmaf, x,y,z);
+    }
+#endif
 }
 
+#if !defined(SKNX_NO_SIMD) && (defined(__GNUC__) || defined(__clang__))
+#if defined(__AVX2__)
+    SI Vec<4,float> fma(const Vec<4,float>& x, const Vec<4,float>& y, const Vec<4,float>& z) {
+        return to_vec<4,float>(_mm_fmadd_ps(to_vext<4,float>(x),
+                                            to_vext<4,float>(y),
+                                            to_vext<4,float>(z)));
+    }
+
+    SI Vec<8,float> fma(const Vec<8,float>& x, const Vec<8,float>& y, const Vec<8,float>& z) {
+        return to_vec<8,float>(_mm256_fmadd_ps(to_vext<8,float>(x),
+                                               to_vext<8,float>(y),
+                                               to_vext<8,float>(z)));
+    }
+#if defined(__AVX512F__)
+    SI Vec<16,float> fma(const Vec<16,float>& x, const Vec<16,float>& y, const Vec<16,float>& z) {
+        return to_vec<16,float>(_mm512_fmadd_ps(to_vext<16,float>(x),
+                                                to_vext<16,float>(y),
+                                                to_vext<16,float>(z)));
+    }
+#endif
+#elif defined(__aarch64__)
+    SI Vec<4,float> fma(const Vec<4,float>& x, const Vec<4,float>& y, const Vec<4,float>& z) {
+        // These instructions tend to work like z += xy, so the order here is z,x,y.
+        return to_vec<4,float>(vfmaq_f32(to_vext<4,float>(z),
+                                         to_vext<4,float>(x),
+                                         to_vext<4,float>(y)));
+    }
+#endif
+
+#endif // !defined(SKNX_NO_SIMD)
+
 SI Vec<1,int> lrint(const Vec<1,float>& x) {
     return (int)lrintf(x.val);
 }
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2022-01-17 15:53:56 +0100
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2022-11-09 11:47:41 +0100
commit	47169462cb21ee013c3db6bc179bd55ddbaaa42e (patch)
tree	d4f90ada21e563fd91d453a1f168f3bae562d37c
parent	c9b11379ee3e282a82a53995b64160aead8bfa94 (diff)
download	qtwebengine-chromium-47169462cb21ee013c3db6bc179bd55ddbaaa42e.tar.gz