summaryrefslogtreecommitdiff
path: root/chromium/third_party/libyuv/source/row_gcc.cc
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/libyuv/source/row_gcc.cc')
-rw-r--r--chromium/third_party/libyuv/source/row_gcc.cc99
1 files changed, 97 insertions, 2 deletions
diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc
index 86f0880be2a..b5c2e65c938 100644
--- a/chromium/third_party/libyuv/source/row_gcc.cc
+++ b/chromium/third_party/libyuv/source/row_gcc.cc
@@ -2753,6 +2753,87 @@ void MergeUVRow_SSE2(const uint8* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16* src_u,
+ const uint16* src_v,
+ uint16* dst_uv,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
+
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(scale) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16* src_y,
+ uint16* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
@@ -5645,6 +5726,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
#ifdef HAS_HALFFLOATROW_SSE2
static float kScaleBias = 1.9259299444e-34f;
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
+ scale *= kScaleBias;
asm volatile (
"pshufd $0x0,%3,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
@@ -5671,7 +5753,11 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- : "x"(scale * kScaleBias) // %3
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
: "memory", "cc",
"xmm2", "xmm3", "xmm4", "xmm5"
);
@@ -5680,6 +5766,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_AVX2
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
+ scale *= kScaleBias;
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
@@ -5707,7 +5794,11 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- : "x"(scale * kScaleBias) // %3
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
: "memory", "cc",
"xmm2", "xmm3", "xmm4", "xmm5"
);
@@ -5740,7 +5831,11 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
+#if defined(__x86_64__)
: "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
: "memory", "cc",
"xmm2", "xmm3", "xmm4"
);