diff options
Diffstat (limited to 'chromium/third_party/libyuv/source/row_gcc.cc')
-rw-r--r-- | chromium/third_party/libyuv/source/row_gcc.cc | 99 |
1 files changed, 97 insertions, 2 deletions
diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc index 86f0880be2a..b5c2e65c938 100644 --- a/chromium/third_party/libyuv/source/row_gcc.cc +++ b/chromium/third_party/libyuv/source/row_gcc.cc @@ -2753,6 +2753,87 @@ void MergeUVRow_SSE2(const uint8* src_u, } #endif // HAS_MERGEUVROW_SSE2 +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16* src_y, + uint16* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + #ifdef HAS_SPLITRGBROW_SSSE3 // Shuffle table for converting RGB to Planar. @@ -5645,6 +5726,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, #ifdef HAS_HALFFLOATROW_SSE2 static float kScaleBias = 1.9259299444e-34f; void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { + scale *= kScaleBias; asm volatile ( "pshufd $0x0,%3,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -5671,7 +5753,11 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "x"(scale * kScaleBias) // %3 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5" ); @@ -5680,6 +5766,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { #ifdef HAS_HALFFLOATROW_AVX2 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { + scale *= kScaleBias; asm volatile ( "vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -5707,7 +5794,11 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "x"(scale * kScaleBias) // %3 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5" ); @@ -5740,7 +5831,11 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 +#if defined(__x86_64__) : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif : "memory", "cc", "xmm2", "xmm3", "xmm4" ); |