diff options
Diffstat (limited to 'chromium/third_party/libyuv')
46 files changed, 1465 insertions, 582 deletions
diff --git a/chromium/third_party/libyuv/Android.bp b/chromium/third_party/libyuv/Android.bp new file mode 100644 index 00000000000..a3d8d834ac7 --- /dev/null +++ b/chromium/third_party/libyuv/Android.bp @@ -0,0 +1,135 @@ +cc_library { + name: "libyuv", + vendor_available: true, + vndk: { + enabled: true, + }, + + srcs: [ + "source/compare.cc", + "source/compare_common.cc", + "source/compare_gcc.cc", + "source/compare_neon.cc", + "source/compare_neon64.cc", + "source/compare_msa.cc", + "source/convert.cc", + "source/convert_argb.cc", + "source/convert_from.cc", + "source/convert_from_argb.cc", + "source/convert_to_argb.cc", + "source/convert_to_i420.cc", + "source/cpu_id.cc", + "source/planar_functions.cc", + "source/rotate.cc", + "source/rotate_any.cc", + "source/rotate_argb.cc", + "source/rotate_common.cc", + "source/rotate_dspr2.cc", + "source/rotate_gcc.cc", + "source/rotate_msa.cc", + "source/rotate_neon.cc", + "source/rotate_neon64.cc", + "source/row_any.cc", + "source/row_common.cc", + "source/row_dspr2.cc", + "source/row_gcc.cc", + "source/row_msa.cc", + "source/row_neon.cc", + "source/row_neon64.cc", + "source/scale.cc", + "source/scale_any.cc", + "source/scale_argb.cc", + "source/scale_common.cc", + "source/scale_dspr2.cc", + "source/scale_gcc.cc", + "source/scale_msa.cc", + "source/scale_neon.cc", + "source/scale_neon64.cc", + "source/video_common.cc", + + "source/convert_jpeg.cc", + "source/mjpeg_decoder.cc", + "source/mjpeg_validate.cc", + ], + + cflags: [ + "-Wall", + "-Werror", + "-Wno-unused-parameter", + "-fexceptions", + "-DHAVE_JPEG", + ], + + shared_libs: ["libjpeg"], + + export_include_dirs: ["include"], +} + +// compatibilty static library until all uses of libyuv_static are replaced +// with libyuv (b/37646797) +cc_library_static { + name: "libyuv_static", + whole_static_libs: ["libyuv"], +} + +cc_test { + name: "libyuv_unittest", + static_libs: ["libyuv"], + shared_libs: ["libjpeg"], + cflags: ["-Wall", "-Werror"], + srcs: [ + "unit_test/unit_test.cc", + "unit_test/basictypes_test.cc", + "unit_test/color_test.cc", + "unit_test/compare_test.cc", + "unit_test/convert_test.cc", + "unit_test/cpu_test.cc", + "unit_test/cpu_thread_test.cc", + "unit_test/math_test.cc", + "unit_test/planar_test.cc", + "unit_test/rotate_argb_test.cc", + "unit_test/rotate_test.cc", + "unit_test/scale_argb_test.cc", + "unit_test/scale_test.cc", + "unit_test/video_common_test.cc", + ], +} + +cc_test { + name: "compare", + gtest: false, + srcs: [ + "util/compare.cc", + ], + static_libs: ["libyuv"], +} + +cc_test { + name: "cpuid", + gtest: false, + srcs: [ + "util/cpuid.c", + ], + static_libs: ["libyuv"], +} + +cc_test { + name: "psnr", + gtest: false, + srcs: [ + "util/psnr_main.cc", + "util/psnr.cc", + "util/ssim.cc", + ], + static_libs: ["libyuv"], +} + +cc_test { + name: "yuvconvert", + gtest: false, + srcs: [ + "util/yuvconvert.cc", + ], + static_libs: ["libyuv"], + shared_libs: ["libjpeg"], +} diff --git a/chromium/third_party/libyuv/BUILD.gn b/chromium/third_party/libyuv/BUILD.gn index 34a9975bf82..9badf08c846 100644 --- a/chromium/third_party/libyuv/BUILD.gn +++ b/chromium/third_party/libyuv/BUILD.gn @@ -158,9 +158,13 @@ static_library("libyuv_internal") { } # To enable AVX2 or other cpu optimization, pass flag here - # cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ] if (!is_win) { - cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON. + cflags = [ + # "-mpopcnt", + # "-mavx2", + # "-mfma", + "-ffp-contract=fast", # Enable fma vectorization for NEON. + ] } } if (libyuv_use_neon) { @@ -185,6 +189,7 @@ if (libyuv_use_neon) { configs -= [ "//build/config/compiler:default_optimization" ] # Enable optimize for speed (-O2) over size (-Os). + # TODO(fbarchard): Consider optimize_speed which is O3. configs += [ "//build/config/compiler:optimize_max" ] } diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS index 852735e2c57..fdb133c7ac1 100644 --- a/chromium/third_party/libyuv/DEPS +++ b/chromium/third_party/libyuv/DEPS @@ -1,6 +1,6 @@ vars = { 'chromium_git': 'https://chromium.googlesource.com', - 'chromium_revision': '3a3410e0eb66727afa4f2557954ecfbd9b230c83', + 'chromium_revision': 'ff3b31782d552b03104a6d831c7530605b52b13f', 'swarming_revision': '5e8001d9a710121ce7a68efd0804430a34b4f9e4', # Three lines of non-changing comments so that # the commit queue can handle CLs rolling lss @@ -9,18 +9,18 @@ vars = { # Three lines of non-changing comments so that # the commit queue can handle CLs rolling catapult # and whatever else without interference from each other. - 'catapult_revision': '1a2a373481ce2376a7c2719802e1dbb6d0d83c6b', + 'catapult_revision': 'aa736cc76ee5e35215abcfb83a8c354f12d0c684', } deps = { 'src/build': - Var('chromium_git') + '/chromium/src/build' + '@' + '800cde0e9ec6dfb73e64d33f28d3858fdeea63b9', + Var('chromium_git') + '/chromium/src/build' + '@' + '156ba982d749902e3403c242e23ded87fd316494', 'src/buildtools': - Var('chromium_git') + '/chromium/buildtools.git' + '@' + 'cbc33b9c0a9d1bb913895a4319a742c504a2d541', + Var('chromium_git') + '/chromium/buildtools.git' + '@' + 'f6d165d9d842ddd29056c127a5f3a3c5d8e0d2e3', 'src/testing': - Var('chromium_git') + '/chromium/src/testing' + '@' + '8ca56a609f13dfd7808d48162338c4d8c7ce8997', + Var('chromium_git') + '/chromium/src/testing' + '@' + 'cc96d3d66b5b9613fd0fe055509cfec5eb54b19c', 'src/third_party': - Var('chromium_git') + '/chromium/src/third_party' + '@' + '5d202b9b7d090b8e92a3832a5257e8b94a44e1c9', + Var('chromium_git') + '/chromium/src/third_party' + '@' + '72c52c224cdd3c377f7caff8ffed0f5749e79549', 'src/third_party/catapult': Var('chromium_git') + '/external/github.com/catapult-project/catapult.git' + '@' + Var('catapult_revision'), 'src/third_party/colorama/src': @@ -32,7 +32,7 @@ deps = { 'src/third_party/yasm/source/patched-yasm': Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + 'b98114e18d8b9b84586b10d24353ab8616d4c5fc', 'src/tools': - Var('chromium_git') + '/chromium/src/tools' + '@' + '9069575d376649d4cfcec7333ba05315a8f7dd29', + Var('chromium_git') + '/chromium/src/tools' + '@' + 'eceb2c420b20350a2d2ba261953109280968647a', 'src/tools/gyp': Var('chromium_git') + '/external/gyp.git' + '@' + 'd61a9397e668fa9843c4aa7da9e79460fe590bfb', 'src/tools/swarming_client': @@ -50,7 +50,7 @@ deps = { deps_os = { 'android': { 'src/base': - Var('chromium_git') + '/chromium/src/base' + '@' + 'b120cdc8d849a20eb32129a7e8c02fb8491f3149', + Var('chromium_git') + '/chromium/src/base' + '@' + '9b543d487c7c38be191c6180001ff9ce186ae326', 'src/third_party/android_tools': Var('chromium_git') + '/android_tools.git' + '@' + 'aadb2fed04af8606545b0afe4e3060bc1a15fad7', 'src/third_party/ced/src': @@ -74,7 +74,7 @@ deps_os = { }, 'ios': { 'src/ios': - Var('chromium_git') + '/chromium/src/ios' + '@' + 'a4c7e589665f97e9900ee806bd8afa34b7e21d11', + Var('chromium_git') + '/chromium/src/ios' + '@' + '39c4b2fcf73f5b1e82af3b9c57267c17217d6a30', }, 'unix': { 'src/third_party/lss': diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS index 2db52d30797..7b21adfe6c7 100644 --- a/chromium/third_party/libyuv/OWNERS +++ b/chromium/third_party/libyuv/OWNERS @@ -1,13 +1,8 @@ fbarchard@chromium.org magjed@chromium.org -torbjorng@chromium.org -per-file *.gyp=kjellander@chromium.org -per-file *.gn=kjellander@chromium.org +per-file *.gn=phoglund@chromium.org per-file .gitignore=* per-file AUTHORS=* per-file DEPS=* -per-file PRESUBMIT.py=kjellander@chromium.org -per-file gyp_libyuv.py=kjellander@chromium.org -per-file setup_links.py=* -per-file sync_chromium.py=kjellander@chromium.org +per-file PRESUBMIT.py=phoglund@chromium.org diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium index 3de0df132c9..88c069734ac 100644 --- a/chromium/third_party/libyuv/README.chromium +++ b/chromium/third_party/libyuv/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1670 +Version: 1678 License: BSD License File: LICENSE diff --git a/chromium/third_party/libyuv/docs/environment_variables.md b/chromium/third_party/libyuv/docs/environment_variables.md index 5802599e9d3..9071c54de23 100644 --- a/chromium/third_party/libyuv/docs/environment_variables.md +++ b/chromium/third_party/libyuv/docs/environment_variables.md @@ -14,7 +14,7 @@ By default the cpu is detected and the most advanced form of SIMD is used. But LIBYUV_DISABLE_SSE42 LIBYUV_DISABLE_AVX LIBYUV_DISABLE_AVX2 - LIBYUV_DISABLE_AVX3 + LIBYUV_DISABLE_AVX512BW LIBYUV_DISABLE_ERMS LIBYUV_DISABLE_FMA3 LIBYUV_DISABLE_DSPR2 diff --git a/chromium/third_party/libyuv/docs/formats.md b/chromium/third_party/libyuv/docs/formats.md index cddfe027e2b..2b75d31ac75 100644 --- a/chromium/third_party/libyuv/docs/formats.md +++ b/chromium/third_party/libyuv/docs/formats.md @@ -138,3 +138,10 @@ Some are channel order agnostic (e.g. ARGBScale). Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro). ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha. Other functions don't care. + +# RGB24 and RAW + +There are 2 RGB layouts - RGB24 (aka 24BG) and RAW + +RGB24 is B,G,R in memory +RAW is R,G,B in memory diff --git a/chromium/third_party/libyuv/docs/getting_started.md b/chromium/third_party/libyuv/docs/getting_started.md index be0b9a9d7a6..58e05f3cbcb 100644 --- a/chromium/third_party/libyuv/docs/getting_started.md +++ b/chromium/third_party/libyuv/docs/getting_started.md @@ -62,30 +62,15 @@ To get just the source (not buildable): ### Windows - call gn gen out/Release "--args=is_debug=false target_cpu=\"x86\"" - call gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\"" - ninja -v -C out/Release - ninja -v -C out/Debug - - call gn gen out/Release "--args=is_debug=false target_cpu=\"x64\"" - call gn gen out/Debug "--args=is_debug=true target_cpu=\"x64\"" - ninja -v -C out/Release - ninja -v -C out/Debug - -#### Building with clang-cl - - set GYP_DEFINES=clang=1 target_arch=ia32 - call python tools\clang\scripts\update.py - - call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x86\"" - call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x86\"" - ninja -v -C out/Release - ninja -v -C out/Debug + call gn gen out\Release "--args=is_debug=false target_cpu=\"x64\"" + call gn gen out\Debug "--args=is_debug=true target_cpu=\"x64\"" + ninja -v -C out\Release + ninja -v -C out\Debug - call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x64\"" - call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x64\"" - ninja -v -C out/Release - ninja -v -C out/Debug + call gn gen out\Release "--args=is_debug=false target_cpu=\"x86\"" + call gn gen out\Debug "--args=is_debug=true target_cpu=\"x86\"" + ninja -v -C out\Release + ninja -v -C out\Debug ### macOS and Linux @@ -123,17 +108,17 @@ https://code.google.com/p/chromium/wiki/AndroidBuildInstructions Add to .gclient last line: `target_os=['android'];` -armv7 +arm64 - gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\"" - gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\"" + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\"" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm64\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest -arm64 +armv7 - gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\"" - gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm64\"" + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\"" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest @@ -181,7 +166,7 @@ Running test with C code: ninja -C out/Debug libyuv ninja -C out/Debug libyuv_unittest ninja -C out/Debug compare - ninja -C out/Debug convert + ninja -C out/Debug yuvconvert ninja -C out/Debug psnr ninja -C out/Debug cpuid @@ -251,16 +236,11 @@ See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.htm out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*" -### OSX - - out/Release/libyuv_unittest --gtest_filter="*" - -### Linux +### macOS and Linux out/Release/libyuv_unittest --gtest_filter="*" -Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g. - +Replace --gtest_filter="*" with specific unittest to run. May include wildcards. out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt ## CPU Emulator tools @@ -275,12 +255,20 @@ Then run: ~/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=**I420ToARGB_Opt +### Intel Architecture Code Analyzer + +Inset these 2 macros into assembly code to be analyzed: + IACA_ASM_START + IACA_ASM_END +Build the code as usual, then run iaca on the object file. + ~/iaca-lin64/bin/iaca.sh -reduceout -arch HSW out/Release/obj/libyuv_internal/compare_gcc.o + ## Sanitizers gn gen out/Release "--args=is_debug=false is_msan=true" ninja -v -C out/Release - Sanitizers available: asan, msan, tsan, ubsan, lsan, ubsan_vptr +Sanitizers available: asan, msan, tsan, ubsan, lsan, ubsan_vptr ### Running Dr Memory memcheck for Windows diff --git a/chromium/third_party/libyuv/include/libyuv/compare_row.h b/chromium/third_party/libyuv/include/libyuv/compare_row.h index 9316dc22bf5..2e5ebe508d1 100644 --- a/chromium/third_party/libyuv/include/libyuv/compare_row.h +++ b/chromium/third_party/libyuv/include/libyuv/compare_row.h @@ -19,7 +19,7 @@ extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 @@ -42,13 +42,7 @@ extern "C" { #endif // clang >= 3.4 #endif // __clang__ -// clang 6 mips issue https://bugs.chromium.org/p/libyuv/issues/detail?id=715 -// broken in clang version 6.0.0 (trunk 308728) -// fixed in clang version 6.0.0 (trunk 310694) -#if defined(__clang__) -// #define DISABLE_CLANG_MSA 1 -#endif - +// The following are available for Visual C: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_HASHDJB2_AVX2 @@ -59,7 +53,7 @@ extern "C" { (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) #define HAS_HASHDJB2_SSE41 #define HAS_SUMSQUAREERROR_SSE2 -#define HAS_HAMMINGDISTANCE_X86 +#define HAS_HAMMINGDISTANCE_SSE42 #endif // The following are available for Visual C and clangcl 32 bit: @@ -69,6 +63,18 @@ extern "C" { #define HAS_SUMSQUAREERROR_AVX2 #endif +// The following are available for GCC and clangcl 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_HAMMINGDISTANCE_SSSE3 +#endif + +// The following are available for GCC and clangcl 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_HAMMINGDISTANCE_AVX2 +#endif + // The following are available for Neon: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -78,14 +84,13 @@ extern "C" { #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_HAMMINGDISTANCE_MSA - -#ifndef DISABLE_CLANG_MSA #define HAS_SUMSQUAREERROR_MSA #endif -#endif uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); -uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libyuv/include/libyuv/convert_from.h index a050e4457f3..237f68f57aa 100644 --- a/chromium/third_party/libyuv/include/libyuv/convert_from.h +++ b/chromium/third_party/libyuv/include/libyuv/convert_from.h @@ -189,6 +189,30 @@ int I420ToRAW(const uint8* src_y, int height); LIBYUV_API +int H420ToRGB24(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); + +LIBYUV_API +int H420ToRAW(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); + +LIBYUV_API int I420ToRGB565(const uint8* src_y, int src_stride_y, const uint8* src_u, diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h index 6d1afbdbb12..c2e9bbbd954 100644 --- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h +++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h @@ -36,15 +36,19 @@ static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; static const int kCpuHasFMA3 = 0x1000; -static const int kCpuHasAVX3 = 0x2000; -static const int kCpuHasF16C = 0x4000; - -// 0x8000 reserved for future X86 flags. +static const int kCpuHasF16C = 0x2000; +static const int kCpuHasGFNI = 0x4000; +static const int kCpuHasAVX512BW = 0x8000; +static const int kCpuHasAVX512VL = 0x10000; +static const int kCpuHasAVX512VBMI = 0x20000; +static const int kCpuHasAVX512VBMI2 = 0x40000; +static const int kCpuHasAVX512VBITALG = 0x80000; +static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; // These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x10000; -static const int kCpuHasDSPR2 = 0x20000; -static const int kCpuHasMSA = 0x40000; +static const int kCpuHasMIPS = 0x200000; +static const int kCpuHasDSPR2 = 0x400000; +static const int kCpuHasMSA = 0x800000; // Optional init function. TestCpuFlag does an auto-init. // Returns cpu_info flags. diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h index d97965cb88d..c91501a9c2c 100644 --- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h +++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h @@ -746,7 +746,7 @@ int I420Interpolate(const uint8* src0_y, int interpolation); #if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 diff --git a/chromium/third_party/libyuv/include/libyuv/rotate_row.h b/chromium/third_party/libyuv/include/libyuv/rotate_row.h index 2c51584eee8..973fc15284f 100644 --- a/chromium/third_party/libyuv/include/libyuv/rotate_row.h +++ b/chromium/third_party/libyuv/include/libyuv/rotate_row.h @@ -19,7 +19,7 @@ extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 @@ -29,7 +29,7 @@ extern "C" { #endif #endif // The following are available for Visual C and clangcl 32 bit: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h index 0b1e017448f..34d727641a8 100644 --- a/chromium/third_party/libyuv/include/libyuv/row.h +++ b/chromium/third_party/libyuv/include/libyuv/row.h @@ -31,7 +31,7 @@ extern "C" { var = 0 #if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 @@ -75,13 +75,6 @@ extern "C" { #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 -// clang 6 mips issue https://bugs.chromium.org/p/libyuv/issues/detail?id=715 -// broken in clang version 6.0.0 (trunk 308728) -// fixed in clang version 6.0.0 (trunk 310694) -#if defined(__clang__) -// #define DISABLE_CLANG_MSA 1 -#endif - // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) @@ -271,7 +264,7 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #endif -// The following are available forr gcc/clang x86 platforms: +// The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) @@ -279,6 +272,15 @@ extern "C" { #define HAS_SPLITRGBROW_SSSE3 #endif +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_MERGEUVROW_16_AVX2 +#define HAS_MULTIPLYROW_16_AVX2 +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -406,12 +408,23 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_ABGRTOUVROW_MSA +#define HAS_ABGRTOYROW_MSA +#define HAS_ARGB1555TOARGBROW_MSA +#define HAS_ARGB1555TOUVROW_MSA +#define HAS_ARGB1555TOYROW_MSA #define HAS_ARGB4444TOARGBROW_MSA #define HAS_ARGBADDROW_MSA #define HAS_ARGBATTENUATEROW_MSA +#define HAS_ARGBBLENDROW_MSA +#define HAS_ARGBCOLORMATRIXROW_MSA +#define HAS_ARGBEXTRACTALPHAROW_MSA #define HAS_ARGBGRAYROW_MSA #define HAS_ARGBMIRRORROW_MSA #define HAS_ARGBMULTIPLYROW_MSA +#define HAS_ARGBQUANTIZEROW_MSA +#define HAS_ARGBSEPIAROW_MSA +#define HAS_ARGBSETROW_MSA #define HAS_ARGBSHADEROW_MSA #define HAS_ARGBSHUFFLEROW_MSA #define HAS_ARGBSUBTRACTROW_MSA @@ -419,33 +432,8 @@ extern "C" { #define HAS_ARGBTOARGB4444ROW_MSA #define HAS_ARGBTORAWROW_MSA #define HAS_ARGBTORGB24ROW_MSA -#define HAS_ARGBTORGB565ROW_MSA -#define HAS_I422TOUYVYROW_MSA -#define HAS_I422TOYUY2ROW_MSA -#define HAS_INTERPOLATEROW_MSA -#define HAS_MERGEUVROW_MSA -#define HAS_MIRRORROW_MSA -#define HAS_RAWTORGB24ROW_MSA -#define HAS_SOBELTOPLANEROW_MSA -#define HAS_UYVYTOUVROW_MSA -#define HAS_UYVYTOYROW_MSA -#define HAS_YUY2TOUV422ROW_MSA -#define HAS_YUY2TOUVROW_MSA -#define HAS_YUY2TOYROW_MSA -#define HAS_ARGBEXTRACTALPHAROW_MSA -#define HAS_SPLITUVROW_MSA -#define HAS_MIRRORUVROW_MSA -#define HAS_HALFFLOATROW_MSA - -#ifndef DISABLE_CLANG_MSA -#define HAS_ABGRTOUVROW_MSA -#define HAS_ABGRTOYROW_MSA -#define HAS_ARGB1555TOARGBROW_MSA -#define HAS_ARGB1555TOUVROW_MSA -#define HAS_ARGB1555TOYROW_MSA -#define HAS_ARGBSEPIAROW_MSA -#define HAS_ARGBSETROW_MSA #define HAS_ARGBTORGB565DITHERROW_MSA +#define HAS_ARGBTORGB565ROW_MSA #define HAS_ARGBTOUV444ROW_MSA #define HAS_ARGBTOUVJROW_MSA #define HAS_ARGBTOUVROW_MSA @@ -453,17 +441,25 @@ extern "C" { #define HAS_ARGBTOYROW_MSA #define HAS_BGRATOUVROW_MSA #define HAS_BGRATOYROW_MSA +#define HAS_HALFFLOATROW_MSA #define HAS_I400TOARGBROW_MSA #define HAS_I422ALPHATOARGBROW_MSA #define HAS_I422TOARGBROW_MSA #define HAS_I422TORGB24ROW_MSA #define HAS_I422TORGBAROW_MSA +#define HAS_I422TOUYVYROW_MSA +#define HAS_I422TOYUY2ROW_MSA #define HAS_I444TOARGBROW_MSA +#define HAS_INTERPOLATEROW_MSA #define HAS_J400TOARGBROW_MSA +#define HAS_MERGEUVROW_MSA +#define HAS_MIRRORROW_MSA +#define HAS_MIRRORUVROW_MSA #define HAS_NV12TOARGBROW_MSA #define HAS_NV12TORGB565ROW_MSA #define HAS_NV21TOARGBROW_MSA #define HAS_RAWTOARGBROW_MSA +#define HAS_RAWTORGB24ROW_MSA #define HAS_RAWTOUVROW_MSA #define HAS_RAWTOYROW_MSA #define HAS_RGB24TOARGBROW_MSA @@ -474,17 +470,20 @@ extern "C" { #define HAS_RGB565TOYROW_MSA #define HAS_RGBATOUVROW_MSA #define HAS_RGBATOYROW_MSA +#define HAS_SETROW_MSA #define HAS_SOBELROW_MSA +#define HAS_SOBELTOPLANEROW_MSA +#define HAS_SOBELXROW_MSA #define HAS_SOBELXYROW_MSA +#define HAS_SOBELYROW_MSA +#define HAS_SPLITUVROW_MSA #define HAS_UYVYTOARGBROW_MSA +#define HAS_UYVYTOUVROW_MSA +#define HAS_UYVYTOYROW_MSA #define HAS_YUY2TOARGBROW_MSA -#define HAS_ARGBBLENDROW_MSA -#define HAS_ARGBQUANTIZEROW_MSA -#define HAS_ARGBCOLORMATRIXROW_MSA -#define HAS_SETROW_MSA -#define HAS_SOBELXROW_MSA -#define HAS_SOBELYROW_MSA -#endif +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOYROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1523,6 +1522,23 @@ void MergeRGBRow_Any_NEON(const uint8* src_r, uint8* dst_rgb, int width); +void MergeUVRow_16_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, /* 64 for 10 bit */ + int width); +void MergeUVRow_16_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width); + +void MultiplyRow_16_AVX2(const uint16* src_y, + uint16* dst_y, + int scale, + int width); +void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width); + void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h index ebafac4f679..c4a66aa07b1 100644 --- a/chromium/third_party/libyuv/include/libyuv/scale_row.h +++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h @@ -20,7 +20,7 @@ extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 @@ -50,13 +50,6 @@ extern "C" { #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 -// clang 6 mips issue https://bugs.chromium.org/p/libyuv/issues/detail?id=715 -// broken in clang version 6.0.0 (trunk 308728) -// fixed in clang version 6.0.0 (trunk 310694) -#if defined(__clang__) -// #define DISABLE_CLANG_MSA 1 -#endif - // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) @@ -112,19 +105,16 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#define HAS_SCALEARGBROWDOWN2_MSA -#define HAS_SCALEROWDOWN2_MSA -#define HAS_SCALEROWDOWN4_MSA #define HAS_SCALEADDROW_MSA #define HAS_SCALEARGBCOLS_MSA -#define HAS_SCALEROWDOWN34_MSA - -#ifndef DISABLE_CLANG_MSA +#define HAS_SCALEARGBFILTERCOLS_MSA +#define HAS_SCALEARGBROWDOWN2_MSA #define HAS_SCALEARGBROWDOWNEVEN_MSA -#define HAS_SCALEROWDOWN38_MSA #define HAS_SCALEFILTERCOLS_MSA -#define HAS_SCALEARGBFILTERCOLS_MSA -#endif +#define HAS_SCALEROWDOWN2_MSA +#define HAS_SCALEROWDOWN34_MSA +#define HAS_SCALEROWDOWN38_MSA +#define HAS_SCALEROWDOWN4_MSA #endif // Scale ARGB vertically with bilinear interpolation. diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h index f27ccfbb0c8..838c70f1349 100644 --- a/chromium/third_party/libyuv/include/libyuv/version.h +++ b/chromium/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1670 +#define LIBYUV_VERSION 1678 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/chromium/third_party/libyuv/infra/config/OWNERS b/chromium/third_party/libyuv/infra/config/OWNERS index 02eccd5eb50..b61b29d6c25 100644 --- a/chromium/third_party/libyuv/infra/config/OWNERS +++ b/chromium/third_party/libyuv/infra/config/OWNERS @@ -1,3 +1,3 @@ set noparent agable@chromium.org -kjellander@chromium.org +phoglund@chromium.org diff --git a/chromium/third_party/libyuv/infra/config/cq.cfg b/chromium/third_party/libyuv/infra/config/cq.cfg index 345723e17bc..604de7814d9 100644 --- a/chromium/third_party/libyuv/infra/config/cq.cfg +++ b/chromium/third_party/libyuv/infra/config/cq.cfg @@ -7,16 +7,8 @@ cq_status_url: "https://chromium-cq-status.appspot.com" git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git" gerrit {} -rietveld { - url: "https://codereview.chromium.org" -} - verifiers { - reviewer_lgtm { - committer_list: "project-libyuv-committers" - dry_run_access_list: "project-libyuv-tryjob-access" - } gerrit_cq_ability { committer_list: "project-libyuv-committers" dry_run_access_list: "project-libyuv-tryjob-access" diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk index 1dd527c7570..7e9aa5e4e8b 100644 --- a/chromium/third_party/libyuv/linux.mk +++ b/chromium/third_party/libyuv/linux.mk @@ -80,4 +80,4 @@ cpuid: util/cpuid.c libyuv.a $(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a clean: - /bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr + /bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc index 20afa0cef36..8c379b59cb8 100644 --- a/chromium/third_party/libyuv/source/compare.cc +++ b/chromium/third_party/libyuv/source/compare.cc @@ -110,12 +110,17 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } +// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. +// So actual maximum is 1 less loop, which is 64436 - 32 bytes. + LIBYUV_API uint64 ComputeHammingDistance(const uint8* src_a, const uint8* src_b, int count) { - const int kBlockSize = 65536; - int remainder = count & (kBlockSize - 1) & ~31; + const int kBlockSize = 1 << 15; // 32768; + const int kSimdSize = 64; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); uint64 diff = 0; int i; uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) = @@ -125,9 +130,14 @@ uint64 ComputeHammingDistance(const uint8* src_a, HammingDistance = HammingDistance_NEON; } #endif -#if defined(HAS_HAMMINGDISTANCE_X86) - if (TestCpuFlag(kCpuHasX86)) { - HammingDistance = HammingDistance_X86; +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; } #endif #if defined(HAS_HAMMINGDISTANCE_AVX2) @@ -153,7 +163,7 @@ uint64 ComputeHammingDistance(const uint8* src_a, src_a += remainder; src_b += remainder; } - remainder = count & 31; + remainder = count & (kSimdSize - 1); if (remainder) { diff += HammingDistance_C(src_a, src_b, remainder); } diff --git a/chromium/third_party/libyuv/source/compare_common.cc b/chromium/third_party/libyuv/source/compare_common.cc index d3e46fb502b..83564a1bcb5 100644 --- a/chromium/third_party/libyuv/source/compare_common.cc +++ b/chromium/third_party/libyuv/source/compare_common.cc @@ -18,7 +18,7 @@ extern "C" { #endif #if ORIGINAL_OPT -uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) { +uint32 HammingDistance_C1(const uint8* src_a, const uint8* src_b, int count) { uint32 diff = 0u; int i; @@ -58,6 +58,16 @@ uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) { src_a += 4; src_b += 4; } + + for (; i < count; ++i) { + uint32 x = *src_a ^ *src_b; + uint32 u = x - ((x >> 1) & 0x55); + u = ((u >> 2) & 0x33) + (u & 0x33); + diff += (u + (u >> 4)) & 0x0f; + src_a += 1; + src_b += 1; + } + return diff; } diff --git a/chromium/third_party/libyuv/source/compare_gcc.cc b/chromium/third_party/libyuv/source/compare_gcc.cc index 994fb10fd59..595c8ec4ae2 100644 --- a/chromium/third_party/libyuv/source/compare_gcc.cc +++ b/chromium/third_party/libyuv/source/compare_gcc.cc @@ -22,18 +22,210 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { +#if defined(__x86_64__) +uint32 HammingDistance_SSE42(const uint8* src_a, + const uint8* src_b, + int count) { + uint64 diff = 0u; + + asm volatile( + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" + + // Process 32 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + + return static_cast<uint32>(diff); +} +#else +uint32 HammingDistance_SSE42(const uint8* src_a, + const uint8* src_b, + int count) { + uint32 diff = 0u; + + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "ecx", "edx"); + + return diff; +} +#endif + +static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; +static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32 HammingDistance_SSSE3(const uint8* src_a, + const uint8* src_b, + int count) { + uint32 diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 +uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) { uint32 diff = 0u; - int i; - for (i = 0; i < count - 7; i += 8) { - uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b); - src_a += 8; - src_b += 8; - diff += __builtin_popcountll(x); - } + asm volatile( + "vbroadcastf128 %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + return diff; } +#endif // HAS_HAMMINGDISTANCE_AVX2 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; diff --git a/chromium/third_party/libyuv/source/compare_msa.cc b/chromium/third_party/libyuv/source/compare_msa.cc index d5db4452a71..57857cf5127 100644 --- a/chromium/third_party/libyuv/source/compare_msa.cc +++ b/chromium/third_party/libyuv/source/compare_msa.cc @@ -47,7 +47,6 @@ uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count) { return diff; } -#ifndef DISABLE_CLANG_MSA uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count) { uint32 sse = 0u; int i; @@ -85,7 +84,6 @@ uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count) { sse += (uint32)__msa_copy_u_w((v4i32)tmp0, 2); return sse; } -#endif #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libyuv/source/compare_win.cc index 5ca59178dbc..bcd6a88ebbb 100644 --- a/chromium/third_party/libyuv/source/compare_win.cc +++ b/chromium/third_party/libyuv/source/compare_win.cc @@ -23,9 +23,11 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { +uint32 HammingDistance_SSE42(const uint8* src_a, + const uint8* src_b, + int count) { uint32 diff = 0u; int i; diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc index d623731d9ff..0f52f9ef9e0 100644 --- a/chromium/third_party/libyuv/source/convert_from.cc +++ b/chromium/third_party/libyuv/source/convert_from.cc @@ -657,6 +657,42 @@ int I420ToRAW(const uint8* src_y, width, height); } +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + // Convert I420 to ARGB1555. LIBYUV_API int I420ToARGB1555(const uint8* src_y, @@ -1075,8 +1111,8 @@ int I420ToRGB565Dither(const uint8* src_y, for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), - width); // NOLINT + *(uint32*)(dither4x4 + ((y & 3) << 2)), // NOLINT + width); // NOLINT dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc index 12e03345b9f..344f3c06a2b 100644 --- a/chromium/third_party/libyuv/source/cpu_id.cc +++ b/chromium/third_party/libyuv/source/cpu_id.cc @@ -124,7 +124,7 @@ void CpuId(int eax, int ecx, int* cpu_info) { int GetXCR0() { int xcr0 = 0; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - xcr0 = (int) _xgetbv(0); // VS2010 SP1 required. NOLINT + xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT #elif defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) @@ -242,10 +242,17 @@ static SAFEBUFFERS int GetCpuFlags(void) { // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; } } + // TODO(fbarchard): Consider moving these to gtest // Environment variable overrides for testing. if (TestEnv("LIBYUV_DISABLE_X86")) { cpu_info &= ~kCpuHasX86; @@ -274,12 +281,12 @@ static SAFEBUFFERS int GetCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_FMA3")) { cpu_info &= ~kCpuHasFMA3; } - if (TestEnv("LIBYUV_DISABLE_AVX3")) { - cpu_info &= ~kCpuHasAVX3; - } if (TestEnv("LIBYUV_DISABLE_F16C")) { cpu_info &= ~kCpuHasF16C; } + if (TestEnv("LIBYUV_DISABLE_AVX512BW")) { + cpu_info &= ~kCpuHasAVX512BW; + } #endif #if defined(__mips__) && defined(__linux__) diff --git a/chromium/third_party/libyuv/source/mjpeg_validate.cc b/chromium/third_party/libyuv/source/mjpeg_validate.cc index 1a17dd7216b..bd760425359 100644 --- a/chromium/third_party/libyuv/source/mjpeg_validate.cc +++ b/chromium/third_party/libyuv/source/mjpeg_validate.cc @@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { const uint8* it = sample; while (it < end) { // TODO(fbarchard): scan for 0xd9 instead. - it = static_cast<const uint8*>(memchr(it, 0xff, end - it)); + it = (const uint8*)(memchr(it, 0xff, end - it)); if (it == NULL) { break; } diff --git a/chromium/third_party/libyuv/source/rotate_win.cc b/chromium/third_party/libyuv/source/rotate_win.cc index ee523a0b913..fb052f65212 100644 --- a/chromium/third_party/libyuv/source/rotate_win.cc +++ b/chromium/third_party/libyuv/source/rotate_win.cc @@ -17,7 +17,7 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, int src_stride, diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc index 2d01a789b6a..6ffc4febbf6 100644 --- a/chromium/third_party/libyuv/source/row_common.cc +++ b/chromium/third_party/libyuv/source/row_common.cc @@ -1798,6 +1798,35 @@ void MergeRGBRow_C(const uint8* src_r, } } +void MergeUVRow_16_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] * scale; + dst_uv[1] = src_v[x] * scale; + dst_uv[2] = src_u[x + 1] * scale; + dst_uv[3] = src_v[x + 1] * scale; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] * scale; + dst_uv[1] = src_v[width - 1] * scale; + } +} + +void MultiplyRow_16_C(const uint16* src_y, + uint16* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_y[x] * scale; + } +} + void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc index 86f0880be2a..b5c2e65c938 100644 --- a/chromium/third_party/libyuv/source/row_gcc.cc +++ b/chromium/third_party/libyuv/source/row_gcc.cc @@ -2753,6 +2753,87 @@ void MergeUVRow_SSE2(const uint8* src_u, } #endif // HAS_MERGEUVROW_SSE2 +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16* src_y, + uint16* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + #ifdef HAS_SPLITRGBROW_SSSE3 // Shuffle table for converting RGB to Planar. @@ -5645,6 +5726,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, #ifdef HAS_HALFFLOATROW_SSE2 static float kScaleBias = 1.9259299444e-34f; void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { + scale *= kScaleBias; asm volatile ( "pshufd $0x0,%3,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -5671,7 +5753,11 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "x"(scale * kScaleBias) // %3 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5" ); @@ -5680,6 +5766,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { #ifdef HAS_HALFFLOATROW_AVX2 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { + scale *= kScaleBias; asm volatile ( "vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -5707,7 +5794,11 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "x"(scale * kScaleBias) // %3 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5" ); @@ -5740,7 +5831,11 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 +#if defined(__x86_64__) : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif : "memory", "cc", "xmm2", "xmm3", "xmm4" ); diff --git a/chromium/third_party/libyuv/source/row_msa.cc b/chromium/third_party/libyuv/source/row_msa.cc index 89fc248f880..5cc23450a52 100644 --- a/chromium/third_party/libyuv/source/row_msa.cc +++ b/chromium/third_party/libyuv/source/row_msa.cc @@ -16,12 +16,6 @@ #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #include "libyuv/macros_msa.h" -// caveat - as of clang 6, some functions do not build under clang. -// The macro DISABLE_CLANG_MSA is set for clang builds to disable -// affected functions. -// __msa_fill_w() is one affected intrinsic. -// See Also: https://bugs.chromium.org/p/libyuv/issues/detail?id=715 - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -373,7 +367,6 @@ void I422ToUYVYRow_MSA(const uint8* src_y, } } -#ifndef DISABLE_CLANG_MSA void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -644,7 +637,6 @@ void I422ToARGB1555Row_MSA(const uint8* src_y, dst_argb1555 += 16; } } -#endif void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { int x; @@ -776,8 +768,6 @@ void UYVYToUV422Row_MSA(const uint8* src_uyvy, } } -#ifndef DISABLE_CLANG_MSA - void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; @@ -941,7 +931,6 @@ void ARGBToUVRow_MSA(const uint8* src_argb0, dst_v += 16; } } -#endif void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { int x; @@ -1088,7 +1077,6 @@ void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { } } -#ifndef DISABLE_CLANG_MSA void ARGBToUV444Row_MSA(const uint8* src_argb, uint8* dst_u, uint8* dst_v, @@ -1160,7 +1148,6 @@ void ARGBToUV444Row_MSA(const uint8* src_argb, dst_v += 16; } } -#endif void ARGBMultiplyRow_MSA(const uint8* src_argb0, const uint8* src_argb1, @@ -1308,7 +1295,6 @@ void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { } } -#ifndef DISABLE_CLANG_MSA void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, uint8* dst_rgb, uint32 dither4, @@ -1352,7 +1338,6 @@ void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, dst_rgb += 16; } } -#endif void ARGBShuffleRow_MSA(const uint8* src_argb, uint8* dst_argb, @@ -1442,7 +1427,6 @@ void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { } } -#ifndef DISABLE_CLANG_MSA void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { int x; v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; @@ -1483,7 +1467,6 @@ void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { dst_argb += 32; } } -#endif void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb, @@ -1514,7 +1497,6 @@ void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, } } -#ifndef DISABLE_CLANG_MSA void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, uint8* dst_argb, int width) { @@ -2372,7 +2354,6 @@ void SobelRow_MSA(const uint8* src_sobelx, dst_argb += 64; } } -#endif void SobelToPlaneRow_MSA(const uint8* src_sobelx, const uint8* src_sobely, @@ -2395,7 +2376,6 @@ void SobelToPlaneRow_MSA(const uint8* src_sobelx, } } -#ifndef DISABLE_CLANG_MSA void SobelXYRow_MSA(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, @@ -2870,7 +2850,6 @@ void UYVYToARGBRow_MSA(const uint8* src_uyvy, rgb_buf += 32; } } -#endif void InterpolateRow_MSA(uint8* dst_ptr, const uint8* src_ptr, @@ -2936,7 +2915,6 @@ void InterpolateRow_MSA(uint8* dst_ptr, } } -#ifndef DISABLE_CLANG_MSA void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { int x; v4i32 dst0 = __builtin_msa_fill_w(v32); @@ -2946,7 +2924,6 @@ void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { dst_argb += 16; } } -#endif void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) { int x; @@ -3010,7 +2987,6 @@ void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width) { } } -#ifndef DISABLE_CLANG_MSA void ARGBBlendRow_MSA(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, @@ -3290,7 +3266,6 @@ void ARGBColorMatrixRow_MSA(const uint8* src_argb, dst_argb += 32; } } -#endif void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, @@ -3316,7 +3291,6 @@ void SplitUVRow_MSA(const uint8* src_uv, } } -#ifndef DISABLE_CLANG_MSA void SetRow_MSA(uint8* dst, uint8 v8, int width) { int x; v16u8 dst0 = (v16u8)__msa_fill_b(v8); @@ -3326,7 +3300,6 @@ void SetRow_MSA(uint8* dst, uint8 v8, int width) { dst += 16; } } -#endif void MirrorUVRow_MSA(const uint8* src_uv, uint8* dst_u, @@ -3357,7 +3330,6 @@ void MirrorUVRow_MSA(const uint8* src_uv, } } -#ifndef DISABLE_CLANG_MSA void SobelXRow_MSA(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, @@ -3456,7 +3428,6 @@ void SobelYRow_MSA(const uint8* src_y0, dst_sobely += 16; } } -#endif void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width) { int i; diff --git a/chromium/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libyuv/source/row_neon64.cc index de17f8b7314..5616d8a5b5f 100644 --- a/chromium/third_party/libyuv/source/row_neon64.cc +++ b/chromium/third_party/libyuv/source/row_neon64.cc @@ -628,19 +628,19 @@ void MergeRGBRow_NEON(const uint8* src_r, ); } -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +// Copy multiple of 32. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile( "1: \n" - "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 + "ldp q0, q1, [%0], #32 \n" "subs %w2, %w2, #32 \n" // 32 processed per loop - "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 + "stp q0, q1, [%1], #32 \n" "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List ); } @@ -2646,11 +2646,11 @@ float ScaleMaxSamples_NEON(const float* src, "b.gt 1b \n" "fmax v5.4s, v5.4s, v6.4s \n" // max "fmaxv %s3, v5.4s \n" // signed max acculator - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width), // %2 - "=w"(fmax) // %3 - : "w"(scale) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); return fmax; } @@ -2676,11 +2676,11 @@ float ScaleSumSamples_NEON(const float* src, "faddp v5.4s, v5.4s, v6.4s \n" "faddp v5.4s, v5.4s, v5.4s \n" "faddp %3.4s, v5.4s, v5.4s \n" // sum - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width), // %2 - "=w"(fsum) // %3 - : "w"(scale) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fsum) // %3 + : "w"(scale) // %4 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); return fsum; } diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc index 13e242ba030..9104acb95fc 100644 --- a/chromium/third_party/libyuv/source/scale.cc +++ b/chromium/third_party/libyuv/source/scale.cc @@ -822,11 +822,12 @@ static void ScaleAddCols2_16_C(int dst_width, static void ScaleAddCols0_C(int dst_width, int boxheight, int x, - int, + int dx, const uint16* src_ptr, uint8* dst_ptr) { int scaleval = 65536 / boxheight; int i; + (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { *dst_ptr++ = src_ptr[i] * scaleval >> 16; @@ -1699,7 +1700,7 @@ void ScalePlane_16(const uint16* src, CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } - if (dst_width == src_width) { + if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled vertically. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, @@ -1728,7 +1729,7 @@ void ScalePlane_16(const uint16* src, return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && - filtering != kFilterBilinear) { + (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); diff --git a/chromium/third_party/libyuv/source/scale_msa.cc b/chromium/third_party/libyuv/source/scale_msa.cc index c246e5e8ec9..df1f482be6d 100644 --- a/chromium/third_party/libyuv/source/scale_msa.cc +++ b/chromium/third_party/libyuv/source/scale_msa.cc @@ -127,7 +127,6 @@ void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, } } -#ifndef DISABLE_CLANG_MSA void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -182,7 +181,6 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, dst_argb += 16; } } -#endif void ScaleRowDown2_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -385,7 +383,6 @@ void ScaleRowDown38_MSA(const uint8_t* src_ptr, } } -#ifndef DISABLE_CLANG_MSA void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -535,7 +532,6 @@ void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, dst_ptr += 12; } } -#endif void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { int x; @@ -557,7 +553,6 @@ void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { } } -#ifndef DISABLE_CLANG_MSA void ScaleFilterCols_MSA(uint8* dst_ptr, const uint8* src_ptr, int dst_width, @@ -634,7 +629,6 @@ void ScaleFilterCols_MSA(uint8* dst_ptr, dst_ptr += 16; } } -#endif void ScaleARGBCols_MSA(uint8* dst_argb, const uint8* src_argb, @@ -663,7 +657,6 @@ void ScaleARGBCols_MSA(uint8* dst_argb, } } -#ifndef DISABLE_CLANG_MSA void ScaleARGBFilterCols_MSA(uint8* dst_argb, const uint8* src_argb, int dst_width, @@ -728,7 +721,6 @@ void ScaleARGBFilterCols_MSA(uint8* dst_argb, dst_argb += 32; } } -#endif void ScaleRowDown34_MSA(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/chromium/third_party/libyuv/source/scale_win.cc b/chromium/third_party/libyuv/source/scale_win.cc index 3e93312832a..b5fd6638262 100644 --- a/chromium/third_party/libyuv/source/scale_win.cc +++ b/chromium/third_party/libyuv/source/scale_win.cc @@ -17,7 +17,7 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Offsets for source bytes 0 to 9 static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, diff --git a/chromium/third_party/libyuv/tools_libyuv/OWNERS b/chromium/third_party/libyuv/tools_libyuv/OWNERS index aca046d45e3..2cb971d2b72 100644 --- a/chromium/third_party/libyuv/tools_libyuv/OWNERS +++ b/chromium/third_party/libyuv/tools_libyuv/OWNERS @@ -1 +1 @@ -kjellander@chromium.org +phoglund@chromium.org diff --git a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS index 60351e7ea2a..0a919805c2c 100644 --- a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS +++ b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS @@ -1,3 +1,3 @@ pbos@chromium.org -kjellander@chromium.org +phoglund@chromium.org diff --git a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS index b608519abf6..da77b4ef23f 100644 --- a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS +++ b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS @@ -1,4 +1,4 @@ pbos@webrtc.org -kjellander@webrtc.org +phoglund@webrtc.org fbarchard@chromium.org diff --git a/chromium/third_party/libyuv/unit_test/color_test.cc b/chromium/third_party/libyuv/unit_test/color_test.cc index 0aa7a54ac01..30b6411283f 100644 --- a/chromium/third_party/libyuv/unit_test/color_test.cc +++ b/chromium/third_party/libyuv/unit_test/color_test.cc @@ -471,21 +471,22 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) { printf("\n"); } +// Step by 5 on inner loop goes from 0 to 255 inclusive. +// Set to 1 for better converage. 3, 5 or 17 for faster testing. +#define FASTSTEP 5 TEST_F(LibYUVColorTest, TestFullYUV) { - int rh[256] = - { - 0, - }, - gh[256] = - { - 0, - }, - bh[256] = { - 0, - }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { - for (int y2 = 0; y2 < 256; ++y2) { + for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVToRGBReference(y, u, v, &r0, &g0, &b0); @@ -503,20 +504,18 @@ TEST_F(LibYUVColorTest, TestFullYUV) { } TEST_F(LibYUVColorTest, TestFullYUVJ) { - int rh[256] = - { - 0, - }, - gh[256] = - { - 0, - }, - bh[256] = { - 0, - }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { - for (int y2 = 0; y2 < 256; ++y2) { + for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVJToRGBReference(y, u, v, &r0, &g0, &b0); @@ -532,6 +531,7 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) { } PrintHistogram(rh, gh, bh); } +#undef FASTSTEP TEST_F(LibYUVColorTest, TestGreyYUVJ) { int r0, g0, b0, r1, g1, b1, r2, g2, b2; diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc index 149a9a13ae6..ff39b2b0f60 100644 --- a/chromium/third_party/libyuv/unit_test/compare_test.cc +++ b/chromium/third_party/libyuv/unit_test/compare_test.cc @@ -229,13 +229,34 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { for (int i = 0; i < count; ++i) { #if defined(HAS_HAMMINGDISTANCE_NEON) h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); -#elif defined(HAS_HAMMINGDISTANCE_X86) - h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); +#elif defined(HAS_HAMMINGDISTANCE_AVX2) + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + if (has_avx2) { + h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); + } else { + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); + } else { + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } + } + } +#elif defined(HAS_HAMMINGDISTANCE_SSE42) + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } #else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif } - EXPECT_EQ(h0, h1); free_aligned_buffer_page_end(src_a); @@ -305,6 +326,99 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) { free_aligned_buffer_page_end(src_b); } +// Tests low levels match reference C for specified size. +// The opt implementations have size limitations +// For NEON the counters are 16 bit so the shorts overflow after 65536 bytes. +// So doing one less iteration of the loop is the maximum. +#if defined(HAS_HAMMINGDISTANCE_NEON) +static const int kMaxOptCount = 65536 - 32; // 65504 +#else +static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 +#endif + +TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { + uint32 h1 = 0; + const int kMaxWidth = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_a, kMaxWidth); + align_buffer_page_end(src_b, kMaxWidth); + memset(src_a, 255u, kMaxWidth); + memset(src_b, 0u, kMaxWidth); + + uint64 h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth); + EXPECT_EQ(kMaxWidth * 8ULL, h0); + + for (int i = 0; i < benchmark_iterations_; ++i) { +#if defined(HAS_HAMMINGDISTANCE_NEON) + h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); +#elif defined(HAS_HAMMINGDISTANCE_AVX2) + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + if (has_avx2) { + h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); + } else { + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); + } else { + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } + } + } +#elif defined(HAS_HAMMINGDISTANCE_SSE42) + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } +#else + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); +#endif + } + + // A large count will cause the low level to potentially overflow so the + // result can not be expected to be correct. + // TODO(fbarchard): Consider expecting the low 16 bits to match. + if (kMaxWidth <= kMaxOptCount) { + EXPECT_EQ(kMaxWidth * 8U, h1); + } else { + if (kMaxWidth * 8ULL != static_cast<uint64>(h1)) { + printf( + "warning - HammingDistance_Opt %u does not match %llu " + "but length of %u is longer than guaranteed.\n", + h1, kMaxWidth * 8ULL, kMaxWidth); + } else { + printf( + "warning - HammingDistance_Opt %u matches but length of %u " + "is longer than guaranteed.\n", + h1, kMaxWidth); + } + } + + free_aligned_buffer_page_end(src_a); + free_aligned_buffer_page_end(src_b); +} + +TEST_F(LibYUVCompareTest, TestHammingDistance) { + align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); + memset(src_a, 255u, benchmark_width_ * benchmark_height_); + memset(src_b, 0, benchmark_width_ * benchmark_height_); + + uint64 h1 = 0; + for (int i = 0; i < benchmark_iterations_; ++i) { + h1 = ComputeHammingDistance(src_a, src_b, + benchmark_width_ * benchmark_height_); + } + EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1); + + free_aligned_buffer_page_end(src_a); + free_aligned_buffer_page_end(src_b); +} + TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth); diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc index deeb30e4985..56b6364e5eb 100644 --- a/chromium/third_party/libyuv/unit_test/convert_test.cc +++ b/chromium/third_party/libyuv/unit_test/convert_test.cc @@ -572,6 +572,8 @@ TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4) +TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4) +TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4) TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4) TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4) @@ -1798,6 +1800,11 @@ TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3) TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3) TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4) +TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4) +TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3) +TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3) +TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3) +TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2) diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc index c0b8910a115..4e694f55ce5 100644 --- a/chromium/third_party/libyuv/unit_test/cpu_test.cc +++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc @@ -21,38 +21,55 @@ namespace libyuv { TEST_F(LibYUVBaseTest, TestCpuHas) { int cpu_flags = TestCpuFlag(-1); printf("Cpu Flags %x\n", cpu_flags); +#if defined(__arm__) || defined(__aarch64__) int has_arm = TestCpuFlag(kCpuHasARM); printf("Has ARM %x\n", has_arm); int has_neon = TestCpuFlag(kCpuHasNEON); printf("Has NEON %x\n", has_neon); +#endif int has_x86 = TestCpuFlag(kCpuHasX86); - printf("Has X86 %x\n", has_x86); int has_sse2 = TestCpuFlag(kCpuHasSSE2); - printf("Has SSE2 %x\n", has_sse2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - printf("Has SSSE3 %x\n", has_ssse3); int has_sse41 = TestCpuFlag(kCpuHasSSE41); - printf("Has SSE4.1 %x\n", has_sse41); int has_sse42 = TestCpuFlag(kCpuHasSSE42); - printf("Has SSE4.2 %x\n", has_sse42); int has_avx = TestCpuFlag(kCpuHasAVX); - printf("Has AVX %x\n", has_avx); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - printf("Has AVX2 %x\n", has_avx2); int has_erms = TestCpuFlag(kCpuHasERMS); - printf("Has ERMS %x\n", has_erms); int has_fma3 = TestCpuFlag(kCpuHasFMA3); - printf("Has FMA3 %x\n", has_fma3); - int has_avx3 = TestCpuFlag(kCpuHasAVX3); - printf("Has AVX3 %x\n", has_avx3); int has_f16c = TestCpuFlag(kCpuHasF16C); + int has_gfni = TestCpuFlag(kCpuHasGFNI); + int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); + int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); + int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); + int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); + int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); + int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); + printf("Has X86 %x\n", has_x86); + printf("Has SSE2 %x\n", has_sse2); + printf("Has SSSE3 %x\n", has_ssse3); + printf("Has SSE4.1 %x\n", has_sse41); + printf("Has SSE4.2 %x\n", has_sse42); + printf("Has AVX %x\n", has_avx); + printf("Has AVX2 %x\n", has_avx2); + printf("Has ERMS %x\n", has_erms); + printf("Has FMA3 %x\n", has_fma3); printf("Has F16C %x\n", has_f16c); + printf("Has GFNI %x\n", has_gfni); + printf("Has AVX512BW %x\n", has_avx512bw); + printf("Has AVX512VL %x\n", has_avx512vl); + printf("Has AVX512VBMI %x\n", has_avx512vbmi); + printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2); + printf("Has AVX512VBITALG %x\n", has_avx512vbitalg); + printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq); + +#if defined(__mips__) int has_mips = TestCpuFlag(kCpuHasMIPS); printf("Has MIPS %x\n", has_mips); int has_dspr2 = TestCpuFlag(kCpuHasDSPR2); printf("Has DSPR2 %x\n", has_dspr2); int has_msa = TestCpuFlag(kCpuHasMSA); printf("Has MSA %x\n", has_msa); +#endif } TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) { diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc index 32bcb871065..f9e6f8abb2f 100644 --- a/chromium/third_party/libyuv/unit_test/planar_test.cc +++ b/chromium/third_party/libyuv/unit_test/planar_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> #include <stdlib.h> #include <time.h> @@ -2616,6 +2617,88 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// TODO(fbarchard): improve test for platforms and cpu detect +#ifdef HAS_MERGEUVROW_16_AVX2 +TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels * 2); + align_buffer_page_end(src_pixels_v, kPixels * 2); + align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2); + + MemRandomize(src_pixels_u, kPixels * 2); + MemRandomize(src_pixels_v, kPixels * 2); + memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2); + memset(dst_pixels_uv_c, 1, kPixels * 2 * 2); + + MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u), + reinterpret_cast<const uint16*>(src_pixels_v), + reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u), + reinterpret_cast<const uint16*>(src_pixels_v), + reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64, + kPixels); + } else { + MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u), + reinterpret_cast<const uint16*>(src_pixels_v), + reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64, + kPixels); + } + } + + for (int i = 0; i < kPixels * 2 * 2; ++i) { + EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); + free_aligned_buffer_page_end(dst_pixels_uv_opt); + free_aligned_buffer_page_end(dst_pixels_uv_c); +} +#endif + +// TODO(fbarchard): improve test for platforms and cpu detect +#ifdef HAS_MULTIPLYROW_16_AVX2 +TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_y, kPixels * 2); + align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_y_c, kPixels * 2); + + MemRandomize(src_pixels_y, kPixels * 2); + memset(dst_pixels_y_opt, 0, kPixels * 2); + memset(dst_pixels_y_c, 1, kPixels * 2); + + MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y), + reinterpret_cast<uint16*>(dst_pixels_y_c), 64, kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + MultiplyRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_y), + reinterpret_cast<uint16*>(dst_pixels_y_opt), 64, + kPixels); + } else { + MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y), + reinterpret_cast<uint16*>(dst_pixels_y_opt), 64, + kPixels); + } + } + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_y); + free_aligned_buffer_page_end(dst_pixels_y_opt); + free_aligned_buffer_page_end(dst_pixels_y_c); +} +#endif + float TestScaleMaxSamples(int benchmark_width, int benchmark_height, int benchmark_iterations, @@ -2623,44 +2706,44 @@ float TestScaleMaxSamples(int benchmark_width, bool opt) { int i, j; float max_c, max_opt = 0.f; - const int y_plane_size = benchmark_width * benchmark_height * 4; - - align_buffer_page_end(orig_y, y_plane_size * 3); - uint8* dst_opt = orig_y + y_plane_size; - uint8* dst_c = orig_y + y_plane_size * 2; + // NEON does multiple of 8, so round count up + const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; + align_buffer_page_end(orig_y, kPixels * 4 * 3 + 48); + uint8* dst_c = orig_y + kPixels * 4 + 16; + uint8* dst_opt = orig_y + kPixels * 4 * 2 + 32; // Randomize works but may contain some denormals affecting performance. - // MemRandomize(orig_y, y_plane_size); - for (i = 0; i < y_plane_size / 4; ++i) { - (reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; + // MemRandomize(orig_y, kPixels * 4); + // large values are problematic. audio is really -1 to 1. + for (i = 0; i < kPixels; ++i) { + (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f); } - memset(dst_c, 0, y_plane_size); - memset(dst_opt, 1, y_plane_size); + memset(dst_c, 0, kPixels * 4); + memset(dst_opt, 1, kPixels * 4); max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_c), scale, - benchmark_width * benchmark_height); + reinterpret_cast<float*>(dst_c), scale, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_SCALESUMSAMPLES_NEON max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y), reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + kPixels); #else - max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + max_opt = + ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), + reinterpret_cast<float*>(dst_opt), scale, kPixels); #endif } else { - max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + max_opt = + ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), + reinterpret_cast<float*>(dst_opt), scale, kPixels); } } float max_diff = FAbs(max_opt - max_c); - for (i = 0; i < y_plane_size / 4; ++i) { + for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - (reinterpret_cast<float*>(dst_opt)[i])); if (abs_diff > max_diff) { @@ -2691,44 +2774,55 @@ float TestScaleSumSamples(int benchmark_width, bool opt) { int i, j; float sum_c, sum_opt = 0.f; - const int y_plane_size = benchmark_width * benchmark_height * 4; - - align_buffer_page_end(orig_y, y_plane_size * 3); - uint8* dst_opt = orig_y + y_plane_size; - uint8* dst_c = orig_y + y_plane_size * 2; + // NEON does multiple of 8, so round count up + const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; + align_buffer_page_end(orig_y, kPixels * 4 * 3); + uint8* dst_c = orig_y + kPixels * 4; + uint8* dst_opt = orig_y + kPixels * 4 * 2; // Randomize works but may contain some denormals affecting performance. - // MemRandomize(orig_y, y_plane_size); - for (i = 0; i < y_plane_size / 4; ++i) { - (reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; + // MemRandomize(orig_y, kPixels * 4); + // large values are problematic. audio is really -1 to 1. + for (i = 0; i < kPixels; ++i) { + (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f); } - memset(dst_c, 0, y_plane_size); - memset(dst_opt, 1, y_plane_size); + memset(dst_c, 0, kPixels * 4); + memset(dst_opt, 1, kPixels * 4); sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_c), scale, - benchmark_width * benchmark_height); + reinterpret_cast<float*>(dst_c), scale, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_SCALESUMSAMPLES_NEON sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y), reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + kPixels); #else - sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + sum_opt = + ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), + reinterpret_cast<float*>(dst_opt), scale, kPixels); #endif } else { - sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + sum_opt = + ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), + reinterpret_cast<float*>(dst_opt), scale, kPixels); } } - float max_diff = FAbs(sum_opt - sum_c); - for (i = 0; i < y_plane_size / 4; ++i) { + float mse_opt = sum_opt / kPixels * 4; + float mse_c = sum_c / kPixels * 4; + float mse_error = FAbs(mse_opt - mse_c) / mse_c; + + // If the sum of a float is more than 4 million, small adds are round down on + // float and produce different results with vectorized sum vs scalar sum. + // Ignore the difference if the sum is large. + float max_diff = 0.f; + if (mse_error > 0.0001 && sum_c < 4000000) { // allow .01% difference of mse + max_diff = mse_error; + } + + for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - (reinterpret_cast<float*>(dst_opt)[i])); if (abs_diff > max_diff) { @@ -2758,45 +2852,41 @@ float TestScaleSamples(int benchmark_width, float scale, bool opt) { int i, j; - const int y_plane_size = benchmark_width * benchmark_height * 4; - - align_buffer_page_end(orig_y, y_plane_size * 3); - uint8* dst_opt = orig_y + y_plane_size; - uint8* dst_c = orig_y + y_plane_size * 2; + // NEON does multiple of 8, so round count up + const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; + align_buffer_page_end(orig_y, kPixels * 4 * 3); + uint8* dst_c = orig_y + kPixels * 4; + uint8* dst_opt = orig_y + kPixels * 4 * 2; // Randomize works but may contain some denormals affecting performance. - // MemRandomize(orig_y, y_plane_size); - for (i = 0; i < y_plane_size / 4; ++i) { - (reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; + // MemRandomize(orig_y, kPixels * 4); + // large values are problematic. audio is really -1 to 1. + for (i = 0; i < kPixels; ++i) { + (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f); } - - memset(dst_c, 0, y_plane_size); - memset(dst_opt, 1, y_plane_size); + memset(dst_c, 0, kPixels * 4); + memset(dst_opt, 1, kPixels * 4); ScaleSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_c), scale, - benchmark_width * benchmark_height); + reinterpret_cast<float*>(dst_c), scale, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_SCALESUMSAMPLES_NEON ScaleSamples_NEON(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + reinterpret_cast<float*>(dst_opt), scale, kPixels); #else ScaleSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + reinterpret_cast<float*>(dst_opt), scale, kPixels); #endif } else { ScaleSamples_C(reinterpret_cast<float*>(orig_y), - reinterpret_cast<float*>(dst_opt), scale, - benchmark_width * benchmark_height); + reinterpret_cast<float*>(dst_opt), scale, kPixels); } } float max_diff = 0.f; - for (i = 0; i < y_plane_size / 4; ++i) { + for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - (reinterpret_cast<float*>(dst_opt)[i])); if (abs_diff > max_diff) { @@ -2820,6 +2910,66 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) { EXPECT_EQ(0, diff); } +float TestCopySamples(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + bool opt) { + int i, j; + // NEON does multiple of 16 floats, so round count up + const int kPixels = (benchmark_width * benchmark_height + 15) & ~15; + align_buffer_page_end(orig_y, kPixels * 4 * 3); + uint8* dst_c = orig_y + kPixels * 4; + uint8* dst_opt = orig_y + kPixels * 4 * 2; + + // Randomize works but may contain some denormals affecting performance. + // MemRandomize(orig_y, kPixels * 4); + // large values are problematic. audio is really -1 to 1. + for (i = 0; i < kPixels; ++i) { + (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f); + } + memset(dst_c, 0, kPixels * 4); + memset(dst_opt, 1, kPixels * 4); + + memcpy(reinterpret_cast<void*>(dst_c), reinterpret_cast<void*>(orig_y), + kPixels * 4); + + for (j = 0; j < benchmark_iterations; j++) { + if (opt) { +#ifdef HAS_COPYROW_NEON + CopyRow_NEON(orig_y, dst_opt, kPixels * 4); +#else + CopyRow_C(orig_y, dst_opt, kPixels * 4); +#endif + } else { + CopyRow_C(orig_y, dst_opt, kPixels * 4); + } + } + + float max_diff = 0.f; + for (i = 0; i < kPixels; ++i) { + float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - + (reinterpret_cast<float*>(dst_opt)[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(orig_y); + return max_diff; +} + +TEST_F(LibYUVPlanarTest, TestCopySamples_C) { + float diff = TestCopySamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, false); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) { + float diff = TestCopySamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, true); + EXPECT_EQ(0, diff); +} + extern "C" void GaussRow_NEON(const uint32* src, uint16* dst, int width); extern "C" void GaussRow_C(const uint32* src, uint16* dst, int width); diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc index 8046de7d4bf..c39211a161b 100644 --- a/chromium/third_party/libyuv/unit_test/scale_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_test.cc @@ -35,19 +35,19 @@ static int TestFilter(int src_width, } int i, j; - const int b = 0; // 128 to test for padding/stride. int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; - int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2); - int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2); + int64 src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64 src_uv_plane_size = (src_width_uv) * (src_height_uv); - int src_stride_y = b * 2 + Abs(src_width); - int src_stride_uv = b * 2 + src_width_uv; + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv; - align_buffer_page_end(src_y, src_y_plane_size) - align_buffer_page_end(src_u, src_uv_plane_size) align_buffer_page_end( - src_v, src_uv_plane_size) if (!src_y || !src_u || !src_v) { + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_u, src_uv_plane_size); + align_buffer_page_end(src_v, src_uv_plane_size); + if (!src_y || !src_u || !src_v) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } @@ -58,60 +58,51 @@ static int TestFilter(int src_width, int dst_width_uv = (dst_width + 1) >> 1; int dst_height_uv = (dst_height + 1) >> 1; - int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2); - int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2); - - int dst_stride_y = b * 2 + dst_width; - int dst_stride_uv = b * 2 + dst_width_uv; - - align_buffer_page_end(dst_y_c, dst_y_plane_size) - align_buffer_page_end(dst_u_c, dst_uv_plane_size) - align_buffer_page_end(dst_v_c, dst_uv_plane_size) - align_buffer_page_end(dst_y_opt, dst_y_plane_size) - align_buffer_page_end(dst_u_opt, dst_uv_plane_size) - align_buffer_page_end( - dst_v_opt, - dst_uv_plane_size) if (!dst_y_c || !dst_u_c || - !dst_v_c || !dst_y_opt || - !dst_u_opt || !dst_v_opt) { + int64 dst_y_plane_size = (dst_width) * (dst_height); + int64 dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); + + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv; + + align_buffer_page_end(dst_y_c, dst_y_plane_size); + align_buffer_page_end(dst_u_c, dst_uv_plane_size); + align_buffer_page_end(dst_v_c, dst_uv_plane_size); + align_buffer_page_end(dst_y_opt, dst_y_plane_size); + align_buffer_page_end(dst_u_opt, dst_uv_plane_size); + align_buffer_page_end(dst_v_opt, dst_uv_plane_size); + if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt || + !dst_v_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); - I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, - src_u + (src_stride_uv * b) + b, src_stride_uv, - src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, - src_height, dst_y_c + (dst_stride_y * b) + b, dst_stride_y, - dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, - dst_height, f); + I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_c, dst_stride_y, dst_u_c, + dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { - I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, - src_u + (src_stride_uv * b) + b, src_stride_uv, - src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, - src_height, dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, - dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, - dst_height, f); + I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt, + dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height, + f); } opt_time = (get_time() - opt_time) / benchmark_iterations; - // Report performance of C vs OPT + // Report performance of C vs OPT. printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference - // of the buffers and look to see that the max difference isn't - // over 2. + // of the buffers and look to see that the max difference is not + // over 3. int max_diff = 0; - for (i = b; i < (dst_height + b); ++i) { - for (j = b; j < (dst_width + b); ++j) { + for (i = 0; i < (dst_height); ++i) { + for (j = 0; j < (dst_width); ++j) { int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] - dst_y_opt[(i * dst_stride_y) + j]); if (abs_diff > max_diff) { @@ -120,8 +111,8 @@ static int TestFilter(int src_width, } } - for (i = b; i < (dst_height_uv + b); ++i) { - for (j = b; j < (dst_width_uv + b); ++j) { + for (i = 0; i < (dst_height_uv); ++i) { + for (j = 0; j < (dst_width_uv); ++j) { int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] - dst_u_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { @@ -135,17 +126,17 @@ static int TestFilter(int src_width, } } - free_aligned_buffer_page_end(dst_y_c) free_aligned_buffer_page_end(dst_u_c) - free_aligned_buffer_page_end(dst_v_c) - free_aligned_buffer_page_end(dst_y_opt) - free_aligned_buffer_page_end(dst_u_opt) - free_aligned_buffer_page_end(dst_v_opt) - - free_aligned_buffer_page_end(src_y) - free_aligned_buffer_page_end(src_u) - free_aligned_buffer_page_end(src_v) - - return max_diff; + free_aligned_buffer_page_end(dst_y_c); + free_aligned_buffer_page_end(dst_u_c); + free_aligned_buffer_page_end(dst_v_c); + free_aligned_buffer_page_end(dst_y_opt); + free_aligned_buffer_page_end(dst_u_opt); + free_aligned_buffer_page_end(dst_v_opt); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_u); + free_aligned_buffer_page_end(src_v); + + return max_diff; } // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. @@ -155,28 +146,34 @@ static int TestFilter_16(int src_width, int dst_width, int dst_height, FilterMode f, - int benchmark_iterations) { + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } - int i, j; - const int b = 0; // 128 to test for padding/stride. + int i; int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; - int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2); - int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2); + int64 src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64 src_uv_plane_size = (src_width_uv) * (src_height_uv); - int src_stride_y = b * 2 + Abs(src_width); - int src_stride_uv = b * 2 + src_width_uv; + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv; - align_buffer_page_end(src_y, src_y_plane_size) align_buffer_page_end( - src_u, src_uv_plane_size) align_buffer_page_end(src_v, src_uv_plane_size) - align_buffer_page_end(src_y_16, src_y_plane_size * 2) - align_buffer_page_end(src_u_16, src_uv_plane_size * 2) - align_buffer_page_end(src_v_16, src_uv_plane_size * 2) - uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16); + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_u, src_uv_plane_size); + align_buffer_page_end(src_v, src_uv_plane_size); + align_buffer_page_end(src_y_16, src_y_plane_size * 2); + align_buffer_page_end(src_u_16, src_uv_plane_size * 2); + align_buffer_page_end(src_v_16, src_uv_plane_size * 2); + if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16); uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16); uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16); @@ -184,104 +181,84 @@ static int TestFilter_16(int src_width, MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); - for (i = b; i < src_height + b; ++i) { - for (j = b; j < src_width + b; ++j) { - p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j]; - } + for (i = 0; i < src_y_plane_size; ++i) { + p_src_y_16[i] = src_y[i]; } - - for (i = b; i < (src_height_uv + b); ++i) { - for (j = b; j < (src_width_uv + b); ++j) { - p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j]; - p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j]; - } + for (i = 0; i < src_uv_plane_size; ++i) { + p_src_u_16[i] = src_u[i]; + p_src_v_16[i] = src_v[i]; } int dst_width_uv = (dst_width + 1) >> 1; int dst_height_uv = (dst_height + 1) >> 1; - int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2); - int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2); + int dst_y_plane_size = (dst_width) * (dst_height); + int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); - int dst_stride_y = b * 2 + dst_width; - int dst_stride_uv = b * 2 + dst_width_uv; + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv; - align_buffer_page_end(dst_y_8, dst_y_plane_size) - align_buffer_page_end(dst_u_8, dst_uv_plane_size) - align_buffer_page_end(dst_v_8, dst_uv_plane_size) - align_buffer_page_end(dst_y_16, dst_y_plane_size * 2) - align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2) - align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2) + align_buffer_page_end(dst_y_8, dst_y_plane_size); + align_buffer_page_end(dst_u_8, dst_uv_plane_size); + align_buffer_page_end(dst_v_8, dst_uv_plane_size); + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); + align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); + align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); - uint16* p_dst_y_16 = - reinterpret_cast<uint16*>(dst_y_16); + uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16); uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16); uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16); - I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, - src_u + (src_stride_uv * b) + b, src_stride_uv, - src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, - src_height, dst_y_8 + (dst_stride_y * b) + b, dst_stride_y, - dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, - dst_height, f); - + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, + dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (i = 0; i < benchmark_iterations; ++i) { - I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y, - p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv, - p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, src_width, - src_height, p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y, - p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv, - p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, - dst_height, f); + I420Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, + p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, + dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, + dst_stride_uv, dst_width, dst_height, f); } - // Expect an exact match + // Expect an exact match. int max_diff = 0; - for (i = b; i < (dst_height + b); ++i) { - for (j = b; j < (dst_width + b); ++j) { - int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] - - p_dst_y_16[(i * dst_stride_y) + j]); - if (abs_diff > max_diff) { - max_diff = abs_diff; - } + for (i = 0; i < dst_y_plane_size; ++i) { + int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; } } - - for (i = b; i < (dst_height_uv + b); ++i) { - for (j = b; j < (dst_width_uv + b); ++j) { - int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] - - p_dst_u_16[(i * dst_stride_uv) + j]); - if (abs_diff > max_diff) { - max_diff = abs_diff; - } - abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] - - p_dst_v_16[(i * dst_stride_uv) + j]); - if (abs_diff > max_diff) { - max_diff = abs_diff; - } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; } } - free_aligned_buffer_page_end(dst_y_8) free_aligned_buffer_page_end(dst_u_8) - free_aligned_buffer_page_end(dst_v_8) - free_aligned_buffer_page_end(dst_y_16) - free_aligned_buffer_page_end(dst_u_16) - free_aligned_buffer_page_end(dst_v_16) - - free_aligned_buffer_page_end(src_y) - free_aligned_buffer_page_end(src_u) - free_aligned_buffer_page_end(src_v) - free_aligned_buffer_page_end(src_y_16) - free_aligned_buffer_page_end(src_u_16) - free_aligned_buffer_page_end(src_v_16) - - return max_diff; + free_aligned_buffer_page_end(dst_y_8); + free_aligned_buffer_page_end(dst_u_8); + free_aligned_buffer_page_end(dst_v_8); + free_aligned_buffer_page_end(dst_y_16); + free_aligned_buffer_page_end(dst_u_16); + free_aligned_buffer_page_end(dst_v_16); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_u); + free_aligned_buffer_page_end(src_v); + free_aligned_buffer_page_end(src_y_16); + free_aligned_buffer_page_end(src_u_16); + free_aligned_buffer_page_end(src_v_16); + + return max_diff; } // The following adjustments in dimensions ensure the scale factor will be // exactly achieved. -// 2 is chroma subsample +// 2 is chroma subsample. #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2) @@ -294,11 +271,12 @@ static int TestFilter_16(int src_width, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter##_16) { \ int diff = TestFilter_16( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_); \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } @@ -335,26 +313,26 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, \ - DISABLED_##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter##_16) { \ int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \ - height, kFilter##filter, benchmark_iterations_); \ + height, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, \ - DISABLED_##name##From##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter##_16) { \ int diff = TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ - benchmark_iterations_); \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } // Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 0) \ - TEST_SCALETO1(name, width, height, Bilinear, 0) \ - TEST_SCALETO1(name, width, height, Box, 0) + TEST_SCALETO1(name, width, height, Linear, 3) \ + TEST_SCALETO1(name, width, height, Bilinear, 3) \ + TEST_SCALETO1(name, width, height, Box, 3) TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 320, 240) @@ -366,7 +344,7 @@ TEST_SCALETO(Scale, 1280, 720) #undef TEST_SCALETO #ifdef HAS_SCALEROWDOWN2_SSSE3 -TEST_F(LibYUVScaleTest, TestScaleOdd) { +TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { SIMD_ALIGNED(uint8 orig_pixels[128 * 2]); SIMD_ALIGNED(uint8 dst_pixels_opt[64]); SIMD_ALIGNED(uint8 dst_pixels_c[64]); @@ -374,78 +352,83 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) { memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt)); memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - // TL - orig_pixels[0] = 255u; - orig_pixels[1] = 0u; - orig_pixels[128 + 0] = 0u; - orig_pixels[128 + 1] = 0u; - // TR - orig_pixels[2] = 0u; - orig_pixels[3] = 100u; - orig_pixels[128 + 2] = 0u; - orig_pixels[128 + 3] = 0u; - // BL - orig_pixels[4] = 0u; - orig_pixels[5] = 0u; - orig_pixels[128 + 4] = 50u; - orig_pixels[128 + 5] = 0u; - // BR - orig_pixels[6] = 0u; - orig_pixels[7] = 0u; - orig_pixels[128 + 6] = 0u; - orig_pixels[128 + 7] = 20u; - // Odd - orig_pixels[126] = 4u; - orig_pixels[127] = 255u; - orig_pixels[128 + 126] = 16u; - orig_pixels[128 + 127] = 255u; - - // Test regular half size. - ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(133u, dst_pixels_c[63]); - - // Test Odd width version - Last pixel is just 1 horizontal pixel. - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(10u, dst_pixels_c[63]); - - // Test one pixel less, should skip the last pixel. - memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(0u, dst_pixels_c[63]); - - // Test regular half size SSSE3. - ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); - - EXPECT_EQ(64u, dst_pixels_opt[0]); - EXPECT_EQ(25u, dst_pixels_opt[1]); - EXPECT_EQ(13u, dst_pixels_opt[2]); - EXPECT_EQ(5u, dst_pixels_opt[3]); - EXPECT_EQ(0u, dst_pixels_opt[4]); - EXPECT_EQ(133u, dst_pixels_opt[63]); - - // Compare C and SSSE3 match. - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); - for (int i = 0; i < 64; ++i) { - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (!has_ssse3) { + printf("Warning SSSE3 not detected; Skipping test.\n"); + } else { + // TL. + orig_pixels[0] = 255u; + orig_pixels[1] = 0u; + orig_pixels[128 + 0] = 0u; + orig_pixels[128 + 1] = 0u; + // TR. + orig_pixels[2] = 0u; + orig_pixels[3] = 100u; + orig_pixels[128 + 2] = 0u; + orig_pixels[128 + 3] = 0u; + // BL. + orig_pixels[4] = 0u; + orig_pixels[5] = 0u; + orig_pixels[128 + 4] = 50u; + orig_pixels[128 + 5] = 0u; + // BR. + orig_pixels[6] = 0u; + orig_pixels[7] = 0u; + orig_pixels[128 + 6] = 0u; + orig_pixels[128 + 7] = 20u; + // Odd. + orig_pixels[126] = 4u; + orig_pixels[127] = 255u; + orig_pixels[128 + 126] = 16u; + orig_pixels[128 + 127] = 255u; + + // Test regular half size. + ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64); + + EXPECT_EQ(64u, dst_pixels_c[0]); + EXPECT_EQ(25u, dst_pixels_c[1]); + EXPECT_EQ(13u, dst_pixels_c[2]); + EXPECT_EQ(5u, dst_pixels_c[3]); + EXPECT_EQ(0u, dst_pixels_c[4]); + EXPECT_EQ(133u, dst_pixels_c[63]); + + // Test Odd width version - Last pixel is just 1 horizontal pixel. + ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); + + EXPECT_EQ(64u, dst_pixels_c[0]); + EXPECT_EQ(25u, dst_pixels_c[1]); + EXPECT_EQ(13u, dst_pixels_c[2]); + EXPECT_EQ(5u, dst_pixels_c[3]); + EXPECT_EQ(0u, dst_pixels_c[4]); + EXPECT_EQ(10u, dst_pixels_c[63]); + + // Test one pixel less, should skip the last pixel. + memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); + ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63); + + EXPECT_EQ(64u, dst_pixels_c[0]); + EXPECT_EQ(25u, dst_pixels_c[1]); + EXPECT_EQ(13u, dst_pixels_c[2]); + EXPECT_EQ(5u, dst_pixels_c[3]); + EXPECT_EQ(0u, dst_pixels_c[4]); + EXPECT_EQ(0u, dst_pixels_c[63]); + + // Test regular half size SSSE3. + ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); + + EXPECT_EQ(64u, dst_pixels_opt[0]); + EXPECT_EQ(25u, dst_pixels_opt[1]); + EXPECT_EQ(13u, dst_pixels_opt[2]); + EXPECT_EQ(5u, dst_pixels_opt[3]); + EXPECT_EQ(0u, dst_pixels_opt[4]); + EXPECT_EQ(133u, dst_pixels_opt[63]); + + // Compare C and SSSE3 match. + ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); + ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); + for (int i = 0; i < 64; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } } } #endif // HAS_SCALEROWDOWN2_SSSE3 @@ -460,7 +443,7 @@ extern "C" void ScaleRowUp2_16_C(const uint16* src_ptr, int dst_width); TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { - SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun + SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun. SIMD_ALIGNED(uint16 dst_pixels_opt[1280]); SIMD_ALIGNED(uint16 dst_pixels_c[1280]); @@ -531,4 +514,101 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { EXPECT_EQ(dst_pixels_c[1279], 3839); } +// Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel +// difference. +// 0 = exact. +static int TestPlaneFilter_16(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i; + int64 src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int src_stride_y = Abs(src_width); + int dst_y_plane_size = dst_width * dst_height; + int dst_stride_y = dst_width; + + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_y_16, src_y_plane_size * 2); + align_buffer_page_end(dst_y_8, dst_y_plane_size); + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); + uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16); + uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16); + + MemRandomize(src_y, src_y_plane_size); + memset(dst_y_8, 0, dst_y_plane_size); + memset(dst_y_16, 1, dst_y_plane_size * 2); + + for (i = 0; i < src_y_plane_size; ++i) { + p_src_y_16[i] = src_y[i] & 255; + } + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y, + dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + + for (i = 0; i < benchmark_iterations; ++i) { + ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16, + dst_stride_y, dst_width, dst_height, f); + } + + // Expect an exact match. + int max_diff = 0; + for (i = 0; i < dst_y_plane_size; ++i) { + int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(dst_y_8); + free_aligned_buffer_page_end(dst_y_16); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_y_16); + + return max_diff; +} + +// The following adjustments in dimensions ensure the scale factor will be +// exactly achieved. +// 2 is chroma subsample. +#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2) +#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2) + +#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) { \ + int diff = TestPlaneFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } + +// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but +// filtering is different fixed point implementations for SSSE3, Neon and C. +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(name, None, nom, denom, 0) \ + TEST_FACTOR1(name, Linear, nom, denom, boxdiff) \ + TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \ + TEST_FACTOR1(name, Box, nom, denom, boxdiff) + +TEST_FACTOR(2, 1, 2, 0) +TEST_FACTOR(4, 1, 4, 0) +TEST_FACTOR(8, 1, 8, 0) +TEST_FACTOR(3by4, 3, 4, 1) +TEST_FACTOR(3by8, 3, 8, 1) +TEST_FACTOR(3, 1, 3, 0) +#undef TEST_FACTOR1 +#undef TEST_FACTOR +#undef SX +#undef DX } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/unit_test.h b/chromium/third_party/libyuv/unit_test/unit_test.h index 87201b11ddb..6454389d52d 100644 --- a/chromium/third_party/libyuv/unit_test/unit_test.h +++ b/chromium/third_party/libyuv/unit_test/unit_test.h @@ -69,19 +69,15 @@ static inline bool SizeValid(int src_width, return true; } -#define align_buffer_page_end(var, size) \ - uint8* var; \ - uint8* var##_mem; \ - var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \ - var = (uint8*)((intptr_t)(var##_mem + \ - (((size) + 4095 + 63) & /* NOLINT */ \ - ~4095) - \ - (size)) & \ - ~63); +#define align_buffer_page_end(var, size) \ + uint8* var##_mem = \ + reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \ + uint8* var = reinterpret_cast<uint8*>( \ + (intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - (size)) & ~63) #define free_aligned_buffer_page_end(var) \ free(var##_mem); \ - var = 0; + var = 0 #ifdef WIN32 static inline double get_time() { diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c index ec0217a65b6..9ff618e0d28 100644 --- a/chromium/third_party/libyuv/util/cpuid.c +++ b/chromium/third_party/libyuv/util/cpuid.c @@ -22,6 +22,9 @@ int main(int argc, const char* argv[]) { int has_arm = TestCpuFlag(kCpuHasARM); int has_mips = TestCpuFlag(kCpuHasMIPS); int has_x86 = TestCpuFlag(kCpuHasX86); + (void)argc; + (void)argv; + #if defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_X64) if (has_x86) { @@ -76,20 +79,32 @@ int main(int argc, const char* argv[]) { int has_sse42 = TestCpuFlag(kCpuHasSSE42); int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_avx3 = TestCpuFlag(kCpuHasAVX3); int has_erms = TestCpuFlag(kCpuHasERMS); int has_fma3 = TestCpuFlag(kCpuHasFMA3); - int has_f16c = TestCpuFlag(kCpuHasF16C); + int has_f16c = TestCpuFlag(kCpuHasF16C); + int has_gfni = TestCpuFlag(kCpuHasGFNI); + int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); + int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); + int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); + int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); + int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); + int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); printf("Has SSE2 %x\n", has_sse2); printf("Has SSSE3 %x\n", has_ssse3); printf("Has SSE4.1 %x\n", has_sse41); printf("Has SSE4.2 %x\n", has_sse42); printf("Has AVX %x\n", has_avx); printf("Has AVX2 %x\n", has_avx2); - printf("Has AVX3 %x\n", has_avx3); printf("Has ERMS %x\n", has_erms); printf("Has FMA3 %x\n", has_fma3); printf("Has F16C %x\n", has_f16c); + printf("Has GFNI %x\n", has_gfni); + printf("Has AVX512BW %x\n", has_avx512bw); + printf("Has AVX512VL %x\n", has_avx512vl); + printf("Has AVX512VBMI %x\n", has_avx512vbmi); + printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2); + printf("Has AVX512VBITALG %x\n", has_avx512vbitalg); + printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq); } return 0; } |